Jupyter Notebook 网站爬虫

问题遇到的现象和发生背景

Jupyter Notebook 网站爬虫以下链接中的信息,希望发行日期每次自动输入上周六日期至本周五日期,点击查询,如有结果,将所有查询到的信息自动下载到excel文档中,查询没有相关结果,直接提示"no result", 请问怎么实现?感谢~

网站链接:https://www.shclearing.com.cn/IssuerServicePlateform/view/client/search/ISIN_search_do.jsp

我想要达到的结果

比较笨的办法


import calendar
import re
import time

import openpyxl
import parsel as parsel
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

url = 'https://www.shclearing.com.cn/IssuerServicePlateform/view/client/search/ISIN_search_do.jsp'
service = Service(r"D:\Softwares\chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get(url=url)

js1 = "document.getElementById('startDate').removeAttribute('readonly')"  # 移除只读属性
js2 = "document.getElementById('startDate').setAttribute('maxlength', 10)"  # 修改maxlength的值
driver.execute_script(js1)
driver.execute_script(js2)

js3 = "document.getElementById('endDate').removeAttribute('readonly')"  # 移除只读属性
js4 = "document.getElementById('endDate').setAttribute('maxlength', 10)"  # 修改maxlength的值
driver.execute_script(js3)
driver.execute_script(js4)

# 获取当前时间
time1 = time.localtime()
# print(time1)
# 返回给定日期的日期码。0(星期一)到6(星期日)。月份为 1(一月) 到 12(12月)。
calendar1 = calendar.weekday(time1.tm_year, time1.tm_mon, time1.tm_mday)
# print(calendar1)

last_saturday = (-2 - calendar1) * 24 * 60 * 60 + time.time()
cur_friday = (4 - calendar1) * 24 * 60 * 60 + time.time()

# 获得上周六日期
last_sat = time.strftime('%Y-%m-%d', time.localtime(last_saturday))
# 获得本周五日期
cur_fri = time.strftime('%Y-%m-%d', time.localtime(cur_friday))

# print(last_sat)
# print(cur_fri)

el_start = driver.find_element(by=By.ID, value="startDate")
el_start.send_keys(last_sat)
# el_start.send_keys('2022-08-24')
time.sleep(2)

el_end = driver.find_element(by=By.ID, value="endDate")
el_end.send_keys(cur_fri)
# el_end.send_keys('2022-09-02')

# 找到查询按钮
el_search = driver.find_element(by=By.XPATH, value='//*[@id="button"]')
# 点击
el_search.click()

# 滑动到页面底部
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")

time.sleep(3)

# 找到重置
# el_reset = driver.find_element(by=By.XPATH, value='//*[@id="button2"]')
# el_reset.click()

html_data = driver.page_source
# print(html_data)


selector = parsel.Selector(html_data)
results = selector.css('#ISINS > tbody > tr').getall()
# print(results)
# print(type(results))
# print(len(results))
len_results = len(results)
# 判断是否有结果
if len_results == 1 and results[0] == '':
    print("no result")
else:
    # 获取总页数
    page_total = int(re.findall('<span class="gray_text12"> 共(.*?)</span>', html_data)[0])
    fieldnames = ['序号', '产品代码', 'ISIN编码', '发行日', '产品中文全称', '产品中文简称', '产品英文简称']
    # 1.创建空白工作簿
    work_book = openpyxl.Workbook()
    # 2.创建新的工作表
    work_book.create_sheet()
    work_sheet = work_book.active  # 获取当前工作表
    # 4.写入单元格
    # 写入标题行
    work_sheet.append(fieldnames)
    for page in range(1, page_total + 1):
        # 获取数据
        selector1 = parsel.Selector(driver.page_source)
        results1 = selector1.css('#ISINS > tbody > tr').getall()
        for r in results1[1:-1]:
            row = re.findall(
                '<td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>('
                '.*?)</td></tr>',
                r)[0]
            print(list(row))
            # print(type(row))
            work_sheet.append(list(row))
        time.sleep(3)
        if page < page_total:
            # 点击“下一页”
            driver.find_element(by=By.LINK_TEXT, value='下一页').click()
            time.sleep(5)
    # 保存Excel文件
    # work_book.save('test.xlsx')
# 关闭浏览器
driver.close()