import selenium
from selenium import webdriver
import os
import requests
from PyPDF2 import PdfFileReader
import random
import time
from selenium.webdriver.common.keys import Keys
import pandas as pd
def makedirs_D():
file_path = 'D:'+"\\公司年报\\"
if not os.path.exists(file_path):
os.makedirs(file_path)
print("New file_path:",file_path)
# 解析网址
def get_html_content(url):
header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
res = requests.get(url, headers=header)
if res.status_code == 200:
res.encoding="UTF-8"
# print(res.content)
# print("Sucessfully Got urlText!")
return res.content
else:
print("Failed to get urlText!")
# 保存pdf
def report_save(url, pdf_name):
report = get_html_content(url)
path = "D:\\公司年报\\" + pdf_name + ".pdf"
with open(path, 'wb') as f:
f.write(report)
# print("Sucessfully Saved PDFreport!")
# 获取年报页数
def get_num_pages(pdf_name):
path = "D:\\公司年报\\" + pdf_name + ".pdf"
reader = PdfFileReader(path)
if reader.isEncrypted:
reader.decrypt('')
page_num = reader.getNumPages()
return page_num
driver = webdriver.Chrome(r'D:\softs\python3.8\Scripts\chromedriver.exe')
driver.implicitly_wait(5)
url = 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&checkedCategory=category_ndbg_szsh#sse'
driver.get(url)
# # 修改财报公布日期期间
# driver.find_element_by_xpath('//tbody/tr[4]/td[7]/div/span').click()
# driver.find_element_by_xpath('/html/body/div[5]/div[1]/div[1]/button[7]').send_keys(Keys.ENTER)
def main():
makedirs_D()
items = []
count = 1
while count <= 1:
count += 1
time.sleep(2)
all_tr = driver.find_element_by_xpath('//*[@id="main"]/div[2]/div[1]/div[1]/div[2]/div/div[3]/table/tbody').find_elements_by_xpath('.//tr')
for tr in all_tr:
item = {}
a = random.random() * 3
time.sleep(a) # 设置随机休息
number = tr.find_elements_by_xpath('./td')[0].find_element_by_xpath('.//span').text
item['公司代码'] = number
name_pre = tr.find_elements_by_xpath('./td')[1].find_element_by_xpath('.//span').text
name = name_pre.replace('*', '~') # windows命名不能包含*,去除*
item['公司名称'] = name
title = tr.find_elements_by_xpath('./td')[2].find_element_by_xpath('.//a').text
item['公告标题'] = title
date = tr.find_elements_by_xpath('./td')[3].find_element_by_xpath('.//span').text
item['公告发布日期'] = date
year = str(int(date[0:4]) - 1) # 次年发布的头年年度报告
item['年报对应年份'] = year
pdf_name = number + "-" + name + "-" + year + "-" + title # 设置pdf命名
# print(pdf_name)
handle_main = driver.current_window_handle # 句柄
tr.find_elements_by_xpath('./td')[2].find_element_by_xpath('.//a').send_keys(Keys.ENTER) # 点击进入网页
if len(driver.window_handles) > 1: # 注意
for handle in driver.window_handles:
if handle != handle_main:
driver.switch_to.window(handle) # 切换到新句柄
pdf_href = driver.find_element_by_xpath(
'//*[@id="noticeDetail"]/div/div[2]/div[1]/a').get_attribute('href') # 获取pdf网址
print(pdf_href)
report_save(pdf_href, pdf_name)
driver.close()
driver.switch_to.window(handle_main) # 切换回主句柄
driver.quit()
......
if __name__=="__main__":
main()
Q:注释掉driver.close()则程序无法持续运行; 不注释掉则driver中存多个标签页且标签页切换时从右往左逐页过渡。请问如何调整!先致谢!
driver.close() 注释掉则程序无法持续运行,不注释掉则driver标签页越来越多且切换到主句柄时是依次从标签页从右向左。请求帮助!
close方法关闭了实例,重新创建driver即可