本人正在练习通过selenium爬取上市公司年报,试了很久才写出如下效果,但其中的selenium定位语句,实在不知道怎么优化,试了很多次,每次都报错,所以期待有高人可以拨冗指点,谢谢!
```python
import re
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
import requests
def main( out_fold ):
url = "https://gu.qq.com/sh600018/gp/jbnb/"
file_list = {}
chrome_options = Options()
chrome_options.add_argument('--log-level=3')
chrome_options.add_argument('--disable-gpu') # 禁用gpu
chrome_options.add_argument('--mute-audio') # 关闭声音
driver = webdriver.Chrome(options=chrome_options, executable_path=r"E:/python_work/BrowseDriver/chromedriver.exe")
driver.implicitly_wait(2)
driver.get(url)
ul = driver.find_element_by_xpath("//strong[text()='公告标题']/../..") # 这块怎么一次性取到报告区域呢???
lis = ul.find_elements_by_xpath('./*') # 每行报告记录
for index, li in enumerate(lis):
eles = li.find_elements_by_xpath('./*')
filename = eles[0].text
file_list[ index ] = {
"filename" : filename ,
"href" : eles[0].get_attribute("href")
}
for k in file_list: # 遍历存储的 file_list
if file_list[k]['filename'] =="公告标题":
continue
driver.implicitly_wait(2)
driver.get( file_list[k]['href'] )
ele = driver.find_element_by_xpath("//a[@class='yk_on']/../..") # 这块语句怎么优化,始终无法直接读取到iframe?????
ele1 = ele.find_elements_by_xpath('./*')
ele2 = ele1[2].find_elements_by_xpath('./*')
ele3 = ele2[0].find_elements_by_xpath('./*')
pdf_src = ele3[1].get_attribute("src")
r = requests.get( pdf_src )
filename_full_path = out_fold + file_list[k]['filename'] + ".pdf"
with open( filename_full_path , "wb") as f:
f.write(r.content)
print("over")
if __name__ == '__main__':
out_fold = 'D:\\data_work\\财报分析模型\财报\\tt\\'
main( out_fold )
```
一次定位取到报告区域,使用如下语句:
ul = driver.find_element(By.XPATH,"//div[@class='content']/div/div[2]/ul")
获取pdf网址使用:
ele = driver.find_element(By.XPATH, "//div[@class='content']/div/div[3]/div/iframe").get_attribute('src')
这部分代码进行调整修改即可:
driver.get(url)
ul = driver.find_element(By.XPATH,"//div[@class='content']/div/div[2]/ul")
#print(ul)
lis = ul.find_elements(By.XPATH,'./*') # 每行报告记录
for index, li in enumerate(lis):
eles = li.find_elements(By.XPATH,'./*')
filename = eles[0].text
file_list[ index ] = {
"filename" : filename ,
"href" : eles[0].get_attribute("href")
}
print(file_list)
for k in list(file_list.keys())[:2]:
if file_list[k]['filename'] =="公告标题":
continue
driver.implicitly_wait(2)
driver.get( file_list[k]['href'] )
ele = driver.find_element(By.XPATH, "//div[@class='content']/div/div[3]/div/iframe").get_attribute('src')
r = requests.get(ele,headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.55'})
filename_full_path = out_fold + file_list[k]['filename'] + ".pdf"
with open( filename_full_path , "wb") as f:
f.write(r.content)
time.sleep(1)
如有帮助,请点采纳。
您好,我是有问必答小助手,您的问题已经有小伙伴帮您解答,感谢您对有问必答的支持与关注!