这是我按照学习教程写的爬取斗鱼主播界面的简单代码,代码如下:
from selenium import webdriver
from selenium.webdriver.common.by import By
class Douyu():
def __init__(self):
self.url='https://www.douyu.com/directory/all'
self.driver=webdriver.Chrome()
def parse_data(self):
self.driver.implicitly_wait(2)
room_list=self.driver.find_elements(By.XPATH,'//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
#遍历房间列表,从每一个房间节点中获取数据
for room in room_list:
try:
haha = {}
haha['title'] = room.find_element(By.XPATH, './a/div[2]/div[1]/h3').text
haha['type'] = room.find_element(By.XPATH, './a/div[2]/div[1]/span').text
haha['owner'] = room.find_element(By.XPATH, './a/div[2]/div[2]/h2/div').text
haha['num'] = room.find_element(By.XPATH, './a/div[2]/div[2]/span').text
haha['src'] = room.find_element(By.XPATH, './a/div[1]/div[1]/picture/img').get_attribute('src')
print(haha)
except:
pass
# def save_data(self,data_list):
# for data in data_list:
# print(data)
def run(self):
#url
#driver
#get
self.driver.get(self.url)
self.parse_data()
if __name__ == '__main__':
douyu=Douyu()
douyu.run()
可是爬取出来的却是两条,有时候一条,这是怎么回事?
{'title': 'AG 0:0 狼队', 'type': '王者荣耀', 'owner': '王者荣耀官方赛事', 'num': '708.8万', 'src': 'https://rpic.douyucdn.cn/live-cover/coverupdate/2023/08/12/e112a2e164dd4657c434a4b9bb0a5f80.jpg/dy1'}
{'title': '七圣召唤主播资格赛决赛进行中!', 'type': '原神', 'owner': '丘丘人大队队长', 'num': '69.3万', 'src': 'https://rpic.douyucdn.cn/live-cover/coverupdate/2023/08/08/8c40ce9a0f97a89813f94a38ec1324f8.jpg/dy1'}
当我去掉try时,报如下错误:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found
(Session info: chrome=115.0.5790.171); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
望解决,谢谢!
【以下回答由 GPT 生成】
根据您提供的代码,以下是我认为可以优化的几个方面:
expected_conditions
模块来定义等待条件,并传递给WebDriverWait
类来设置等待时间。from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 在初始化方法中使用显式等待
def __init__(self):
self.url = 'https://www.douyu.com/directory/all'
self.driver = webdriver.Chrome()
# 设置显式等待
self.wait = WebDriverWait(self.driver, 10) # 最多等待10秒
def parse_data(self):
# 使用显式等待查找元素
room_list = self.wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//*[@id="listAll"]/section[2]/div[2]/ul/li/div')))
data['title'] = room.find_element(By.XPATH, 'a/div[2]/div[1]/h3').text
data['type'] = room.find_element(By.XPATH, 'a/div[2]/div[1]/span').text
data['owner'] = room.find_element(By.XPATH, 'a/div[2]/div[2]/h2/div').text
data['num'] = room.find_element(By.XPATH, 'a/div[2]/div[2]/span').text
find_elements
方法一次性查找到多个元素,然后逐个处理,以减少与浏览器的通信次数。room_list = self.wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//*[@id="listAll"]/section[2]/div[2]/ul/li/div')))
# 遍历房间列表
for room in room_list:
data = {}
data['title'] = room.find_element(By.XPATH, 'a/div[2]/div[1]/h3').text
data['type'] = room.find_element(By.XPATH, 'a/div[2]/div[1]/span').text
data['owner'] = room.find_element(By.XPATH, 'a/div[2]/div[2]/h2/div').text
data['num'] = room.find_element(By.XPATH, 'a/div[2]/div[2]/span').text
# 处理数据
...
def __del__(self):
self.driver.quit()
以上是我认为可以优化代码的几个方面,希望对您有所帮助。如果您有任何疑问,请随时提问。
【相关推荐】