# coding = utf-8
from selenium import webdriver
import time
from selenium.common.exceptions import NoSuchElementException, WebDriverException, TimeoutException
# from selenium.webdriver.support.wait import WebDriverWait
# 增加 get方法的检测, 怀疑即时timeout后重新唤醒 chromedriver也已经死亡,无法唤醒导致阻塞
from pymongo import MongoClient
from selenium.webdriver.support import expected_conditions
import sys
import traceback
title = "注册"
title_en = "login"
class Mongo:
def __init__(self):
MONGODB_HOST = '192.168.3.100'
MONGODB_PORT = 27017
MONGODB_DB = 'database'
mongodb_client = MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
# 绑定数据库地址和端口
mongodb_db = mongodb_client[MONGODB_DB]
# 等同于 use database
self.mongodb_col = mongodb_db[title_en]
# 使用集合
self.mongodb_p = mongodb_db["params"]
def insert(self, data):
# 判断data中是否有数据
if len(data) :
self.mongodb_col.insert_many(data)
print_log('[+] successfully insert')
else:
print_log("[!] 数据为空")
def print_log(message):
print(f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} {message}')
class Test:
def __init__(self, ):
self.options = webdriver.ChromeOptions()
# self.options.add_argument('--headless')
self.options.add_argument('--disable-gpu')
self.options.add_argument('--no-sandbox')
self.options.add_argument('log-level=3')
if sys.platform == "win32":
device_path = "chromedriver.exe"
else:
device_path = "./chromedriver"
self.driver = webdriver.Chrome(executable_path=device_path, options=self.options)
self.driver.execute_cdp_cmd("Network.enable", {})
self.driver.set_window_size(1920, 1080)
self.urls = list()
self.data = list()
self.data_params = list()
self.mongo = Mongo()
self.driver.set_page_load_timeout(30)
self.driver.set_script_timeout(30)
def run(self):
print_log("[+] 开始")
url = "https://www.baidu.com"
search_text = f"intitle:{title}"
self.driver.get(url)
# 向指定url发送请求
self.driver.find_element_by_xpath('//*[@id="kw"]').send_keys(search_text)
# 找到百度搜索框id,其id为kw 在输入框内输入search_text内的信息
self.driver.find_element_by_xpath('//*[@id="su"]').click()
# 搜索键的id为 su,进行点击
self.driver.implicitly_wait(10)
# 參數如果取值过小 会在首页找不到元素
# 取代time sleep,在得到内容后就不再等待
print_log("[+] 当前页面: 1")
next_page = self.driver.find_element_by_xpath('//*[@id="page"]/div/a[10]').get_attribute('href')
# 首先获取 前十页的url
# 得到后面10个url
self.driver.implicitly_wait(2)
self.get_result()
# 调用get result函数
self.mongo.insert(self.data)
# 将数据插入数据库
for _ in range(2, 101):
self.data.clear()
self.data_params.clear()
print_log(f"[+] 当前页面: {_}")
if expected_conditions.alert_is_present()(self.driver):
self.driver.switch_to.alert.accept()
self.driver.get(next_page)
# 得到next_page后 向该url发送请求
try:
next_page = self.driver.find_element_by_xpath('//*[@id="page"]/div/a[10]').get_attribute('href')
self.driver.implicitly_wait(2)
except NoSuchElementException: # 百度验证
if "百度安全验证" in self.driver.page_source:
print_log(f"[!] 发现风控验证码 请在10s内完成验证!")
time.sleep(10)
self.get_result()
time.sleep(1)
try:
self.mongo.insert(self.data)
print_log(f"[+] A将数据插入MongoDB数据库")
except TypeError:
print_log("[!] 保存数据失败!")
print("traceback error:%s" % traceback.format_exc())
# traceback函数查看具体错误类型
continue
print_log("[+] 保存数据完成!")
print_log("[+] 结束!")
self.driver.close()
def get_result(self):
result_urls = []
search_result = self.driver.find_elements_by_xpath('//*[@tpl="se_com_default"]/h3/a')
for i in search_result:
result_urls.append(i.get_attribute('href'))
for url in result_urls:
try:
self.driver.set_page_load_timeout(30)
self.driver.set_script_timeout(30) # 两个都设置才会有效
self.driver.get(url)
self.driver.implicitly_wait(10)
print_log("[+] 向%s 发送请求" % url)
# WebDriverWait(self.driver, 30, 1).until(lambda x: x.find_element_by_xpath('//*[@id="kw"]'))
# timeout 设置为30 每2秒检查一次
except (WebDriverException, TimeoutException):
self.driver.execute_script('window.stop()')
print_log("[!] 网络错误! 打开目标网站失败! ")
# 尝试在页面加载停止后继续操作
continue
try:
if self.driver.current_url not in self.urls:
self.urls.append(self.driver.current_url)
print_log(f"[+] URL: {self.driver.current_url} title: {self.driver.title}")
self.data.append({"url": str(self.driver.current_url), "page_source": str(self.driver.page_source),
"screenshot": str(self.driver.get_screenshot_as_base64()), "type": str(title),
"source_type": str(title), "is_tag": None})
print_log('[+] 成功将%s 数据插入data列表' % self.driver.title)
except Exception:
print_log("[!] exception caught, the error detail is below")
print("traceback error:%s" % traceback.format_exc())
# taceback函数查看具体错误类型
continue
if __name__ == "__main__":
if len(sys.argv) < 2:
print("缺失参数", "python get_baidu_search.py 登录 login")
sys.exit(0)
title = sys.argv[1]
title_en = sys.argv[2]
print("输入内容:", title, title_en)
test = Test()
test.run()
楼主是新人,单位需要爬取百度关键字的结果并拍照入库。但是有时候会出现以下错误。可是明明self.data里不是空list呀。求解,使用的数据库是mongodb。chrome版本是91.0.4472
报错信息如下:
traceback error:Traceback (most recent call last):
File "get_baidu_search.py", line 99, in run
time.sleep(10)
File "get_baidu_search.py", line 30, in insert
# 判断data中是否有数据
File "E:\test_project1\venv\lib\site-packages\pymongo\collection.py", line 746, in insert_many
raise TypeError("documents must be a non-empty list")
TypeError: documents must be a non-empty list
有两个问题,一是get_result没有数据返回,二是在调用时没有将调用结果赋值给self.data。
[15692:8788:0615/084914.698:ERROR:sdp_offer_answer.cc(3112)] The order of m-lines in subsequent offer doesn't match order from previous offer/answer. (INVALID_PARAMETER)
[15692:8788:0615/084914.698:ERROR:sdp_offer_answer.cc(1930)] Failed to set local offer sdp: The order of m-lines in subsequent offer doesn't match order from previous offer/answer.
这是在程序运行中经常出现的error,出现后程序没有崩溃还会继续运行
您好,我是有问必答小助手,您的问题已经有小伙伴解答了,您看下是否解决,可以追评进行沟通哦~
如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~
ps: 问答会员年卡【8折】购 ,限时加赠IT实体书,即可 享受50次 有问必答服务,了解详情>>>https://t.csdnimg.cn/RW5m