python3.8 使用selenium和driver对百度页面进行爬取和拍照并入库

# coding = utf-8
from selenium import webdriver
import time
from selenium.common.exceptions import NoSuchElementException, WebDriverException, TimeoutException
# from selenium.webdriver.support.wait import WebDriverWait
# 增加 get方法的检测, 怀疑即时timeout后重新唤醒 chromedriver也已经死亡,无法唤醒导致阻塞
from pymongo import MongoClient
from selenium.webdriver.support import expected_conditions
import sys
import traceback

title = "注册"
title_en = "login"


class Mongo:
    def __init__(self):
        MONGODB_HOST = '192.168.3.100'
        MONGODB_PORT = 27017
        MONGODB_DB = 'database'
        mongodb_client = MongoClient(host=MONGODB_HOST, port=MONGODB_PORT)
        # 绑定数据库地址和端口
        mongodb_db = mongodb_client[MONGODB_DB]
        # 等同于 use database
        self.mongodb_col = mongodb_db[title_en]
        # 使用集合
        self.mongodb_p = mongodb_db["params"]

    def insert(self, data):
        # 判断data中是否有数据
        if len(data) :
            self.mongodb_col.insert_many(data)
            print_log('[+] successfully insert')
        else:
            print_log("[!] 数据为空")


def print_log(message):
    print(f'{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} {message}')


class Test:
    def __init__(self, ):
        self.options = webdriver.ChromeOptions()
        # self.options.add_argument('--headless')
        self.options.add_argument('--disable-gpu')
        self.options.add_argument('--no-sandbox')
        self.options.add_argument('log-level=3')
        if sys.platform == "win32":
            device_path = "chromedriver.exe"
        else:
            device_path = "./chromedriver"
        self.driver = webdriver.Chrome(executable_path=device_path, options=self.options)
        self.driver.execute_cdp_cmd("Network.enable", {})
        self.driver.set_window_size(1920, 1080)
        self.urls = list()
        self.data = list()
        self.data_params = list()
        self.mongo = Mongo()
        self.driver.set_page_load_timeout(30)
        self.driver.set_script_timeout(30)

    def run(self):
        print_log("[+] 开始")
        url = "https://www.baidu.com"
        search_text = f"intitle:{title}"
        self.driver.get(url)
        # 向指定url发送请求
        self.driver.find_element_by_xpath('//*[@id="kw"]').send_keys(search_text)
        # 找到百度搜索框id,其id为kw 在输入框内输入search_text内的信息
        self.driver.find_element_by_xpath('//*[@id="su"]').click()
        # 搜索键的id为 su,进行点击
        self.driver.implicitly_wait(10)
        # 參數如果取值过小 会在首页找不到元素
        # 取代time sleep,在得到内容后就不再等待
        print_log("[+] 当前页面: 1")
        next_page = self.driver.find_element_by_xpath('//*[@id="page"]/div/a[10]').get_attribute('href')
        # 首先获取 前十页的url
        # 得到后面10个url
        self.driver.implicitly_wait(2)
        self.get_result()
        # 调用get result函数
        self.mongo.insert(self.data)
        # 将数据插入数据库
        for _ in range(2, 101):
            self.data.clear()
            self.data_params.clear()
            print_log(f"[+] 当前页面: {_}")
            if expected_conditions.alert_is_present()(self.driver):
                self.driver.switch_to.alert.accept()
            self.driver.get(next_page)
            # 得到next_page后 向该url发送请求
            try:
                next_page = self.driver.find_element_by_xpath('//*[@id="page"]/div/a[10]').get_attribute('href')
                self.driver.implicitly_wait(2)
            except NoSuchElementException:  # 百度验证
                if "百度安全验证" in self.driver.page_source:
                    print_log(f"[!] 发现风控验证码 请在10s内完成验证!")
                    time.sleep(10)
            self.get_result()
            time.sleep(1)
            try:
                self.mongo.insert(self.data)
                print_log(f"[+] A将数据插入MongoDB数据库")
            except TypeError:
                print_log("[!] 保存数据失败!")
                print("traceback error:%s" % traceback.format_exc())
                # traceback函数查看具体错误类型
                continue

            print_log("[+] 保存数据完成!")

        print_log("[+] 结束!")
        self.driver.close()

    def get_result(self):
        result_urls = []
        search_result = self.driver.find_elements_by_xpath('//*[@tpl="se_com_default"]/h3/a')
        for i in search_result:
            result_urls.append(i.get_attribute('href'))
        for url in result_urls:
            try:
                self.driver.set_page_load_timeout(30)
                self.driver.set_script_timeout(30)  # 两个都设置才会有效
                self.driver.get(url)
                self.driver.implicitly_wait(10)
                print_log("[+] 向%s 发送请求" % url)
                # WebDriverWait(self.driver, 30, 1).until(lambda x: x.find_element_by_xpath('//*[@id="kw"]'))
                # timeout 设置为30   每2秒检查一次
            except (WebDriverException, TimeoutException):
                self.driver.execute_script('window.stop()')
                print_log("[!] 网络错误! 打开目标网站失败! ")
                # 尝试在页面加载停止后继续操作
                continue
            try:
                if self.driver.current_url not in self.urls:
                    self.urls.append(self.driver.current_url)
                    print_log(f"[+] URL: {self.driver.current_url} title: {self.driver.title}")
                    self.data.append({"url": str(self.driver.current_url), "page_source": str(self.driver.page_source),
                                      "screenshot": str(self.driver.get_screenshot_as_base64()), "type": str(title),
                                      "source_type": str(title), "is_tag": None})
                print_log('[+] 成功将%s 数据插入data列表' % self.driver.title)
            except Exception:
                print_log("[!] exception caught, the error detail is below")
                print("traceback error:%s" % traceback.format_exc())
                # taceback函数查看具体错误类型
                continue


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("缺失参数", "python get_baidu_search.py 登录 login")
        sys.exit(0)
    title = sys.argv[1]
    title_en = sys.argv[2]
    print("输入内容:", title, title_en)
    test = Test()
    test.run()

楼主是新人,单位需要爬取百度关键字的结果并拍照入库。但是有时候会出现以下错误。可是明明self.data里不是空list呀。求解,使用的数据库是mongodb。chrome版本是91.0.4472

报错信息如下:

traceback error:Traceback (most recent call last):
  File "get_baidu_search.py", line 99, in run
    time.sleep(10)
  File "get_baidu_search.py", line 30, in insert
    # 判断data中是否有数据
  File "E:\test_project1\venv\lib\site-packages\pymongo\collection.py", line 746, in insert_many
    raise TypeError("documents must be a non-empty list")
TypeError: documents must be a non-empty list

有两个问题,一是get_result没有数据返回,二是在调用时没有将调用结果赋值给self.data。

[15692:8788:0615/084914.698:ERROR:sdp_offer_answer.cc(3112)] The order of m-lines in subsequent offer doesn't match order from previous offer/answer. (INVALID_PARAMETER)
[15692:8788:0615/084914.698:ERROR:sdp_offer_answer.cc(1930)] Failed to set local offer sdp: The order of m-lines in subsequent offer doesn't match order from previous offer/answer.

这是在程序运行中经常出现的error,出现后程序没有崩溃还会继续运行

 

您好,我是有问必答小助手,您的问题已经有小伙伴解答了,您看下是否解决,可以追评进行沟通哦~

如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~

ps: 问答会员年卡【8折】购 ,限时加赠IT实体书,即可 享受50次 有问必答服务,了解详情>>>https://t.csdnimg.cn/RW5m