python爬取数据存存到mongodb,试过多次,数据最多只能存到50条,请问该如何解决?

from selenium import webdriver
import requests
from lxml import etree
import os
import re 
from bs4 import BeautifulSoup
import pymongo

client = pymongo.MongoClient(host='127.0.0.1',port=27017)
mydb = client['oneDB']
gooood_address = mydb['gooood_5']

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #'Accept-Language' : 'en-US,en;q=0.5',
    'Accept-Encoding' : 'gzip',
    'DNT' : '1', 
    'Connection' : 'close',
    'Accept-Language':'zh-CN'
}


path = 'F:/CodeWar/spider/gooood-3/'   # 文件存放地址

dic = {}   # 新建空字典,用来存储获得的图片名称和地址

def get_each_picture_address(each_page_adress_collection):      # 获得每个项目的图片地址
    each_page_item_collection_response = requests.get(url=each_page_adress_collection  , headers = headers , timeout = 100)
    each_page_item_collection_soup =  BeautifulSoup(each_page_item_collection_response.content, 'lxml')
    each_page_item_collection_data = each_page_item_collection_response.text
    each_page_item_collection_html = etree.HTML(each_page_item_collection_data)
    try:
        each_page_item_collection_href = each_page_item_collection_html.xpath('//img[@class="img-responsive"]//@lsrc')
        each_page_item_name = each_page_item_collection_html.xpath('//h1//text()')
        need_convert = str(each_page_item_name)   # 找到的项目名是列表格式,需要转为字符串
        real_name = need_convert[2:-2]  # 取出标题部分
        purge_each_page_item_name = re.sub(r'[\\/:*?"<>|.]', '-', real_name)   # 替换非法字符,为创建文件夹 和文件名做准备
        print(purge_each_page_item_name)

        newPath = os.path.join(path, purge_each_page_item_name)
        key_end = 0
        for url in each_page_item_collection_href:
            key_end += 1
            dic.update({'{}{}'.format(newPath,key_end) : url})
        
    except:
        print('--------------this is no thumb_picture----------------')
    print('--------------next folder----------------')
    return dic

mother_web = 'https://www.gooood.cn'
n = 1
page_number = 5             #指定需要爬取的页数
browser = webdriver.Chrome(executable_path= 'E:\chromedriver.exe')

browser.get('https://www.gooood.cn/')

while n <= page_number:
    
    url = browser.current_url
    
    print(browser.page_source)
    html = etree.HTML(browser.page_source)

    each_page_adress_collection = html.xpath(
        '//div[@class="post-item col-lg-6 col-md-6 col-sm-6 col-xs-12" or @class="post-bottom-item post-item col-lg-6 col-md-6 col-sm-6 col-xs-12"]/div/div/div/h2/a//@href')
    
    for x in each_page_adress_collection:
        each_page_item_collection = mother_web + x
        get_each_picture_address(each_page_item_collection)
        gooood_address.insert_one(dic)
        dic = {}
 
    elem = browser.find_element_by_css_selector('#column-2 > div > div.archive-content > div > div.flex-wrapper > div.paginations > a.next.page-numbers')
    elem.click()
    print('当前地址是' + browser.current_url)
    n += 1
    print(n)
    print('************************************************')

elem = browser.find_element_by_css_selector('#column-2 > div > div.archive-content > div > div.flex-wrapper > div.paginations > a.next.page-numbers')
elem.click()
print('当前地址是' + browser.current_url)

 

是不是默认只查50条呢, shell脚本 有 it 继续显示

 我试下下可以插入 90条,每页18个,按照你的写法是,查了5页的数据。

我按照你的结构,试了下(保存的东西我改了下,保存下来的序号是90.)你应该就可以定位到问题了,我把代码发给你。看后面。

get_each_picture_address

定位在你的上面这个方法,出了异常,然后你又把异常except掉了,所以插入只有50个。有40个处理的时候有异常了。

from selenium import webdriver

import requests

from lxml import etree

import os

import re


import pymongo

client =  pymongo.MongoClient("mongodb://eman:admin123@192.168.10.45:27017/ttjj")

mydb = client.good  # 指定数据库,若不存在,则直接创建一个test数据库

gooood_address = mydb.gooood_5

headers = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',

    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

    # 'Accept-Language' : 'en-US,en;q=0.5',

    'Accept-Encoding': 'gzip',

    'DNT': '1',

    'Connection': 'close',

    'Accept-Language': 'zh-CN'

}

path = 'F:/CodeWar/spider/gooood-3/'  # 文件存放地址

dic = {}  # 新建空字典,用来存储获得的图片名称和地址


def get_each_picture_address(each_page_adress_collection):  # 获得每个项目的图片地址

    each_page_item_collection_response = requests.get(url=each_page_adress_collection, headers=headers, timeout=100)


    each_page_item_collection_data = each_page_item_collection_response.text

    each_page_item_collection_html = etree.HTML(each_page_item_collection_data)

    try:

        each_page_item_collection_href = each_page_item_collection_html.xpath('//img[@class="img-responsive"]//@lsrc')

        each_page_item_name = each_page_item_collection_html.xpath('//h1//text()')

        need_convert = str(each_page_item_name)  # 找到的项目名是列表格式,需要转为字符串

        real_name = need_convert[2:-2]  # 取出标题部分

        purge_each_page_item_name = re.sub(r'[\\/:*?"<>|.]', '-', real_name)  # 替换非法字符,为创建文件夹 和文件名做准备


        newPath = os.path.join(path, purge_each_page_item_name)

        key_end = 0

        for url in each_page_item_collection_href:
            key_end += 1

            dic.update({'{}{}'.format(newPath, key_end): url})





    except:

        print('--------------this is no thumb_picture----------------')

    print('--------------next folder----------------')

    return dic


mother_web = 'https://www.gooood.cn'
n = 1
global s;
s=0
page_number = 5  # 指定需要爬取的页数

browser =webdriver.Chrome(executable_path="D:\sdk\chromedriver.exe")

browser.get('https://www.gooood.cn/')

while n <= page_number:

    url = browser.current_url


    html = etree.HTML(browser.page_source)

    each_page_adress_collection = html.xpath(

            '//div[@class="post-item col-lg-6 col-md-6 col-sm-6 col-xs-12" or @class="post-bottom-item post-item col-lg-6 col-md-6 col-sm-6 col-xs-12"]/div/div/div/h2/a//@href')

    for x in each_page_adress_collection:
        each_page_item_collection = mother_web + x
        s = s+1

        print(s)
        gooood_address.insert_one({'a':s})
        dic = {}

    elem = browser.find_element_by_css_selector(
        '#column-2 > div > div.archive-content > div > div.flex-wrapper > div.paginations > a.next.page-numbers')

    elem.click()

    print('当前地址是' + browser.current_url)

    n += 1

    print(n)

    print('************************************************')

elem = browser.find_element_by_css_selector(
    '#column-2 > div > div.archive-content > div > div.flex-wrapper > div.paginations > a.next.page-numbers')

elem.click()

print('当前地址是' + browser.current_url)