from selenium import webdriver
import requests
from lxml import etree
import os
import re
from bs4 import BeautifulSoup
import pymongo
client = pymongo.MongoClient(host='127.0.0.1',port=27017)
mydb = client['oneDB']
gooood_address = mydb['gooood_5']
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#'Accept-Language' : 'en-US,en;q=0.5',
'Accept-Encoding' : 'gzip',
'DNT' : '1',
'Connection' : 'close',
'Accept-Language':'zh-CN'
}
path = 'F:/CodeWar/spider/gooood-3/' # 文件存放地址
dic = {} # 新建空字典,用来存储获得的图片名称和地址
def get_each_picture_address(each_page_adress_collection): # 获得每个项目的图片地址
each_page_item_collection_response = requests.get(url=each_page_adress_collection , headers = headers , timeout = 100)
each_page_item_collection_soup = BeautifulSoup(each_page_item_collection_response.content, 'lxml')
each_page_item_collection_data = each_page_item_collection_response.text
each_page_item_collection_html = etree.HTML(each_page_item_collection_data)
try:
each_page_item_collection_href = each_page_item_collection_html.xpath('//img[@class="img-responsive"]//@lsrc')
each_page_item_name = each_page_item_collection_html.xpath('//h1//text()')
need_convert = str(each_page_item_name) # 找到的项目名是列表格式,需要转为字符串
real_name = need_convert[2:-2] # 取出标题部分
purge_each_page_item_name = re.sub(r'[\\/:*?"<>|.]', '-', real_name) # 替换非法字符,为创建文件夹 和文件名做准备
print(purge_each_page_item_name)
newPath = os.path.join(path, purge_each_page_item_name)
key_end = 0
for url in each_page_item_collection_href:
key_end += 1
dic.update({'{}{}'.format(newPath,key_end) : url})
except:
print('--------------this is no thumb_picture----------------')
print('--------------next folder----------------')
return dic
mother_web = 'https://www.gooood.cn'
n = 1
page_number = 5 #指定需要爬取的页数
browser = webdriver.Chrome(executable_path= 'E:\chromedriver.exe')
browser.get('https://www.gooood.cn/')
while n <= page_number:
url = browser.current_url
print(browser.page_source)
html = etree.HTML(browser.page_source)
each_page_adress_collection = html.xpath(
'//div[@class="post-item col-lg-6 col-md-6 col-sm-6 col-xs-12" or @class="post-bottom-item post-item col-lg-6 col-md-6 col-sm-6 col-xs-12"]/div/div/div/h2/a//@href')
for x in each_page_adress_collection:
each_page_item_collection = mother_web + x
get_each_picture_address(each_page_item_collection)
gooood_address.insert_one(dic)
dic = {}
elem = browser.find_element_by_css_selector('#column-2 > div > div.archive-content > div > div.flex-wrapper > div.paginations > a.next.page-numbers')
elem.click()
print('当前地址是' + browser.current_url)
n += 1
print(n)
print('************************************************')
elem = browser.find_element_by_css_selector('#column-2 > div > div.archive-content > div > div.flex-wrapper > div.paginations > a.next.page-numbers')
elem.click()
print('当前地址是' + browser.current_url)
是不是默认只查50条呢, shell脚本 有 it 继续显示
我试下下可以插入 90条,每页18个,按照你的写法是,查了5页的数据。
我按照你的结构,试了下(保存的东西我改了下,保存下来的序号是90.)你应该就可以定位到问题了,我把代码发给你。看后面。
get_each_picture_address
定位在你的上面这个方法,出了异常,然后你又把异常except掉了,所以插入只有50个。有40个处理的时候有异常了。
from selenium import webdriver
import requests
from lxml import etree
import os
import re
import pymongo
client = pymongo.MongoClient("mongodb://eman:admin123@192.168.10.45:27017/ttjj")
mydb = client.good # 指定数据库,若不存在,则直接创建一个test数据库
gooood_address = mydb.gooood_5
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language' : 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip',
'DNT': '1',
'Connection': 'close',
'Accept-Language': 'zh-CN'
}
path = 'F:/CodeWar/spider/gooood-3/' # 文件存放地址
dic = {} # 新建空字典,用来存储获得的图片名称和地址
def get_each_picture_address(each_page_adress_collection): # 获得每个项目的图片地址
each_page_item_collection_response = requests.get(url=each_page_adress_collection, headers=headers, timeout=100)
each_page_item_collection_data = each_page_item_collection_response.text
each_page_item_collection_html = etree.HTML(each_page_item_collection_data)
try:
each_page_item_collection_href = each_page_item_collection_html.xpath('//img[@class="img-responsive"]//@lsrc')
each_page_item_name = each_page_item_collection_html.xpath('//h1//text()')
need_convert = str(each_page_item_name) # 找到的项目名是列表格式,需要转为字符串
real_name = need_convert[2:-2] # 取出标题部分
purge_each_page_item_name = re.sub(r'[\\/:*?"<>|.]', '-', real_name) # 替换非法字符,为创建文件夹 和文件名做准备
newPath = os.path.join(path, purge_each_page_item_name)
key_end = 0
for url in each_page_item_collection_href:
key_end += 1
dic.update({'{}{}'.format(newPath, key_end): url})
except:
print('--------------this is no thumb_picture----------------')
print('--------------next folder----------------')
return dic
mother_web = 'https://www.gooood.cn'
n = 1
global s;
s=0
page_number = 5 # 指定需要爬取的页数
browser =webdriver.Chrome(executable_path="D:\sdk\chromedriver.exe")
browser.get('https://www.gooood.cn/')
while n <= page_number:
url = browser.current_url
html = etree.HTML(browser.page_source)
each_page_adress_collection = html.xpath(
'//div[@class="post-item col-lg-6 col-md-6 col-sm-6 col-xs-12" or @class="post-bottom-item post-item col-lg-6 col-md-6 col-sm-6 col-xs-12"]/div/div/div/h2/a//@href')
for x in each_page_adress_collection:
each_page_item_collection = mother_web + x
s = s+1
print(s)
gooood_address.insert_one({'a':s})
dic = {}
elem = browser.find_element_by_css_selector(
'#column-2 > div > div.archive-content > div > div.flex-wrapper > div.paginations > a.next.page-numbers')
elem.click()
print('当前地址是' + browser.current_url)
n += 1
print(n)
print('************************************************')
elem = browser.find_element_by_css_selector(
'#column-2 > div > div.archive-content > div > div.flex-wrapper > div.paginations > a.next.page-numbers')
elem.click()
print('当前地址是' + browser.current_url)