import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
s = Service('chromedriver.exe')
browser = webdriver.Chrome(service=s)
browser.get('https://search.jd.com/Search?keyword=lola%20rose&enc=utf-8&pvid=57b4722dbe2e4f9c8cf6b82478bde868%27)#%E8%AE%BF%E9%97%AEJD%E7%9A%84%E6%90%9C%E7%B4%A2%E9%A1%B5%E9%9D%A2
browser.maximize_window()#窗口最大化
urltrue_list = []
for i in range(2):
url = 'https://search.jd.com/Search?keyword=lola%20rose&qrst=1&ev=exbrand_LOLA%20ROSE%5E&pvid=57b4722dbe2e4f9c8cf6b82478bde868&page=%27+str(i*2+1)+%27&s=%27+str(i*60+1)+%27&click=1%27
urltrue_list.append(url)
def buffer():#定义拖拽页面,使其加载完成
for i in range(50):
time.sleep(0.3)
browser.execute_script('window.scrollBy(0,300)','')
def get():#定义函数,获取商品列表页所有的商品详情链接
for each in urltrue_list:
browser.get(each)
time.sleep(5)
global detail_urls
buffer()
info = browser.find_elements(By.CLASS_NAME,'gl-i-wrap')
for line in info:
detail_url = line.find_element(By.CLASS_NAME,'p-img').find_element(By.TAG_NAME,'a').get_attribute('href')
detail_urls.append(detail_url)
time.sleep(3)
print(len(detail_urls))
return detail_urls
result ={'商品名称':[],'商品价格':[],'商品信息':[],'累计评价':[]}
def get_detail(detail_urls):
for each in detail_urls:#获取商品详情链接
browser.maximize_window()
browser.get(url=each)
time.sleep(3)
buffer()
#获取商品名称
title = browser.find_element(By.CLASS_NAME,'sku-name').text
result['商品名称'].append(title) # 将标签中的商品名称依次提取出来,并添加到字典中
#获取价格
price = browser.find_element(By.CSS_SELECTOR, 'body span.p-price').text
result['商品价格'].append(price)
#获取评价数量
comment_num = browser.find_element(By.ID,'comment-count').find_element(By.TAG_NAME,'a').text
result['累计评价'].append(comment_num)
#获取信息
contents = browser.find_elements(By.CSS_SELECTOR, '#detail .p-parameter-list')
_parameters = []
for cont in contents:
_parameter = cont.text
if "\n" in _parameters:
_parameters += _parameter.split('\n')
else:
_parameters.append(_parameter)
result['商品信息'].append(_parameters) # 将标签中的商品名称依次提取出来,并添加到字典中
buffer()
if name == 'main':
poll = ThreadPoolExecutor(4)#创建容量为4的进程池
detail_urls = []
get()
poll.submit(get_detail,detail_urls)#提交任务
result =pd.DataFrame(result)
result.to_csv('lolarose.csv',index=False,encoding='utf-8-sig')
最后导出的csv是空的,读取不到东西,是什么原因呢?多线程这里该怎么改写呢
导出csv的时候 get_detail函数还没操作完毕呢 result是空值
具体可参考一下以下代码
def get_detail(detail_urls):
for each in detail_urls:#获取商品详情链接
browser = webdriver.Chrome()
browser.maximize_window()
browser.get(url=each)
time.sleep(3)
for i in range(50):
time.sleep(0.3)
browser.execute_script('window.scrollBy(0,300)', '')
# 获取商品名称
title = browser.find_element(By.CLASS_NAME, 'sku-name').text
result['商品名称'].append(title) # 将标签中的商品名称依次提取出来,并添加到字典中
# 获取价格
price = browser.find_element(By.CSS_SELECTOR, 'body span.p-price').text
result['商品价格'].append(price)
# 获取评价数量
comment_num = browser.find_element(By.ID, 'comment-count').find_element(By.TAG_NAME, 'a').text
result['累计评价'].append(comment_num)
# 获取信息
contents = browser.find_elements(By.CSS_SELECTOR, '#detail .p-parameter-list')
_parameters = []
for cont in contents:
_parameter = cont.text
if "\n" in _parameters:
_parameters += _parameter.split('\n')
else:
_parameters.append(_parameter)
result['商品信息'].append(_parameters) # 将标签中的商品名称依次提取出来,并添加到字典中
for i in range(50):
time.sleep(0.3)
browser.execute_script('window.scrollBy(0,300)', '')
browser.close()
with ThreadPoolExecutor(max_workers=10) as t: # 创建一个最大容纳数量为5的线程池
detail_urls = []
get()
step = 10 # 以10个一组分成列表
b = [detail_urls[i:i + step] for i in range(0, len(detail_urls), step)]
task1 = t.submit(get_detail, b[0]) # 每10个一组的列表分成一个线程组
task2 = t.submit(get_detail, b[1])
task3 = t.submit(get_detail, b[2])
task4 = t.submit(get_detail, b[3])
task5 = t.submit(get_detail, b[4])
task6 = t.submit(get_detail, b[5])
# 最后把result集合里面数据写入csv
result =pandas.DataFrame(result)
result.to_csv('lolarose.csv',index=False,encoding='utf-8-sig')
谢谢,我把result放到多线程里就成功了,谢谢你的思路