import requests
from lxml import etree
import re
from selenium import webdriver
import time
import csv
lst1=[]
lst2=[]
lst3=[]
url = 'https://www.mi.com/shop/category/list'
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost = requests.get(url,headers=headers).text
ele = etree.HTML(respost)
ulall = ele.xpath('//ul[@class="children-list clearix"]')
for i in ulall:
url_all = i.xpath('./li/a/@href') # 获取到全部商品url
# 补全商品链接中的缺陷
for i in url_all:
if 'https:' in i :
url1 = i
headers1 = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost1 = requests.get(url1, headers=headers1).text
ele1 = etree.HTML(respost1)
script1 = ele1.xpath('//script[@type="text/javascript"]/text()')
for aq in script1:
con1 = aq.split(',')
for aw in con1:
ac = re.findall('"product_id":"(.*?)"', aw)
if ac: #数据做保存
for xc in ac:
lst1.append(xc)
# print(xc)
else:
a = 'https:' + i
url2 = a
drive = webdriver.Chrome()
drive.maximize_window()
drive.get(f'{url2}')
time.sleep(1) # 加载1秒
idall = drive.page_source # 获取当前页面信息
ida = re.findall('6.64.2.(.*?)&', idall) # 获取当前页id
for qe in ida:
if qe.isdigit(): # 判断是否为纯数字
lst2.append(qe)
# print(lst2)
drive.quit()
lst3 = lst1 +lst2
lst4= list(set(lst3)) #去重 保存所有ID
# print(lst4)
lst5=[]
lst6=[]
lst7=[]
lst8=[]
lst9=[]
acx = 0
for w2 in lst4:
id = w2
url3 = f'https://api2.service.order.mi.com/user_comment/get_summary?show_all_tag=1&goods_id={id}&v_pid=17972&support_start=0&support_len=10&add_start=0&add_len=10&profile_id=0&show_img=0&callback=__jp6'
headers3 = {'referer': 'https://www.mi.com/',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': "Windows",
'sec-fetch-dest': 'script',
'sec-fetch-mode': 'no-cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
respost3 = requests.get(url3, headers=headers3)
data = respost3.text
con1 = data.split(',')
for i in con1:
idp = re.findall('"product_id":"(.*?)"', i) # ID
if idp:
lst6.append(idp)
mani = re.findall('"comments_total":(.*)', i) # 总评数
if mani:
lst7.append(mani)
zop = re.findall('"comments_good":(.*)', i) # 好评数
if zop:
lst8.append(zop)
hop = re.findall('"satisfy_per":"(.*?)"', i) # 满意度
if hop:
lst9.append(hop)
url4 = f'https://www.mi.com/shop/comment/{id}.html'
headers4 = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost4 = requests.get(url4, headers=headers4).text
name = re.findall('【(.*)怎么样,好不好】用户评价-小米商城 ', respost4)
if name:
lst5.append(name)
data_list = []
for a, b, c, d,e in zip(lst5, lst6, lst7, lst8,lst9):
x = {}
x['商品名称'] = a
x['id'] = b
x['总评数'] = c
x['好评数'] = d
x['满意度'] = e
data_list.append(x)
with open('小米商城.csv', 'w', encoding='gbk',newline='') as f:
write = csv.DictWriter(f, fieldnames=['商品名称', 'id', '总评数', '好评数', '满意度'])
write.writeheader()
write.writerows(data_list)
import requests
from lxml import etree
import re
from selenium import webdriver
import time
import csv
lst1=[]
lst2=[]
lst3=[]
# 定义一个计数器
count = 0
# 设置循环条件
while count < 5:
url = 'https://www.mi.com/shop/category/list'
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost = requests.get(url,headers=headers).text
ele = etree.HTML(respost)
ulall = ele.xpath('//ul[@class="children-list clearix"]')
for i in ulall:
url_all = i.xpath('./li/a/@href') # 获取到全部商品url
# 补全商品链接中的缺陷
for i in url_all:
if 'https:' in i :
url1 = i
headers1 = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225.400'}
respost1 = requests.get(url1, headers=headers1).text
ele1 = etree.HTML(respost1)
script1 = ele1.xpath('//script[@type="text/javascript"]/text()')
for aq in script1:
con1 = aq.split(',')
for aw in con1:
ac = re.findall('"product_id":"(.*?)"', aw)
if ac: #数据做保存
for xc in ac:
lst1.append(xc)
# print(xc)
else:
a = 'https:' + i
url2 = a
drive = webdriver.Chrome()
drive.maximize_window()
drive.get(f'{url2}')
time.sleep(1) # 加载1秒
idall = drive.page_source # 获取当前页面信息
ida = re.findall('6.64.2.(.*?)&', idall) # 获取当前页id
for qe in ida:
if qe.isdigit(): # 判断是否为纯数字
lst2.append(qe)
# print(lst2)
drive.quit()
l
lst3 = lst1 +lst2
lst4= list(set(lst3)) #去重 保存所有ID
# print(lst4)
lst5=[]
lst6=[]
lst7=[]
lst8=[]
lst9=[]
acx = 0
for w2 in lst4:
id = w2
url3 = f'https://api2.service.order.mi.com/user_comment/get_summary?show_all_tag=1&goods_id={id}&v_pid=17972&support_start=0&support_len=10&add_start=0&add_len=10&profile_id=0&show_img=0&callback=__jp6'
headers3 = {'referer': 'https://www.mi.com/',
'accept': 'application/json, text/plain, */*',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="86", "Google Chrome";v="86"',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.188.400 QQBrowser/11.4.5225
respost3 = requests.get(url3, headers=headers3).text
# print(respost3)
ele3 = etree.HTML(respost3)
script3 = ele3.xpath('//script[@type="text/javascript"]/text()')
for x in script3:
con3 = x.split(',')
for z in con3:
xc3 = re.findall('"content":"(.*?)"', z)
if xc3:
for ax in xc3:
lst5.append(ax)
good = re.findall('"good":"(.*?)"', respost3)
for g in good:
lst6.append(g)
general = re.findall('"general":"(.*?)"', respost3)
for g in general:
lst7.append(g)
poor = re.findall('"poor":"(.*?)"', respost3)
for p in poor:
lst8.append(p)
ac = re.findall('"all":"(.*?)"', respost3)
for a in ac:
lst9.append(a)
acx = acx + 1
print(f'爬取第{acx}条数据')
# 保存
with open('mi.csv', 'a', newline='', encoding='utf-8') as f:
write = csv.writer(f)
for i in range(len(lst4)):
data = [lst4[i], lst5[i], lst6[i], lst7[i], lst8[i], lst9[i]]
write.writerow(data)
# 清空列表,便于下次保存
lst1.clear()
lst2.clear()
lst3.clear()
lst4.clear()
lst5.clear()
lst6.clear()
lst7.clear()
lst8.clear()
lst9.clear()
acx = 0
# 设置终止条件,爬取五次后退出循环
if ulall.index(i) == 4:
break
你要是仔细研究爬取这个网页的数据,有更简单的办法,既快又好。用不着这么复杂。
网页里面给你提供了完整的字典格式的数据