import time
import requests
import json
import re
from selenium import webdriver # 调用webdriver模块
import csv
from operator import itemgetter
## 在这里设置下爬取结果文件保存的路径
csv_file = open(r'C:\Users\Administrator\Desktop\demo.csv', 'w', newline='', encoding='utf-8-sig')
writer = csv.writer(csv_file)
driver = webdriver.Chrome() # 设置引擎为Chrome,模拟真实地打开一个浏览器
driver.get('https://login.taobao.com/member/login.jhtml')
time.sleep(1)
## 运行代码之前输入自己的账号和密码
user = driver.find_element_by_name('fm-login-id')
user.send_keys('13511501564')
time.sleep(2)
assistant = driver.find_element_by_name('fm-login-password')
assistant.send_keys('zhangxingzhi520')
time.sleep(2)
submit = driver.find_element_by_class_name('fm-btn')
submit.click() # 登录
time.sleep(5)
cookie_list = driver.get_cookies()
cookies = {}
print(len(cookie_list))
for cookie in cookie_list:
cookies[cookie['name']] = cookie['value']
print("已经成功的获取到用户登录的cookies")
print(cookies)
driver.close()
headers = {'Host': 's.taobao.com',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Mobile Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive'
}
list_url = 'http://s.taobao.com/search?q=%(key)s&ie=utf8&s=%(pnum)d'
titles = '"raw_title":"(.*?)"' # 标题
locations = '"item_loc":"(.*?)"' # 销售地
sales = '"view_sales":"(.*?)人付款"' # 销售量
comments = '"comment_count":"(.*?)"' # 评论数
prices = '"view_price":"(.*?)"' # 销售价格
nids = '"nid":"(.*?)"' # 这里需要nid,是因为商品的链接是需要这个参数的
writer.writerow(['商品名称', '销售地', '销售量', '评论数', '销售价格', '商品链接'])
key = input('输入想要爬取的商品名称:')
Page = 5 # 爬取的页数 ,可以自行修改
data = []
for i in range(Page):
pnum = i * 44
url = list_url % {'key': key, 'pnum': pnum}
print(url)
res = requests.get(url, headers=headers, cookies=cookies,allow_redirects=False)
html = res.text
title = re.findall(titles, html)
location = re.findall(locations, html)
sale = re.findall(sales, html)
comment = re.findall(comments, html)
price = re.findall(prices, html)
nid = re.findall(nids, html)
for j in range(len(title)):
goods_url = 'https://item.taobao.com/item.htm?id=' + nid[j] + '&ns=1&abbucket=19#detail'
sale[j] = sale[j] if sale[j][-1] != '+' else sale[j][:-1]
if sale[j][-1] == '万':
data.append([title[j], location[j], float(sale[j][:-1]) * 10000, comment[j], price[j],
goods_url]) # 如果最后一位是万,去掉最后一位,乘以10000即可
else:
data.append([title[j], location[j], float(sale[j]), comment[j], price[j], goods_url])
print('-------Page%s 已经抓取完毕!--------\n\n' % (i + 1))
time.sleep(2)
data.sort(key=itemgetter(2))
data.reverse() # 按照销量进行排序
for j in range(len(data)):
writer.writerow(data[j])
运行之后就只有这个界面
得不到具体的商品信息,能帮忙指点一下吗?
这个要分析一下网页,根据网页里面的dom结构来获取商品信息