import requests
import re
def getHTML(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
# print(r.text[:100])
return r.text
except:
print("error")
# print("")
def parseHTML(infolist, html):
try:
# 使用正则表达式获取相应的字符串
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
print(plt)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
# price = plt[i].split(':')[1]
# title = tlt[i].split(':')[1]
infolist.append([title, price])
except:
print("error")
# print("")
def printlist(infolist):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号", "商品名称", "价格"))
count = 0
print(infolist)
for g in infolist:
count = count + 1
print(tplt.format(count, g[0], g[1]))
# print("")
def main():
start_url = "https://s.tb.com/search?q="
key_w = "足球鞋"
depth = 2
infolist = []
for i in range(depth):
try:
url = start_url + key_w + "&s=" + str(44*i)
html = getHTML(url)
# print(html[:100])
parseHTML(infolist, html)
except:
continue
printlist(infolist)
main()
url因为版权的问题不能给出
这个写错了吧:
start_url = "https://s.tb.com/search?q="
先确认请求的确是拿到正确数据了,其次需要注意的是 淘宝的验证机制挺蛋疼的的,一旦检测到类似蜘蛛的ip就会出现前端验证 这个时候后端直接访问是请求不到你需要的内容的,你检查下是不是ip被限制了