为什么爬取后的plt和tlt为空,哪里出错了?


import requests
import re


def getHTML(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        # print(r.text[:100])
        return r.text
    except:
        print("error")
    # print("")

def parseHTML(infolist, html):
    try:
        # 使用正则表达式获取相应的字符串
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)
        print(plt)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            # price = plt[i].split(':')[1]
            # title = tlt[i].split(':')[1]
            infolist.append([title, price])
    except:
        print("error")
    # print("")

def printlist(infolist):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号", "商品名称", "价格"))
    count = 0
    print(infolist)
    for g in infolist:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))
    # print("")

def main():
    start_url = "https://s.tb.com/search?q="
    key_w = "足球鞋"
    depth = 2
    infolist = []
    for i in range(depth):
        try:
            url = start_url + key_w + "&s=" + str(44*i)
            html = getHTML(url)
            # print(html[:100])
            parseHTML(infolist, html)
        except:
            continue
    printlist(infolist)


main()

url因为版权的问题不能给出

这个写错了吧:

start_url = "https://s.tb.com/search?q="  

先确认请求的确是拿到正确数据了,其次需要注意的是 淘宝的验证机制挺蛋疼的的,一旦检测到类似蜘蛛的ip就会出现前端验证 这个时候后端直接访问是请求不到你需要的内容的,你检查下是不是ip被限制了