import requests
import re
def getHTMLText(url):
kv = {
'cookie': 'thw=cn; t=040b13710117e82ba5a48a3d55259c85; UM_distinctid=17912c36d2d58b-049356f888806b-404b032d-15f900-17912c36d2e7d0; enc=OyW%2BP9L19CxbiGNnMXpbrMx0TGZZ6k%2B9%2BqEhiOjQ0VNlbOBCQcZGbxC2fgsv0rsBLfP8l1Vs%2B6Hynudz2ZoXQAxhavRsbn%2FQE%2FZWRimasdc%3D; cookie2=16dd7e08f6962d37f91fc97fed6dfcbc; Hm_lvt_96bc309cbb9c6a6b838dd38a00162b96=1625018753,1625041401,1625102021,1625189059; samesite_flag=true; _m_h5_tk=7c070d43c27bacbf08702dc8cae67500_1625197274412; _m_h5_tk_enc=6a8787e2102d5e9e2b8bab78b51912c7; tb_token=1s51CyaIsAK77y3; Hm_lpvt_96bc309cbb9c6a6b838dd38a00162b96=1625190744; xlly_s=1; cna=5soeGDxHrXECAXPBsLP9C4GS; sgcookie=E100M7puKdJsJ7Vt%2FO%2FNNNTnErJbZ2cn46T%2BNTslp09LZTXbM7WrI8aYYGGeuDJ0oh1qGh2w5CLOkb4LO7nQIKQvmg%3D%3D; unb=2670926756; uc3=id2=UU6m39INa5s0JA%3D%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&nk2=12eoTWgxTaR7iV4Uk493&vt3=F8dCuwzn8QHPd1O2Q3U%3D; csg=63e76ecf; lgc=%5Cu4F20%5Cu8BF4%5Cu4E2Dde%5Cu8DEF%5Cu4EBA%5Cu7532i; cookie17=UU6m39INa5s0JA%3D%3D; dnk=%5Cu4F20%5Cu8BF4%5Cu4E2Dde%5Cu8DEF%5Cu4EBA%5Cu7532i; skt=b40fd6925e39aedf; existShop=MTYyNTE5Njc0NA%3D%3D; uc4=id4=0%40U2xrciCyjNrShY5Ou31hRYxrrqM3&nk4=0%401ajx2yFZtsi7C80%2F%2Bxt6ZnViDksDJQNVRqg%3D; tracknick=%5Cu4F20%5Cu8BF4%5Cu4E2Dde%5Cu8DEF%5Cu4EBA%5Cu7532i; cc=UtASsssmfA%3D%3D; l_g=Ug%3D%3D; sg=i65; nk=%5Cu4F20%5Cu8BF4%5Cu4E2Dde%5Cu8DEF%5Cu4EBA%5Cu7532i; cookie1=BxAcBpL1nbND89NIuZlrTklJxdHebbbF5u3U6Udw2yE%3D; tfstk=cJROBON39DmGIHQpzdHnhSdRg78AZrMd0VsYMVS68O4jEFPAiXAka_1UCZW7o2C..; l=eBrp7uvljK3mOzDOBOfwnurza77tdIRVguPzaNbMiOCP9wCk5v2AW69UUWTDCnGVnsMX-3PGap0pByLowyUBQxv9-e_7XPQojdLh.; mt=ci=0_1; uc1=cookie14=Uoe2yIJ2WCWo9w%3D%3D&pas=0&cookie15=W5iHLLyFOGW7aA%3D%3D&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie21=W5iHLLyFeYZ1WM9hVnmS&existShop=false; isg=BDAwZ9kLJO1ujPi1hCuInsO9Af6CeRTDAJDU3CqB6Qte5dCP040uU1TXOe2F9cyb',
'user-agent': 'Mozilla/5.0'
}
try:
r = requests.get(url, headers=kv, timeout=300)
r.raise_for_status()
return r.text
except:
return""
def parsePage(ilt, html):
try:
pri = re.findall(r'item.htm?id=.*?&', html)
for i in range(len(pri)):
goodid = eval(pri[i].split('=')[1])
# numi=eval(num[i].split(':')[1])
ilt.append([goodid])
except:
print("")
def printGoodsList(ilt):
tplt = "{:4}\t{:10}"
print(tplt.format("序号","商品ID"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g[0]))
def main():
depth = 4
start_url = 'https://gaozi.tmall.com/search.htm?spm='
infoList = []
for i in range(depth):
try:
url = start_url + '&pageNo=' + str(1 * i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
main()
ilt.append([goodid])
你这个函数里没有定义return, ilt 作为形参,是不会修改外边的同名变量的内容的,你需要return 一下这个变量
parsePage(infoList, html) 则修改成 infoList = parsePage(infoList, html)
原因是pri = re.findall(r'item.htm?id=.*?&', html)没有获取到数据。使用bs4,并将代码这样修改即可。
import requests
from bs4 import BeautifulSoup as bs
def getHTMLText(url):
kv = {
'cookie': 'thw=cn; t=040b13710117e82ba5a48a3d55259c85; UM_distinctid=17912c36d2d58b-049356f888806b-404b032d-15f900-17912c36d2e7d0; enc=OyW%2BP9L19CxbiGNnMXpbrMx0TGZZ6k%2B9%2BqEhiOjQ0VNlbOBCQcZGbxC2fgsv0rsBLfP8l1Vs%2B6Hynudz2ZoXQAxhavRsbn%2FQE%2FZWRimasdc%3D; cookie2=16dd7e08f6962d37f91fc97fed6dfcbc; Hm_lvt_96bc309cbb9c6a6b838dd38a00162b96=1625018753,1625041401,1625102021,1625189059; samesite_flag=true; _m_h5_tk=7c070d43c27bacbf08702dc8cae67500_1625197274412; _m_h5_tk_enc=6a8787e2102d5e9e2b8bab78b51912c7; tb_token=1s51CyaIsAK77y3; Hm_lpvt_96bc309cbb9c6a6b838dd38a00162b96=1625190744; xlly_s=1; cna=5soeGDxHrXECAXPBsLP9C4GS; sgcookie=E100M7puKdJsJ7Vt%2FO%2FNNNTnErJbZ2cn46T%2BNTslp09LZTXbM7WrI8aYYGGeuDJ0oh1qGh2w5CLOkb4LO7nQIKQvmg%3D%3D; unb=2670926756; uc3=id2=UU6m39INa5s0JA%3D%3D&lg2=U%2BGCWk%2F75gdr5Q%3D%3D&nk2=12eoTWgxTaR7iV4Uk493&vt3=F8dCuwzn8QHPd1O2Q3U%3D; csg=63e76ecf; lgc=%5Cu4F20%5Cu8BF4%5Cu4E2Dde%5Cu8DEF%5Cu4EBA%5Cu7532i; cookie17=UU6m39INa5s0JA%3D%3D; dnk=%5Cu4F20%5Cu8BF4%5Cu4E2Dde%5Cu8DEF%5Cu4EBA%5Cu7532i; skt=b40fd6925e39aedf; existShop=MTYyNTE5Njc0NA%3D%3D; uc4=id4=0%40U2xrciCyjNrShY5Ou31hRYxrrqM3&nk4=0%401ajx2yFZtsi7C80%2F%2Bxt6ZnViDksDJQNVRqg%3D; tracknick=%5Cu4F20%5Cu8BF4%5Cu4E2Dde%5Cu8DEF%5Cu4EBA%5Cu7532i; cc=UtASsssmfA%3D%3D; l_g=Ug%3D%3D; sg=i65; nk=%5Cu4F20%5Cu8BF4%5Cu4E2Dde%5Cu8DEF%5Cu4EBA%5Cu7532i; cookie1=BxAcBpL1nbND89NIuZlrTklJxdHebbbF5u3U6Udw2yE%3D; tfstk=cJROBON39DmGIHQpzdHnhSdRg78AZrMd0VsYMVS68O4jEFPAiXAka_1UCZW7o2C..; l=eBrp7uvljK3mOzDOBOfwnurza77tdIRVguPzaNbMiOCP9wCk5v2AW69UUWTDCnGVnsMX-3PGap0pByLowyUBQxv9-e_7XPQojdLh.; mt=ci=0_1; uc1=cookie14=Uoe2yIJ2WCWo9w%3D%3D&pas=0&cookie15=W5iHLLyFOGW7aA%3D%3D&cookie16=WqG3DMC9UpAPBHGz5QBErFxlCA%3D%3D&cookie21=W5iHLLyFeYZ1WM9hVnmS&existShop=false; isg=BDAwZ9kLJO1ujPi1hCuInsO9Af6CeRTDAJDU3CqB6Qte5dCP040uU1TXOe2F9cyb',
'user-agent': 'Mozilla/5.0'
}
try:
r = requests.get(url, headers=kv, timeout=300)
r.raise_for_status()
return r.text
except:
return""
def parsePage(html):
ilt=[]
soup=bs(html,'lxml')
try:
pri = [x['href'] for x in soup.select('li.cat.snd-cat h4 a')]
for a in pri:
if 'parentCatId' in a:
ilt.append(a.split('=')[2][:10])
except:
print("")
return ilt
def printGoodsList(ilt):
tplt = "{:4}\t{:10}"
print(tplt.format("序号", "商品ID"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g))
def main():
depth = 1
start_url = 'https://gaozi.tmall.com/search.htm?spm='
infoList = []
for i in range(depth):
try:
url = start_url + '&pageNo=' + str(1 * i)
html = getHTMLText(url)
infoList=parsePage(html)
print(infoList)
except:
continue
printGoodsList(infoList)
main()
如对你有帮助,请点我回答的右上方采纳按钮,给予采纳。