这段代码是模仿嵩天老师的最好大学排名的方法,但是按照这个方法,爬取清博指数的网页数据的时候,就会出现这个错误,求解
import requests
from bs4 import BeautifulSoup
import bs4,time
def getHTMLText(url):
header = {'Cookie':''
User-Agent ;} #太长忽略啦
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def getinfo(ulist,html):
soup = BeautifulSoup(html,'html.parser')
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
trs = tr('td')
astr=trs[1].find('a')
if trs:
ulist.append([trs[0].string,astr.string,trs[2].string,trs[3].string,trs[4].string,trs[5].string,trs[6].string,trs[8].string,trs[7].string])
def printinfo(ulist,num):
tplt = '{:10}\t{:10}\t{:10}\t{:10}\t{:10}\t{:10}\t{:10}\t{:10}\t{:10}\t{:10}\t'
print(tplt.format("排名","微博号","发博/原创","转发数","评论数","原创转发","原创评论","总点赞数","BCI",chr(12288)))
for i in range(num):
u = ulist[i]
print(tplt.format(u[0],u[1],u[2],u[3],u[4],u[5],u[6],u[7],u[8],chr(12288)))
def main():
ulist = []
url = 'http://www.gsdata.cn/rank/wbrank?type=day'
html = getHTMLText(url)
getinfo(ulist,html)
printinfo(ulist,10)
time.sleep(1)
print(type(html))
main()