Python爬取中国前30名大学

Python爬取中国前30名大学


最近在学习爬虫,学习任务中有一个爬取中国前30名的大学,按着教程来爬一直有bug,网上的代码没找到一个能用的。

import requests
import re
import bs4
from bs4 import BeautifulSoup

def getHTMLText(url):
    try:
        r = requests.get(url,timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print("getHTMLText失败!")
        return ""

def fillUnivList(ulist,html):
    soup = BeautifulSoup(html,"html.parser")
    for tr in soup.find("tbody").children:
        if isinstance(tr,bs4.element.Tag):      #过滤非标签信息
            tds = tr("td")      #查询td标签
            tagas = tr("a")     #查询a标签
            ulist.append([tds[0].string.strip(),tagas[0].string.strip(),tds[2].string.strip(),tds[3].string.strip(),tds[4].string.strip()])
    pass

def printUnivList(ulist,num):
    a = "{0:^10}{1:{5}^10}{2:^10}{3:^10}{4:^10}"
    print(a.format("排名","学校","地域","类型","总分",chr(12288)))
    for i in range(num):
        u = ulist[i]
        print(a.format(u[0],u[1],u[2],u[3],u[4],chr(12288)))

def save(path,ulist):
    Excel = open(path,'w',encoding = 'gbk')
    Excel.write('排名\t学校\t地域\t类型\t总分\n')
    for i in range(len(ulist)):
        for j in range(len(ulist[i])):
            Excel.write(str(ulist[i][j]))
            Excel.write('\t')      #相当于Tab一下,换下一个单元格
        Excel.write('\n')          #写完一行,换行
    Excel.close()

def main():
    uinfo = []
    url = "http://www.shanghairanking.cn/rankings/bcur/2020.html"
    html = getHTMLText(url)
    print("get成功")
    fillUnivList(uinfo,html)
    print("fill成功")
    printUnivList(uinfo,20)     #20univs
    print("print成功")
    save("大学排名.xls",uinfo)
    print("save成功")

main()

 

一直会提示以下问题,尝试多种方式没有一种能解决的。求教大神们给讲解下

Traceback (most recent call last):
  File "/Volumes/Others/编程/bs4/Best University-210422.py", line 54, in <module>
    main()
  File "/Volumes/Others/编程/bs4/Best University-210422.py", line 47, in main
    fillUnivList(uinfo,html)
  File "/Volumes/Others/编程/bs4/Best University-210422.py", line 18, in fillUnivList
    for tr in soup.find("tbody").children:
AttributeError: 'NoneType' object has no attribute 'children'

 

这个排名名次的数据是动态获取的,用这个链接:https://www.shanghairanking.cn/api/pub/v1/bcur?bcur_type=11&year=2020