import requests
from bs4 import BeautifulSoup
import xlwt
import re
def main():
data_list=get_data(url_base)
path='xw.tc'
savedata(data_list,path)
find_link=re.compile(r"<a href='(.*?)'.* class='teac-name>")
find_name=re.compile(r"<a href=.*>(.*)</a>")
find_intro=re.compile(r"<p class='teac-zc'>(.*)</p>")
def get_data(url_base):
data_list=[]
html=ask_data(url_base)
soup=BeautifulSoup(html,'html.parser')
for item in soup.find_all('ul',style_='overflow:hidden;'):
data=[]
item=str(item)
link=re.findall(find_link,item)[0]
data.append(link)
name=re.findall(find_name,item)[0]
data.append(name)
intro=re.findall(find_intro,item)[0]
data_list.append(data)
return data_list
def ask_data(url_base):
headers={"User-Agent": "Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 80.0.3987.122 Safari / 537.36"}
r=requests.get(url_base,headers=headers)
try:
html=r.content.decode('utf-8')
except:
print('爬取失败')
return html
def savedata(data_list,path):
book = xlwt.Workbook(encoding="utf-8", style_compression=0)
sheet=book.add_sheet('教师信息',cell_overwrite_ok=True)
col=['信息链接','教师姓名','教师简介']
for i in range(0,3):
sheet.write(0,i,col[i])
for i in range(0,41):
print("第%d条" % (i + 1))
data = data_list[i]
for j in range(0,3):
sheet.write(i + 1, j, data[j])
book.save(path)
if __name__ == '__main__':
url_base='http://xmtxy.xjtu.edu.cn/szdw/zzjs.htm'
main()
url_base='http://xmtxy.xjtu.edu.cn/szdw/zzjs/1.htm'
main()
print('爬取完毕')
我刚看了一下,应该是正则表达式的问题,好像没有匹配到字符串
soup.find_all('ul', style='overflow:hidden;')
style没有下划线,你加了当然找不到,既然用了bs4还用正则干嘛。
初学最好还是养成写注释的习惯