import requests
from bs4 import BeautifulSoup
import re
#网页爬取
def getHtml(url,data):
r=requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/94.0.4606.81 Safari/537.36'},params=data)
# r=requests.get(url,headers={'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us)'
# ' AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},params=data)
r.encoding='utf-8'
print(r.status_code)
return r.text
# #parserHtml()函数为网页内容解析函数
# #数据解析
def parserHtml(html,name,attrs):
result=[]
soup=BeautifulSoup(html,"html.parser")
data=soup.find_all(name,attrs)
for i in data:
info=[]
file_name=i.find("span",{'class':"title"})
director_actor_type=i.find('p',{'class':''}).contents
director_info=director_actor_type[0]
director=re.findall(r'导演:.*?\xa0',director_info)#截取结果类似
#"导演:弗兰克·德拉邦特 Frank Dareabont\xa0",需应用"字符串切片"进行切片,如下
director=director[0][4:-3:]
#在每一个name标签中找第一个span标签
score=i.find('span',{'class':'rating_num'})#找第一个class标签
#评论人数
number=i.find('div',class_="star").find_all('span')[-1].string.strip('人评价')
#影评
quote=i.find('span', {'class':'inq'})
if quote is None:
quote = "暂无"
else:
quote = quote.string
#for j in tagA:
info.append(file_name.string)
info.append(director)
info.append(score.string)
info.append(number)
info.append(quote)
result.append(info)
return result
# print(getHtml(url="http://www.szpt.edu.cn/"))
#print(parserHtml(html=getHtml(url="http://www.szpt.edu.cn/"),name='div',attrs={'class':'list clearfix'}))
#数据储存
import csv
def writeFile(data,filename):
save_path=save_path="./"+filename
with open(save_path,'a',errors='ignore',newline='')as fd:
f_csv=csv.writer(fd)
f_csv.writerow(data)
#数据格式化输出
def display(data):
print("{1:<10}{2:{0}^20}{3:{0}^16}".format(chr(12288),"序号","电影名","豆瓣评分",
'导演',"评论人数","影评"))
for i in range(len(data)):
print("{1:^10}{2:{0}^20}{3:{0}^20}".format(chr(12288),i+1,data[i][0],
data[i][2],data[i][1],data[i][3],data[i][4]))
#main函数定义及调用
def main():
result=[]
params={}
# url="https://movie.douban.com/top250?start="+str(i*25)+'&filter='
page_num = 8
for i in range(page_num):
params['stars']=i*25
url = "https://movie.douban.com/top250?start=" + str(i * 25) + '&filter='
html=getHtml(url,params)
result+=parserHtml(html,"div",{'class':'info'})
display(result)
#主函数调用
main()import requests
from bs4 import BeautifulSoup
import re
#网页爬取
def getHtml(url,data):
r=requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/94.0.4606.81 Safari/537.36'},params=data)
# r=requests.get(url,headers={'User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us)'
# ' AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},params=data)
r.encoding='utf-8'
print(r.status_code)
return r.text
# #parserHtml()函数为网页内容解析函数
# #数据解析
def parserHtml(html,name,attrs):
result=[]
soup=BeautifulSoup(html,"html.parser")
data=soup.find_all(name,attrs)
for i in data:
info=[]
file_name=i.find("span",{'class':"title"})
director_actor_type=i.find('p',{'class':''}).contents
director_info=director_actor_type[0]
director=re.findall(r'导演:.*?\xa0',director_info)#截取结果类似
#"导演:弗兰克·德拉邦特 Frank Dareabont\xa0",需应用"字符串切片"进行切片,如下
director=director[0][4:-3:]
#在每一个name标签中找第一个span标签
score=i.find('span',{'class':'rating_num'})#找第一个class标签
#评论人数
number=i.find('div',class_="star").find_all('span')[-1].string.strip('人评价')
#影评
quote=i.find('span', {'class':'inq'})
if quote is None:
quote = "暂无"
else:
quote = quote.string
#for j in tagA:
info.append(file_name.string)
info.append(director)
info.append(score.string)
info.append(number)
info.append(quote)
result.append(info)
return result
# print(getHtml(url="http://www.szpt.edu.cn/"))
#print(parserHtml(html=getHtml(url="http://www.szpt.edu.cn/"),name='div',attrs={'class':'list clearfix'}))
#数据储存
import csv
def writeFile(data,filename):
save_path=save_path="./"+filename
with open(save_path,'a',errors='ignore',newline='')as fd:
f_csv=csv.writer(fd)
f_csv.writerow(data)
#数据格式化输出
def display(data):
print("{1:<10}{2:{0}^20}{3:{0}^16}".format(chr(12288),"序号","电影名","豆瓣评分",
'导演',"评论人数","影评"))
for i in range(len(data)):
print("{1:^10}{2:{0}^20}{3:{0}^20}".format(chr(12288),i+1,data[i][0],
data[i][2],data[i][1],data[i][3],data[i][4]))
#main函数定义及调用
def main():
result=[]
params={}
# url="https://movie.douban.com/top250?start="+str(i*25)+'&filter='
page_num = 8
for i in range(page_num):
params['stars']=i*25
url = "https://movie.douban.com/top250?start=" + str(i * 25) + '&filter='
html=getHtml(url,params)
result+=parserHtml(html,"div",{'class':'info'})
display(result)
#主函数调用
main()
之前还爬取的到数据,爬多了怎么变成这个了
