import re
import requests
from bs4 import BeautifulSoup
import time
import json
def get_one_page(url):
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
response=requests.get(url)
if response.status_code==200:
return response.text
return None
def parse_one_page(html):
soup=BeautifulSoup(html,'lxml')
items=soup.find_all(name='dd')
for item in items:
yield{
'index':item.find(name='i',class_='board-index').string,#爬取电影排名
'name': item.find(name='p',class_='name').string,#爬取电影名
'star': item.find(name='p', class_='star').string.strip()[3:],#爬取主演
'time': item.find(name='p', class_='releasetime').string[5:],#上映时间
'score': item.find(name = 'i',class_ = 'integer').string,#评分
}
def write_to_text(text):
with open('paimin.csv','a',encoding='utf-8') as t:
t.write(json.dumps(text,ensure_ascii=False)+'\n')
def main(offset):
url='https://maoyan.com/board/4?offset=%27+str(offset)
html=get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_text(item)
if name=='main':
for i in range(10):
main(offset=i * 10)
time.sleep(1)