他一直在弹错,说模块有问题,度娘复制过去没有相同的答案,一直是跟着人家做的但是就是有问题
模块有问题,就是环境没搭建好
试试这个,已测试通过!
import re, requests
class DouBan:
def __init__(self, filename='movie.txt'):
self.url = "https://movie.douban.com/top250?start={}&filter="
self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
self.filename = filename
def dataGet(self, start):
return requests.get(self.url.format(start), headers=self.headers).text
def dataClean(self, html:str):
result = []
html = html.replace("\n", "")
rep = re.compile('<div class="info">.*?</li>')
for info in re.findall(rep, html):
# 名字
names = re.findall('<span class="title">(.*?)</span>', info)
chineseName = names[0]
if len(names)==2:
englishName = names[1].replace("/", "").strip().replace(" ", "")
else:
englishName = "-"
otherName = re.findall('<span class="other">(.*?)</span>', info)[0].strip("/").replace(" ", "")
# 评分, 评价人数
star = re.findall('<span class="rating_num".*?>(.*?)</span>', info)[0]
starPeople = re.findall('<span>(.*?)人评价</span>', info)[0]
starPeople = self.Format(starPeople)
# 语录
try:
nidehanwang = re.findall('<span class="inq">(.*?)</span>', info)[0]
except:
nidehanwang = "-"
# other
others = re.findall('<p class="">(.*?)</p>', info)[0].strip().replace("/", "").split(" ")
type = others[-1]
country = others[-2]
if len(others) == 4:
direct = others[0].split(":")[-1].strip()
others = others[1].split("<br>")
year = others[-1].strip()
actors = others[0].split(":")[-1]
else:
if "主演" not in others[0]:
direct = others[0].split("<br>")[0].split(":")[1]
year = others[0].split("<br>")[1].strip()
actors = "-"
if len(str(year))>4:
year = re.findall("([0-9]+)", year)[-1]
result.append([chineseName, englishName, otherName,
direct, actors, year, country, type,
star, starPeople, nidehanwang])
return result
def fileCreate(self):
head = '中文名,英文名,别名,导演,演员,上映年份,上映国家,类型,评分,评价人数,语录\n'
try:
with open(self.filename,'w',encoding='utf-8') as f:
f.write(head)
except:
raise PermissionError(f'写入文件失败,检查文件{self.filename}是否已被其他软件打开。')
def dataSave(self, datas:list):
with open(self.filename,'a',encoding='utf-8') as f:
for d in datas:
f.write('"'+'\",\"'.join(d)+'\"\n')
def Format(self, num):
if len(str(num))>4:
return str(round(int(num)/10000, 1))+"万"
def main(self, pages=10):
self.fileCreate()
for i in range(pages):
print(f"正在爬取第{i+1:>3}/{pages:<3}页")
html = self.dataGet(i*25)
data = self.dataClean(html)
self.dataSave(data)
print("爬取成功!")
if __name__ == '__main__':
db = DouBan()
db.main()