一点都不懂,爬豆瓣top

他一直在弹错,说模块有问题,度娘复制过去没有相同的答案,一直是跟着人家做的但是就是有问题

模块有问题,就是环境没搭建好

试试这个,已测试通过!

import re, requests

class DouBan:
    def __init__(self, filename='movie.txt'):
        self.url = "https://movie.douban.com/top250?start={}&filter="
        self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"}
        self.filename = filename

    def dataGet(self, start):
        return requests.get(self.url.format(start), headers=self.headers).text

    def dataClean(self, html:str):
        result = []
        html = html.replace("\n", "")
        rep = re.compile('<div class="info">.*?</li>')
        for info in re.findall(rep, html):
            # 名字
            names = re.findall('<span class="title">(.*?)</span>', info)
            chineseName = names[0]
            if len(names)==2:
                englishName = names[1].replace("/", "").strip().replace("&nbsp;", "")
            else:
                englishName = "-"
            otherName = re.findall('<span class="other">(.*?)</span>', info)[0].strip("/").replace("&nbsp;", "")
            # 评分, 评价人数
            star = re.findall('<span class="rating_num".*?>(.*?)</span>', info)[0]
            starPeople = re.findall('<span>(.*?)人评价</span>', info)[0]
            starPeople = self.Format(starPeople)
            # 语录
            try:
                nidehanwang = re.findall('<span class="inq">(.*?)</span>', info)[0]
            except:
                nidehanwang = "-"
            # other
            others = re.findall('<p class="">(.*?)</p>', info)[0].strip().replace("/", "").split("&nbsp;&nbsp;")
            type = others[-1]
            country = others[-2]
            if len(others) == 4:
                direct = others[0].split(":")[-1].strip()
                others = others[1].split("<br>")
                year = others[-1].strip()
                actors = others[0].split(":")[-1]
            else:
                if "主演" not in others[0]:
                    direct = others[0].split("<br>")[0].split(":")[1]
                    year = others[0].split("<br>")[1].strip()
                    actors = "-"
            if len(str(year))>4:
                year = re.findall("([0-9]+)", year)[-1]
            result.append([chineseName, englishName, otherName,
                          direct, actors, year, country, type,
                          star, starPeople, nidehanwang])
        return result

    def fileCreate(self):
        head = '中文名,英文名,别名,导演,演员,上映年份,上映国家,类型,评分,评价人数,语录\n'
        try:
            with open(self.filename,'w',encoding='utf-8') as f:
                f.write(head)
        except:
            raise PermissionError(f'写入文件失败,检查文件{self.filename}是否已被其他软件打开。')
        
    def dataSave(self, datas:list):
        with open(self.filename,'a',encoding='utf-8') as f:
            for d in datas:
                f.write('"'+'\",\"'.join(d)+'\"\n')

    def Format(self, num):
        if len(str(num))>4:
            return str(round(int(num)/10000, 1))+"万"

    def main(self, pages=10):
        self.fileCreate()
        for i in range(pages):
            print(f"正在爬取第{i+1:>3}/{pages:<3}页")
            html = self.dataGet(i*25)
            data = self.dataClean(html)
            self.dataSave(data)
        print("爬取成功!")


if __name__ == '__main__':
    
    db = DouBan()
    db.main()