爬取豆瓣250,运行错误。请问这个错误怎么破?


import requests
import re
import os
from hashlib import md5
from requests.exceptions import RequestException

def get_page(url):#请求并获取豆瓣250
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def get_parse(html):#进行解析
    parse = re.compile('<li.*?div.*?"item".*?<a.*?img.*?src="(.*?)"'
                       '.*?div.*?"info".*?span.*?"title">(.*?)</span>'
                       '.*?div.*?"bd".*?p.*?>(.*?)<br>', re.S)
    parse_over = re.findall(parse, html)
    for item in parse_over:
        yield {
            "jpg": item[0].strip(),
            "title": item[1].strip(),
            "director": item[2].strip()
        }


def down_photo(photos_url):#下载图片
    if not os.path.exists('photo'):#查看当前路径下是否有这个文件
        os.mkdir('photo')#创建文件夹
    for photo_url in photos_url:
        try:
            photo = requests.get(photo_url)
            if photo.status_code == 200:
                name = 'photo' + os.path.sep + md5(photo.content).hexdigest() + '.jpg'
                with open(name, 'wb+') as f:
                    f.write(photo.content)
        except Exception:
            return None


if __name__ == '__main__':
    for num in range(0, 250, 25):
        photos_url = []
        url = "https://movie.douban.com/top250?start=" + str(num)
        html = get_page(url)
        items = get_parse(html)
        for item in items:
            photos_url.append(item["jpg"])
            print(item)
        down_photo(photos_url)

img

这是代码运行后出的错误,我也不知道是不是少了哪些模块??有人能讲解一下吗

你22行parse_over = re.findall(parse, html)中 parse匹配的是字符串,而html是bytes型只需要把html转换成string ,这样试试

parse_over = re.findall(parse, str(html))

你的代码没有加请求头 所以被反爬了,给你修改了下代码,如果对你有帮助希望点个右上角“采纳”谢谢!

import requests
import re
import os
from hashlib import md5
from requests.exceptions import RequestException
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
def get_page(url):#请求并获取豆瓣250
    try:
        response = requests.get(url,headers=headers)
        print("status_code:",response.status_code)
        if response.status_code == 200:            
            return response.text
        return None
    except RequestException:
        return None
 
 
def get_parse(html):#进行解析
    parse = re.compile('<li.*?div.*?"item".*?<a.*?img.*?src="(.*?)"'
                       '.*?div.*?"info".*?span.*?"title">(.*?)</span>'
                       '.*?div.*?"bd".*?p.*?>(.*?)<br>', re.S)
    parse_over = re.findall(parse, str(html))
    for item in parse_over:
        yield {
            "jpg": item[0].strip(),
            "title": item[1].strip(),
            "director": item[2].strip()
        }
 
 
def down_photo(photos_url):#下载图片
    if not os.path.exists('photo'):#查看当前路径下是否有这个文件
        os.mkdir('photo')#创建文件夹
    for photo_url in photos_url:
        try:
            photo = requests.get(photo_url)
            if photo.status_code == 200:
                name = 'photo' + os.path.sep + md5(photo.content).hexdigest() + '.jpg'
                with open(name, 'wb+') as f:
                    f.write(photo.content)
        except Exception:
            return None
 
 
if __name__ == '__main__':
    for num in range(0, 250, 25):
        print(num)
        photos_url = []
        url = "https://movie.douban.com/top250?start=" + str(num)
        html = get_page(url)
        print(html)
        items = get_parse(html)
        for item in items:
            photos_url.append(item["jpg"])
            print(photos_url)
            print(item)
        down_photo(photos_url)