python用bs4库解析电影网站

import urllib.request
import jsonpath
import json
import re
from lxml import etree
from bs4 import BeautifulSoup
import urllib.parse

取消全局证书验证

import ssl

ssl._create_default_https_context = ssl._create_unverified_context

class Spider():
def init(self):
self.begin_page = int(input('请输入起始页:'))
self.end_page = int(input('请输入终止页:'))
self.base_url = "https://ssr1.scrape.center/page/%22

def load_page(self, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
    }

    url = self.base_url + str(page)
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read().decode("utf-8")
    return html

# lxml解析
# def parse_page(self, html):
#     root = etree.HTML(html)
#     names = root.xpath("//h2[@class='m-b-sm']/text()")
#     score = root.xpath("//p[@class='score m-t-md m-b-n-sm']/text()")
#     items = []
#     for i in range(0, len(names)):
#         item = {}
#         item["电影名"] = names[i]
#         item['评分'] = score[i].strip()
#         items.append(item)
#     print(items)
#     self.save_file(items)

# re解析
# def parse_page(self, html):
#     names_list = re.findall(r'class="m-b-sm">(.*?)</h2>',html)#参数问题findall有两个参数,第一个是正则表达式,第二个是你要筛选的文本
#     print(len(names_list))
#     score_list = re.findall(r'class="score m-t-md m-b-n-sm">\n(.*?)</p>',html)
#     print(score_list)
#     items = []
#     for i in range(0,len(names_list)):
#         item = {}
#         item["电影名"] = names_list[i]
#         item['评分'] = score_list[i].strip()
#         items.append(item)
#     print(items)
#     self.save_file(items)

# bs4解析
def parse_page(self,html):
    html = BeautifulSoup(html,'html.parser')
    name = html.find_all('h2',class_="m-b-sm")[1:]
    score = html.find_all('p',class_="score m-t-md m-b-n-sm")[1:]
    print(name)
    print(score)
    items = []
    for i in range(0,len(name)):
        item = {}
        item['电影名'] = name[i]
        item['评分'] = score[i]
        items.append(item)
    print(items)
    self.save_file(items)

def save_file(self, items):
    json.dump(items, open('films.json', 'w', encoding='UTF-8'), ensure_ascii=False, indent=4)

if name == 'main':
spider = Spider()
for page in range(spider.begin_page, spider.end_page + 1):
return_html = spider.load_page(page)
spider.parse_page(return_html)

问题是在代码中Item值中含有标签,不能用json解析。代码中这里改写一下即可:

for i in range(0, len(name)):
            item = {}
            item['电影名'] = name[i].text.strip()
            item['评分'] = score[i].text.strip()
            items.append(item)

[{'电影名': '少年派的奇幻漂流 - Life of Pi', '评分': '8.9'}, {'电影名': '美丽心灵 - A Beautiful Mind', '评分': '8.8'}, {'电影名': ' 
初恋这件小事 - สิ่งเล็กเล็กที่เรียกว่า...รัก', '评分': '8.9'}, {'电影名': '借东西的小人阿莉埃蒂 - 借りぐらしのアリエッティ', '评分':
 '8.8'}, {'电影名': '一一 - Yi yi: A One and a Two', '评分': '8.8'}, {'电影名': '美丽人生 - La vita è bella', '评分': '9.1'}, {'电影': "海上钢琴师 - La leggenda del pianista sull'oceano", '评分': '9.1'}, {'电影名': '千与千寻 - 千と千尋の神隠し', '评分': '9.1'}, 
{'电影名': '迁徙的鸟 - The Travelling Birds', '评分': '9.1'}]

如对你有帮助,请点击采纳按钮。