import urllib.request
import jsonpath
import json
import re
from lxml import etree
from bs4 import BeautifulSoup
import urllib.parse
class Spider():
def init(self):
self.begin_page = int(input('请输入起始页:'))
self.end_page = int(input('请输入终止页:'))
self.base_url = "https://ssr1.scrape.center/page/%22
def load_page(self, page):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
}
url = self.base_url + str(page)
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
return html
# lxml解析
# def parse_page(self, html):
# root = etree.HTML(html)
# names = root.xpath("//h2[@class='m-b-sm']/text()")
# score = root.xpath("//p[@class='score m-t-md m-b-n-sm']/text()")
# items = []
# for i in range(0, len(names)):
# item = {}
# item["电影名"] = names[i]
# item['评分'] = score[i].strip()
# items.append(item)
# print(items)
# self.save_file(items)
# re解析
# def parse_page(self, html):
# names_list = re.findall(r'class="m-b-sm">(.*?)</h2>',html)#参数问题findall有两个参数,第一个是正则表达式,第二个是你要筛选的文本
# print(len(names_list))
# score_list = re.findall(r'class="score m-t-md m-b-n-sm">\n(.*?)</p>',html)
# print(score_list)
# items = []
# for i in range(0,len(names_list)):
# item = {}
# item["电影名"] = names_list[i]
# item['评分'] = score_list[i].strip()
# items.append(item)
# print(items)
# self.save_file(items)
# bs4解析
def parse_page(self,html):
html = BeautifulSoup(html,'html.parser')
name = html.find_all('h2',class_="m-b-sm")[1:]
score = html.find_all('p',class_="score m-t-md m-b-n-sm")[1:]
print(name)
print(score)
items = []
for i in range(0,len(name)):
item = {}
item['电影名'] = name[i]
item['评分'] = score[i]
items.append(item)
print(items)
self.save_file(items)
def save_file(self, items):
json.dump(items, open('films.json', 'w', encoding='UTF-8'), ensure_ascii=False, indent=4)
if name == 'main':
spider = Spider()
for page in range(spider.begin_page, spider.end_page + 1):
return_html = spider.load_page(page)
spider.parse_page(return_html)
问题是在代码中Item值中含有标签,不能用json解析。代码中这里改写一下即可:
for i in range(0, len(name)):
item = {}
item['电影名'] = name[i].text.strip()
item['评分'] = score[i].text.strip()
items.append(item)
[{'电影名': '少年派的奇幻漂流 - Life of Pi', '评分': '8.9'}, {'电影名': '美丽心灵 - A Beautiful Mind', '评分': '8.8'}, {'电影名': '
初恋这件小事 - สิ่งเล็กเล็กที่เรียกว่า...รัก', '评分': '8.9'}, {'电影名': '借东西的小人阿莉埃蒂 - 借りぐらしのアリエッティ', '评分':
'8.8'}, {'电影名': '一一 - Yi yi: A One and a Two', '评分': '8.8'}, {'电影名': '美丽人生 - La vita è bella', '评分': '9.1'}, {'电影
名': "海上钢琴师 - La leggenda del pianista sull'oceano", '评分': '9.1'}, {'电影名': '千与千寻 - 千と千尋の神隠し', '评分': '9.1'},
{'电影名': '迁徙的鸟 - The Travelling Birds', '评分': '9.1'}]
如对你有帮助,请点击采纳按钮。