问题遇到的现象和发生背景 通过拼接网址,构造跳转网页函数,但就是不能爬取后面几页的内容
问题相关代码,请勿粘贴截图
import scrapy
import json
import requests
import re
from ..items import TencentNewsItem
from scrapy.http import Request
from lxml import etree
class NewsSpider(scrapy.Spider):
name = 'news'
allowed_domains = ['qq.com']
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537..36 Edg/91.0.864.37'}
start_urls = ['https://i.news.qq.com/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/list?sub_srv_id=24hours&srv_id=pc&offset=20&limit=20&strategy=1&ext={%22pool%22:[%22top%22],%22is_filter%22:7,%22check_type%22:true}']
def parse(self, response):
item = TencentNewsItem()
data = response.body.decode("utf-8")
pat1 = '"title":"(.*?)"'
pat2 = '"url":"(.*?)"'
pat3 = '"tag_word":"(.*?)"'
pat4 = '"update_time":"(.*?)"'
pat5 = '"comment_num":(.*?),'
pat6 = '"media_name":"(.*?)"'
pat7 = '"sub_category_cn":"(.*?)"'
pat8 = '"category_cn":"(.*?)"'
item["title"] = re.compile(pat1).findall(data)
item["url"] = re.compile(pat2).findall(data)
item["tag_word"] = re.compile(pat3).findall(data)
item["update_time"] = re.compile(pat4).findall(data)
item["comment_num"] = re.compile(pat5).findall(data)
item["media_name"] = re.compile(pat6).findall(data)
item["sub_category_cn"] = re.compile(pat7).findall(data)
item["category_cn"] = re.compile(pat8).findall(data)
yield item
for i in range(0,10):
urls = ['https://i.news.qq.com/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/list?sub_srv_id=24hours&srv_id=pc&offset=' + str(
i * 20) + '&limit=20&strategy=1&ext={%22pool%22:[%22top%22],%22is_filter%22:7,%22check_type%22:true}']
# urls = ['https://i.news.qq.com/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/list?sub_srv_id=24hours&srv_id=pc&offset={}&limit=20&strategy=1&ext='.format(str(i)) + '{%22pool%22:[%22top%22],%22is_filter%22:2,%22check_type%22:true}']
# '{%22pool%22:[%22top%22],%22is_filter%22:2,%22check_type%22:true}'
for url in urls:
yield Request(url, callback=self.parse)
运行结果及报错内容
我的解答思路和尝试过的方法
我想要达到的结果