Scrapy框架在meta传值时少了一组结果,什么导致的?

问题:

img

img

爬取网站:http://www.52jingsai.com/bisai/keji/index.php?jsstatus=2&jssort=0
目的:按照标签爬取每个竞赛的信息
spider代码:

class CsSpider(scrapy.Spider):
    name = 'cs'
    allowed_domains = ['52jingsai.com']
    start_urls = ['http://www.52jingsai.com/bisai/keji/index.php?jsstatus=2&jssort=0']
    # 获取活动对象标签
    def parse(self, response):
        li_lst = [i.xpath('.//a/@href').get() for i in response.xpath('//div[@class="js"]/div[2]/ul/li')[2::]]
        li_text = [i.xpath('.//a/text()').get().strip() for i in response.xpath('//div[@class="js"]/div[2]/ul/li')[2::]]
        for num in range(len(li_lst)):
            item = CompetitionsItem(competition_level=li_text[num])
            yield scrapy.Request(
                url=li_lst[num],
                callback=self.order_parse,
                meta={'item': deepcopy(item)}
            )
    # 获取竞赛排序标签
    def order_parse(self, response):
        item = response.meta.get('item')
        li_lst = [i.xpath('./@href').get() for i in response.xpath('//div[@class="js"]/div[3]/ul/li/a')]
        li_text = [i.xpath('./text()').get() for i in response.xpath('//div[@class="js"]/div[3]/ul/li/a')]
        for num in range(len(li_lst)):
            item['competitions_label'] = li_text[num]
            yield scrapy.Request(
                url=li_lst[num],
                callback=self.details,
                meta={'item': deepcopy(item)}
            )
    # 获取详细信息
    def details(self, response):
        print(response.meta['item'])

这是怎么回事?第一次见,求解!

不知你代码中CompetitionsItem这个类是如何定义的,检查一下CompetitionsItem,代码写成如下可以获取竞赛标签信息。

import scrapy
from copy import deepcopy
class CompetitionsItem(scrapy.Item):
    competition_level=scrapy.Field()
    competitions_label=scrapy.Field()

class CsSpider(scrapy.Spider):
    name = 'cs'
    allowed_domains = ['52jingsai.com']
    start_urls = [
        'http://www.52jingsai.com/bisai/keji/index.php?jsstatus=2&jssort=0']
    # 获取活动对象标签

    def parse(self, response):
        li_lst = [i.xpath('.//a/@href').get()
                  for i in response.xpath('//div[@class="js"]/div[2]/ul/li')[2::]]
        li_text = [i.xpath('.//a/text()').get().strip()
                   for i in response.xpath('//div[@class="js"]/div[2]/ul/li')[2::]]
        for num in range(len(li_lst)):
            item = CompetitionsItem(competition_level=li_text[num])
            yield scrapy.Request(
                url=li_lst[num],
                callback=self.order_parse,
                meta={'item': deepcopy(item)}
            )
    # 获取竞赛排序标签

    def order_parse(self, response):
        item = response.meta.get('item')
        li_lst = [i.xpath('./@href').get()
                  for i in response.xpath('//div[@class="js"]/div[3]/ul/li/a')]
        li_text = [i.xpath('./text()').get()
                   for i in response.xpath('//div[@class="js"]/div[3]/ul/li/a')]
        for num in range(len(li_lst)):
            item['competitions_label'] = li_text[num]
            yield scrapy.Request(
                url=li_lst[num],
                callback=self.details,
                meta={'item': deepcopy(item)}
            )
    # 获取详细信息

    def details(self, response):
        print(response.meta['item'])

#输出:
{'competition_level': '全国', 'competitions_label': '热门'}
{'competition_level': '全国', 'competitions_label': '推荐'}
{'competition_level': '国际', 'competitions_label': '热门'}
{'competition_level': '全国', 'competitions_label': '最新'}
{'competition_level': '国际', 'competitions_label': '推荐'}
{'competition_level': '国际', 'competitions_label': '最新'}
{'competition_level': '各省', 'competitions_label': '推荐'}
{'competition_level': '各省', 'competitions_label': '热门'}