python爬虫Scrapy框架中分布设计从节点爬虫程序出现为题

爬虫程序:

redis_key = 'xinfang_detail'
    # rules = (
    #     Rule(LinkExtractor(allow=r'fang\.anjuke\.com\/loupan\/\d+\.html'), callback='parse_item', follow=True),
    # )
    def parse_item(self, response):
        print("*"*20 + "开始爬取" + response.url)
        item = XinfangItem()
        # 房区名
        item['title'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/div/div/div[1]/h1/text()').extract_first()
        # 价格
        item['price'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[1]/p/em/text()').extract_first()
        # 地址
        yield item

反馈:

2021-05-14 10:38:46 [scrapy.core.scraper] ERROR: Spider error processing <GET https://ly.fang.anjuke.com/loupan/448966.html> (referer: https://ly.fang.anjuke.com/loupan/437877.html)
Traceback (most recent call last):
  File "d:\desktop\anjuke1.0\venv\lib\site-packages\twisted\internet\defer.py", line 662, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "d:\desktop\anjuke1.0\venv\lib\site-packages\scrapy\spiders\crawl.py", line 105, in _callback
    rule = self._rules[response.meta['rule']]
IndexError: list index out of range

这个IndexError错误是列表索引超过列表长度了,但我在你的代码中没看到有列表访问。可以贴一下完整代码?

# 主节点
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider

from xinfang.items import XinfangItem
from xinfang.utils.InsertRedis import inserintota

class XfSpider(RedisCrawlSpider):
    name = 'xf'
    redis_key = 'start_urls'
    rules = (
        Rule(LinkExtractor(allow=r'fang\.anjuke\.com\/loupan\/\d+\.html'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        print(response.url)
        url = response.url
        if url:
            inserintota(url, 2)
            print('[success] the detail link ' + url + ' is insert into the redis queue')

# 从节点
from scrapy_redis.spiders import RedisCrawlSpider

from xinfang.items import XinfangItem

class XfSpider(RedisCrawlSpider):
    name = 'xf'
    # allowed_domains = ['anjuke.com']
    # start_urls = ['http://anjuke.com/',
    #               'https://jiao.fang.anjuke.com/?from=navigation',
    #               'https://zz.fang.anjuke.com/?from=navigation',
    #               'https://kf.fang.anjuke.com/?from=navigation',
    #               'https://ly.fang.anjuke.com/?from=navigation',
    #               'https://pds.fang.anjuke.com/?from=navigation',
    #               'https://ay.fang.anjuke.com/?from=navigation',
    #               'https://hb.fang.anjuke.com/?from=navigation',
    #               'https://xx.fang.anjuke.com/?from=navigation',
    #               'https://py.fang.anjuke.com/?from=navigation',
    #               'https://xc.fang.anjuke.com/?from=navigation',
    #               'https://lh.fang.anjuke.com/?from=navigation',
    #               'https://smx.fang.anjuke.com/?from=navigation',
    #               'https://ny.fang.anjuke.com/?from=navigation',
    #               'https://sq.fang.anjuke.com/?from=navigation',
    #               'https://xiny.fang.anjuke.com/?from=navigation',
    #               'https://zk.fang.anjuke.com/?from=navigation',
    #               'https://zmd.fang.anjuke.com/?from=navigation'
    # ]
    # 解析从start_urls下载返回的页面
    # 页面页面有两个目的:
    # 第一个:解析获取下一页的地址,将下一页的地址传递给爬虫调度器,以便作为爬虫的下一次请求
    # 第二个:获取详情页地址,再对详情页进行下一步的解析
    # redis_key = 'xinfang_detail:requests'
    redis_key = 'xinfang_detail'
    # rules = (
    #     Rule(LinkExtractor(allow=r'fang\.anjuke\.com\/loupan\/\d+\.html'), callback='parse_item', follow=True),
    # )
    def parse_item(self, response):
        print("*"*20 + "开始爬取" + response.url)
        item = XinfangItem()
        # 房区名
        item['title'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/div/div/div[1]/h1/text()').extract_first()
        # 价格
        item['price'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[1]/p/em/text()').extract_first()
        # 地址
        item['address'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[5]/a[1]/text()').extract_first()
        # 所在市区
        item['city'] = response.xpath('//*[@id="header"]/div[2]/div[1]/a[2]/text()').extract_first()
        # 售楼电话 存在虚拟号码时效性问题故不考虑真实号码
        item['phone'] = response.xpath('//*[@id="phone_show_soj"]/p/strong/text()').extract_first()
        # 开盘时间
        item['open_quotation_date'] = response.xpath(
            '//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[2]/span/text()').extract_first()
        # 交房时间
        item['delivery_date'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[3]/span/text()').extract_first()


        yield item

 

您好,我是有问必答小助手,您的问题已经有小伙伴解答了,您看下是否解决,可以追评进行沟通哦~

如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~

ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632