爬虫程序:
redis_key = 'xinfang_detail'
# rules = (
# Rule(LinkExtractor(allow=r'fang\.anjuke\.com\/loupan\/\d+\.html'), callback='parse_item', follow=True),
# )
def parse_item(self, response):
print("*"*20 + "开始爬取" + response.url)
item = XinfangItem()
# 房区名
item['title'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/div/div/div[1]/h1/text()').extract_first()
# 价格
item['price'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[1]/p/em/text()').extract_first()
# 地址
yield item
反馈:
2021-05-14 10:38:46 [scrapy.core.scraper] ERROR: Spider error processing <GET https://ly.fang.anjuke.com/loupan/448966.html> (referer: https://ly.fang.anjuke.com/loupan/437877.html) Traceback (most recent call last): File "d:\desktop\anjuke1.0\venv\lib\site-packages\twisted\internet\defer.py", line 662, in _runCallbacks current.result = callback(current.result, *args, **kw) File "d:\desktop\anjuke1.0\venv\lib\site-packages\scrapy\spiders\crawl.py", line 105, in _callback rule = self._rules[response.meta['rule']] IndexError: list index out of range
这个IndexError错误是列表索引超过列表长度了,但我在你的代码中没看到有列表访问。可以贴一下完整代码?
# 主节点
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
from xinfang.items import XinfangItem
from xinfang.utils.InsertRedis import inserintota
class XfSpider(RedisCrawlSpider):
name = 'xf'
redis_key = 'start_urls'
rules = (
Rule(LinkExtractor(allow=r'fang\.anjuke\.com\/loupan\/\d+\.html'), callback='parse_item', follow=True),
)
def parse_item(self, response):
print(response.url)
url = response.url
if url:
inserintota(url, 2)
print('[success] the detail link ' + url + ' is insert into the redis queue')
# 从节点
from scrapy_redis.spiders import RedisCrawlSpider
from xinfang.items import XinfangItem
class XfSpider(RedisCrawlSpider):
name = 'xf'
# allowed_domains = ['anjuke.com']
# start_urls = ['http://anjuke.com/',
# 'https://jiao.fang.anjuke.com/?from=navigation',
# 'https://zz.fang.anjuke.com/?from=navigation',
# 'https://kf.fang.anjuke.com/?from=navigation',
# 'https://ly.fang.anjuke.com/?from=navigation',
# 'https://pds.fang.anjuke.com/?from=navigation',
# 'https://ay.fang.anjuke.com/?from=navigation',
# 'https://hb.fang.anjuke.com/?from=navigation',
# 'https://xx.fang.anjuke.com/?from=navigation',
# 'https://py.fang.anjuke.com/?from=navigation',
# 'https://xc.fang.anjuke.com/?from=navigation',
# 'https://lh.fang.anjuke.com/?from=navigation',
# 'https://smx.fang.anjuke.com/?from=navigation',
# 'https://ny.fang.anjuke.com/?from=navigation',
# 'https://sq.fang.anjuke.com/?from=navigation',
# 'https://xiny.fang.anjuke.com/?from=navigation',
# 'https://zk.fang.anjuke.com/?from=navigation',
# 'https://zmd.fang.anjuke.com/?from=navigation'
# ]
# 解析从start_urls下载返回的页面
# 页面页面有两个目的:
# 第一个:解析获取下一页的地址,将下一页的地址传递给爬虫调度器,以便作为爬虫的下一次请求
# 第二个:获取详情页地址,再对详情页进行下一步的解析
# redis_key = 'xinfang_detail:requests'
redis_key = 'xinfang_detail'
# rules = (
# Rule(LinkExtractor(allow=r'fang\.anjuke\.com\/loupan\/\d+\.html'), callback='parse_item', follow=True),
# )
def parse_item(self, response):
print("*"*20 + "开始爬取" + response.url)
item = XinfangItem()
# 房区名
item['title'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/div/div/div[1]/h1/text()').extract_first()
# 价格
item['price'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[1]/p/em/text()').extract_first()
# 地址
item['address'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[5]/a[1]/text()').extract_first()
# 所在市区
item['city'] = response.xpath('//*[@id="header"]/div[2]/div[1]/a[2]/text()').extract_first()
# 售楼电话 存在虚拟号码时效性问题故不考虑真实号码
item['phone'] = response.xpath('//*[@id="phone_show_soj"]/p/strong/text()').extract_first()
# 开盘时间
item['open_quotation_date'] = response.xpath(
'//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[2]/span/text()').extract_first()
# 交房时间
item['delivery_date'] = response.xpath('//*[@id="container"]/div[1]/div[2]/div[1]/dl/dd[3]/span/text()').extract_first()
yield item
您好,我是有问必答小助手,您的问题已经有小伙伴解答了,您看下是否解决,可以追评进行沟通哦~
如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~
ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632