import scrapy
from redis import Redis
from selenium import webdriver
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from fbsPro.items import FbsproItem
class FbsSpider(CrawlSpider):
name = 'fbs'
#allowed_domains = ['www.xxx.com']
start_urls = ['https://www.fanqiehd.com/list/?5.html']
conn=Redis(host='127.0.0.1',port=6379)
rules = (
Rule(LinkExtractor(allow=r'\?5-\d+.html'), callback='parse_item', follow=True),
)
modu_list=[]
def parse_item(self, response):
li_list=response.xpath('/html/body/div[3]/div/div[2]/div/div/ul/li')
for li in li_list:
li_href='https://www.fanqiehd.com'+li.xpath('./div[1]/a/@href').extract_first()
self.modu_list.append(li_href)
ex=self.conn.sadd('urls',li_href)
if ex ==1:
print('该数据没有爬取过,可进行爬取')
yield scrapy.Request(url=li_href,callback=self.parse_item1)
else:
print('该数据已经爬取')
yield scrapy.Request(url=li_href, callback=self.parse_item1)
def parse_item1(self,response):
name=response.xpath('/html/body/div[2]/div[1]/div/div[2]/h1/text()').extract_first()
title=response.xpath('/html/body/div[2]/div[1]/div/div[2]/p[4]/a/text()').extract_first()
print(name,title)
item = FbsproItem()
item['name'] = name
item['title']=title
yield item
pipelines中:
class FbsproPipeline:
conn=None
def openspider(self,spider):
self.conn=spider.conn
def process_item(self, item, spider):
dic={
'name':item['name'],
'title':item['title']
}
self.conn.lpush('moviedata',dic)
return item
代码运行后出现标题中的问题,是啥原因呢?