爬取到文件不保存到pipelines文件中,但是管道也开启了
pipelines文件
import json
class ItbearPipeline:
f = None
def process_item(self, item, spider):
def open_spider(self, spider):
print("开始")
self.f = open("./house.csv", mode='w', encoding="utf-8")
def process_item(self, item, spider):
item = dict(item)
item_str = json.dumps(item, ensure_ascii=False) + "\n"
self.f.write(item_str)
return item
def close_spider(self, spider):
print("完毕")
self.f.close()
settings文件
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
ITEM_PIPELINES = {
'itbear.pipelines.ItbearPipeline': 300,
}
.py文件
```python
import scrapy
from itbear.items import ItbearItem
class IbearSpider(scrapy.Spider):
name = 'ibear'
#allowed_domains = ['www.xxx.com']
start_urls = ['http://www.techweb.com.cn/news/']
def parse(self, response):
node_list = response.xpath('//div[@class="list_con"]/div[@class="picture_text"]')
for node in node_list:
link = node.xpath('./div[@class="text"]/a/@href').extract_first()
yield scrapy.Request(link, callback=self.parse_detail) # 访问
#print(link)
def parse_detail(self, response):
item = ItbearItem() # item对象
item['name'] = response.xpath(
'//div[@class="content"]/div[@class="main_c"]/h1/text()').extract_first()
#print(item['name'])
item['date'] = response.xpath(
'//div[@class="content"]/div[@class="main_c"]/div[@class="article_info"]/div[@class="infos"]/span['
'@class="time"]/text()').extract_first()
#print(item['date'])
item['content'] = response.xpath(
'//div[@class="content"]/div[@class="main_c"]/div[@id="content"]/p/text()').extract_first()
# print(item['content'])
yield item
运行结果 直接略过了pipelines文件
你在parse函数的for循环里面print看看是否有执行循环