from scrapy.pipelines.images import ImagesPipeline
import scrapy
import random
class MvtpPipeline:
f = None
def open_spider(self, spider):
self.f = open("mvtp.txt", "a", encoding='utf8')
def process_item(self, item, spider):
if item.__class__.__name__ == "MvtpItem":
base_title = item['base_title']
detail_url = item['detail_url']
print(base_title, detail_url)
self.f.write(base_title +"; " + detail_url + '\n')
else:
base_in_title = item['base_in_title']
image_url = item['image_url']
print(base_in_title, image_url)
self.f.write(base_in_title +"; " + image_url + '\n')
return item
def close_spider(self, spider):
self.f.close()
class ImagePipeline(ImagesPipeline):
user_agent_list = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
]
headers = random.choice(user_agent_list)
def get_media_requests(self, item, info):
if item.__class__.__name__ == "MvtpItemDetail":
print(item['image_url'])
yield scrapy.Request(item['image_url'], headers=self.headers)
def file_path(self, request, response=None, info=None, *, item=None):
img_name = request.url.split('/')[-1]
return img_name
def item_completed(self, results, item, info):
print(results)
return item
捣鼓了半天也解决不了!!能保存txt文件 , 图片始终不能保存到本地电脑。
下面的是item类
import scrapy
class MvtpItem(scrapy.Item):
# define the fields for your item here like:
base_title = scrapy.Field()
detail_url = scrapy.Field()
pass
class MvtpItemDetail(scrapy.Item):
# define the fields for your item here like:
base_in_title = scrapy.Field()
image_url = scrapy.Field()
pass
下面的是setting设置
BOT_NAME = 'mvtp'
SPIDER_MODULES = ['mvtp.spiders']
NEWSPIDER_MODULE = 'mvtp.spiders'
LOG_LEVEL = "WARNING"
RETRY_ENABLED = False
DOWNLOAD_TIMEOUT = 2
IMAGES_STORE = './sexlady'
ITEM_PIPELINES = {
'mvtp.pipelines.MvtpPipeline': 301,
'scrapy.pipelines.images.ImagesPipeline':300,
}