h1=scrapy.Field()
假如我要存到当前目录下的h1文件夹
https://www.cnblogs.com/shuangzikun/p/python_taotao_scrapy_pic_mysql.html
# coding=utf-8
from scrapy.spiders import Spider
import re
from scrapy import Request
from ..items import DoubanImgsItem
class download_douban(Spider):
name = 'download_douban'
default_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch, br',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'www.douban.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
}
def __init__(self, url='1638835355', *args, **kwargs):
self.allowed_domains = ['douban.com']
self.start_urls = [
'http://www.douban.com/photos/album/%s/' % (url)]
self.url = url
# call the father base function
# super(download_douban, self).__init__(*args, **kwargs)
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, headers=self.default_headers, callback=self.parse)
def parse(self, response):
list_imgs = response.xpath('//div[@class="photolst clearfix"]//img/@src').extract()
if list_imgs:
item = DoubanImgsItem()
item['image_urls'] = list_imgs
yield item
如果响应内容是文件流的话,可以直接操作存储为文件的,比如这段应用:
#遍历保存当前页面的图片
for img in all_img:
src = img['src']
img_url = src
#保存图片到本地,获取 / 以后的最后一部分的文件名
path = "./image/"+img_url.split('/')[-1]
try:
if not os.path.exists(path):
r = requests.get("https:"+img_url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print('文件保存成功')
else:
print('文件已存在')
except:
print('失败')
#返回下一页连接
return next_page_url
完整的代码可参考:https://blog.csdn.net/wojiushiwo945you/article/details/99415502