爬取不到网页图片的下载地址,别的id和name都可以得到
不知道是不是正则表达式的问题
爬取网站链接:https://www.ssense.com/en-cn/women?q=top
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from ssense.items import SsenseItem
import re
class SsensePicSpider(scrapy.Spider):
name = 'ssense_pic'
allowed_domains = ['ssense.com']
start_urls = ['http://ssense.com/']
def parse(self, response):#定义解析函数
search_word = 'top'#查找词,可修改
for i in range(1, 2):#爬取所有网页
url = 'http://www.ssense.com/en-cn/women?q=' + str(search_word) + '&page=' + str(i)
#print(url)
yield Request(url=url, callback=self.page)
pass
# 爬取商品url
def page(self, response):
body = response.body.decode('utf-8', 'ignore')
url_id = '"url":\s"([/a-z-0-9]*)"'
item_id = re.compile(url_id).findall(body) #获取商品url
#print(item_id)
for i in range(0, len(item_id)):
this_id = item_id[i]
website = 'https://www.ssense.com/en-cn' + str(this_id) # 商品链接
yield Request(url=website, callback=self.next)
pass
pass
def next(self, response):
item = SsenseItem()
body = response.body.decode('utf-8', 'ignore')
# 获取商品productID
pro_id = '"productID":\s(\d{7})'
productID = re.compile(pro_id).findall(body)
item['productID'] = productID
#获取商品name
item_name = '"name":\s"([a-zA-Z -]*)"[,]'
name = re.compile(item_name).findall(body)
item['name'] = name
#获取商品price
item_price = '"price":\s([0-9]*)'
price = re.compile(item_price).findall(body)
item['price'] = price
# 获取sku
item_sku = '"sku":\s"([0-9A-Z]*)",'
sku = re.compile(item_sku).findall(body)
item['sku'] = sku
#获取图片url
item_image = '"image":\s"([a-z:/.0-9A-F_-]*)"'
image = re.compile(item_image).findall(body)
item['image'] = image
print(type(image))
yield item
pass
data-srcset="后面的",不知道你的 image: 这个是什么鬼。
<picture data-v-60b7d3e3=""><source data-v-60b7d3e3="" data-srcset="https://cldny.ccindex.cn/ssenseweb/image/upload/b_white,c_lpad,g_south,h_1086,w_724/c_scale,h_480/f_auto,dpr_1.0/201071F110010_1.jpg" media="(min-width: 1025px)" srcset="https://cldny.ccindex.cn/ssenseweb/image/upload/b_white,c_lpad,g_south,h_1086,w_724/c_scale,h_480/f_auto,dpr_1.0/201071F110010_1.jpg"><source data-v-60b7d3e3="" data-srcset="https://cldny.ccindex.cn/ssenseweb/image/upload/b_white,c_lpad,g_south,h_706,w_470/c_scale,h_320/f_auto,dpr_1.0/201071F110010_1.jpg" media="(min-width: 768px)" srcset="https://cldny.ccindex.cn/ssenseweb/image/upload/b_white,c_lpad,g_south,h_706,w_470/c_scale,h_320/f_auto,dpr_1.0/201071F110010_1.jpg"><img data-v-60b7d3e3="" data-srcset="https://cldny.ccindex.cn/ssenseweb/image/upload/b_white,c_lpad,g_south,h_706,w_470/c_scale,h_280/f_auto,dpr_1.0/201071F110010_1.jpg" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAXUAAAIwAQMAAABDTmnJAAAAA1BMVEUAAACnej3aAAAAAXRSTlMAQObYZgAAADFJREFUeNrtwTEBAAAAwiD7pzbDfmAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANEBaQAAAZUbkzMAAAAASUVORK5CYII=" alt="Live the Process - Grey Seamless Sport Top" class="product-thumbnail lazyloaded" srcset="https://cldny.ccindex.cn/ssenseweb/image/upload/b_white,c_lpad,g_south,h_706,w_470/c_scale,h_280/f_auto,dpr_1.0/201071F110010_1.jpg"></picture>