def parse(self, response):
print("我进来了")
try:
sim_url = response.meta["sim_url"]
ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()
ye = int(ye_se.replace('1 of ',''))
for i in range(ye):
next_url = sim_url+str(i*10)
province = response.meta['province']
meta = {'province': province}
print('我出去了')
yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta)
except:
print('我没有值了')
item = YelpspiderItem()
item['keyword'] = None
item['city'] = None
item['phone_number'] = None
item['address'] = None
item['web'] = None
item['provinc'] = None
yield item
def parse_detail(self,response):
print(2222222222222)
print("shshhsshshsh"+response.url)
print(response.meta['province'])
detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()
for url in detail_url_list:
detail_url = 'https://www.yelp.com'+url
province = response.meta['province']
meta={'province':province}
1.确定 '我出去了' 这个字符串打印了吗?
2.打印了之后 最好抓包看一下 看看 有没有网页的请求 发出去?
用这个格式化代码一下
或者文件发qq1467288927
def parse(self, response): print("我进来了") try: sim_url = response.meta["sim_url"] ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first() ye = int(ye_se.replace('1 of ','')) for i in range(ye): next_url = sim_url+str(i*10) province = response.meta['province'] meta = {'province': province} print('我出去了') yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta) except: print('我没有值了') item = YelpspiderItem() item['keyword'] = None item['city'] = None item['phone_number'] = None item['address'] = None item['web'] = None item['provinc'] = None yield item def parse_detail(self,response): print(2222222222222) print("shshhsshshsh"+response.url) print(response.meta['province']) detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract() for url in detail_url_list: detail_url = 'https://www.yelp.com'+url province = response.meta['province'] meta={'province':province}
没看到哪里调了parse_detail 方法
def parse(self, response): print("我进来了") try: sim_url = response.meta["sim_url"] ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first() ye = int(ye_se.replace('1 of ','')) for i in range(ye): next_url = sim_url+str(i*10) province = response.meta['province'] meta = {'province': province} print('我出去了') yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta) except: print('我没有值了') item = YelpspiderItem() item['keyword'] = None item['city'] = None item['phone_number'] = None item['address'] = None item['web'] = None item['provinc'] = None yield item def parse_detail(self,response): print(2222222222222) print("shshhsshshsh"+response.url) print(response.meta['province']) detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract() for url in detail_url_list: detail_url = 'https://www.yelp.com'+url province = response.meta['province'] meta={'province':province} w我出去了可以打印出来,但是后面的22222222222222没有打印出来
在try后面回调了
加上:
dont_filter=True参数
yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)
parse_detail接收到的meta参数没问题吗?
那请求是post的话,要用Request的子类FormRequest 类
yield scrapy.FormRequest(url="url", formdata={'name': 'zhangsan', 'age': '27'}, callback=self.after_post)
只有两个方法,这边没办法调试哈
不管用
在 allowed_domains 中加入目标url
^_^
还是没有用
是不是url不对
import scrapy
import xlrd
from YelpSpider.items import YelpspiderItem
class YelpSpiderSpider(scrapy.Spider):
name = 'yelp_spider'
allowed_domains = ['www.yelp.com']
# start_urls = ['http://www/']
def start_requests(self):
base_url = 'https://www.yelp.com/search?find_desc='
address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')
k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')
add_table = address_file.sheets()[0]
k_table = k_file.sheets()[0]
k_nrows = add_table.nrows
add_nrows = add_table.nrows
for k in range(k_nrows):
ke = k_table.row_values(k)[0]
for add in range(add_nrows):
city = add_table.row_values(add)[0]
province = add_table.row_values(add)[2]
province_s = add_table.row_values(add)[1]
add_url = city+","+"+"+province_s
ke_url = ke.replace(' ','+')
fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'
sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='
meta = {"province":province,'sim_url':sim_url}
yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)
def parse(self, response):
print("我进来了")
try:
sim_url = response.meta["sim_url"]
ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()
ye = int(ye_se.replace('1 of ',''))
for i in range(ye):
next_url = sim_url+str(i*10)
province = response.meta['province']
meta = {'province': province}
print('我出去了')
yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)
except:
print('我没有值了')
item = YelpspiderItem()
item['keyword'] = None
item['city'] = None
item['phone_number'] = None
item['address'] = None
item['web'] = None
item['provinc'] = None
yield item
def parse_detail(self,response):
print(2222222222222)
print("shshhsshshsh"+response.url)
print(response.meta['province'])
detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()
for url in detail_url_list:
detail_url = 'https://www.yelp.com'+url
province = response.meta['province']
meta={'province':province}
参数接收有问题吧?
def parse(self, response): request = scrapy.Request('http://www.example.com/index.html', callback=self.parse_page2, cb_kwargs=dict(main_url=response.url)) request.cb_kwargs['foo'] = 'bar' # add more arguments for the callback yield request def parse_page2(self, response, main_url, foo): yield dict( main_url=main_url, other_url=response.url, foo=foo, )
import scrapy
import xlrd
from YelpSpider.items import YelpspiderItem
class YelpSpiderSpider(scrapy.Spider):
name = 'yelp_spider'
allowed_domains = ['www.yelp.com']
# start_urls = ['http://www/']
def start_requests(self):
base_url = 'https://www.yelp.com/search?find_desc='
address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')
k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')
add_table = address_file.sheets()[0]
k_table = k_file.sheets()[0]
k_nrows = add_table.nrows
add_nrows = add_table.nrows
for k in range(k_nrows):
ke = k_table.row_values(k)[0]
for add in range(add_nrows):
city = add_table.row_values(add)[0]
province = add_table.row_values(add)[2]
province_s = add_table.row_values(add)[1]
add_url = city+","+"+"+province_s
ke_url = ke.replace(' ','+')
fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'
sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='
meta = {"province":province,'sim_url':sim_url}
yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)
def parse(self, response):
print("我进来了")
try:
sim_url = response.meta["sim_url"]
ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()
ye = int(ye_se.replace('1 of ',''))
for i in range(ye):
next_url = sim_url+str(i*10)
province = response.meta['province']
meta = {'province': province}
print('我出去了')
yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)
except:
print('我没有值了')
item = YelpspiderItem()
item['keyword'] = None
item['city'] = None
item['phone_number'] = None
item['address'] = None
item['web'] = None
item['provinc'] = None
yield item
def parse_detail(self,response):
print(2222222222222)
print("shshhsshshsh"+response.url)
print(response.meta['province'])
detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()
for url in detail_url_list:
detail_url = 'https://www.yelp.com'+url
province = response.meta['province']
meta={'province':province}
import scrapy
import xlrd
from YelpSpider.items import YelpspiderItem
class YelpSpiderSpider(scrapy.Spider):
name = 'yelp_spider'
allowed_domains = ['www.yelp.com']
# start_urls = ['http://www/']
def start_requests(self):
base_url = 'https://www.yelp.com/search?find_desc='
address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')
k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')
add_table = address_file.sheets()[0]
k_table = k_file.sheets()[0]
k_nrows = add_table.nrows
add_nrows = add_table.nrows
for k in range(k_nrows):
ke = k_table.row_values(k)[0]
for add in range(add_nrows):
city = add_table.row_values(add)[0]
province = add_table.row_values(add)[2]
province_s = add_table.row_values(add)[1]
add_url = city+","+"+"+province_s
ke_url = ke.replace(' ','+')
fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'
sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='
meta = {"province":province,'sim_url':sim_url}
yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)
def parse(self, response):
print("我进来了")
try:
sim_url = response.meta["sim_url"]
ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()
ye = int(ye_se.replace('1 of ',''))
for i in range(ye):
next_url = sim_url+str(i*10)
province = response.meta['province']
meta = {'province': province}
print('我出去了')
yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)
except:
print('我没有值了')
item = YelpspiderItem()
item['keyword'] = None
item['city'] = None
item['phone_number'] = None
item['address'] = None
item['web'] = None
item['provinc'] = None
yield item
def parse_detail(self,response):
print(2222222222222)
print("shshhsshshsh"+response.url)
print(response.meta['province'])
detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()
for url in detail_url_list:
detail_url = 'https://www.yelp.com'+url
province = response.meta['province']
meta={'province':province}
import scrapy
import xlrd
from YelpSpider.items import YelpspiderItem
class YelpSpiderSpider(scrapy.Spider):
name = 'yelp_spider'
allowed_domains = ['www.yelp.com']
# start_urls = ['http://www/']
def start_requests(self):
base_url = 'https://www.yelp.com/search?find_desc='
address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')
k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')
add_table = address_file.sheets()[0]
k_table = k_file.sheets()[0]
k_nrows = add_table.nrows
add_nrows = add_table.nrows
for k in range(k_nrows):
ke = k_table.row_values(k)[0]
for add in range(add_nrows):
city = add_table.row_values(add)[0]
province = add_table.row_values(add)[2]
province_s = add_table.row_values(add)[1]
add_url = city+","+"+"+province_s
ke_url = ke.replace(' ','+')
fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'
sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='
meta = {"province":province,'sim_url':sim_url}
yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)
def parse(self, response):
print("我进来了")
try:
sim_url = response.meta["sim_url"]
ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()
ye = int(ye_se.replace('1 of ',''))
for i in range(ye):
next_url = sim_url+str(i*10)
province = response.meta['province']
meta = {'province': province}
print('我出去了')
yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)
except:
print('我没有值了')
item = YelpspiderItem()
item['keyword'] = None
item['city'] = None
item['phone_number'] = None
item['address'] = None
item['web'] = None
item['provinc'] = None
yield item
def parse_detail(self,response):
print(2222222222222)
print("shshhsshshsh"+response.url)
print(response.meta['province'])
detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()
for url in detail_url_list:
detail_url = 'https://www.yelp.com'+url
province = response.meta['province']
meta={'province':province}
没法格式化了
全部代码就是这样的
参数有问题会报错的,现在是没有报错,但是第二个请求就是进不去
import scrapy import xlrd from YelpSpider.items import YelpspiderItem class YelpSpiderSpider(scrapy.Spider): name = 'yelp_spider' allowed_domains = ['www.yelp.com'] # start_urls = ['http://www/'] def start_requests(self): base_url = 'https://www.yelp.com/search?find_desc=' address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx') k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx') add_table = address_file.sheets()[0] k_table = k_file.sheets()[0] k_nrows = add_table.nrows add_nrows = add_table.nrows for k in range(k_nrows): ke = k_table.row_values(k)[0] for add in range(add_nrows): city = add_table.row_values(add)[0] province = add_table.row_values(add)[2] province_s = add_table.row_values(add)[1] add_url = city+","+"+"+province_s ke_url = ke.replace(' ','+') fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00' sim_url = base_url+ke_url+'&find_loc='+add_url+'&start=' meta = {"province":province,'sim_url':s
import scrapy
import xlrd
from YelpSpider.items import YelpspiderItem
class YelpSpiderSpider(scrapy.Spider):
name = 'yelp_spider'
allowed_domains = ['www.yelp.com']
# start_urls = ['http://www/']
def start_requests(self):
base_url = 'https://www.yelp.com/search?find_desc='
address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')
k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')
add_table = address_file.sheets()[0]
k_table = k_file.sheets()[0]
k_nrows = add_table.nrows
add_nrows = add_table.nrows
for k in range(k_nrows):
ke = k_table.row_values(k)[0]
for add in range(add_nrows):
city = add_table.row_values(add)[0]
province = add_table.row_values(add)[2]
province_s = add_table.row_values(add)[1]
add_url = city+","+"+"+province_s
ke_url = ke.replace(' ','+')
fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'
sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='
meta = {"province":province,'sim_url':sim_url}
yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)
def parse(self, response):
print("我进来了")
try:
sim_url = response.meta["sim_url"]
ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()
ye = int(ye_se.replace('1 of ',''))
for i in range(ye):
next_url = sim_url+str(i*10)
province = response.meta['province']
meta = {'province': province}
print('我出去了')
yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)
except:
print('我没有值了')
item = YelpspiderItem()
item['keyword'] = None
item['city'] = None
item['phone_number'] = None
item['address'] = None
item['web'] = None
item['provinc'] = None
yield item
def parse_detail(self,response):
print(2222222222222)
print("shshhsshshsh"+response.url)
print(response.meta['province'])
detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()
for url in detail_url_list:
detail_url = 'https://www.yelp.com'+url
province = response.meta['province']
meta={'province':province}
前面都是没有问题的
请求失败了呗,你把请求链接打印出来,请求试试
没事,我已经给解决了,谢谢你了