parse方法没有问题,都是可以正常得出数据,但是就是进不去二次请求 parse_detail方法中

def parse(self, response):

print("我进来了")

try:

sim_url = response.meta["sim_url"]

ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()

ye = int(ye_se.replace('1 of ',''))

for i in range(ye):

next_url = sim_url+str(i*10)

province = response.meta['province']

meta = {'province': province}

print('我出去了')

yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta)

except:

print('我没有值了')

item = YelpspiderItem()

item['keyword'] = None

item['city'] = None

item['phone_number'] = None

item['address'] = None

item['web'] = None

item['provinc'] = None

yield item

def parse_detail(self,response):

print(2222222222222)

print("shshhsshshsh"+response.url)

print(response.meta['province'])

detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()

for url in detail_url_list:

detail_url = 'https://www.yelp.com'+url

province = response.meta['province']

meta={'province':province}

1.确定 '我出去了' 这个字符串打印了吗?

2.打印了之后 最好抓包看一下 看看 有没有网页的请求 发出去?

用这个格式化代码一下

或者文件发qq1467288927

def parse(self, response):

   print("我进来了")

   try:

      sim_url = response.meta["sim_url"]

      ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()

      ye = int(ye_se.replace('1 of ',''))

      for i in range(ye):

         next_url = sim_url+str(i*10)

         province = response.meta['province']

         meta = {'province': province}

      print('我出去了')

      yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta)

   except:

      print('我没有值了')

      item = YelpspiderItem()

      item['keyword'] = None

      item['city'] = None

      item['phone_number'] = None

      item['address'] = None

      item['web'] = None

      item['provinc'] = None

      yield item

def parse_detail(self,response):

   print(2222222222222)

   print("shshhsshshsh"+response.url)

   print(response.meta['province'])

   detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()

   for url in detail_url_list:

      detail_url = 'https://www.yelp.com'+url

      province = response.meta['province']

      meta={'province':province}

没看到哪里调了parse_detail 方法

def parse(self, response):
    print("我进来了")
    try:
        sim_url = response.meta["sim_url"]
        ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()
        ye = int(ye_se.replace('1 of ',''))
        for i in range(ye):
            next_url = sim_url+str(i*10)
            province = response.meta['province']
            meta = {'province': province}
            print('我出去了')
            yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta)
    except:
        print('我没有值了')
        item = YelpspiderItem()
        item['keyword'] = None
        item['city'] = None
        item['phone_number'] = None
        item['address'] = None
        item['web'] = None
        item['provinc'] = None
        yield item
def parse_detail(self,response):
    print(2222222222222)
    print("shshhsshshsh"+response.url)
    print(response.meta['province'])
    detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()
    for url in detail_url_list:
        detail_url = 'https://www.yelp.com'+url
        province = response.meta['province']
        meta={'province':province}
w我出去了可以打印出来,但是后面的22222222222222没有打印出来

在try后面回调了

加上:

dont_filter=True参数

yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)

parse_detail接收到的meta参数没问题吗?

那请求是post的话,要用Request的子类FormRequest 类

yield scrapy.FormRequest(url="url",
                    formdata={'name': 'zhangsan', 'age': '27'},
                    callback=self.after_post)

只有两个方法,这边没办法调试哈

不管用

在 allowed_domains 中加入目标url

^_^


还是没有用

是不是url不对

import scrapy

import xlrd

from YelpSpider.items import YelpspiderItem


class YelpSpiderSpider(scrapy.Spider):

name = 'yelp_spider'

allowed_domains = ['www.yelp.com']

# start_urls = ['http://www/']

def start_requests(self):

base_url = 'https://www.yelp.com/search?find_desc='

address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')

k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')

add_table = address_file.sheets()[0]

k_table = k_file.sheets()[0]

k_nrows = add_table.nrows

add_nrows = add_table.nrows

for k in range(k_nrows):

ke = k_table.row_values(k)[0]

for add in range(add_nrows):

city = add_table.row_values(add)[0]

province = add_table.row_values(add)[2]

province_s = add_table.row_values(add)[1]

add_url = city+","+"+"+province_s

ke_url = ke.replace(' ','+')

fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'

sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='

meta = {"province":province,'sim_url':sim_url}

yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)




def parse(self, response):

print("我进来了")

try:

sim_url = response.meta["sim_url"]

ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()

ye = int(ye_se.replace('1 of ',''))

for i in range(ye):

next_url = sim_url+str(i*10)

province = response.meta['province']

meta = {'province': province}

print('我出去了')

yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)

except:

print('我没有值了')

item = YelpspiderItem()

item['keyword'] = None

item['city'] = None

item['phone_number'] = None

item['address'] = None

item['web'] = None

item['provinc'] = None

yield item

def parse_detail(self,response):

print(2222222222222)

print("shshhsshshsh"+response.url)

print(response.meta['province'])

detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()

for url in detail_url_list:

detail_url = 'https://www.yelp.com'+url

province = response.meta['province']

meta={'province':province}

参数接收有问题吧?

def parse(self, response):
    request = scrapy.Request('http://www.example.com/index.html',
                             callback=self.parse_page2,
                             cb_kwargs=dict(main_url=response.url))
    request.cb_kwargs['foo'] = 'bar'  # add more arguments for the callback
    yield request

def parse_page2(self, response, main_url, foo):
    yield dict(
        main_url=main_url,
        other_url=response.url,
        foo=foo,
    )

import scrapy

import xlrd

from YelpSpider.items import YelpspiderItem


class YelpSpiderSpider(scrapy.Spider):

name = 'yelp_spider'

allowed_domains = ['www.yelp.com']

# start_urls = ['http://www/']

def start_requests(self):

base_url = 'https://www.yelp.com/search?find_desc='

address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')

k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')

add_table = address_file.sheets()[0]

k_table = k_file.sheets()[0]

k_nrows = add_table.nrows

add_nrows = add_table.nrows

for k in range(k_nrows):

ke = k_table.row_values(k)[0]

for add in range(add_nrows):

city = add_table.row_values(add)[0]

province = add_table.row_values(add)[2]

province_s = add_table.row_values(add)[1]

add_url = city+","+"+"+province_s

ke_url = ke.replace(' ','+')

fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'

sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='

meta = {"province":province,'sim_url':sim_url}

yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)




def parse(self, response):

print("我进来了")

try:

sim_url = response.meta["sim_url"]

ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()

ye = int(ye_se.replace('1 of ',''))

for i in range(ye):

next_url = sim_url+str(i*10)

province = response.meta['province']

meta = {'province': province}

print('我出去了')

yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)

except:

print('我没有值了')

item = YelpspiderItem()

item['keyword'] = None

item['city'] = None

item['phone_number'] = None

item['address'] = None

item['web'] = None

item['provinc'] = None

yield item

def parse_detail(self,response):

print(2222222222222)

print("shshhsshshsh"+response.url)

print(response.meta['province'])

detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()

for url in detail_url_list:

detail_url = 'https://www.yelp.com'+url

province = response.meta['province']

meta={'province':province}

import scrapy

import xlrd

from YelpSpider.items import YelpspiderItem


class YelpSpiderSpider(scrapy.Spider):

name = 'yelp_spider'

allowed_domains = ['www.yelp.com']

# start_urls = ['http://www/']

def start_requests(self):

base_url = 'https://www.yelp.com/search?find_desc='

address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')

k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')

add_table = address_file.sheets()[0]

k_table = k_file.sheets()[0]

k_nrows = add_table.nrows

add_nrows = add_table.nrows

for k in range(k_nrows):

ke = k_table.row_values(k)[0]

for add in range(add_nrows):

city = add_table.row_values(add)[0]

province = add_table.row_values(add)[2]

province_s = add_table.row_values(add)[1]

add_url = city+","+"+"+province_s

ke_url = ke.replace(' ','+')

fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'

sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='

meta = {"province":province,'sim_url':sim_url}

yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)




def parse(self, response):

print("我进来了")

try:

sim_url = response.meta["sim_url"]

ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()

ye = int(ye_se.replace('1 of ',''))

for i in range(ye):

next_url = sim_url+str(i*10)

province = response.meta['province']

meta = {'province': province}

print('我出去了')

yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)

except:

print('我没有值了')

item = YelpspiderItem()

item['keyword'] = None

item['city'] = None

item['phone_number'] = None

item['address'] = None

item['web'] = None

item['provinc'] = None

yield item

def parse_detail(self,response):

print(2222222222222)

print("shshhsshshsh"+response.url)

print(response.meta['province'])

detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()

for url in detail_url_list:

detail_url = 'https://www.yelp.com'+url

province = response.meta['province']

meta={'province':province}

import scrapy

import xlrd

from YelpSpider.items import YelpspiderItem


class YelpSpiderSpider(scrapy.Spider):

name = 'yelp_spider'

allowed_domains = ['www.yelp.com']

# start_urls = ['http://www/']

def start_requests(self):

base_url = 'https://www.yelp.com/search?find_desc='

address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')

k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')

add_table = address_file.sheets()[0]

k_table = k_file.sheets()[0]

k_nrows = add_table.nrows

add_nrows = add_table.nrows

for k in range(k_nrows):

ke = k_table.row_values(k)[0]

for add in range(add_nrows):

city = add_table.row_values(add)[0]

province = add_table.row_values(add)[2]

province_s = add_table.row_values(add)[1]

add_url = city+","+"+"+province_s

ke_url = ke.replace(' ','+')

fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'

sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='

meta = {"province":province,'sim_url':sim_url}

yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)




def parse(self, response):

print("我进来了")

try:

sim_url = response.meta["sim_url"]

ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()

ye = int(ye_se.replace('1 of ',''))

for i in range(ye):

next_url = sim_url+str(i*10)

province = response.meta['province']

meta = {'province': province}

print('我出去了')

yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)

except:

print('我没有值了')

item = YelpspiderItem()

item['keyword'] = None

item['city'] = None

item['phone_number'] = None

item['address'] = None

item['web'] = None

item['provinc'] = None

yield item

def parse_detail(self,response):

print(2222222222222)

print("shshhsshshsh"+response.url)

print(response.meta['province'])

detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()

for url in detail_url_list:

detail_url = 'https://www.yelp.com'+url

province = response.meta['province']

meta={'province':province}

没法格式化了


全部代码就是这样的

参数有问题会报错的,现在是没有报错,但是第二个请求就是进不去

import scrapy

import xlrd

from YelpSpider.items import YelpspiderItem



class YelpSpiderSpider(scrapy.Spider):

name = 'yelp_spider'

allowed_domains = ['www.yelp.com']

# start_urls = ['http://www/']

def start_requests(self):

base_url = 'https://www.yelp.com/search?find_desc='

address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')

k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')

add_table = address_file.sheets()[0]

k_table = k_file.sheets()[0]

k_nrows = add_table.nrows

add_nrows = add_table.nrows

for k in range(k_nrows):

ke = k_table.row_values(k)[0]

for add in range(add_nrows):

city = add_table.row_values(add)[0]

province = add_table.row_values(add)[2]

province_s = add_table.row_values(add)[1]

add_url = city+","+"+"+province_s

ke_url = ke.replace(' ','+')

fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'

sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='

meta = {"province":province,'sim_url':s

import scrapy

import xlrd

from YelpSpider.items import YelpspiderItem


class YelpSpiderSpider(scrapy.Spider):

name = 'yelp_spider'

allowed_domains = ['www.yelp.com']

# start_urls = ['http://www/']

def start_requests(self):

base_url = 'https://www.yelp.com/search?find_desc='

address_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\h_z.xlsx')

k_file = xlrd.open_workbook(r'D:\工作\YelpSpider\YelpSpider\data\key_word.xlsx')

add_table = address_file.sheets()[0]

k_table = k_file.sheets()[0]

k_nrows = add_table.nrows

add_nrows = add_table.nrows

for k in range(k_nrows):

ke = k_table.row_values(k)[0]

for add in range(add_nrows):

city = add_table.row_values(add)[0]

province = add_table.row_values(add)[2]

province_s = add_table.row_values(add)[1]

add_url = city+","+"+"+province_s

ke_url = ke.replace(' ','+')

fin_url = base_url+ke_url+'&find_loc='+add_url+'&start=00'

sim_url = base_url+ke_url+'&find_loc='+add_url+'&start='

meta = {"province":province,'sim_url':sim_url}

yield scrapy.Request(url=fin_url, callback=self.parse,meta=meta)




def parse(self, response):

print("我进来了")

try:

sim_url = response.meta["sim_url"]

ye_se = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/div[1]/div[2]/span/text()').extract_first()

ye = int(ye_se.replace('1 of ',''))

for i in range(ye):

next_url = sim_url+str(i*10)

province = response.meta['province']

meta = {'province': province}

print('我出去了')

yield scrapy.Request(url=next_url, callback=self.parse_detail,meta=meta,dont_filter=True)

except:

print('我没有值了')

item = YelpspiderItem()

item['keyword'] = None

item['city'] = None

item['phone_number'] = None

item['address'] = None

item['web'] = None

item['provinc'] = None

yield item

def parse_detail(self,response):

print(2222222222222)

print("shshhsshshsh"+response.url)

print(response.meta['province'])

detail_url_list = response.xpath('//*[@id="wrap"]/div[3]/div[2]/div/div[1]/div[1]/div[2]/div[2]/ul/li/div/div/div/div/div/div/div/div/div/div/div/a/@href').extract()

for url in detail_url_list:

detail_url = 'https://www.yelp.com'+url

province = response.meta['province']

     meta={'province':province}


前面都是没有问题的

请求失败了呗,你把请求链接打印出来,请求试试

没事,我已经给解决了,谢谢你了