使用scrapy框架爬取静态的青州招聘网时,定位的xpath爬出来的为空列表,使用静态爬取测试代码时也是如此,但使用selenum却可以爬取,有什么办法可以不使用selenum的情况下简单爬出xpath的对应内容,测试的代码如下:
```python
import requests
from lxml import etree
import time
import pandas as pd
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'}
url = 'http://www.0536qz.com/post/zhaopin/pn1/'
page_text = requests.get(url, headers = headers)#请求url
page_text.encoding = 'gbk'
tree = etree.HTML(page_text.text)
title = tree.xpath('//*[@id="jobshow7683"]/div[1]/div[1]/span/em/text()')
```python
import scrapy
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from zhaopin.items import ZhaopinItem
from scrapy.http import Request
from lxml import etree
import csv
import json
class ZhaopinspiderSpider(scrapy.Spider):
name = 'zhaopinspider'#唯一(必要)
#allowed_domains = ['zhaopin.com']
start_urls = ['http://www.0536qz.com/post/zhaopin/pn1/']#爬虫的首页
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
def parse(self, response): #解析并提取网页内容
titles = response.xpath('/html/body/div[5]/div[3]/div[2]/ul/li[1]/div[1]/div/a/p/text()').extract() #标题
prices = response.xpath('/html/body/div[5]/div[3]/div[2]/ul/li[1]/div[1]/p[1]/text()').extract() # 工资
detail_links = response.xpath('/html/body/div[5]/div[3]/div[2]/ul/li[1]/div[1]/div/a/@href').extract() #详细页超链接
base_url = 'http://www.0536qz.com'#青州人才网主页
if detail_links:
#使用超链接的url构建Requ对象,并使用yield方法返回
yield scrapy.Request(url=base_url+detail_links, callback=self.parse, headers={'User-Agent': self.ua})
for ind,detail_link in enumerate(detail_links):
title = titles[ind]
price = prices[ind]
#通过meta在request与response间传递数据
yield scrapy.Request(detail_link, callback=self.parse_detail, meta={'title': title,
'price': price},
headers={'User-Agent': self.ua})
#通过“后页”定位构建爬虫的下一个地址,即爬虫所爬的下一页
next_page = response.xpath('//*[@id="page_x"]/a[9]/@href').extract_frist()
if next_page:
# 使用下一页的url构建Requ对象,并使用yield方法返回
yield scrapy.Request(url=base_url + next_page, callback=self.parse, headers={'User-Agent': self.ua})
def parse_detail(self, response):
title = response.mrta['title'] #标题
price = response.mrta['price'] #工资
time = response.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/span[1]/text()').extract_frist()# 更新时间
attention = response.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/span[2]/em/text()').extract_frist()# 关注度
job = response.xpath('//*[@id="baseInfo"]/ul[1]/li[1]/span[2]/text()').extract_frist()# 招聘职位
job_num = response.xpath('//*[@id="baseInfo"]/ul[1]/li[3]/span[2]/text()').extract_frist()# 招聘人数
Education = response.xpath('//*[@id="baseInfo"]/ul[1]/li[2]/span[2]/text()').extract_frist()# 学历
contacts = response.xpath('//*[@id="baseInfo"]/ul[1]/li[5]/span[2]/text()').extract_frist()# 联系人
#创建ZhaopinItem对象
item = ZhaopinItem()
item['title'] = title
item['time'] = time
item['attention'] = attention
item['price'] = price
item['job'] = job
item['job_num'] = job_num
item['Education'] = Education
item['contacts'] = contacts
yield item
你是用requests,可能有部分数据是ajax,所以你的xpath无法获取数据,requests得到的数据要在其他位置分析,如图