可否详细说说看,我使用的代码是模仿人民邮电出版社的python网络爬虫scrapy框架的scrapy+selenium+chrome爬取京东页面的代码
import scrapy
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from zhaopin.items import ZhaopinItem
from scrapy.http import Request
from lxml import etree
from selenium import webdriver
import csv
import json
class ZhaopinspiderSpider(scrapy.Spider):
name = 'zhaopinspider'#唯一(必要)
#allowed_domains = ['zhaopin.com']
start_urls = ['http://www.0536qz.com/post/zhaopin/pn1/']#爬虫的首页
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
def __init__(self):
super(ZhaopinspiderSpider,self).__init__()
options = webdriver.ChromeOptions()
options.add_argument('headless')
self.driver = webdriver.Chrome(executable_path='D:/python-3.7.0/chromedriver.exe', options=options)
def close(self, spider):
self.driver.quit()
print('closed spider')
def parse(self, response): #解析并提取网页内容
titles = response.xpath('/html/body/div[5]/div[3]/div[2]/ul/li[1]/div[1]/div/a/p/text()').extract() #标题
prices = response.xpath('/html/body/div[5]/div[3]/div[2]/ul/li[1]/div[1]/p[1]/text()').extract() # 工资
detail_links = response.xpath('/html/body/div[5]/div[3]/div[2]/ul/li[1]/div[1]/div/a/@href').extract() #详细页超链接
base_url = 'http://www.0536qz.com'#青州人才网主页
if detail_links:
#使用超链接的url构建Requ对象,并使用yield方法返回
yield scrapy.Request(url=base_url+detail_links, callback=self.parse, headers={'User-Agent': self.ua})
for ind,detail_link in enumerate(detail_links):
title = titles[ind]
price = prices[ind]
#通过meta在request与response间传递数据
yield scrapy.Request(detail_link, callback=self.parse_detail, meta={'title': title,
'price': price},
headers={'User-Agent': self.ua})
#通过“后页”定位构建爬虫的下一个地址,即爬虫所爬的下一页
next_page = response.xpath('//*[@id="page_x"]/a[9]/@href').extract_frist()
if next_page:
# 使用下一页的url构建Requ对象,并使用yield方法返回
yield scrapy.Request(url=base_url + next_page, callback=self.parse, headers={'User-Agent': self.ua})
def parse_detail(self, response):
title = response.mrta['title'] #标题
price = response.mrta['price'] #工资
time = response.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/span[1]/text()').extract_frist()# 更新时间
attention = response.xpath('/html/body/div[5]/div[2]/div[2]/div[1]/span[2]/em/text()').extract_frist()# 关注度
job = response.xpath('//*[@id="baseInfo"]/ul[1]/li[1]/span[2]/text()').extract_frist()# 招聘职位
job_num = response.xpath('//*[@id="baseInfo"]/ul[1]/li[3]/span[2]/text()').extract_frist()# 招聘人数
Education = response.xpath('//*[@id="baseInfo"]/ul[1]/li[2]/span[2]/text()').extract_frist()# 学历
contacts = response.xpath('//*[@id="baseInfo"]/ul[1]/li[5]/span[2]/text()').extract_frist()# 联系人
#创建ZhaopinItem对象
item = ZhaopinItem()
item['title'] = title
item['time'] = time
item['attention'] = attention
item['price'] = price
item['job'] = job
item['job_num'] = job_num
item['Education'] = Education
item['contacts'] = contacts
yield item
已经多次修改过了settings等
能够爬出结果