1、数据采集(使用scrapy框架完成 40分)
访问https://www.51job.com/,输入“大数据”,工作地点选择5个城市,搜索“大数据”岗位。使用Python采集与大数据有关的岗位招聘信息,信息包括:职位名、公司名、工作地点、薪资和发布时间,相关信息采集下来后,保存为文件名“前程无忧.csv”文件,文件格式如下表1所示:
表1 文件格式要求
职位名 公司名 工作地点 薪资 发布时间
22989-大数据人工智能产品项目经理 深圳市腾讯计算机系统有限公司
深圳 08-07
大数据开发工程师 中国国际金融股份有限公司 北京 1.2-2万/月 08-07
大数据分析专员/助理 大数据分析专员/助理深圳歌华工艺饰品有限公司 深圳-南山区 4.2-6.5千/月 08-07
selenium的方法一:
import re
import os
import csv
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
#保存文件
def save_to_csv(a1,a2,a3,a4,a5):
is_exist = False
if os.path.exists("前程无忧.csv"):
is_exist = True
with open('前程无忧.csv', 'a', encoding='GBK', newline='') as csvfile:
writer = csv.writer(csvfile)
if not is_exist:
writer.writerow(['职位名','公司名','工作地点','薪资','发布时间'])
writer.writerow([a1,a2,a3,a4,a5])
csvfile.close()
#创建webDriver对象
driver = webdriver.Chrome()
#等待变量
wait=WebDriverWait(driver,10)
for page in range(10):
s=[f"https://search.51job.com/list/190200%252c010000%252c020000%252c030200%252c040000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{page+1}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="]
for i in s:
driver.get(i)
#等待3秒
time.sleep(3)
#获取网页源代码
html=driver.page_source
# print(html)#查看网页源代码
#解析网页源代码
# 正则表达式的方法
pattern =re.compile(r'class="jname at">(?P<name>.*?)</span>.*?'
r'<span class="time">(?P<time>.*?)发布</span>.*?'
r'<span class="sal">(?P<money>.*?)</span>.*?'
r'<span class="d at">(?P<place>.*?)</span>.*?'
r'class="cname at">(?P<company>.*?)</a>',re.S)
# 用正则匹配网页源代码的数据
result=pattern.finditer(html)
# 提取数据
for i in result:
# print("职位名:"+i.group("name"))
# print("公司名:"+i.group("company"))
# print("工作地点:"+i.group("place").split('|')[0])
# print("薪资:"+i.group("money"))
# print("发布时间:"+i.group("time"))
a1=i.group("name")
a2=i.group("company")
a3=i.group("place").split('|')[0].strip()
a4=i.group("money")
a5=i.group("time")
save_to_csv(a1,a2,a3,a4,a5)
print("over,数据已保存,文件名为:前程无忧.csv")
#关闭浏览器
driver.close()
selenium的方法二:
import scrapy
import json
import jsonpath
from scrapy_51job.items import Scrapy51JobItem
class Job51Spider(scrapy.Spider):
name = 'job51'
allowed_domains = ['search.51job.com']
start_urls = ['https://search.51job.com/list/010000%252c020000%252c030200%252c040000%252c180200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?']
page = 1
base_url = 'https://search.51job.com/list/010000%252c020000%252c030200%252c040000%252c180200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,'
def parse(self, response):
# print('*'*50)
content = json.loads(response.text)
# 职位名、公司名、工作地点、薪资和发布时间
job_names = jsonpath.jsonpath(content,'$.engine_jds..job_name')
company_names = jsonpath.jsonpath(content,'$.engine_jds..company_name')
job_areas = jsonpath.jsonpath(content,'$.engine_jds..workarea_text')
job_salarys = jsonpath.jsonpath(content,'$.engine_jds..providesalary_text')
job_times = jsonpath.jsonpath(content,'$.engine_jds..issuedate')
for i in range(len(job_names)):
job_name = job_names[i]
company_name = company_names[i]
job_area = job_areas[i]
job_salary = job_salarys[i]
job_time = job_times[i]
job = Scrapy51JobItem(job_name=job_name,company_name=company_name,job_area=job_area,job_salary=job_salary,job_time=job_time)
print(job_name,company_name,job_area,job_salary,job_time)
yield job
# print('*' * 50)
if self.page < 100:
self.page = self.page + 1
url = self.base_url+str(self.page)+'.html?'
yield scrapy.Request(url=url,callback=self.parse)
Scrapy的方法:
spider目录下的爬虫文件job51.py
import scrapy
import json
import jsonpath
from scrapy_51job.items import Scrapy51JobItem
class Job51Spider(scrapy.Spider):
name = 'job51'
allowed_domains = ['search.51job.com']
start_urls = ['https://search.51job.com/list/010000%252c020000%252c030200%252c040000%252c180200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?']
page = 1
base_url = 'https://search.51job.com/list/010000%252c020000%252c030200%252c040000%252c180200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,'
def parse(self, response):
# print('*'*50)
content = json.loads(response.text)
# 职位名、公司名、工作地点、薪资和发布时间
job_names = jsonpath.jsonpath(content,'$.engine_jds..job_name')
company_names = jsonpath.jsonpath(content,'$.engine_jds..company_name')
job_areas = jsonpath.jsonpath(content,'$.engine_jds..workarea_text')
job_salarys = jsonpath.jsonpath(content,'$.engine_jds..providesalary_text')
job_times = jsonpath.jsonpath(content,'$.engine_jds..issuedate')
for i in range(len(job_names)):
job_name = job_names[i]
company_name = company_names[i]
job_area = job_areas[i]
job_salary = job_salarys[i]
job_time = job_times[i]
job = Scrapy51JobItem(job_name=job_name,company_name=company_name,job_area=job_area,job_salary=job_salary,job_time=job_time)
print(job_name,company_name,job_area,job_salary,job_time)
yield job
# print('*' * 50)
if self.page < 100:
self.page = self.page + 1
url = self.base_url+str(self.page)+'.html?'
yield scrapy.Request(url=url,callback=self.parse)
items.py:
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class Scrapy51JobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 职位名、公司名、工作地点、薪资和发布时间
job_name = scrapy.Field()
company_name = scrapy.Field()
job_area = scrapy.Field()
job_salary = scrapy.Field()
job_time = scrapy.Field()
pass
middlewares.py:
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class Scrapy51JobSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class Scrapy51JobDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pipelines.py:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import csv
class Scrapy51JobPipeline:
# 打开文件
fp = open('job.csv','w',encoding='utf-8',newline='')
csv_fp = csv.DictWriter(fp,fieldnames=['职位名','公司名','工作地点','薪资','发布时间'])
csv_fp.writeheader()
# 写数据
def process_item(self, item, spider):
self.csv_fp.writerow({
'薪资':item['job_salary'],
'发布时间':item['job_time'],
'工作地点':item['job_area'],
'职位名':item['job_name'],
'公司名':item['company_name']
})
return item
# 关闭文件
def colse_file(self,spider):
self.csv_fp.close()
settings.py:
# Scrapy settings for scrapy_51job project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'scrapy_51job'
SPIDER_MODULES = ['scrapy_51job.spiders']
NEWSPIDER_MODULE = 'scrapy_51job.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy_51job (+http://www.yourdomain.com)'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Cookie': 'BMAP_SECKEY2=e7ccd76a71cca7384bc9d56993ddbed2e19bbff4744b85e39bb3d65be30e7613e76ae0b8689ae7f5bb14207898aef6950e69432a9314fa542a239fa64bfb5b45539d1a80f930c11874cdf0fb5ac9908bd8649d09a624f58f45a25d4be79ebaf17704c8dcaf9fe4319210bbb78a143775dfbd5a68a0209a8af9c89d8cae7dd2fc432f86e81107e5397ab76a101807e41ef808693c3097f9526cce82eff18e4c18017f257eb1a17a4a9a97680080cc917b25af8918656f764560d35c2d2b62b97609ae2b3d91755804906b7deb81f9cdb2ad943f101fe8e424171dd48adb1ca94c94b60631d5170361a4655d86f3d7c2cf; guid=c95c679632d574c264011d8272e38ab6; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; ps=needv%3D0; 51job=cuid%3D199203233%26%7C%26cusername%3D7%252F2uZW9EGI8Si6%252Fb4YzgYg%252FmFzw1olQwhkbL45OIZy8%253D%26%7C%26cpassword%3D%26%7C%26cname%3D%26%7C%26cemail%3D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0%252Fs7R%252F1f62jE%26%7C%26cconfirmkey%3D%25241%2524gFJJQBqo%2524KbI6xiDNrh30ErbphPVop0%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D%26%7C%26cnamekey%3D%25241%2524vdH21U3s%252412KBiwKGg0EFcB18pO95V%252F%26%7C%26to%3D01bcce1cbe1ea72ba5915abc8ce41a1b617b85b1%26%7C%26; search=jobarea%7E%60010000%2C020000%2C030200%2C040000%2C180200%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60010000%2C020000%2C030200%2C040000%2C180200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%B4%F3%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%B4%F3%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CE%EF%C1%AA%CD%F8%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAjava%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch4%7E%60180200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%B4%F3%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21; ssxmod_itna=eqfxgDuDyDniDt3q0d97tD9jQ3W/oQeqDKbQD/3IDnqD=GFDK40Eo7IvTDOYyoYPxPie7QnTaibftUniKqAFRk4GLDmKDyijLReDxrq0rD74irDDxD3Db3dDSDWKD9D048yRvLKGWDbx=Di4D+iTQDmqG0DDtHU4G2D7tnYDYbik0LtQU5sqhx57DnD0UYxBd4QZRPN=abUk0NqPIzD0pxQBF4fcxKQ0NWeqGyeKGuIdtlRS28HdNSlk+17DpEvGv5IGDai7G5QGxb/GxQflhqKXe1YAD=WDwLhveDGfqE2iD===; ssxmod_itna2=eqfxgDuDyDniDt3q0d97tD9jQ3W/oQeqDKbG9tVQDBdwrx7PP+BaFGFQ+yrzR5jqGQm+Ehe0xqQtiArKQ72YKUENY08vtwVIFcY98Bi0ppjX8Rb1HS5Wk8WhnyP5YyotOq/ae=9pBBm1jnybq4YO/04zUon1WgWdmPYNIraK22RDZEvCUhqIEga=ExAGlZFQ4oei2W5bB7ypOKAFU=7a7Kk1OmqqBEe6dlIhR=IF0pKteBPkKvh7h8QWi8WX23F2gBeX6FvCP8WddaHWxV6WPhU4dEHXktav8f2QOtD4m6EZiWsE4=WIQGFRvCGKaUaXeRvYpCRIcYWfYDbSwGDHcmYe5TISuWbq5EPzYB9Ab/UIEDoaSA2IWoZPoEcvWObfIRGTjAPRAPIS9Hiy=ZoyzYpfBYcIK=ncee6fhYwpTSAo1wn9KOCKaghiP4OUp2YbV/x9+xE0RRWcK=46Y5HiGyedjIery48Qo+ApH+d0tk107BeRDG7rQ0qtxDKurmBNxIzeDT5/Nd2DdUKKGxD7=DYKoeD=; partner=sem_pcbaidu5_153412; adv=ad_logid_url%3Dhttps%253A%252F%252Ftrace.51job.com%252Ftrace.php%253Fpartner%253Dsem_pcbaidu5_153412%2526ajp%253DaHR0cHM6Ly9ta3QuNTFqb2IuY29tL3RnL3NlbS9MUF8yMDIwXzEuaHRtbD9mcm9tPWJhaWR1YWQ%253D%2526k%253Dd946ba049bfb67b64f408966cbda3ee9%2526bd_vid%253D8450429807367679908%26%7C%26; slife=lastlogindate%3D20211203%26%7C%26; privacy=1638512010',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'scrapy_51job.middlewares.Scrapy51JobSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'scrapy_51job.middlewares.Scrapy51JobDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapy_51job.pipelines.Scrapy51JobPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
import re
import os
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#保存文件
def save_to_csv(a1,a2,a3,a4,a5):
is_exist = False
if os.path.exists("前程无忧.csv"):
is_exist = True
with open('前程无忧.csv', 'a', encoding='GBK', newline='') as csvfile:
writer = csv.writer(csvfile)
if not is_exist:
writer.writerow(['职位名','公司名','工作地点','薪资','发布时间'])
writer.writerow([a1,a2,a3,a4,a5])
csvfile.close()
#创建webDriver对象
driver = webdriver.Chrome()
#等待变量
wait=WebDriverWait(driver,10)
# 打开51job网页
driver.get('https://search.51job.com/')
# 等待三秒
time.sleep(3)
# 等待全部城市按钮加载完成
confirm_btn_1 = driver.find_element(By.CLASS_NAME, "allcity")
# 点击全部城市
confirm_btn_1.click()
# 等待1秒
time.sleep(1)
# 等待上海城市
confirm_btn_2 = driver.find_element(By.XPATH,
"//*[@id='popop']/div/div[2]/div[1]/div[2]/div/table/tbody[1]/tr/td[1]/em")
# 点击上海城市
confirm_btn_2.click()
# 等待1秒
time.sleep(1)
# 等待北京城市
confirm_btn_3 = driver.find_element(By.XPATH,
"//*[@id='popop']/div/div[2]/div[1]/div[2]/div/table/tbody[1]/tr/td[2]/em")
# 点击北京城市
confirm_btn_3.click()
# 等待1秒
time.sleep(1)
# 等待广州城市
confirm_btn_4 = driver.find_element(By.XPATH,
"//*[@id='popop']/div/div[2]/div[1]/div[2]/div/table/tbody[1]/tr/td[3]/em")
# 点击广州城市
confirm_btn_4.click()
# 等待1秒
time.sleep(1)
# 等待深圳城市
confirm_btn_5 = driver.find_element(By.XPATH,
"//*[@id='popop']/div/div[2]/div[1]/div[2]/div/table/tbody[1]/tr/td[4]/em")
# 点击深圳城市
confirm_btn_5.click()
# 等待1秒
time.sleep(1)
# 定位确认按钮
confirm_btn_6 = driver.find_element(By.XPATH, "//*[@id='popop']/div/div[3]/span")
# 点击确认按钮
confirm_btn_6.click()
# 等待1秒
time.sleep(1)
# 等待“搜索框”加载完成
search_bth = driver.find_element(By.ID, "keywordInput")
# 在搜索框输入”大数据“
search_bth.send_keys("大数据")
# 等待1秒
time.sleep(1)
# 定位搜索按钮
confirm_bth = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#search_btn')))
# 点击搜索按钮
confirm_bth.click()
# 等待1秒
time.sleep(1)
for i in range(5):
#获取网页源代码
html=driver.page_source
# print(html)#查看网页源代码
#解析网页源代码
# 正则表达式的
pattern =re.compile(r'class="jname at">(?P<name>.*?)</span>.*?'
r'<span class="time">(?P<time>.*?)发布</span>.*?'
r'<span class="sal">(?P<money>.*?)</span>.*?'
r'<span class="d at">(?P<place>.*?)</span>.*?'
r'class="cname at">(?P<company>.*?)</a>',re.S)
#用正则匹配网页源代码的数据
result=pattern.finditer(html)
#提取数据
for i in result:
# print("职位名:"+i.group("name"))
# print("公司名:"+i.group("company"))
# print("工作地点:"+i.group("place").split('|')[0])
# print("薪资:"+i.group("money"))
# print("发布时间:"+i.group("time"))
a1=i.group("name")
a2=i.group("company")
a3=i.group("place").split('|')[0].strip()
a4=i.group("money")
a5=i.group("time")
save_to_csv(a1,a2,a3,a4,a5)
#等待三秒
time.sleep(3)
#定位下一页
confirm_btn_7=driver.find_element(By.XPATH,"/html/body/div[2]/div[3]/div/div[2]/div[4]/div[2]/div/div/div/ul/li[8]/a")
#点击下一页
confirm_btn_7.click()
#等待一秒
time.sleep(3)
print("over,数据已保存,文件名为:前程无忧.csv")
# #关闭浏览器
# driver.close()
2、数据处理和分析(30分)
使用Python的pandas读取“前程无忧.csv”文件,完成如下数据处理和分析操作:
(1)对数据进行清洗,把“工作地点”为“异地招聘”的数据删除;
(2)对“工作地点”数据处理,提取城市信息,比如:上海-浦东新区,提取出城市上海;对薪资数据进行转换,变成年薪数据,即转换成默认单位为“万/月”的数据。处理完后,保存数据到“前程无忧_NEW.csv”文件;
import pandas as pd
import re
# 读取数据
file = pd.read_csv('前程无忧.csv',encoding='gbk')
# 去空值
file = file.dropna()
# 去掉工作地点为异地招聘的地点
file = file[file['工作地点'] != '异地招聘']
# 清洗工作地点字段
new_area = []
for i in file['工作地点']:
new_area.append(i[:2])
# print(i[:2])
file['工作地点'] = new_area
# 重置索引
file = file.reset_index(drop=True)
# 薪资数据
salary = file['薪资']
# 存储清洗后的薪资数据
minsalary = []
maxsalary = []
for i in range(len(salary)):
# 如果是类似 2.5-3万/月 则不用修改值 直接提取值
if salary[i].find('-') != -1 and salary[i].find('万/月') != -1:
sa = re.search('(?P<minsa>\d\.?\d?\d?)-(?P<maxsa>\d\.?\d?\d?)',salary[i])
minsalary.append(float(sa['minsa']))
maxsalary.append(float(sa['maxsa']))
# 如果是类似 2.5-3千/月 提取值/10
elif salary[i].find('-') != -1 and salary[i].find('千/月') != -1:
sa = re.search('(?P<minsa>\d\.?\d?\d?)-(?P<maxsa>\d\.?\d?\d?)',salary[i])
# 取小数点后两位数
minsalary.append(round(float(sa['minsa']) / 10,2))
maxsalary.append(round(float(sa['maxsa']) / 10,2))
# 如果是类似 2.5-3万/年 提取值/12
elif salary[i].find('-') != -1 and salary[i].find('万/年') != -1:
sa = re.search('(?P<minsa>\d\.?\d?\d?)-(?P<maxsa>\d\.?\d?\d?)',salary[i])
# 取小数点后两位数
minsalary.append(round(float(sa['minsa']) / 12,2))
maxsalary.append(round(float(sa['maxsa']) / 12,2))
# 如果是类似 100万以上/年 提取值/12 则没有最大值
elif salary[i].find('万以上/年') != -1:
sa = re.search('(?P<minsa>\d\.?\d?\d?)\w+',salary[i])
# 取小数点后两位数
minsalary.append(round(float(sa['minsa']) / 12,2))
maxsalary.append(0)
# 如果是类似 千以下/月 提取值/10 没有最小值
elif salary[i].find('千以下/月') != -1:
sa = re.search('(?P<minsa>\d*\.?\d*)\w+',salary[i])
# 取小数点后两位数
minsalary.append(0)
maxsalary.append(round(float(sa['minsa']) / 10,2))
# 如果是类似 1000元/天 取一个月工作21天为最大值 提取值/10000 最小值为0
elif salary[i].find('元/天') != -1:
sa = re.search('(?P<maxsa>\d*)',salary[i])
minsalary.append(0)
maxsalary.append(round((float(sa['maxsa']) / 10000) * 21,2))
if len(file) == len(minsalary) == len(maxsalary):
print('薪资数据清洗完成')
else:
print('数据清洗有误')
# 新增最高最低薪资列
file['最低薪资(万/月)'] = minsalary
file['最高薪资(万/月)'] = maxsalary
# 存储数据
file.to_csv('前程无忧_new.csv',encoding='utf-8',index=False)
3、数据可视化(20分)
根据题目需求使用Python读取相关的csv文件中的数据,完成如下题目。
(1)使用折线图展示岗位(职位名)数量最多的前5位
(2)使用条形图展示当前搜索到的岗位发布数前5的城市
(3)使用直方图展示当前搜索到的不同城市大数据岗位薪资分布情况
import matplotlib.pyplot as plt
import pandas as pd
# 解决中文乱码
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('前程无忧_new.csv',encoding='utf-8')
# 职位数前5的职位
count_job = data.groupby('职位名',as_index=False).count().sort_values('公司名',ascending=False).head(5)
# 岗位发布数前5的城市
area_job = data.groupby('工作地点',as_index=False).count().sort_values('职位名',ascending=False).head(5)
# 根据搜索的不同城市大数据岗位薪资分布情况
# 最低薪资岗位分布数据
min_sa = data['最低薪资(万/月)'].groupby(data['工作地点']).mean()
plt.hist(min_sa.values)
plt.title('城市大数据最低岗位薪资分布情况直方图')
plt.xlabel('薪资')
plt.ylabel('城市数量')
plt.show()
# 最高薪资岗位分布数据
max_sa = data['最高薪资(万/月)'].groupby(data['工作地点']).mean()
plt.hist(max_sa.values)
plt.title('城市大数据最高岗位薪资分布情况直方图')
plt.xlabel('薪资')
plt.ylabel('城市数量')
plt.show()
# 职位数前5的职位数据
area_name = area_job['工作地点']
job_sum = area_job['职位名']
# 岗位发布数前5的城市数据
job_name = count_job['职位名']
job_count = count_job['公司名']
# 职位数量最多前5折线图
plt.plot(job_name,job_count,marker='o')
plt.title('职位数量最多前5折线图')
plt.xlabel('职位名')
plt.ylabel('职位数量')
plt.grid()
plt.show()
# 职位发布数量前5城市
plt.bar(area_name,job_sum,color='r')
plt.title('职位发布数量前5城市')
plt.xlabel('城市名')
plt.ylabel('发布职位数量')
plt.show()