《网络爬虫应用技术》课程实训任务

1、数据采集(使用scrapy框架完成 40分)
访问https://www.51job.com/,输入“大数据”,工作地点选择5个城市,搜索“大数据”岗位。使用Python采集与大数据有关的岗位招聘信息,信息包括:职位名、公司名、工作地点、薪资和发布时间,相关信息采集下来后,保存为文件名“前程无忧.csv”文件,文件格式如下表1所示:
表1 文件格式要求
职位名 公司名 工作地点 薪资 发布时间
22989-大数据人工智能产品项目经理 深圳市腾讯计算机系统有限公司
深圳   08-07
大数据开发工程师 中国国际金融股份有限公司 北京 1.2-2万/月 08-07
大数据分析专员/助理 大数据分析专员/助理深圳歌华工艺饰品有限公司 深圳-南山区 4.2-6.5千/月 08-07
selenium的方法一:

import re
import os
import csv
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait

#保存文件
def save_to_csv(a1,a2,a3,a4,a5):
    is_exist = False
    if os.path.exists("前程无忧.csv"):
        is_exist = True
    with open('前程无忧.csv', 'a', encoding='GBK', newline='') as csvfile:
        writer = csv.writer(csvfile)
        if not is_exist:
            writer.writerow(['职位名','公司名','工作地点','薪资','发布时间'])
        writer.writerow([a1,a2,a3,a4,a5])
    csvfile.close()

#创建webDriver对象
driver = webdriver.Chrome()
#等待变量
wait=WebDriverWait(driver,10)


for page in range(10):
    s=[f"https://search.51job.com/list/190200%252c010000%252c020000%252c030200%252c040000,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{page+1}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="]
    for i in s:
        driver.get(i)
        #等待3秒
        time.sleep(3)
        #获取网页源代码
        html=driver.page_source
        # print(html)#查看网页源代码

        #解析网页源代码
        # 正则表达式的方法
        pattern =re.compile(r'class="jname at">(?P<name>.*?)</span>.*?'
                            r'<span class="time">(?P<time>.*?)发布</span>.*?'
                            r'<span class="sal">(?P<money>.*?)</span>.*?'
                            r'<span class="d at">(?P<place>.*?)</span>.*?'
                            r'class="cname at">(?P<company>.*?)</a>',re.S)
        # 用正则匹配网页源代码的数据
        result=pattern.finditer(html)

        # 提取数据
        for i in result:
            # print("职位名:"+i.group("name"))
            # print("公司名:"+i.group("company"))
            # print("工作地点:"+i.group("place").split('|')[0])
            # print("薪资:"+i.group("money"))
            # print("发布时间:"+i.group("time"))
            a1=i.group("name")
            a2=i.group("company")
            a3=i.group("place").split('|')[0].strip()
            a4=i.group("money")
            a5=i.group("time")
            save_to_csv(a1,a2,a3,a4,a5)

print("over,数据已保存,文件名为:前程无忧.csv")
 #关闭浏览器
driver.close()


selenium的方法二:

import scrapy
import json
import jsonpath
from scrapy_51job.items import Scrapy51JobItem
class Job51Spider(scrapy.Spider):
    name = 'job51'
    allowed_domains = ['search.51job.com']
    start_urls = ['https://search.51job.com/list/010000%252c020000%252c030200%252c040000%252c180200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?']

    page = 1
    base_url = 'https://search.51job.com/list/010000%252c020000%252c030200%252c040000%252c180200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,'

    def parse(self, response):
        # print('*'*50)
        content = json.loads(response.text)
        # 职位名、公司名、工作地点、薪资和发布时间
        job_names = jsonpath.jsonpath(content,'$.engine_jds..job_name')
        company_names = jsonpath.jsonpath(content,'$.engine_jds..company_name')
        job_areas = jsonpath.jsonpath(content,'$.engine_jds..workarea_text')
        job_salarys = jsonpath.jsonpath(content,'$.engine_jds..providesalary_text')
        job_times = jsonpath.jsonpath(content,'$.engine_jds..issuedate')
        for i in range(len(job_names)):
            job_name = job_names[i]
            company_name = company_names[i]
            job_area = job_areas[i]
            job_salary = job_salarys[i]
            job_time = job_times[i]
            job = Scrapy51JobItem(job_name=job_name,company_name=company_name,job_area=job_area,job_salary=job_salary,job_time=job_time)
            print(job_name,company_name,job_area,job_salary,job_time)
            yield job
        # print('*' * 50)
        if self.page < 100:
            self.page = self.page + 1
            url = self.base_url+str(self.page)+'.html?'
            yield scrapy.Request(url=url,callback=self.parse)


Scrapy的方法:
spider目录下的爬虫文件job51.py

import scrapy
import json
import jsonpath
from scrapy_51job.items import Scrapy51JobItem
class Job51Spider(scrapy.Spider):
    name = 'job51'
    allowed_domains = ['search.51job.com']
    start_urls = ['https://search.51job.com/list/010000%252c020000%252c030200%252c040000%252c180200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,1.html?']

    page = 1
    base_url = 'https://search.51job.com/list/010000%252c020000%252c030200%252c040000%252c180200,000000,0000,00,9,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,'

    def parse(self, response):
        # print('*'*50)
        content = json.loads(response.text)
        # 职位名、公司名、工作地点、薪资和发布时间
        job_names = jsonpath.jsonpath(content,'$.engine_jds..job_name')
        company_names = jsonpath.jsonpath(content,'$.engine_jds..company_name')
        job_areas = jsonpath.jsonpath(content,'$.engine_jds..workarea_text')
        job_salarys = jsonpath.jsonpath(content,'$.engine_jds..providesalary_text')
        job_times = jsonpath.jsonpath(content,'$.engine_jds..issuedate')
        for i in range(len(job_names)):
            job_name = job_names[i]
            company_name = company_names[i]
            job_area = job_areas[i]
            job_salary = job_salarys[i]
            job_time = job_times[i]
            job = Scrapy51JobItem(job_name=job_name,company_name=company_name,job_area=job_area,job_salary=job_salary,job_time=job_time)
            print(job_name,company_name,job_area,job_salary,job_time)
            yield job
        # print('*' * 50)
        if self.page < 100:
            self.page = self.page + 1
            url = self.base_url+str(self.page)+'.html?'
            yield scrapy.Request(url=url,callback=self.parse)


items.py:

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class Scrapy51JobItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 职位名、公司名、工作地点、薪资和发布时间
    job_name = scrapy.Field()
    company_name = scrapy.Field()
    job_area = scrapy.Field()
    job_salary = scrapy.Field()
    job_time = scrapy.Field()
    pass


middlewares.py:

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter


class Scrapy51JobSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class Scrapy51JobDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


pipelines.py:

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

import csv
class Scrapy51JobPipeline:
    # 打开文件
    fp = open('job.csv','w',encoding='utf-8',newline='')
    csv_fp = csv.DictWriter(fp,fieldnames=['职位名','公司名','工作地点','薪资','发布时间'])
    csv_fp.writeheader()

    # 写数据
    def process_item(self, item, spider):
        self.csv_fp.writerow({
            '薪资':item['job_salary'],
            '发布时间':item['job_time'],
            '工作地点':item['job_area'],
            '职位名':item['job_name'],
            '公司名':item['company_name']
        })
        return item

    # 关闭文件
    def colse_file(self,spider):
        self.csv_fp.close()


settings.py:

# Scrapy settings for scrapy_51job project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'scrapy_51job'

SPIDER_MODULES = ['scrapy_51job.spiders']
NEWSPIDER_MODULE = 'scrapy_51job.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'scrapy_51job (+http://www.yourdomain.com)'

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Cookie': 'BMAP_SECKEY2=e7ccd76a71cca7384bc9d56993ddbed2e19bbff4744b85e39bb3d65be30e7613e76ae0b8689ae7f5bb14207898aef6950e69432a9314fa542a239fa64bfb5b45539d1a80f930c11874cdf0fb5ac9908bd8649d09a624f58f45a25d4be79ebaf17704c8dcaf9fe4319210bbb78a143775dfbd5a68a0209a8af9c89d8cae7dd2fc432f86e81107e5397ab76a101807e41ef808693c3097f9526cce82eff18e4c18017f257eb1a17a4a9a97680080cc917b25af8918656f764560d35c2d2b62b97609ae2b3d91755804906b7deb81f9cdb2ad943f101fe8e424171dd48adb1ca94c94b60631d5170361a4655d86f3d7c2cf; guid=c95c679632d574c264011d8272e38ab6; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; ps=needv%3D0; 51job=cuid%3D199203233%26%7C%26cusername%3D7%252F2uZW9EGI8Si6%252Fb4YzgYg%252FmFzw1olQwhkbL45OIZy8%253D%26%7C%26cpassword%3D%26%7C%26cname%3D%26%7C%26cemail%3D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0%252Fs7R%252F1f62jE%26%7C%26cconfirmkey%3D%25241%2524gFJJQBqo%2524KbI6xiDNrh30ErbphPVop0%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D%26%7C%26cnamekey%3D%25241%2524vdH21U3s%252412KBiwKGg0EFcB18pO95V%252F%26%7C%26to%3D01bcce1cbe1ea72ba5915abc8ce41a1b617b85b1%26%7C%26; search=jobarea%7E%60010000%2C020000%2C030200%2C040000%2C180200%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60010000%2C020000%2C030200%2C040000%2C180200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%B4%F3%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%B4%F3%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CE%EF%C1%AA%CD%F8%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAjava%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch4%7E%60180200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%B4%F3%CA%FD%BE%DD%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21; ssxmod_itna=eqfxgDuDyDniDt3q0d97tD9jQ3W/oQeqDKbQD/3IDnqD=GFDK40Eo7IvTDOYyoYPxPie7QnTaibftUniKqAFRk4GLDmKDyijLReDxrq0rD74irDDxD3Db3dDSDWKD9D048yRvLKGWDbx=Di4D+iTQDmqG0DDtHU4G2D7tnYDYbik0LtQU5sqhx57DnD0UYxBd4QZRPN=abUk0NqPIzD0pxQBF4fcxKQ0NWeqGyeKGuIdtlRS28HdNSlk+17DpEvGv5IGDai7G5QGxb/GxQflhqKXe1YAD=WDwLhveDGfqE2iD===; ssxmod_itna2=eqfxgDuDyDniDt3q0d97tD9jQ3W/oQeqDKbG9tVQDBdwrx7PP+BaFGFQ+yrzR5jqGQm+Ehe0xqQtiArKQ72YKUENY08vtwVIFcY98Bi0ppjX8Rb1HS5Wk8WhnyP5YyotOq/ae=9pBBm1jnybq4YO/04zUon1WgWdmPYNIraK22RDZEvCUhqIEga=ExAGlZFQ4oei2W5bB7ypOKAFU=7a7Kk1OmqqBEe6dlIhR=IF0pKteBPkKvh7h8QWi8WX23F2gBeX6FvCP8WddaHWxV6WPhU4dEHXktav8f2QOtD4m6EZiWsE4=WIQGFRvCGKaUaXeRvYpCRIcYWfYDbSwGDHcmYe5TISuWbq5EPzYB9Ab/UIEDoaSA2IWoZPoEcvWObfIRGTjAPRAPIS9Hiy=ZoyzYpfBYcIK=ncee6fhYwpTSAo1wn9KOCKaghiP4OUp2YbV/x9+xE0RRWcK=46Y5HiGyedjIery48Qo+ApH+d0tk107BeRDG7rQ0qtxDKurmBNxIzeDT5/Nd2DdUKKGxD7=DYKoeD=; partner=sem_pcbaidu5_153412; adv=ad_logid_url%3Dhttps%253A%252F%252Ftrace.51job.com%252Ftrace.php%253Fpartner%253Dsem_pcbaidu5_153412%2526ajp%253DaHR0cHM6Ly9ta3QuNTFqb2IuY29tL3RnL3NlbS9MUF8yMDIwXzEuaHRtbD9mcm9tPWJhaWR1YWQ%253D%2526k%253Dd946ba049bfb67b64f408966cbda3ee9%2526bd_vid%253D8450429807367679908%26%7C%26; slife=lastlogindate%3D20211203%26%7C%26; privacy=1638512010',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'scrapy_51job.middlewares.Scrapy51JobSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'scrapy_51job.middlewares.Scrapy51JobDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'scrapy_51job.pipelines.Scrapy51JobPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
import re
import os
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

#保存文件
def save_to_csv(a1,a2,a3,a4,a5):
    is_exist = False
    if os.path.exists("前程无忧.csv"):
        is_exist = True
    with open('前程无忧.csv', 'a', encoding='GBK', newline='') as csvfile:
        writer = csv.writer(csvfile)
        if not is_exist:
            writer.writerow(['职位名','公司名','工作地点','薪资','发布时间'])
        writer.writerow([a1,a2,a3,a4,a5])
    csvfile.close()

#创建webDriver对象
driver = webdriver.Chrome()
#等待变量
wait=WebDriverWait(driver,10)
# 打开51job网页
driver.get('https://search.51job.com/')

# 等待三秒
time.sleep(3)
# 等待全部城市按钮加载完成
confirm_btn_1 = driver.find_element(By.CLASS_NAME, "allcity")
# 点击全部城市
confirm_btn_1.click()

# 等待1秒
time.sleep(1)

# 等待上海城市
confirm_btn_2 = driver.find_element(By.XPATH,
                                    "//*[@id='popop']/div/div[2]/div[1]/div[2]/div/table/tbody[1]/tr/td[1]/em")
# 点击上海城市
confirm_btn_2.click()

# 等待1秒
time.sleep(1)

# 等待北京城市
confirm_btn_3 = driver.find_element(By.XPATH,
                                    "//*[@id='popop']/div/div[2]/div[1]/div[2]/div/table/tbody[1]/tr/td[2]/em")
# 点击北京城市
confirm_btn_3.click()

# 等待1秒
time.sleep(1)

# 等待广州城市
confirm_btn_4 = driver.find_element(By.XPATH,
                                    "//*[@id='popop']/div/div[2]/div[1]/div[2]/div/table/tbody[1]/tr/td[3]/em")
# 点击广州城市
confirm_btn_4.click()

# 等待1秒
time.sleep(1)

# 等待深圳城市
confirm_btn_5 = driver.find_element(By.XPATH,
                                    "//*[@id='popop']/div/div[2]/div[1]/div[2]/div/table/tbody[1]/tr/td[4]/em")
# 点击深圳城市
confirm_btn_5.click()

# 等待1秒
time.sleep(1)

# 定位确认按钮
confirm_btn_6 = driver.find_element(By.XPATH, "//*[@id='popop']/div/div[3]/span")
# 点击确认按钮
confirm_btn_6.click()

# 等待1秒
time.sleep(1)

# 等待“搜索框”加载完成
search_bth = driver.find_element(By.ID, "keywordInput")
# 在搜索框输入”大数据“
search_bth.send_keys("大数据")

# 等待1秒
time.sleep(1)

# 定位搜索按钮
confirm_bth = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#search_btn')))
# 点击搜索按钮
confirm_bth.click()

# 等待1秒
time.sleep(1)

for i in range(5):
    #获取网页源代码
    html=driver.page_source
    # print(html)#查看网页源代码

    #解析网页源代码
    # 正则表达式的
    pattern =re.compile(r'class="jname at">(?P<name>.*?)</span>.*?'
                        r'<span class="time">(?P<time>.*?)发布</span>.*?'
                        r'<span class="sal">(?P<money>.*?)</span>.*?'
                        r'<span class="d at">(?P<place>.*?)</span>.*?'
                        r'class="cname at">(?P<company>.*?)</a>',re.S)
    #用正则匹配网页源代码的数据
    result=pattern.finditer(html)

    #提取数据
    for i in result:
        # print("职位名:"+i.group("name"))
        # print("公司名:"+i.group("company"))
        # print("工作地点:"+i.group("place").split('|')[0])
        # print("薪资:"+i.group("money"))
        # print("发布时间:"+i.group("time"))
        a1=i.group("name")
        a2=i.group("company")
        a3=i.group("place").split('|')[0].strip()
        a4=i.group("money")
        a5=i.group("time")
        save_to_csv(a1,a2,a3,a4,a5)

    #等待三秒
    time.sleep(3)

    #定位下一页
    confirm_btn_7=driver.find_element(By.XPATH,"/html/body/div[2]/div[3]/div/div[2]/div[4]/div[2]/div/div/div/ul/li[8]/a")
    #点击下一页
    confirm_btn_7.click()

    #等待一秒
    time.sleep(3)

print("over,数据已保存,文件名为:前程无忧.csv")
#  #关闭浏览器
# driver.close()


2、数据处理和分析(30分)
使用Python的pandas读取“前程无忧.csv”文件,完成如下数据处理和分析操作:
(1)对数据进行清洗,把“工作地点”为“异地招聘”的数据删除;
(2)对“工作地点”数据处理,提取城市信息,比如:上海-浦东新区,提取出城市上海;对薪资数据进行转换,变成年薪数据,即转换成默认单位为“万/月”的数据。处理完后,保存数据到“前程无忧_NEW.csv”文件;

import pandas as pd
import re

# 读取数据
file = pd.read_csv('前程无忧.csv',encoding='gbk')

# 去空值
file = file.dropna()

# 去掉工作地点为异地招聘的地点
file = file[file['工作地点'] != '异地招聘']

# 清洗工作地点字段
new_area = []
for i in file['工作地点']:
    new_area.append(i[:2])
    # print(i[:2])
file['工作地点'] = new_area

# 重置索引
file = file.reset_index(drop=True)

# 薪资数据
salary = file['薪资']
# 存储清洗后的薪资数据
minsalary = []
maxsalary = []

for i in range(len(salary)):
    # 如果是类似 2.5-3万/月 则不用修改值 直接提取值
    if salary[i].find('-') != -1 and salary[i].find('万/月') != -1:
        sa = re.search('(?P<minsa>\d\.?\d?\d?)-(?P<maxsa>\d\.?\d?\d?)',salary[i])
        minsalary.append(float(sa['minsa']))
        maxsalary.append(float(sa['maxsa']))
    # 如果是类似 2.5-3/月 提取值/10
    elif salary[i].find('-') != -1 and salary[i].find('千/月') != -1:
        sa = re.search('(?P<minsa>\d\.?\d?\d?)-(?P<maxsa>\d\.?\d?\d?)',salary[i])
        # 取小数点后两位数
        minsalary.append(round(float(sa['minsa']) / 10,2))
        maxsalary.append(round(float(sa['maxsa']) / 10,2))
    # 如果是类似 2.5-3/年 提取值/12
    elif salary[i].find('-') != -1 and salary[i].find('万/年') != -1:
        sa = re.search('(?P<minsa>\d\.?\d?\d?)-(?P<maxsa>\d\.?\d?\d?)',salary[i])
        # 取小数点后两位数
        minsalary.append(round(float(sa['minsa']) / 12,2))
        maxsalary.append(round(float(sa['maxsa']) / 12,2))
    # 如果是类似 100万以上/年 提取值/12 则没有最大值
    elif salary[i].find('万以上/年') != -1:
        sa = re.search('(?P<minsa>\d\.?\d?\d?)\w+',salary[i])
        # 取小数点后两位数
        minsalary.append(round(float(sa['minsa']) / 12,2))
        maxsalary.append(0)
    # 如果是类似 千以下/月 提取值/10 没有最小值
    elif salary[i].find('千以下/月') != -1:
        sa = re.search('(?P<minsa>\d*\.?\d*)\w+',salary[i])
        # 取小数点后两位数
        minsalary.append(0)
        maxsalary.append(round(float(sa['minsa']) / 10,2))
    # 如果是类似 1000/天 取一个月工作21天为最大值 提取值/10000 最小值为0
    elif salary[i].find('元/天') != -1:
        sa = re.search('(?P<maxsa>\d*)',salary[i])
        minsalary.append(0)
        maxsalary.append(round((float(sa['maxsa']) / 10000) * 21,2))

if len(file) == len(minsalary) == len(maxsalary):
    print('薪资数据清洗完成')
else:
    print('数据清洗有误')

# 新增最高最低薪资列
file['最低薪资(万/月)'] = minsalary
file['最高薪资(万/月)'] = maxsalary

# 存储数据
file.to_csv('前程无忧_new.csv',encoding='utf-8',index=False)

3、数据可视化(20分)
根据题目需求使用Python读取相关的csv文件中的数据,完成如下题目。
(1)使用折线图展示岗位(职位名)数量最多的前5位
(2)使用条形图展示当前搜索到的岗位发布数前5的城市
(3)使用直方图展示当前搜索到的不同城市大数据岗位薪资分布情况

import matplotlib.pyplot as plt
import pandas as pd
# 解决中文乱码
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('前程无忧_new.csv',encoding='utf-8')

# 职位数前5的职位
count_job = data.groupby('职位名',as_index=False).count().sort_values('公司名',ascending=False).head(5)

# 岗位发布数前5的城市
area_job = data.groupby('工作地点',as_index=False).count().sort_values('职位名',ascending=False).head(5)

# 根据搜索的不同城市大数据岗位薪资分布情况
# 最低薪资岗位分布数据
min_sa = data['最低薪资(万/月)'].groupby(data['工作地点']).mean()
plt.hist(min_sa.values)
plt.title('城市大数据最低岗位薪资分布情况直方图')
plt.xlabel('薪资')
plt.ylabel('城市数量')
plt.show()
# 最高薪资岗位分布数据
max_sa = data['最高薪资(万/月)'].groupby(data['工作地点']).mean()
plt.hist(max_sa.values)
plt.title('城市大数据最高岗位薪资分布情况直方图')
plt.xlabel('薪资')
plt.ylabel('城市数量')
plt.show()

# 职位数前5的职位数据
area_name = area_job['工作地点']
job_sum = area_job['职位名']

# 岗位发布数前5的城市数据
job_name = count_job['职位名']
job_count = count_job['公司名']

# 职位数量最多前5折线图
plt.plot(job_name,job_count,marker='o')
plt.title('职位数量最多前5折线图')
plt.xlabel('职位名')
plt.ylabel('职位数量')
plt.grid()
plt.show()

# 职位发布数量前5城市
plt.bar(area_name,job_sum,color='r')
plt.title('职位发布数量前5城市')
plt.xlabel('城市名')
plt.ylabel('发布职位数量')
plt.show()