
import json
import time
import requests
import pandas as pd
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
page=1#页码
key='0122'#职位
url_q="https://search.51job.com/list/010000%252c020000%252c030200%252c040000%252c080200,000000,"
url_z=",01,9,99,+,2,"
url_h=".html?lang=c&postchannel=0000&workyear=01&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
url =url_q+key+url_z+str(page)+url_h
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'Connection': 'close',
'Host': 'search.51job.com'}
# 获取
def spider(base_url):
re1 = requests.get(url , headers = headers,proxies= {'http':'tps131.kdlapi.com'},timeout=(5,10))
html1 = etree.HTML(re1.text)
divs = html1.xpath('//script[@type = "text/javascript"]/text()')[0].replace('window.__SEARCH_RESULT__ = ',"")
js = json.loads(divs)
url2 = []
for i in range(len(js['engine_jds'])):
if js['engine_jds'][i]['job_href'][0:22] == "https://jobs.51job.com":
url2.append(js['engine_jds'][i]['job_href'])
else:
print("url异常,弃用")
print("成功提取"+str(len(url2))+"条求职信息")
for i in url2:
print(i)
driver = webdriver.Chrome()
driver.get(i)
time.sleep(0.1)
data2 = driver.page_source
selector = etree.HTML(data2)
temp=selector.xpath("/html/body/div[1]/div[1]/div/div[2]/p/text()")[0]
print(temp)
if(temp=="为保证您的正常访问,请进行如下验证: "):
print("页面锁定,")
continue
jobname=selector.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/h1/text()")[0]
print(jobname)
gsname=selector.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/p[1]/a[1]/text()")[0]
print(gsname)
wz=selector.xpath("/html/body/div[3]/div[2]/div[2]/div/div[1]/p[2]/text()")[0]
print(wz)
xuqiu=selector.xpath("/html/body/div[3]/div[2]/div[3]/div[1]/div/p/text()")
print(xuqiu)
#爬取多个页面
def run():
for page in range(1,2):
print("正在获取第"+str(page)+"页信息")
url =url_q+key+url_z+str(page)+url_h
spider(url)
run()