爬取两三页就爬取不了
import requests
import re
import random
import json
import csv
import time
from selenium import webdriver
f = open("前程无忧.csv", mode='a', encoding='utf-8', newline='')
csvwriter = csv.writer(f)
for i in range(105, 2000):
url = f"https://search.51job.com/list/000000,000000,0000,00,9,99,+,2,{i}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
driver = webdriver.Chrome
driver().get(url)
driver().implicitly_wait(10)
driver().set_window_size(1000, 800)
time.sleep(1)
driver().set_window_size(1200, 800)
time.sleep(1)
driver().set_window_size(1000, 800)
time.sleep(2)
user_agent = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
]
# 打开文件,换行读取
f = open("IP.txt", "r")
file = f.readlines()
# 遍历并分别存入列表,方便随机选取IP
item = []
for proxies in file:
proxies = eval(proxies.replace('\n', '')) # 以换行符分割,转换为dict对象
item.append(proxies)
proxies = random.choice(item) # 随机选取一个IP
headers = {
"User-Agent": random.choice(user_agent), # 随机获取一个请求头
}
resp = requests.get(url, headers=headers, proxies=proxies)
print(proxies, headers)
print(resp.text)
# 解析数据
html_data = re.findall("window.SEARCH_RESULT = (.*?)", resp.text)[0]
json_data = json.loads(html_data) # 转成字典数据类型
for index in json_data['engine_jds']:
print(index)
try:
dit = {
'id': index['jobid'],
'职业名称': index['job_name'],
'薪资水平': index['providesalary_text'],
'招聘单位': index['company_name'],
'工作地点': index['workarea_text'],
'工作经验': index['attribute_text'][1],
'学历要求': index['attribute_text'][2],
'福利待遇': index['jobwelf'],
'子链接': index['job_href'],
'发布时间': index['issuedate']
}
except:
dit = {
'id': index['jobid'],
'职业名称': index['job_name'],
'薪资水平': index['providesalary_text'],
'招聘单位': index['company_name'],
'工作地点': index['workarea_text'],
'工作经验': index['attribute_text'][1],
'学历要求': " ",
'福利待遇': index['jobwelf'],
'子链接': index['job_href'],
'发布时间': index['issuedate']
}
csvwriter.writerow(dit.values())
time.sleep(2)
resp.close()
爬取所有的页面
使用会话试试,requests.session
下面两个:你想用那个爬,
import requests
from selenium import webdriver