from selenium import webdriver
import time
dri = webdriver.Chrome()
dri.get("https://www.zhipin.com/web/geek/job?city=101020100%22)
time.sleep(5from selenium import webcodriver
import time
dri = webdriver.Chrome()
dri.get("https://www.zhipin.com/web/geek/job?city=101020100%22)
time.sleep(5)
print(dri.page_source)
print(dri.page_source)
/Users/alanna/PycharmProjects/pythonProject4/venv/bin/python "/Users/alanna/Library/Application Support/JetBrains/PyCharmCE2022.1/scratches/boss.PY.py"
/Users/alanna/PycharmProjects/pythonProject4/venv/bin/python: can't open file '/Users/alanna/Library/Application Support/JetBrains/PyCharmCE2022.1/scratches/boss.PY.py': [Errno 2] No such file or directory
需求抓取招聘网站关键信息,是否可远程指导成功抓取
完全指导你能成功抓取招聘数据?还是只需要大概的?
boss.PY.py': [Errno 2] No such file or directory
没找到这个文件,后缀不对吧。重新创建一个文件。
你是要模拟登录直接抓取?
No such file or directory 错误是没找到boss.PY.py这个文件,
应该是你编辑器的问题,编辑器把boss.py文件,错认为了boss.PY.py
你重启下编辑器,重新创建一个py文件试试。
报错告诉你,不能打开boss.PY.py,没有boss.PY.py这个文件。要么你重新创建一个放在代码根目录下,要么重启pycharm试试。而且从你给的信息中也无法判断你是哪里用到了boss.PY.py这个文件,我猜你代码都没写对(你自己看你第五行那是啥),你至少要直观的告诉大家你的具体需求
可以把需求和使用方法说一下吗?
利用requests库的get函数获取招聘信息,需要通过浏览器控制台获取两个东西,一个是接口url,另一个是header格式,最主要的是header里的cookie,登录网站,地址:https://www.zhipin.com/web/geek/job?city=101020100%EF%BC%8C%E6%9C%80%E5%A5%BD%E6%98%AF%E7%99%BB%E5%BD%95%E4%B8%80%E4%B8%8B%EF%BC%8C%E5%90%A6%E5%88%99%E8%AE%BF%E9%97%AE%E5%87%A0%E6%AC%A1%E5%90%8E%E5%B0%B1%E7%A6%81%E6%AD%A2%E8%AE%BF%E9%97%AE%E4%BA%86%EF%BC%8C%E6%8C%89f12%EF%BC%8C%E6%89%93%E5%BC%80%E6%B5%8F%E8%A7%88%E5%99%A8%E6%8E%A7%E5%88%B6%E5%8F%B0%EF%BC%8C%E5%88%87%E6%8D%A2%E5%88%B0network%E9%A1%B5
import requests
import pandas as pd
import time
columns = ['jobName','brandName','brandIndustry','brandStageName','brandScaleName','bossName','bossTitle','salaryDesc','jobDegree','jobExperience','skills','welfareList','cityName','areaDistrict','businessDistrict']
data = {c:[] for c in columns}
for i in range(1,5): #每一次替换cookie可以访问4次,要获取10页内容需要先取1-4页,然后取5-8页,最后取9-10页
time.sleep(1)
url = "https://www.zhipin.com/wapi/zpgeek/search/joblist.json?scene=1&query=&city=101020100&experience=°ree=&industry=&scale=&stage=&position=&salary=&multiBusinessDistrict=&page="+str(i)+"&pageSize=30"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'wd_guid=f6e3b6f9-6996-4ece-b1fc-3825c87035c0; historyState=state; _bl_uid=pnl8m56zcp72pL7e2tydp2wwsmnC; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1657260677,1657337818; __fid=bc12a2cc2da625e2e578196657bae609; toUrl=/; wt2=DOLRftl7Cw3aft1wC7_WPOpArD0UBrqygNrVJS36QNuQNxFNdrPmQ9sR-VX0twnkNe99dbp5eiPDjMwoAGNMXsg~~; wbg=0; JSESSIONID=3E292C81981F7901A9893A9B8EE312AC; lastCity=101020100; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1657359864; __c=1657337818; __l=l=%2Fwww.zhipin.com%2Fweb%2Fgeek%2Fjob%3Fcity%3D101020100%26page%3D6&r=&g=&s=3&friend_source=0&s=3&friend_source=0; __a=53526745.1657260676.1657260676.1657337818.61.2.57.61; geek_zp_token=V1RN4vGez62FZtVtRvyhobLym36DzRxC8~; __zp_stoken__=4013dUBBzTWZiXEExFDkREU5SWGVxBWgoV3EeaB4hPyJ1KUoiIgkITmgqLjQPLxM9D0FENyoac3h4RVdxen5TAkxtWnpaV2UhPnc2UAh0SGcJSWotagt6bAtaNgtzYn8oOGxoN2MwOlYXDGhJGRo1NwIVOioYbldfJF80VTg6Rg%3D%3D',
'referer': 'https://www.zhipin.com/web/geek/job?city=101020100&page='+str(i),
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'token': '3PKr1RJWchVHl19s',
'traceid': '9303CC86-B313-4DBD-9420-7F1C7F5ECD7C',
'x-requested-with': 'XMLHttpRequest',
'zp_token': 'V1RN4vGez62FZtVtRvyhobLyq17jvSxiQ~'}
response = requests.get(url=url, headers=headers)
json = response.json()
print(json)
try:
jobList = json['zpData']['jobList']
for job in jobList:
for k,v in job.items():
if k in columns:
data[k].append(v)
except:
print('error')
df = pd.DataFrame(data,columns=columns)
df.to_csv('info.csv',encoding='gbk')
无头浏览器里面有支持通过xpath定位到元素,然后执行click等命令,也就是说非必要不要用POST/GET,因为不知道是不是带token验证等
鉴于其他答主说了cookie的问题,这里说下怎么拿数据:
首先数据是被get来的,即使网页用到了ajax加密,也仍然会有get把内容请求回来,而且多半是json或frame网页之类的,通过F12可查
其次的API请求头构造问题,为了保险请全部把F12里的头加上,因为这个网址我试了,查的很严,平时其他网站
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin'
这种头基本上其他网站根本不管,基本上只要referer origin host及cookie type之类的
其次,POST是表单 Query是查询 这两种,由于很多人混淆这个概念成一个,导致构造时老是不成功
POST表单就是在post 流上加个dictionary,然后看他需不需要 URL编码
由于我个人是neter,C# 写爬虫的过程很清晰,你应该看得懂
var url = "https://xxxxxxxxxxxxxx";
IWebDriver driver = new PhantomJSDriver(GetPhantomJSDriverService(), new()
{
PageLoadStrategy = PageLoadStrategy.Normal,
AcceptInsecureCertificates = true,
BrowserVersion = "v101.2200.65.2214",
PlatformName = "Microsoft Edge",
UnhandledPromptBehavior = UnhandledPromptBehavior.Ignore
}, new(0, 0, 30));
driver.Navigate().GoToUrl(url);
Cookie jeesitcookie = null;
string allCookies = "";
var currentCookies = driver.Manage().Cookies;
currentCookies.AllCookies.ToList().ForEach(cookie =>
{
if (cookie.Name.ToLower() == "jeesite.session.id")
{
jeesitcookie = cookie;//提取jeesite cookie
}
allCookies += cookie.Name + "=" + cookie.Value + ";";
});
if (jeesitcookie == null)
{
Console.WriteLine("not found cookie");
return;
}
//Console.WriteLine(allCookies);
//ocr
var forms = new CapCodeEnter
{
Cookies = allCookies
};
forms.ShowDialog();
//Console.WriteLine("输入账户:");
driver.FindElement(By.XPath("/html/body/div[3]/form[2]/div[1]/input[1]")).SendKeys(forms.UserName);
//Console.WriteLine("输入密码:");
driver.FindElement(By.XPath("/html/body/div[3]/form[2]/div[1]/input[3]")).SendKeys(forms.Password);
//Console.WriteLine("code:");
driver.FindElement(By.XPath("/html/body/div[3]/form[2]/div[1]/div[1]/input")).SendKeys(forms.Code);
//MessageBox.Show(forms.Result);
driver.FindElement(By.XPath("/html/body/div[3]/form[2]/div[1]/input[4]")).Click();
Console.WriteLine();
while (!driver.PageSource.Contains("退出登录", StringComparison.CurrentCultureIgnoreCase))
{
Thread.Sleep(200);
}
//MessageBox.Show("登陆成功");
Console.WriteLine("开始抓取");
//后面就是利用cookie抓取之类的
//这里我用了个C# 的restsharp库,返回的是request类,用于后续构造.POST/GET等
public static RestRequest GetRequest(string jeesiteCookie)
{
var restQuest = new RestRequest();
restQuest.AddHeader("Cookie", jeesiteCookie);
restQuest.AddHeader("Host", "cardcenter.bjedu.cn");
restQuest.AddHeader("Referer", "https://cardcenter.bjedu.cn/becom-sccms-web/a/login");
restQuest.AddHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47");
return restQuest;
}
虽然是C# 的,但是这个网站是锐数加密的,大概逻辑看明白了就行
核心思路是太复杂上无头浏览器,模拟人类行为,简单的API找不变应万变,token的构造是js所为,可以换个思路,比如用无头浏览器click文章链接,然后通过xpath+正则 进行提取数据