期末作业是爬取Boss直聘岗位相关数据,前期结合所学和网上的代码,但CSV文件中仍是空的,找不到问题,还望大家帮忙看看问题出在哪儿,谢谢大家!
import requests
import csv
import pandas as pd
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
cookie="__zp__pub__=; lastCity=101110100; __zp_stoken__=ddfaaC1oxcwcya3oOXUNWPSV5Vn9mfABYVFAhHlNZG1cvMgZMNnVnHHNKSzcYaQtJQSADE3tSZTopfT5ka30GRSlaU3c6ckVVaR4eBiMvDT8aR38lBkcNWAI8UVctTitNAxlGbCBbZz9gTSU%3D; t=Oh8LmQ5pyMOhjqah; wt=Oh8LmQ5pyMOhjqah; sid=sem_pz_bdpc_dasou_title; __c=1591769829; __g=sem_pz_bdpc_dasou_title; __l=l=%2Fwww.zhipin.com%2Fxian%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&r=https%3A%2F%2Fsp0.baidu.com%2F9q9JcDHa2gU2pMbgoY3K%2Fadrc.php%3Ft%3D06KL00c00fDdiHC088qh0KZEgsZ9X8KX00000ZoOx7C00000UkfexZ.THdBULP1doZA80K85yF9pywd0ZnquAu9rjTdnj6snj0YrHc4mfKd5Hmkwbnsn1RzfbmLn1mvfRPArRf4wjnsfRcvf1wAn1bd0ADqI1YhUyPGujY1n1f1PWTsnHckFMKzUvwGujYkP6K-5y9YIZK1rBtEILILQMGCpgKGUB4WUvYE5LPGujd1uydxTZGxmhwsmdqbmgPEINqYpgw_ufKWThnqn1nYrHD%26tpl%3Dtpl_11534_22672_17382%26l%3D1518141306%26attach%3Dlocation%253D%2526linkName%253D%2525E6%2525A0%252587%2525E5%252587%252586%2525E5%2525A4%2525B4%2525E9%252583%2525A8-%2525E6%2525A0%252587%2525E9%2525A2%252598-%2525E4%2525B8%2525BB%2525E6%2525A0%252587%2525E9%2525A2%252598%2526linkText%253DBOSS%2525E7%25259B%2525B4%2525E8%252581%252598%2525E2%252580%252594%2525E2%252580%252594%2525E6%252589%2525BE%2525E5%2525B7%2525A5%2525E4%2525BD%25259C%2525EF%2525BC%25258C%2525E6%252588%252591%2525E8%2525A6%252581%2525E8%2525B7%25259F%2525E8%252580%252581%2525E6%25259D%2525BF%2525E8%2525B0%252588%2525EF%2525BC%252581%2526xp%253Did(%252522m3343670121_canvas%252522)%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FDIV%25255B1%25255D%25252FH2%25255B1%25255D%25252FA%25255B1%25255D%2526linkType%253D%2526checksum%253D140%26ie%3DUTF-8%26f%3D8%26tn%3Dbaidu%26wd%3DBoss%25E7%259B%25B4%25E8%2581%2598%26oq%3DBoss%25E7%259B%25B4%25E8%2581%2598%26rqlang%3Dcn&g=%2Fwww.zhipin.com%2Fxian%2F%3Fsid%3Dsem_pz_bdpc_dasou_title&friend_source=0&friend_source=0; _bl_uid=I3k1mb9y8d8y37ngjsvq4eevzRaj; __zp_seo_uuid__=8792ec29-03f0-439e-86f2-1b86c2c55784; __a=40025213.1591065226.1591691731.1591769829.57.7.3.3"
def get_one_page(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'}
html = requests.get(url, headers=headers)
if html.status_code == 200:
return html.text
return None
except RequestException:
return None
result_all = [] # 用于存储样本
def parse_one_page(html):
soup = BeautifulSoup(html, 'lxml')
companies = soup.find_all('div', 'job-primary', True)
for com in companies:
res = parse_one_company(com)
result_all.append(res)
def parse_one_company(comp):
result = []
company_soup = comp.find('div', class_='info-company')
com_desc = company_soup.find('p').text
primary_soup = comp.find('div', class_='info-primary')
job_name = primary_soup.find('div').text
salary = primary_soup.find('span').text
requirement = primary_soup.find('p').text
result.append(com_desc)
result.append(job_name)
result.append(salary)
result.append(requirement)
return result
def parse_all_page(num, offset):
url1 = 'https://www.zhipin.com/c101280100/h_101280100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 广州
url2 = 'https://www.zhipin.com/c101280600/h_101280600/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 深圳
url3 = 'https://www.zhipin.com/c101010100/h_101010100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 北京
url4 = 'https://www.zhipin.com/c101020100/h_101020100/?query=数据分析师&page='+str(offset)+'&ka=page-'+str(offset) # 上海
urldict = {'1':url1, '2':url2, '3':url3, '4':url4}
html = get_one_page(urldict[str(num)])
parse_one_page(html)
if __name__ == '__main__':
for j in range(1, 5):
for i in range(1,11):
parse_all_page(j, i)
file = pd.DataFrame(result_all, columns=['公司信息', '岗位', '薪水', '其他'])
# encoding='utf_8_sig解决保存到CSV文件后显示乱码问题
file.to_csv('Bosszhiping_four_city.csv', mode='a', index=True, encoding='utf_8_sig')
``
你确定爬到了页面的数据吗?我用代码测试的时候发现爬到的都是
请稍后的页面爬取boss直聘需要在请求里带上自己的cookies,并且它的cookies每次都会变,这需要你自己来想办法通过它的js文件获取了,下面的代码实现了爬取并存储到csv文件,运行前更换cookie,并删除原本你目录里的csv文件
import requests
import csv
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
cookies = "__zp__pub__=; _uab_collina=159178398373799871604155; lastCity=100010000; __c=1591783980; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1591783980; __l=l=%2Fwww.zhipin.com%2Fc101280100%2Fh_101280100%2F%3Fquery%3D%25E6%2595%25B0%25E6%258D%25AE%25E5%2588%2586%25E6%259E%2590%25E5%25B8%2588&r=&friend_source=0&friend_source=0; __a=942990.1591783980..1591783980.8.1.8.8; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1591834512; __zp_stoken__=226aaABFRYnVuOBpEfiQLP1IsSndBHmsQQFh2eBYxWmU7UHwcXAAhW1wdHWB%2BIVo0HmQDGF1EawMDHH0Edz8%2BIXY2XRwMFQNge0YlZzI2CHdFAzBuZVQVbTVzdV4tDzp%2FXF1GZwY%2FSHRLBg0%3D"
cook_dict = {cookie.split('=')[0]: cookie.split('=')[1] for cookie in cookies.split('; ')}
html = requests.get(url, headers=headers, cookies=cook_dict)
if html.status_code == 200:
return html.content.decode('utf-8')
return None
except RequestException:
return None
result_all = [] # 用于存储样本
def parse_one_page(html):
soup = BeautifulSoup(html, 'lxml')
companies = soup.find_all('div', 'job-primary', True)
for com in companies:
res = parse_one_company(com)
result_all.append(res)
def parse_one_company(comp):
result = []
company_soup = comp.find('div', class_='info-company')
com_desc = company_soup.find('a').text
job_soup = comp.find('div', class_='job-title')
job_name = job_soup.find('a').text
salary_soup = comp.find('div', class_='job-limit clearfix')
salary = salary_soup.find('span').text
requirement = salary_soup.find('p').text
result.append(com_desc)
result.append(job_name)
result.append(salary)
result.append(requirement)
return result
def parse_all_page(num, offset):
url1 = 'https://www.zhipin.com/c101280100/h_101280100/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
offset) # 广州
url2 = 'https://www.zhipin.com/c101280600/h_101280600/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
offset) # 深圳
url3 = 'https://www.zhipin.com/c101010100/h_101010100/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
offset) # 北京
url4 = 'https://www.zhipin.com/c101020100/h_101020100/?query=数据分析师&page=' + str(offset) + '&ka=page-' + str(
offset) # 上海
urldict = {'1': url1, '2': url2, '3': url3, '4': url4}
html = get_one_page(urldict[str(num)])
parse_one_page(html)
if __name__ == '__main__':
for j in range(1, 5):
for i in range(1, 11):
parse_all_page(j, i)
print(result_all)
file = pd.DataFrame(result_all, columns=['公司信息', '岗位', '薪水', '其他'])
# encoding='utf_8_sig解决保存到CSV文件后显示乱码问题
file.to_csv('Bosszhiping_four_city.csv', mode='a', index=True, encoding='utf_8_sig')
我前几天爬到的,我博客上有介绍,也不是很高明的想法,你可以去看看