需要构建请求信息,包括url,headers,proxies等信息,并利用requests库或者selenium库进行访
怎么解析网页,将所有信息保存为csv文件。
import requests
from lxml import etree
import csv
f = open('java工程师招聘信息数据表.csv', 'a', encoding='utf-8', newline='')
xiaoluo=[
'招聘岗位',
'薪水',
'工作地点',
'学历',
'发布时间',
'公司名称',
'公司地址',
'任职详细信息']
ww = csv.writer(f)
ww.writerow(xiaoluo)
a = 0
for i in range(1, 11):
url = f'https://msearch.51job.com/job_list.php?keyword=Java%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88&keywordtype=2&jobarea=020000&fromapp=&pageno={i} '
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.41',
'Referer': 'https://m.51job.com/',
'Cookie': 'guid=9005afff64eb7f1f3f879ec0436f1583; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229005afff64eb7f1f3f879ec0436f1583%22%2C%22first_id%22%3A%2218181483d7eb65-0b5fde49beaee18-26021b51-1350728-18181483d7f467%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgxODE0ODNkN2ViNjUtMGI1ZmRlNDliZWFlZTE4LTI2MDIxYjUxLTEzNTA3MjgtMTgxODE0ODNkN2Y0NjciLCIkaWRlbnRpdHlfbG9naW5faWQiOiI5MDA1YWZmZjY0ZWI3ZjFmM2Y4NzllYzA0MzZmMTU4MyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%229005afff64eb7f1f3f879ec0436f1583%22%7D%2C%22%24device_id%22%3A%2218181483d7eb65-0b5fde49beaee18-26021b51-1350728-18181483d7f467%22%7D; _uab_collina=165573142584412596676913; partner=51jobhtml5; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60010000%2C020000%2C030200%2C040000%2C090200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch4%7E%60010000%2C020000%2C030200%2C040000%2C090200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; msearchhistory=020000%2CJava%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%2C2%2C%2C%2C%7C%7C020000%2C%E7%88%AC%E8%99%AB%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%2C2%2C%2C%2C%7C%7C020000%2C%E7%88%AC%E8%99%AB%2C2%2C%2C%2C; m_search=areacode%3D020000%26%7C%26keyword%3DJava%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88; acw_tc=76b20ff916558203895284101e604b838909c0a4b9c6f6fbc1969a4d2dbe6d; SECKEY_ABVK=xgzxoSyn2rtXh5MMsu8vphF1pw/gDr23aCF3+p/Ki2c%3D; BMAP_SECKEY=2mBlqVQnvkTyj7CDYCSepCV66Z1JJFHnlxN1HBYtIwhr14tDfacPvwpRsh-FttecZsXYmsTaSLo4-NkE4QJbdsk64mkhlzL2Ezr_kzAwhhqC0PaFJqgR6OiihpJ14g93nuM-6VNoqCkYBFCDJ7SRQtIOUh9eKYILo6glUsfuGO8B5yK7TgM1g4D6YaaYHOS8; acw_sc__v2=62b1d0733f8741b0e9094a2a23d15f489ac3336d; ssxmod_itna=YqUx0DBD9DnA3AKK0dKYIE=4rqqu0ibOb3qGNdAoDZDiqAPGhDC84Ix27RkD3h0EsDIoweDgC4xhxEGlMeTevQipyeDHxY=DU=CiKbD4+KGwD0eG+DD4DWeqAoDexGpc2pXKGWD4qDODWKDX2akDiPCDmR8pxGClxDCUAPDwx0CLovF=4YpDiyiROhPLxG1F40HiGfxLxOfL8G1RPSAozbO43YDvxDkDUKDo2PpDB+kBpYNQCRDWjuDYWb3qOx45xRi=Q2isx2rqZxxLQexQWUPKYaKTjqKKDDWilty4D===; ssxmod_itna2=YqUx0DBD9DnA3AKK0dKYIE=4rqqu0ibOb4A=TGOj7DBTgq7pxLhBaHGFj+g6fk6LHNYq8O7APDgemv=GY6hBBj7QrIR3cj3A31LFj2YDzf7u27eq1F9LxsZ97BWggIlgSXwUsY8MpcmsSh9HY2wbmoebQAANZxBhum+GKtWCF4AirtAmzA5za=Grz8D7qGIb3PqeenpvFW5HW+ee2AGoDQIEDjKD+OfxEnDTV7hx4D=='
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
page_text = response.text
html = etree.HTML(page_text)
rel_list = html.xpath('//div[@class="list"]/a/@rel') # 获取详细信息页链接实现详细信息爬取
for rel in rel_list:
response = requests.get(url=rel, headers=headers)
response.encoding = 'utf-8'
page_text = response.text
html = etree.HTML(page_text)
jie1 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[1]/p[@class="jname"]/text()')[0] # 招聘岗位名称
jie2 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[1]/p[@class="sal"]/text()')[0] # 薪水
jie3 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/div/span[1]/text()')[0] # 工作地点
jie4 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/div/span[2]/text()')[0] # 学历
jie5 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/span/text()')[0] # 时间
jie6 = html.xpath('//div[@id="pageContent"]/div[2]/div/div[2]/h3/text()')[0] # 公司名称
jie7 = html.xpath('//div[@id="pageContent"]/div[1]/a/span/text()')[0] # 公司地址
jie8 = html.xpath('//div[@id="pageContent"]/div[3]/div[2]/article//text()') # 任职详细信息
jie8 = ",".join(jie8).replace('\xa0', '')
a += 1
print(f'正在爬取第{a}条招聘信息')
xiaojie = [jie1,jie2,jie3,jie4,jie5,jie6,jie7,jie8]
print(xiaojie)
ww.writerow(xiaojie)
print(f'第{i}页爬取结束!!!')
f.close()
在编写爬虫的过程中,有些网站会设置反爬机制,对于非浏览器的访问拒绝响应;或短时间频繁爬取会触发网站的反爬机制,导致 ip 被封无法爬取网页。这就需要在爬虫程序中修改请求的 headers 伪装浏览器访问,或者使用代理发起请求。从而绕开网站的反爬机制获取正确的页面。
本文使用 python3.6,常用的请求库 requests 以及自动化测试库 selenium 使用浏览器。
关于这两个库的使用请参考官方文档或本人另一篇博客: python 爬虫获取网页 html 内容以及下载附件的方法。