爬取网站上某个职位
1.需要构建请求信息,包括url,headers,proxies等信息,并利用requests库或者selenium库进行访问
2.对网页源码进行解析:
(1)若未能进入详细页面,需至少解析出:招聘岗位名称,薪水,学历要求,
工作地点,公司名称。
(2)若能进入详细页面,请解析出所有信息。
3.将所有信息保存为csv文件。
import requests
from lxml import etree
import csv
f = open('java工程师招聘信息数据表.csv', 'a', encoding='utf-8', newline='')
xiaoluo=[
'招聘岗位',
'薪水',
'工作地点',
'学历',
'发布时间',
'公司名称',
'公司地址',
'任职详细信息']
ww = csv.writer(f)
ww.writerow(xiaoluo)
a = 0
for i in range(1, 11):
url = f'https://msearch.51job.com/job_list.php?keyword=Java%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88&keywordtype=2&jobarea=020000&fromapp=&pageno={i} '
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.41',
'Referer': 'https://m.51job.com/',
'Cookie': 'guid=9005afff64eb7f1f3f879ec0436f1583; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229005afff64eb7f1f3f879ec0436f1583%22%2C%22first_id%22%3A%2218181483d7eb65-0b5fde49beaee18-26021b51-1350728-18181483d7f467%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgxODE0ODNkN2ViNjUtMGI1ZmRlNDliZWFlZTE4LTI2MDIxYjUxLTEzNTA3MjgtMTgxODE0ODNkN2Y0NjciLCIkaWRlbnRpdHlfbG9naW5faWQiOiI5MDA1YWZmZjY0ZWI3ZjFmM2Y4NzllYzA0MzZmMTU4MyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%229005afff64eb7f1f3f879ec0436f1583%22%7D%2C%22%24device_id%22%3A%2218181483d7eb65-0b5fde49beaee18-26021b51-1350728-18181483d7f467%22%7D; _uab_collina=165573142584412596676913; partner=51jobhtml5; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60010000%2C020000%2C030200%2C040000%2C090200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch4%7E%60010000%2C020000%2C030200%2C040000%2C090200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; msearchhistory=020000%2CJava%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%2C2%2C%2C%2C%7C%7C020000%2C%E7%88%AC%E8%99%AB%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%2C2%2C%2C%2C%7C%7C020000%2C%E7%88%AC%E8%99%AB%2C2%2C%2C%2C; m_search=areacode%3D020000%26%7C%26keyword%3DJava%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88; acw_tc=76b20ff916558203895284101e604b838909c0a4b9c6f6fbc1969a4d2dbe6d; SECKEY_ABVK=xgzxoSyn2rtXh5MMsu8vphF1pw/gDr23aCF3+p/Ki2c%3D; BMAP_SECKEY=2mBlqVQnvkTyj7CDYCSepCV66Z1JJFHnlxN1HBYtIwhr14tDfacPvwpRsh-FttecZsXYmsTaSLo4-NkE4QJbdsk64mkhlzL2Ezr_kzAwhhqC0PaFJqgR6OiihpJ14g93nuM-6VNoqCkYBFCDJ7SRQtIOUh9eKYILo6glUsfuGO8B5yK7TgM1g4D6YaaYHOS8; acw_sc__v2=62b1d0733f8741b0e9094a2a23d15f489ac3336d; ssxmod_itna=YqUx0DBD9DnA3AKK0dKYIE=4rqqu0ibOb3qGNdAoDZDiqAPGhDC84Ix27RkD3h0EsDIoweDgC4xhxEGlMeTevQipyeDHxY=DU=CiKbD4+KGwD0eG+DD4DWeqAoDexGpc2pXKGWD4qDODWKDX2akDiPCDmR8pxGClxDCUAPDwx0CLovF=4YpDiyiROhPLxG1F40HiGfxLxOfL8G1RPSAozbO43YDvxDkDUKDo2PpDB+kBpYNQCRDWjuDYWb3qOx45xRi=Q2isx2rqZxxLQexQWUPKYaKTjqKKDDWilty4D===; ssxmod_itna2=YqUx0DBD9DnA3AKK0dKYIE=4rqqu0ibOb4A=TGOj7DBTgq7pxLhBaHGFj+g6fk6LHNYq8O7APDgemv=GY6hBBj7QrIR3cj3A31LFj2YDzf7u27eq1F9LxsZ97BWggIlgSXwUsY8MpcmsSh9HY2wbmoebQAANZxBhum+GKtWCF4AirtAmzA5za=Grz8D7qGIb3PqeenpvFW5HW+ee2AGoDQIEDjKD+OfxEnDTV7hx4D=='
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
page_text = response.text
html = etree.HTML(page_text)
rel_list = html.xpath('//div[@class="list"]/a/@rel') # 获取详细信息页链接实现详细信息爬取
for rel in rel_list:
response = requests.get(url=rel, headers=headers)
response.encoding = 'utf-8'
page_text = response.text
html = etree.HTML(page_text)
jie1 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[1]/p[@class="jname"]/text()')[0] # 招聘岗位名称
jie2 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[1]/p[@class="sal"]/text()')[0] # 薪水
jie3 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/div/span[1]/text()')[0] # 工作地点
jie4 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/div/span[2]/text()')[0] # 学历
jie5 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/span/text()')[0] # 时间
jie6 = html.xpath('//div[@id="pageContent"]/div[2]/div/div[2]/h3/text()')[0] # 公司名称
jie7 = html.xpath('//div[@id="pageContent"]/div[1]/a/span/text()')[0] # 公司地址
jie8 = html.xpath('//div[@id="pageContent"]/div[3]/div[2]/article//text()') # 任职详细信息
jie8 = ",".join(jie8).replace('\xa0', '')
a += 1
print(f'正在爬取第{a}条招聘信息')
xiaojie = [jie1,jie2,jie3,jie4,jie5,jie6,jie7,jie8]
print(xiaojie)
ww.writerow(xiaojie)
print(f'第{i}页爬取结束!!!')
f.close()
假设需要爬取的网站是 https://www.example.com/
,需要爬取其中的某个职位的招聘信息。
首先需要安装 requests
和 beautifulsoup4
这两个库来进行爬取和解析网页源码。
可以直接在 jupyter 中安装 requests
和 beautifulsoup4
两个库,在 jupyter 中执行以下命令:
!pip install requests beautifulsoup4
然后,进入编辑模式,在 jupyter 中执行以下代码:
import requests
from bs4 import BeautifulSoup
url = 'https://www.example.com/job'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
job_list = soup.find_all('div', class_='job-list')
for job in job_list:
job_title = job.find('h2').text
job_location = job.find('p', class_='location').text
job_description = job.find('p', class_='description').text
print(f'Title: {job_title}\nLocation: {job_location}\nDescription: {job_description}\n')
这个代码逻辑实际上非常简单,就是通过 requests
库请求网页,然后用 beautifulsoup4
解析网页源码,找到所有职位信息的 div
标签,然后遍历每个职位,找到职位的标题、地点和描述,最后输出这些信息。
值得注意的是,在进行网页爬取时,不要一味地访问网站,应该尽量模拟浏览器的行为,尤其是要注意不要频繁访问同一个页面,否则可能会被网站误认为是恶意攻击。因此,在实际进行爬取时,需要设置合适的请求头和间隔时间,以免被封 IP。
同时,爬取网页的过程中还需要处理一些异常情况,比如网络连接失败、页面解析错误等。可以通过 try-except
语句来捕获这些异常,将其记录并继续执行程序。
最后,需要注意遵循网络爬虫的道德规范,尊重网站的知识产权和利益,不要盗用、抄袭网站内容,避免对网站造成不必要的损失。