python的jupyter怎么使用

爬取网站上某个职位
1.需要构建请求信息,包括url,headers,proxies等信息,并利用requests库或者selenium库进行访问
2.对网页源码进行解析:
(1)若未能进入详细页面,需至少解析出:招聘岗位名称,薪水,学历要求,
工作地点,公司名称。
(2)若能进入详细页面,请解析出所有信息。
3.将所有信息保存为csv文件。


import requests
from lxml import etree
import csv
f = open('java工程师招聘信息数据表.csv', 'a', encoding='utf-8', newline='')
xiaoluo=[
    '招聘岗位',
    '薪水',
    '工作地点',
    '学历',
    '发布时间',
    '公司名称',
    '公司地址',
    '任职详细信息']

ww = csv.writer(f)
ww.writerow(xiaoluo)
a = 0
for i in range(1, 11):
    url = f'https://msearch.51job.com/job_list.php?keyword=Java%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88&keywordtype=2&jobarea=020000&fromapp=&pageno={i} '
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.41',
        'Referer': 'https://m.51job.com/',
        'Cookie': 'guid=9005afff64eb7f1f3f879ec0436f1583; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%229005afff64eb7f1f3f879ec0436f1583%22%2C%22first_id%22%3A%2218181483d7eb65-0b5fde49beaee18-26021b51-1350728-18181483d7f467%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTgxODE0ODNkN2ViNjUtMGI1ZmRlNDliZWFlZTE4LTI2MDIxYjUxLTEzNTA3MjgtMTgxODE0ODNkN2Y0NjciLCIkaWRlbnRpdHlfbG9naW5faWQiOiI5MDA1YWZmZjY0ZWI3ZjFmM2Y4NzllYzA0MzZmMTU4MyJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%229005afff64eb7f1f3f879ec0436f1583%22%7D%2C%22%24device_id%22%3A%2218181483d7eb65-0b5fde49beaee18-26021b51-1350728-18181483d7f467%22%7D; _uab_collina=165573142584412596676913; partner=51jobhtml5; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch2%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%BF%AA%B7%A2%B9%A4%B3%CC%CA%A6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch3%7E%60010000%2C020000%2C030200%2C040000%2C090200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%C5%C0%B3%E6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch4%7E%60010000%2C020000%2C030200%2C040000%2C090200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FApython%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; msearchhistory=020000%2CJava%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%2C2%2C%2C%2C%7C%7C020000%2C%E7%88%AC%E8%99%AB%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%2C2%2C%2C%2C%7C%7C020000%2C%E7%88%AC%E8%99%AB%2C2%2C%2C%2C; m_search=areacode%3D020000%26%7C%26keyword%3DJava%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88; acw_tc=76b20ff916558203895284101e604b838909c0a4b9c6f6fbc1969a4d2dbe6d; SECKEY_ABVK=xgzxoSyn2rtXh5MMsu8vphF1pw/gDr23aCF3+p/Ki2c%3D; BMAP_SECKEY=2mBlqVQnvkTyj7CDYCSepCV66Z1JJFHnlxN1HBYtIwhr14tDfacPvwpRsh-FttecZsXYmsTaSLo4-NkE4QJbdsk64mkhlzL2Ezr_kzAwhhqC0PaFJqgR6OiihpJ14g93nuM-6VNoqCkYBFCDJ7SRQtIOUh9eKYILo6glUsfuGO8B5yK7TgM1g4D6YaaYHOS8; acw_sc__v2=62b1d0733f8741b0e9094a2a23d15f489ac3336d; ssxmod_itna=YqUx0DBD9DnA3AKK0dKYIE=4rqqu0ibOb3qGNdAoDZDiqAPGhDC84Ix27RkD3h0EsDIoweDgC4xhxEGlMeTevQipyeDHxY=DU=CiKbD4+KGwD0eG+DD4DWeqAoDexGpc2pXKGWD4qDODWKDX2akDiPCDmR8pxGClxDCUAPDwx0CLovF=4YpDiyiROhPLxG1F40HiGfxLxOfL8G1RPSAozbO43YDvxDkDUKDo2PpDB+kBpYNQCRDWjuDYWb3qOx45xRi=Q2isx2rqZxxLQexQWUPKYaKTjqKKDDWilty4D===; ssxmod_itna2=YqUx0DBD9DnA3AKK0dKYIE=4rqqu0ibOb4A=TGOj7DBTgq7pxLhBaHGFj+g6fk6LHNYq8O7APDgemv=GY6hBBj7QrIR3cj3A31LFj2YDzf7u27eq1F9LxsZ97BWggIlgSXwUsY8MpcmsSh9HY2wbmoebQAANZxBhum+GKtWCF4AirtAmzA5za=Grz8D7qGIb3PqeenpvFW5HW+ee2AGoDQIEDjKD+OfxEnDTV7hx4D=='
    } 
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8'
    page_text = response.text
    html = etree.HTML(page_text)
    rel_list = html.xpath('//div[@class="list"]/a/@rel')  # 获取详细信息页链接实现详细信息爬取
    for rel in rel_list:
        response = requests.get(url=rel, headers=headers)
        response.encoding = 'utf-8'
        page_text = response.text
        html = etree.HTML(page_text)
        jie1 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[1]/p[@class="jname"]/text()')[0]  # 招聘岗位名称
        jie2 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[1]/p[@class="sal"]/text()')[0]  # 薪水
        jie3 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/div/span[1]/text()')[0]  # 工作地点
        jie4 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/div/span[2]/text()')[0]  # 学历
        jie5 = html.xpath('//div[@id="pageContent"]/div[1]/div/div[2]/span/text()')[0]  # 时间
        jie6 = html.xpath('//div[@id="pageContent"]/div[2]/div/div[2]/h3/text()')[0]  # 公司名称
        jie7 = html.xpath('//div[@id="pageContent"]/div[1]/a/span/text()')[0]  # 公司地址
        jie8 = html.xpath('//div[@id="pageContent"]/div[3]/div[2]/article//text()')  # 任职详细信息
        jie8 = ",".join(jie8).replace('\xa0', '')
        a += 1
        print(f'正在爬取第{a}条招聘信息')
        xiaojie = [jie1,jie2,jie3,jie4,jie5,jie6,jie7,jie8]
        print(xiaojie)
        ww.writerow(xiaojie)

    print(f'第{i}页爬取结束!!!')
f.close()

img

不知道你这个问题是否已经解决, 如果还没有解决的话:
  • 以下回答来自chatgpt:

    问题回答:

    假设需要爬取的网站是 https://www.example.com/ ,需要爬取其中的某个职位的招聘信息。

    首先需要安装 requestsbeautifulsoup4 这两个库来进行爬取和解析网页源码。

    可以直接在 jupyter 中安装 requestsbeautifulsoup4 两个库,在 jupyter 中执行以下命令:

    !pip install requests beautifulsoup4
    

    然后,进入编辑模式,在 jupyter 中执行以下代码:

    import requests
    from bs4 import BeautifulSoup
    
    url = 'https://www.example.com/job'
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    job_list = soup.find_all('div', class_='job-list')
    
    for job in job_list:
      job_title = job.find('h2').text
      job_location = job.find('p', class_='location').text
      job_description = job.find('p', class_='description').text
      print(f'Title: {job_title}\nLocation: {job_location}\nDescription: {job_description}\n')
    

    这个代码逻辑实际上非常简单,就是通过 requests 库请求网页,然后用 beautifulsoup4 解析网页源码,找到所有职位信息的 div 标签,然后遍历每个职位,找到职位的标题、地点和描述,最后输出这些信息。

    值得注意的是,在进行网页爬取时,不要一味地访问网站,应该尽量模拟浏览器的行为,尤其是要注意不要频繁访问同一个页面,否则可能会被网站误认为是恶意攻击。因此,在实际进行爬取时,需要设置合适的请求头和间隔时间,以免被封 IP。

    同时,爬取网页的过程中还需要处理一些异常情况,比如网络连接失败、页面解析错误等。可以通过 try-except 语句来捕获这些异常,将其记录并继续执行程序。

    最后,需要注意遵循网络爬虫的道德规范,尊重网站的知识产权和利益,不要盗用、抄袭网站内容,避免对网站造成不必要的损失。


如果你已经解决了该问题, 非常希望你能够分享一下解决方案, 写成博客, 将相关链接放在评论区, 以帮助更多的人 ^-^