#导入模块
import requests
import pandas
from fake_useragent import UserAgent
from lxml import etree
data_list=[]
salary=""
position='分析'
curPage=1
#获取网页内容
def gethtml(url):
headers={'User-Agent':str(UserAgent().random)
}
response=requests.get(url,headers=headers)
r_response=response.content.decode('gbk')
return r_response
#获取网页数据
def getpath(r):
html=etree.HTML(r)
b=html.xpath('//div[@class="j_joblist"]/div')
for i in b:
list={}
list['职位'] = i.xpath(".a/p[1]/span[1]/text()")[0]
data_list.append(list)
return data_list
#翻页
def next_page():
url_np='https://search.51job.com/list/000000,000000,0000,22%252c05%252c06%252c44%252c60,9,{}-,{},2,{}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
url_list=[url_np.format(salary,position,i) for i in range(1,curPage+1)]
return url_list
#主程序
def run_51job():
page = next_page()
for i in page:
gh=gethtml(i)
gp=getpath(gh)
gp = pandas.DataFrame(gp)
gp.to_excel('./51job.xlsx', index=False)
return gp
if __name__ == '__main__':
print(run_51job())
改了好多xpath路径,但得到结果仍为[](用xpath helper查看无问题),求大佬指点一二:
你请求访问返回的html中就没有这个,应该是没加载,看下接口把,或者模拟器访问