想爬招聘网数据,已写好获得每页招聘网的职位信息代码及每一条职位对应的详细页信息代码:但如何把这两个数据连接放在一表中并保存下来?得到如下的表格式?
#导入模块
import requests
import pandas
from fake_useragent import UserAgent
from lxml import etree
data_list=[]
dqs=""
pubtime="3"
salary="30"
industries="200"
position='分析'
curPage=1
#获取网页内容
def gethtml(url):
headers={'User-Agent':str(UserAgent().random)}
response=requests.get(url,headers=headers)
r_response=response.content.decode().replace("https://www.liepin.com", "")
return r_response
#获取网页数据
def getpath(r):
html=etree.HTML(r)
b=html.xpath('//div//ul[@class="sojob-list"]/li')
for i in b:
list={}
list['职位'] = i.xpath("./div/div/h3/a/text()")[0]
list['招聘企业'] = i.xpath('./div/div[2]/p[1]/a/text()')[0]
list['工作地点'] = i.xpath('.//*[@class="area"]/text()')[0]
list['月薪']= i.xpath('./div/div[1]/p[1]/span[1]/text()')[0]
list['发布时间'] = i.xpath('./div/div[1]/p[2]/time/text()')[0]
data_list.append(list)
return data_list
#翻页
def next_page():
url_np='https://www.liepin.com/zhaopin/?compkind=&dqs{}=&pubTime={}&pageSize=40&salary={}%24&compTag=&sortFlag=15°radeFlag=0&compIds=&subIndustry=&jobKind=&industries={}&compscale=&key={}&curPage={}'
url_list=[url_np.format(dqs,pubtime,salary,industries,position,i) for i in range(0,curPage)]
return url_list
#详细页
def href_list():
hreflist = []
page = next_page()
for i in page:
href=gethtml(i)
href1=etree.HTML(href)
href2=href1.xpath('//div//ul[@class="sojob-list"]/li')
for x in href2:
href3="https://www.liepin.com"+x.xpath('./div/div/h3/a/@href')[0]
hreflist.append(href3)
return hreflist
#详细页内容
def detail_list():
hl=href_list()
for i in hl:
gh1=gethtml(i)
gp2=etree.HTML(gh1)
cc=gp2.xpath('//div/div[1]/div[1]/div[1]/div[3]/div/text()')
data_list.append(cc)
return data_list
#主程序
def run_liep():
page = next_page()
for i in page:
gh=gethtml(i)
gp=getpath(gh)
gp = pandas.DataFrame(gp)
gp.to_excel('./liepin.xlsx', index=False)
return gp
if __name__ == '__main__':
print(run_liep())
用pandas,将每条数据放入数据框中,然后用append得到一个整体的数据框,写入excel。