本来现在pythone爬虫,学习的一个案例中爬取某网站免费简历模版,自己写的代码如下
```python
import requests
from lxml import etree
import os
url = 'https://sc.chinaz.com/jianli/free.html'
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
# with open('./jianli.html','w',encoding='utf-8') as fp:
# fp.write(page_text)
a_list = tree.xpath('//div[@id="container"]/div/a')
if not os.path.exists('./resume'):
os.mkdir('resume')
resume_list = []
for a in a_list:
resume_src = 'http:'+a.xpath('./@href')[0]
resume_name = a.xpath('./img/@alt')[0]
resume_name = resume_name.encode('iso-8859-1').decode('utf-8')
download_resume_src = resume_src+'#down'
#print(resume_name,resume_src)
resume_list.append(download_resume_src)
#print(download_resume_src)
download_text = requests.get(url=download_resume_src,headers=headers).text
tree = etree.HTML(download_text)
download_list = tree.xpath('.//div[@id="down"]/div[2]/ul/li/a/@href')[0]
print(download_list)
download_data = requests.get(url=download_list,headers=headers).content
resume_path = 'resume/'+ resume_name
with open(resume_path,'wb') as fp:
fp.write(download_data)
print(resume_name,'下载成功!')
运行结果显示:


原网站简历下载格式为.rar,但我执行后文件夹下保存的下载文件不是rar格式。
请教大家给予指点!感谢!
resume_path = 'resume/'+ resume_name+".rar"#后缀加上
import requests
from lxml import etree
import os
url = 'https://sc.chinaz.com/ppt/free_1.html'
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81'
}
page_text = requests.get(url=url,headers=headers).text
#print(page_text)
tree = etree.HTML(page_text)
lian_list = tree.xpath('//div[@class="bot-div"]/a/@href')
#print(lian_list)
if not os.path.exists('ppt模板'):
os.mkdir('ppt模板')
for lian in lian_list:
lian_url = 'https://sc.chinaz.com'+lian
detail_text = requests.get(url=lian_url,headers=headers).text
tree = etree.HTML(detail_text)
ppt_name = tree.xpath('//div[@class="title-box clearfix"]/h1/text()')[0]
ppt_url = tree.xpath('//div[@class="download-url"]/a[1]/@href')[0]
#print(ppt_name)
# print(ppt_url)
ppt_path = 'ppt模板/'+ppt_name+'.rar' #加.rar 后缀
ppt_data = requests.get(url=ppt_url,headers=headers).content
with open(ppt_path, 'wb') as fp:
fp.write(ppt_data)
print(ppt_name,'下载成功!')
注意文件名加.rar后缀,这个是我之前写的,参考一下,有帮助的话采纳一下哦!,谢谢!🙈🙈🙈