import requests
import re
import time
from lxml import etree
allurl = []
query = input("请输入要查找的景点:")
for mun in range(1,9,1):
url = f'https://you.ctrip.com/searchsite/travels/?query={query}&isAnswered=&isRecommended=&publishDate=&PageNo=%27+str(mun)
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
'AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56'
}
resp = requests.get(url,headers=headers).text
obj = re.compile(r'<li class="cf">.*?<dt><a href="(?P<href>.*?)" target="_blank">(?P<title>.*?)</a>',re.S)
result = obj.finditer(resp)
for it in result:
herf = 'https://you.ctrip.com' + it.group("href")
title = it.group("title")
#print(title)
allurl.append(herf)
for allurl1 in allurl:
resp1 = requests.get(allurl1,headers=headers).text
txt = etree.HTML(resp1)
file = txt.xpath('//div[@class="ctd_content"]')
try:
file = file[0].xpath('string(.)').strip().replace('\\n', '')
except Exception:
file = ''
pattern = "[\u4e00-\u9fa5]+"
regex = re.compile(pattern)
results = ','.join(regex.findall(file))
print(results)
print("完成!")
import requests
import re
from lxml import etree
import os
class Spider:
def __init__(self, province: str):
dir_list = os.listdir()
if province not in dir_list:
os.mkdir(province)
os.chdir(province)
"""
输入一个目标省份
:param province:
"""
self.province = province
self.session = requests.session()
def get_html(self, url: str) -> str:
"""
获取目标网站的html
:param url:
:return:
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
}
req = self.session.get(url=url, headers=headers)
req.encoding = req.apparent_encoding
html = req.text
req.close()
return html
def get_html_content(self, html: str) -> dict:
"""
提取html里面的数据
:param html:
:return:
"""
title_obj = re.compile(r'<dt><a href="(?P<url>.*?)" target="_blank">(?P<title>.*?)</a> </dt>', re.S)
json = {}
base_url = "https://you.ctrip.com"
for i in title_obj.finditer(html):
json[i.group('title')] = json.get(i.group('title'), base_url + i.group('url'))
return json
def operate_html(self, num=4) -> None:
for i in range(1, 4):
url = f"https://you.ctrip.com/searchsite/travels/?query={self.province}&isAnswered=&isRecommended=&publishDate=&PageNo={i}"
html = self.get_html(url=url)
the_json = self.get_html_content(html=html)
for k, v in the_json.items():
try:
self.get_one_url(name=k, url=v)
except:
print(k, "下载失误(命名有问题)")
continue
def get_one_url(self, name: str, url: str) -> None:
"""
获取单个url里面的文本信息
:param name:
:param url:
:return:
"""
print(name)
name = name.replace("|", '').replace("\\", '')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
}
req = self.session.get(url=url, headers=headers)
req.encoding = 'utf-8'
html = req.text
req.close()
html = etree.HTML(html)
child = html.xpath(r"./body/div[2]/div[4]/div[1]/div[1]/div[2]/p")
with open(f"{name}.txt", 'w', encoding='utf-8') as f:
for i in child:
for j in i.xpath('./text()'):
f.write(j)
print(f"{name}.txt 完成")
if __name__ == '__main__':
p = "贵州"
User = Spider(p)
User.operate_html()
运行后会生成一个你输入的province的文件件,里面有下载的txt文件,默认设置了爬4页,不要弄太多,对服务器不友好,敲代码不易,有用的话点一下采纳
用循环,设定一个变量,在命名时加入数字变量
抓取文章标题,作为文件名。
循环获取里面的内容