怎么将游记保存为:一个游记一个txt文件

import requests
import re
import time
from lxml import etree

allurl = []
query = input("请输入要查找的景点:")
for mun in range(1,9,1):
url = f'https://you.ctrip.com/searchsite/travels/?query={query}&isAnswered=&isRecommended=&publishDate=&PageNo=%27+str(mun)
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
'AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56'
}
resp = requests.get(url,headers=headers).text

obj = re.compile(r'<li class="cf">.*?<dt><a href="(?P<href>.*?)" target="_blank">(?P<title>.*?)</a>',re.S)
result = obj.finditer(resp)
for it in result:
    herf = 'https://you.ctrip.com' + it.group("href")
    title = it.group("title")
    #print(title)
    allurl.append(herf)

for allurl1 in allurl:
resp1 = requests.get(allurl1,headers=headers).text

txt = etree.HTML(resp1)
file = txt.xpath('//div[@class="ctd_content"]')
try:
    file = file[0].xpath('string(.)').strip().replace('\\n', '')
except Exception:
    file = ''
pattern = "[\u4e00-\u9fa5]+"
regex = re.compile(pattern)
results = ','.join(regex.findall(file))
print(results)

print("完成!")


import requests
import re
from lxml import etree
import os


class Spider:
    def __init__(self, province: str):
        dir_list = os.listdir()
        if province not in dir_list:
            os.mkdir(province)
        os.chdir(province)
        """
        输入一个目标省份
        :param province:
        """
        self.province = province
        self.session = requests.session()

    def get_html(self, url: str) -> str:
        """
        获取目标网站的html
        :param url:
        :return:
        """
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
        }
        req = self.session.get(url=url, headers=headers)
        req.encoding = req.apparent_encoding
        html = req.text
        req.close()
        return html

    def get_html_content(self, html: str) -> dict:
        """
        提取html里面的数据
        :param html:
        :return:
        """
        title_obj = re.compile(r'<dt><a href="(?P<url>.*?)" target="_blank">(?P<title>.*?)</a> </dt>', re.S)
        json = {}
        base_url = "https://you.ctrip.com"
        for i in title_obj.finditer(html):
            json[i.group('title')] = json.get(i.group('title'), base_url + i.group('url'))
        return json

    def operate_html(self, num=4) -> None:
        for i in range(1, 4):
            url = f"https://you.ctrip.com/searchsite/travels/?query={self.province}&isAnswered=&isRecommended=&publishDate=&PageNo={i}"
            html = self.get_html(url=url)
            the_json = self.get_html_content(html=html)
            for k, v in the_json.items():
                try:
                    self.get_one_url(name=k, url=v)
                except:
                    print(k, "下载失误(命名有问题)")
                    continue

    def get_one_url(self, name: str, url: str) -> None:
        """
        获取单个url里面的文本信息
        :param name:
        :param url:
        :return:
        """
        print(name)
        name = name.replace("|", '').replace("\\", '')
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36"
        }
        req = self.session.get(url=url, headers=headers)
        req.encoding = 'utf-8'
        html = req.text
        req.close()
        html = etree.HTML(html)
        child = html.xpath(r"./body/div[2]/div[4]/div[1]/div[1]/div[2]/p")
        with open(f"{name}.txt", 'w', encoding='utf-8') as f:
            for i in child:
                for j in i.xpath('./text()'):
                    f.write(j)
        print(f"{name}.txt 完成")


if __name__ == '__main__':
    p = "贵州"
    User = Spider(p)
    User.operate_html()

img

img

img

运行后会生成一个你输入的province的文件件,里面有下载的txt文件,默认设置了爬4页,不要弄太多,对服务器不友好,敲代码不易,有用的话点一下采纳

python爬取网文存入txt文件_想学好c艹呀的博客-CSDN博客_python爬取文章保存为txt 文章目录一、网络爬虫是什么?二、使用步骤1.引入库2.数据请求2.数据解析3.数据存储4.完整代码一、网络爬虫是什么? 百度百科给出的定义:网络爬虫(又称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。本文以爬取诗词名句网的<<水浒传>>为例二、使用步骤1.引入库requests是python中一款基于网络请求的模块,功能非常强大,简 https://blog.csdn.net/m0_46417197/article/details/121301348

用循环,设定一个变量,在命名时加入数字变量

抓取文章标题,作为文件名。

循环获取里面的内容