文本内容有可能是动态加载的
import requests
from lxml import etree
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Host': 'www.yodu.org'
}
url = 'https://www.yodu.org/book/3737/266795.html'
filename = 'wbfx.txt'
##请求页面,返回数据,添加headers的user-agent是反爬的最简单方式
req = requests.get(url=url,headers=headers).content.decode('utf-8')
#使用lxml解析页面
data_page = etree.HTML(req)
data_title = data_page.xpath('//h1/text()')[0] #获取标题
data_list = data_page.xpath('//div[@id="TextContent"]//p/text()') #获取文本内容
data = '\n'.join(data_list) #拼接文字内容
this_chapter = '\n{}\n{}'.format(data_title,data) #拼接标题和内容
# 写入文件
with open(filename,'a',encoding='utf-8') as f:
f.write(this_chapter)
print('下载完成')