import requests
from bs4 import BeautifulSoup
if name =="__main__":
#资源地址
url = 'https://zhijian.hnquxing.com/book/7yq6yzy90dz5/chapter?startup=so_search'
#UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
#对首页发起请求
home_page = requests.get(url = url , headers = headers).text
#构造对象
home_page_soup = BeautifulSoup(home_page, 'lxml')
# print(home_page_soup)
#目录列表
catalog_list = home_page_soup.select(".catalog-list.list > a")
complite_url = url + catalog_list.a['href']
fp = open('./文本.text' , 'w' , encoding='utf-8')
for a in catalog_list:
title = a.text
#详情页的地址
detail_url = a['href']
# print(detail_url)
#拼接地址
complite_url = url + detail_url
# print(complite_url)
#对完整地址访问
detail_page = requests.get(url = complite_url,headers = headers).text
# print(detail_page)
# #构造对象
detail_page_soup = BeautifulSoup(detail_page, 'lxml')
# print(detail_page_soup)
# #提取文本内容
div_tag = detail_page_soup.find('div' , class_= "scrollcontent")
print(div_tag)
content = div_tag.get_text()
# #永久存储
fp.write(title + ":" + content +":" + '\n')
fp.close()
1、爬取的是三国演义小说的内容
2、在提取文本内容时div_tag无法返回标签里的内容
3、请大佬帮忙解答一下,不胜感激
catalog_list = home_page_soup.select(".catalog-list.list > a")
这种方法能用吗??
我还没试过