import requests
import re
main_url = 'https://www.qb5.tw/book_116663/'
main_html = requests.get(main_url).text
info_list = re.findall('
(.*?), main_html)
for inf in info_list:
url = 'https://www.qb5.tw/book_116663/'+inf[0]
response = requests.get(url)
html_date = response.text
text = re.findall('(.*?)', html_date)[0]
text = text.replace(' 全本小说 www.qb5.tw,最快更新星门最新章节!
', '')
text = text.replace(' ', ' ')
text = inf[1]+'\n\n'+text.replace('
', '\n')
print(text)
open('星门.txt', mode='a', encoding='gbk').write(text)
帮我试试看看这段代码什么问题,为什么得不到我想要的东西呀
剩下的你自己慢慢处理吧
import requests
import re
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
main_url = 'https://www.qb5.tw/book_116663/'
main_html = requests.get(main_url,headers=headers).text
info_list = re.findall('<dd><a href="(.*?)">第', main_html)
for inf in info_list:
print(inf)
if len(inf)!=13:
continue
url = f'https://www.qb5.tw/book_116663/{inf}'
response = requests.get(url,headers=headers)
html_date = response.text
html = etree.HTML(html_date)
sen_list = html.xpath('//*[@id="content"]//text()')
text=''
for i in sen_list:
print(i)
break