如何解决python爬取小说出现的异常?

python爬取小说出现异常,该怎么解决?

import requests
import parsel
import re

list_url='http://huayu.zongheng.com/showchapter/1222064.html'
response=requests.get(list_url)
print(response.text)
selectors=parsel.Selector(response.text)
href = selectors.css('div.container div div div ul li a::attr(href)').getall()
href=''.join(href)
print(href)
name=re.findall('

(.*?)

',response.text)[0] print(name) response=requests.get(href) print(response.text) selectors=parsel.Selector(response.text) title=selectors.css('#readerFt > div > div.title > div.title_txtbox::text').get() print(title) content=selectors.css('p::text').getall() print(content) content=''.join(content) with open(f'D:/python练习/{name}',mode='a',encoding='utf-8')as f: f.write(title) f.write('\n') f.write(content) f.write('\n')

几个问题

  1. 你把各章节的href拼接成了一个字符串,传入requests.get里,并不能识别
  2. 用正则(.*?)取书名,会返回空
  3. 打开的文件没有扩展名

修改后,用循环取出每章节内容

import requests
import parsel
import re
 
list_url='http://huayu.zongheng.com/showchapter/1222064.html'
response=requests.get(list_url)
#print(response.text)
selectors=parsel.Selector(response.text)
href = selectors.css('div.container div div div ul li a::attr(href)').getall()
#href=''.join(href)
#print(href)
name=re.findall('(?<=<h1>).*(?=<\/h1>)',response.text)[0]
#print(name)
for chapter in href:
    response=requests.get(chapter)
    #print(response.text)
    selectors=parsel.Selector(response.text)
    title=selectors.css('#readerFt > div > div.title > div.title_txtbox::text').get()
    print(title)
    content=selectors.css('p::text').getall()
    print(content)
    content=''.join(content)
     
    with open(f'D:/python练习/{name}.txt',mode='a',encoding='utf-8')as f:
        f.write(title)
        f.write('\n')
        f.write(content)
        f.write('\n')