#源码在此
# -*- coding:utf-8 -*-
import requests,sys
from bs4 import BeautifulSoup
class downloader (object):
def __init__(self):
self.names = [] #存放章节名
self.urls = [] #存放章节链接
self.nums = () #章节数
self.target = 'https://www.biquge5200.cc/84_84888/'
def get_url(self):
req = requests.get(url =self.target)
html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all('div', id='list')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
self.nums = len(a)
for each in a:
# print(each.string, each.get('href'))
self.names.append (each.string)
self.urls.append (each.string, each.get('href'))
def get_content (self,targer):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/61.0.3163.79 Safari/537.36'
}
content = requests.get(url = targer,headers=header)
# content = content.encode ('iso-8859-1')
html = content.text
bf = BeautifulSoup(html)
text = bf.find_all(id = 'content')
texts = text[0].text.replace('</p><p>'*2,'\n')
return texts
def write_in (self,name,path,texts):
write_flag = True
with open('超品巫师'+'.txt',encoding='utf-8')as f:
f.write(name + '\n')
f.writelines(texts)
f.write('\n\n')
if __name__ == "__main__":
dl = downloader
downloader.get_url()
print('The page is loading, please wait...')
for i in range(dl.nums):
dl.write_in(dl.names[i],'超品巫师.txt',dl.get_content(dl.urls[i]))
sys.stdout.write (" 已下载:%.3f%%" % float(i/dl.nums) + '\r')
sys.stdout.flush()
# print(flush = True)
print('Mission completed')
下面是错误信息
代码错误较多。1.代码中所有的BeautifulSoup函数参数要添加参数features='xlml' 或‘html.parser’ ;
2.get_url方法中最后一句,self.urls.append (each.string, each.get('href'))错误,append只能有一个参数,改成:self.urls.append (each.string+','+ each.get('href'))
3.在main函数中,类对象的实例化错误,改为:dl=downloader(),dl.get_url(),在for循环中第一句中调用函数dl.get_content()时传递的参数不正确,按第二条中self.urls的形式,作出更改:dl.urls[i].split(',')[-1]
4.不能过于频繁调用,在循环中加入休眠时间。