```

它说我属性错误:“下载程序”对象没有属性“get_download_url”,我确实没有这个属性,从某网摘过来学习的 它也没有写请问有没有细心的家人 看一看哪里错了
# -*- coding: gbk -*-
from bs4 import BeautifulSoup
import requests
"""
类说明:下载《纵横网》网小说《虚假之月》
Parameters:
无
Returns:
无
Modify:
2021-08-17
"""
class downloader(object):
def __init__(self):
self.server = 'http://book.zongheng.com/showchapter//'
self.target = 'http://book.zongheng.com/showchapter/1122915.html'
self.names = [] #存放章节名
self.urls = [] #存放章节链接
self.nums = 0 #章节数
"""
函数说明:获取下载链接
Parameters:
无
Returns:
无
Modify:
2021-08-17
"""
def get_download_url(self):
req = requests.get(url = self.target)
html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all('div', class_ = 'volume-list')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
self.nums = len(a[15:]) #剔除不必要的章节,并统计章节数
for each in a[15:]:
self.names.append(each.string)
self.urls.append(self.server + each.get('href'))
"""
函数说明:获取章节内容
Parameters:
target - 下载连接(string)
Returns:
texts - 章节内容(string)
Modify:
2021-08-17
"""
def get_contents(self, target):
req = requests.get(url=target)
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all('div', class_='volume-list')
texts = texts[0].text.replace('\xa0' * 8, '\n\n')
return texts
"""
函数说明:将爬取的文章内容写入文件
Parameters:
name - 章节名称(string)
path - 当前路径下,小说保存名称(string)
text - 章节内容(string)
Returns:
无
Modify:
2021-08-17
"""
def writer(self, name, path, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
看看代码
图2糊了看不清,建议使用粘贴代码功能将源码粘贴到问题或回答里
# -*- coding: gbk -*-
import os
from bs4 import BeautifulSoup
import requests
"""
类说明:下载《纵横网》网小说《虚假之月》
Parameters:
无
Returns:
无
Modify:
2021-08-17
"""
aim_dir = 'pastebin/'
if not os.path.exists(aim_dir):
os.mkdir(aim_dir)
class downloader(object):
def __init__(self):
self.server = 'http://book.zongheng.com/showchapter//'
self.target = 'http://book.zongheng.com/showchapter/1122915.html'
self.names = [] # 存放章节名
self.urls = [] # 存放章节链接
self.nums = 0 # 章节数
"""
函数说明:获取下载链接
Parameters:
无
Returns:
无
Modify:
2021-08-17
"""
def get_download_url(self):
req = requests.get(url=self.target)
html = req.text
div_bf = BeautifulSoup(html, features='lxml')
div = div_bf.find_all('div', class_='volume-list')
a_bf = BeautifulSoup(str(div[0]), features='lxml')
a = a_bf.find_all('a')
self.nums = len(a[15:]) # 剔除不必要的章节,并统计章节数
for each in a[15:]:
self.names.append(each.string)
print(self.names[-1])
self.urls.append(each.get('href'))
print(self.urls[-1])
"""
函数说明:获取章节内容
Parameters:
target - 下载连接(string)
Returns:
texts - 章节内容(string)
Modify:
2021-08-17
"""
def get_contents(self, target):
req = requests.get(url=target)
html = req.text
bf = BeautifulSoup(html, features='lxml')
texts = bf.find_all('div', class_='content')
try:
texts = texts[0].text.replace('\xa0' * 8, '\n\n')
return texts
except IndexError:
return None
"""
函数说明:将爬取的文章内容写入文件
Parameters:
name - 章节名称(string)
path - 当前路径下,小说保存名称(string)
text - 章节内容(string)
Returns:
无
Modify:
2021-08-17
"""
def writer(self, name, path, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.write(text)
downloader1 = downloader()
downloader1.get_download_url()
for each in range(len(downloader1.names)):
print(downloader1.urls[each])
if not downloader1.get_contents(downloader1.urls[each]):
print(f'在{downloader1.names[each]} 处未找到文本')
continue
downloader1.writer(name=downloader1.names[each], path=aim_dir + downloader1.names[each] + '.txt',
text=downloader1.get_contents(downloader1.urls[each]))
(我怎么感觉我被嫖了个爬虫。。。)
目前输出的txt格式有点混乱,你根据content重新排版一下get_contents里的text,在对应位置替换成/n就行
有帮助请好评,特别帮助请打赏()
我是经常爬小说看的人,所以我按照我的思路简单写了一个爬虫的小程序,最后将爬取到的内容整合我没有写
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from lxml import etree
from time import sleep
import requests
import random
start_url = 'http://book.zongheng.com/showchapter/1122915.html'
USER_AGETN = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
,"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
,"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
,"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"
,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"
,"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
,"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
,"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"
,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
]
headers = {
"User-Agent":random.choice(USER_AGETN),
"Refer":start_url,
"Host":"book.zongheng.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
}
class novelScrapy(object):
def __init__(self,start_url):
self.start_url = start_url
self.urls = {}
def getNovelUrls(self):
'''获取小说的名称和链接地址'''
html = etree.HTML(session.get(self.start_url).content.decode('utf-8'))
aList = html.xpath('//li[@class=" col-4"]/a')
for i in aList:
name = i.xpath('./text()')[0].replace('’','').replace('!','').replace('?','').replace(' ','')
url = i.xpath('./@href')[0]
# print(name , url)
self.urls[name] = url
# print(self.urls)
def getSingleNovelContent(self):
'''获取单个url的内容,并将爬取的内容写入文件'''
if self.urls:
for name,url in self.urls.items():
if not '{}.txt'.format(name):
novelContentHtml = etree.HTML( session.get(url).content.decode('utf-8') )
novelInfo = novelContentHtml.xpath('//div[@class="content"]/p/text()')
##将爬取的内容写入文件
with open('{}.txt'.format(name),'w',encoding='utf-8') as wf:
wf.write(name+'\n')
for i in novelInfo:
if i.strip():
wf.write(i.strip()+'\n')
sleep(2)
else:
print('无可用的链接')
if __name__ == '__main__':
session = requests.session()
session.headers = headers
obj = novelScrapy(start_url=start_url)
obj.getNovelUrls()
obj.getSingleNovelContent()