网上找的小说全栈爬虫代码,本人没学过不会改,用的pycharm2021.2.2,最后应该是数据存储路径问题
报错:
C:\Users\Jason\PycharmProjects\pythonProject1\venv\Scripts\python.exe C:/Users/Jason/PycharmProjects/pythonProject1/venv/01.py
Traceback (most recent call last):
File "C:\Users\Jason\PycharmProjects\pythonProject1\venv\01.py", line 234, in <module>
f = open("./config.ini", encoding='utf-8')
FileNotFoundError: [Errno 2] No such file or directory: './config.ini'
进程已结束,退出代码为 1
代码:
f = open("./config.ini", encoding='utf-8')
print(f.read())
category = input("请输入您要爬取的分类:")
pages = input("\n请输入您要爬取的开始页:")
if len(category) > 0 and len(pages) > 0:
print("\n爬虫启动中...")
time.sleep(2)
print("开始爬取数据...", time.strftime("%Y/%m/%d %H:%M:%S"))
time.sleep(3)
runSpider(pages, category) # runSpider(页码,分类号)
# for p in c:
print("分类", category, "爬取完毕!")
全部代码
import requests
import random
from lxml import etree
from html.parser import HTMLParser
import os
import time
import re
base_url = 'http://www.zxcs.me/sort/{}'
page = "/page/{}"
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
]
UA = random.choice(user_agent_list)
HEADERS = {'User-Agent': UA}
# 简单封装网络请求操作
def get_requests(url):
global NETWORK_STATUS
NETWORK_STATUS = True # 判断状态变量
try:
req = requests.get(url, headers=HEADERS, timeout=20)
if req.status_code == 200:
html = req.text
return html
except requests.exceptions.Timeout:
NETWORK_STATUS = False # 请求超时改变状态
if not NETWORK_STATUS:
'''请求超时'''
for i in range(1, 10):
print('请求超时,第%s次重复请求' % i)
req = requests.get(url, headers=HEADERS, timeout=5)
if req.status_code == 200:
html = req.text
return html
return -1
# 获取各分类列表页中的总页数
def getPageTotal(cates):
# base_url = "http://www.zxcs.me/sort/25/"
req = get_requests(base_url.format(cates))
selector = etree.HTML(req)
page = selector.xpath("//*[@id='pagenavi']/a/@href")[-1].split('/')[-1] # 获取每个分类下的总页数
return page
# 获取每个分类下每页内容
def getPage(category, pageIdex):
req = get_requests(base_url.format(category) + page.format(str(pageIdex)))
return req
# 获取列表中文章的url
def getList(cates, pageIdex):
req = getPage(cates, pageIdex)
selector = etree.HTML(req)
url_list = selector.xpath("//*[@id='plist']/dt/a/@href")
return url_list
# 获取下载链接
def get_downUrl(url):
link_id = url.split('/')[-1] # 通过传入的文章地址截取网址中的文章id
req = get_requests("http://www.zxcs.me/download.php?id={}".format(link_id))
selector = etree.HTML(req)
d_url = selector.xpath("//span[@class='downfile']/a/@href")[0]
return d_url
# 下载小说
def saveRAR(url, fileName):
print("开始下载", fileName, "小说...\n")
rar_name = url.split('/')[-1] # 截取网址中小说名称
names = fileName + '/' + str(rar_name) # 组合下载路径
start = time.time() # 开始时间
size = 0
req = requests.get(url, stream=True, headers=HEADERS)
chunk_size = 1024 # 每次下载的数据大小
content_size = int(req.headers['content-length']) # 总大小
if req.status_code == 200:
print('[文件大小]:%0.2f MB' % (content_size / chunk_size / 1024)) # 换算单位并print()
with open(names, 'wb') as f:
for d in req.iter_content(chunk_size=chunk_size): # 获取请求的原始响应
f.write(d)
size += len(d) # 已下载文件大小
print(
'\r' + '[下载进度]:%s %.2f%%' % ('>' * int(size * 50 / content_size), float(size / content_size * 100)),
end='')
f.close()
end = time.time()
print('\n', "小说下载完成!用时%.2f秒" % (end - start))
# 保存内容简介
def saveBrief(content, name):
fileName = name + "/" + name + ".txt"
f = open(fileName, "w+")
print("正在偷偷保存小说简介", fileName)
f.write(content)
# 保存内容页中图片到本地
def saveImg(imgUrl, fileName):
name = imgUrl.split('/')[-1]
req = requests.get(imgUrl).content
if name.endswith(".jpg"):
names = fileName + "/" + str(name)
else:
names = fileName + "/" + str(name.split('?')[0])
f = open(names, 'wb')
f.write(req)
print("正在悄悄的保存小说图片", name)
f.close()
# 创建新目录
def mkdir(path):
path = path.strip()
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
print("正在新建名为", path, '的文件夹')
# 创建目录操作函数
os.makedirs(path)
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print("名为", path, '的文件夹已经创建!')
return False
# 获取详情页内容并返回一个列表
def getContent(url):
cont = []
req = get_requests(url)
selector = etree.HTML(req)
title = selector.xpath("//*[@id='content']/h1/text()")[0] # 标题
category = selector.xpath("//*[@class='date']//a/text()")[1] # 分类
if len(selector.xpath("//*[@class='date']//a/text()")) >= 3:
tags = selector.xpath("//*[@class='date']//a/text()")[2] # 分类
else:
tags = "暂无"
if len(selector.xpath("//*[@id='content']/a/img/@src")) > 0:
image = selector.xpath("//*[@id='content']/a/img/@src")[0] # 图片
elif len(selector.xpath("//*[@id='content']/img/@src")) > 0:
image = selector.xpath("//*[@id='content']/img/@src")[0] # 图片
elif len(selector.xpath("//*[@id='content']/p/img/@src")) > 0:
image = selector.xpath("//*[@id='content']/p/img/@src")[0]
else:
image = selector.xpath("//*[@id='content']/p/a/img/@src")[0] # 图片
# print(image)
text_info = selector.xpath("//*[@id='content']//p")[2] # 获取内容页文本
div_str = etree.tostring(text_info) # 获取id中的html代码
text = html_toText(div_str).strip().replace(' ', '') # 调用外部工具类提取html中文本
text = "".join(text.split("\xa0")) # 去掉特殊字符\xa0
cont.append(title)
cont.append(image)
cont.append(text)
cont.append(category)
cont.append(tags)
cont.append(get_downUrl(url))
return cont
# 保存一篇文章内容信息
'''
contents[0] 小说标题
contents[1] 小说封面图
contents[2] 小说简介
contents[3] 小说分类
contents[4] 小说下载地址
'''
def saveInfo(category, pageIndex):
number = 1
for ls in getList(category, pageIndex):
contents = getContent(ls)
mkdir(contents[0]) # 以文件名创建文件夹目录
saveImg(contents[1], contents[0]) # 保存小说图片
saveBrief(contents[4] + "\n\n" + contents[2], contents[0]) # 保存小说简介
saveRAR(contents[5], contents[0]) # 下载小说附件
print("\n------ 分类ID为", category, "的第", str(number), "部小说信息保存完毕 ------\n")
number += 1
# 启动爬虫开始爬取数据
def runSpider(pageId, categroy):
num = getPageTotal(categroy) # 获取每个分类下总页数
for i in range(int(pageId), int(num) + 1):
saveInfo(categroy, i)
print("\n[--------- 第", i, "页爬取完毕!---------]\n")
if __name__ == '__main__':
c = [23, 25, 26, 27, 28, 29, 55]
# ss = get_page_total(base_url,23)
# u = getList(23, 2) # 获取页面中文章url
# t = getContent("http://www.zxcs.me/post/1568")
# u = get_downUrl("http://www.zxcs.me/post/11610")
# print(t)
'''
都市生活 23 精校奇幻 38 精校灵异 41 精校竞技 44
精校武侠 36 精校玄幻 39 精校历史 42 精校游戏 45
精校仙侠 37 精校科幻 40 精校军事 43 二次元 55
'''
f = open("./config.ini", encoding='utf-8')
print(f.read())
category = input("请输入您要爬取的分类:")
pages = input("\n请输入您要爬取的开始页:")
if len(category) > 0 and len(pages) > 0:
print("\n爬虫启动中...")
time.sleep(2)
print("开始爬取数据...", time.strftime("%Y/%m/%d %H:%M:%S"))
time.sleep(3)
runSpider(pages, category) # runSpider(页码,分类号)
# for p in c:
print("分类", category, "爬取完毕!")
这个怎么改有用啊,请各位帮帮忙
原代码中,在同一目录下有个配置ini文件,由于你在运行代码时没有此文件所以报错。尝试解决办法,一是将f=open...及print(f.read())两行给注释掉,再运行看看可否成功。二是在原项目源码中找config.ini文件放入目录下。在打开时写成文件绝对路径。
f = open(r".\config.ini", encoding='utf-8')
dos/windows系统的路径用的反斜杠,同路径的索性不要加.
直接: f = open("config.ini", encoding='utf-8')