python爬虫数据存储问题,该如何改?

网上找的小说全栈爬虫代码,本人没学过不会改,用的pycharm2021.2.2,最后应该是数据存储路径问题

报错:

C:\Users\Jason\PycharmProjects\pythonProject1\venv\Scripts\python.exe C:/Users/Jason/PycharmProjects/pythonProject1/venv/01.py
Traceback (most recent call last):
  File "C:\Users\Jason\PycharmProjects\pythonProject1\venv\01.py", line 234, in <module>
    f = open("./config.ini", encoding='utf-8')
FileNotFoundError: [Errno 2] No such file or directory: './config.ini'

进程已结束,退出代码为 1


代码:

 f = open("./config.ini", encoding='utf-8')          
    print(f.read())
    category = input("请输入您要爬取的分类:")

    pages = input("\n请输入您要爬取的开始页:")
    if len(category) > 0 and len(pages) > 0:
        print("\n爬虫启动中...")
        time.sleep(2)
        print("开始爬取数据...", time.strftime("%Y/%m/%d %H:%M:%S"))
        time.sleep(3)
        runSpider(pages, category)  # runSpider(页码,分类号)
        # for p in c:
        print("分类", category, "爬取完毕!")

全部代码

import requests
import random
from lxml import etree
from html.parser import HTMLParser
import os
import time
import re

base_url = 'http://www.zxcs.me/sort/{}'

page = "/page/{}"
user_agent_list = [
    "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
]
UA = random.choice(user_agent_list)
HEADERS = {'User-Agent': UA}


# 简单封装网络请求操作

def get_requests(url):
    global NETWORK_STATUS
    NETWORK_STATUS = True  # 判断状态变量
    try:
        req = requests.get(url, headers=HEADERS, timeout=20)
        if req.status_code == 200:
            html = req.text
            return html
    except requests.exceptions.Timeout:
        NETWORK_STATUS = False  # 请求超时改变状态
        if not NETWORK_STATUS:
            '''请求超时'''
            for i in range(1, 10):
                print('请求超时,第%s次重复请求' % i)
                req = requests.get(url, headers=HEADERS, timeout=5)
                if req.status_code == 200:
                    html = req.text
                    return html
    return -1


# 获取各分类列表页中的总页数

def getPageTotal(cates):
    # base_url = "http://www.zxcs.me/sort/25/"
    req = get_requests(base_url.format(cates))
    selector = etree.HTML(req)
    page = selector.xpath("//*[@id='pagenavi']/a/@href")[-1].split('/')[-1]  # 获取每个分类下的总页数

    return page


# 获取每个分类下每页内容

def getPage(category, pageIdex):
    req = get_requests(base_url.format(category) + page.format(str(pageIdex)))
    return req


# 获取列表中文章的url

def getList(cates, pageIdex):
    req = getPage(cates, pageIdex)
    selector = etree.HTML(req)
    url_list = selector.xpath("//*[@id='plist']/dt/a/@href")

    return url_list


# 获取下载链接

def get_downUrl(url):
    link_id = url.split('/')[-1]  # 通过传入的文章地址截取网址中的文章id
    req = get_requests("http://www.zxcs.me/download.php?id={}".format(link_id))
    selector = etree.HTML(req)
    d_url = selector.xpath("//span[@class='downfile']/a/@href")[0]
    return d_url


# 下载小说

def saveRAR(url, fileName):
    print("开始下载", fileName, "小说...\n")
    rar_name = url.split('/')[-1]  # 截取网址中小说名称
    names = fileName + '/' + str(rar_name)  # 组合下载路径
    start = time.time()  # 开始时间
    size = 0
    req = requests.get(url, stream=True, headers=HEADERS)
    chunk_size = 1024  # 每次下载的数据大小
    content_size = int(req.headers['content-length'])  # 总大小

    if req.status_code == 200:
        print('[文件大小]:%0.2f MB' % (content_size / chunk_size / 1024))  # 换算单位并print()
        with open(names, 'wb') as f:
            for d in req.iter_content(chunk_size=chunk_size):  # 获取请求的原始响应
                f.write(d)
                size += len(d)  # 已下载文件大小
                print(
                    '\r' + '[下载进度]:%s %.2f%%' % ('>' * int(size * 50 / content_size), float(size / content_size * 100)),
                    end='')
            f.close()

        end = time.time()
        print('\n', "小说下载完成!用时%.2f秒" % (end - start))


# 保存内容简介

def saveBrief(content, name):
    fileName = name + "/" + name + ".txt"
    f = open(fileName, "w+")
    print("正在偷偷保存小说简介", fileName)

    f.write(content)


# 保存内容页中图片到本地

def saveImg(imgUrl, fileName):
    name = imgUrl.split('/')[-1]
    req = requests.get(imgUrl).content
    if name.endswith(".jpg"):
        names = fileName + "/" + str(name)
    else:
        names = fileName + "/" + str(name.split('?')[0])

    f = open(names, 'wb')
    f.write(req)
    print("正在悄悄的保存小说图片", name)
    f.close()


# 创建新目录

def mkdir(path):
    path = path.strip()
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        print("正在新建名为", path, '的文件夹')
        # 创建目录操作函数
        os.makedirs(path)
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print("名为", path, '的文件夹已经创建!')
        return False


# 获取详情页内容并返回一个列表

def getContent(url):
    cont = []
    req = get_requests(url)
    selector = etree.HTML(req)
    title = selector.xpath("//*[@id='content']/h1/text()")[0]  # 标题
    category = selector.xpath("//*[@class='date']//a/text()")[1]  # 分类
    if len(selector.xpath("//*[@class='date']//a/text()")) >= 3:
        tags = selector.xpath("//*[@class='date']//a/text()")[2]  # 分类
    else:
        tags = "暂无"
    if len(selector.xpath("//*[@id='content']/a/img/@src")) > 0:
        image = selector.xpath("//*[@id='content']/a/img/@src")[0]  # 图片
    elif len(selector.xpath("//*[@id='content']/img/@src")) > 0:
        image = selector.xpath("//*[@id='content']/img/@src")[0]  # 图片
    elif len(selector.xpath("//*[@id='content']/p/img/@src")) > 0:
        image = selector.xpath("//*[@id='content']/p/img/@src")[0]
    else:
        image = selector.xpath("//*[@id='content']/p/a/img/@src")[0]  # 图片

    # print(image)
    text_info = selector.xpath("//*[@id='content']//p")[2]  # 获取内容页文本
    div_str = etree.tostring(text_info)  # 获取id中的html代码
    text = html_toText(div_str).strip().replace(' ', '')  # 调用外部工具类提取html中文本
    text = "".join(text.split("\xa0"))  # 去掉特殊字符\xa0

    cont.append(title)
    cont.append(image)
    cont.append(text)
    cont.append(category)
    cont.append(tags)
    cont.append(get_downUrl(url))
    return cont


# 保存一篇文章内容信息
'''
contents[0] 小说标题
contents[1] 小说封面图
contents[2] 小说简介
contents[3] 小说分类
contents[4] 小说下载地址
'''


def saveInfo(category, pageIndex):
    number = 1
    for ls in getList(category, pageIndex):
        contents = getContent(ls)
        mkdir(contents[0])  # 以文件名创建文件夹目录
        saveImg(contents[1], contents[0])  # 保存小说图片
        saveBrief(contents[4] + "\n\n" + contents[2], contents[0])  # 保存小说简介
        saveRAR(contents[5], contents[0])  # 下载小说附件
        print("\n------ 分类ID为", category, "的第", str(number), "部小说信息保存完毕 ------\n")
        number += 1


# 启动爬虫开始爬取数据

def runSpider(pageId, categroy):
    num = getPageTotal(categroy)  # 获取每个分类下总页数
    for i in range(int(pageId), int(num) + 1):
        saveInfo(categroy, i)
        print("\n[--------- 第", i, "页爬取完毕!---------]\n")


if __name__ == '__main__':
    c = [23, 25, 26, 27, 28, 29, 55]
    # ss = get_page_total(base_url,23)
    # u = getList(23, 2)  # 获取页面中文章url
    # t = getContent("http://www.zxcs.me/post/1568")
    # u = get_downUrl("http://www.zxcs.me/post/11610")
    # print(t)
    '''
    都市生活  23    精校奇幻   38   精校灵异    41  精校竞技    44
    精校武侠  36    精校玄幻   39   精校历史    42  精校游戏    45
    精校仙侠  37    精校科幻   40   精校军事    43  二次元      55

    '''
    f = open("./config.ini", encoding='utf-8')
    print(f.read())
    category = input("请输入您要爬取的分类:")

    pages = input("\n请输入您要爬取的开始页:")
    if len(category) > 0 and len(pages) > 0:
        print("\n爬虫启动中...")
        time.sleep(2)
        print("开始爬取数据...", time.strftime("%Y/%m/%d %H:%M:%S"))
        time.sleep(3)
        runSpider(pages, category)  # runSpider(页码,分类号)
        # for p in c:
        print("分类", category, "爬取完毕!")

img

这个怎么改有用啊,请各位帮帮忙

原代码中,在同一目录下有个配置ini文件,由于你在运行代码时没有此文件所以报错。尝试解决办法,一是将f=open...及print(f.read())两行给注释掉,再运行看看可否成功。二是在原项目源码中找config.ini文件放入目录下。在打开时写成文件绝对路径。

f = open(r".\config.ini", encoding='utf-8')
dos/windows系统的路径用的反斜杠,同路径的索性不要加.
直接: f = open("config.ini", encoding='utf-8')