小说爬取如何多线程?

下面是全部的程序,通过分析笔趣阁的网址并进行合成,然后爬取全站小说

问题:如何利用多线程解决速度过慢的问题?

作者大致思路:改进书籍爬取,可以一次性爬取多本书籍。

提升思路:改进章节爬取(实力不足)

目前可实现功能:章节可更新(可以中途中断爬取),记录程序时间


import os
import re
import time
from types import TracebackType
import requests
from lxml import etree




yxstart=time.time()


headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}




def pq(wz):
    qz = True
    zjljlb = []
    # wz = 'https://www.biqugeu.net/137_137192/'
    wb = requests.get(wz, headers=headers)
    wb.encoding = 'utf-8'
    wb = wb.text
    wb = etree.HTML(wb)
    zjljlb = wb.xpath('//*[@id="list"]/dl/dd/a/@href')[12:]

    zjs = len(zjljlb)
    ks = 0
    bookname = wb.xpath('//*[@id="info"]/h1/text()')
    

    try:
        bookname = bookname[0]
    except:
            print(wz+"书籍无名,跳过爬取\n")
            return 0
 

    print("-------------------"+str(bookname)+"------------------")
    b = os.getcwd()
    # print(b)



    ml = b+'\\'+bookname

    if not os.path.exists(bookname):

        os.mkdir(ml)
        with open(ml+'\zjs.txt', 'w')as f:
            f.write('0')
        qz = False
    zjlb = wb.xpath('//*[@id="list"]/dl/dd/a/text()')[12:]
    zjlj = zjljlb[0]
    with open(ml+'\zjs.txt', 'r')as f:
        ks = f.read()
        ks = int(ks)


    if qz:
        ks = ks+1
    else:
        ks = ks



    if ks == zjs:
        print('---------------------没有章节更新-----------------------')
    else:
        print('------------------发现新的章节,开始更新-----------------')
        print('------------------共' + str(zjs - ks) + '个章节-----------------')

        # for i in range(len(zjljlb)):
        #     zjljlb[i] = 'https://www.mcmssc.com/' + zjljlb[i]
        # cljs = 0

        for i in range(ks, zjs):
            # print('------------开始处理-------------')

            with open(ml+'\文章.txt', 'a', encoding='utf-8') as file:
                file.write('   ---------------------' +zjlb[i] + '---------------------\n')
                zjlj = zjljlb[i]
                zjlj = 'https://www.biqugeu.net' + zjlj

                while 1:
                    try:
                        wb = requests.get(zjlj, headers=headers, timeout=3)
                    except:
                        time.sleep(0.5)
                        continue
                    break
                wb.encoding = 'utf-8'
                wb = wb.text
                wb = etree.HTML(wb)
                # zjlj = wb.xpath('//*[@id="list"]/dl/dd[1]/a/@href')


                wz = wb.xpath('//*[@id="content"]/text()')
                wz = wz[2:]
                for j in range(len(wz)):
                    t = re.findall('.+?(.\w+.+)', wz[j])
                    if len(t) != 0:
                        t = t[0]
                        wz[j] = t.strip('\xa0')
                        wz[j] = t.strip('r')
                        # t = t.replace('笔趣阁TV首发www.biqugetv.com m.biqugetv.com', '')


                        while 1:
                            try:
                                file.write('     ' + wz[j] + '\n' + '\n')
                            except:
                                continue
                            break
                        
                            
            with open(ml+'\zjs.txt', 'w')as f:
                ks = f.write(str(i))
            print(str(i)+"  "+zjlb[i])
            # time.sleep(0.5)

# with open('J:\py文件夹\zjs.txt', 'w')as f:
#     ks = f.write(str(zjs))



for i in range(1,137720):
    pq('https://www.biqugeu.net/01_'+str(i)+"/")







print('--------------------------结束--------------------------')
sj=time.ctime()
yxend=time.time()

yxsj=yxend-yxstart
print('--------------结束时间:'+sj+'--------------')
print("--------------运行时间:"+str(yxsj)+"-------------")

多线程使用的是threading库