下面是全部的程序,通过分析笔趣阁的网址并进行合成,然后爬取全站小说
问题:如何利用多线程解决速度过慢的问题?
作者大致思路:改进书籍爬取,可以一次性爬取多本书籍。
提升思路:改进章节爬取(实力不足)
目前可实现功能:章节可更新(可以中途中断爬取),记录程序时间
import os
import re
import time
from types import TracebackType
import requests
from lxml import etree
yxstart=time.time()
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
}
def pq(wz):
qz = True
zjljlb = []
# wz = 'https://www.biqugeu.net/137_137192/'
wb = requests.get(wz, headers=headers)
wb.encoding = 'utf-8'
wb = wb.text
wb = etree.HTML(wb)
zjljlb = wb.xpath('//*[@id="list"]/dl/dd/a/@href')[12:]
zjs = len(zjljlb)
ks = 0
bookname = wb.xpath('//*[@id="info"]/h1/text()')
try:
bookname = bookname[0]
except:
print(wz+"书籍无名,跳过爬取\n")
return 0
print("-------------------"+str(bookname)+"------------------")
b = os.getcwd()
# print(b)
ml = b+'\\'+bookname
if not os.path.exists(bookname):
os.mkdir(ml)
with open(ml+'\zjs.txt', 'w')as f:
f.write('0')
qz = False
zjlb = wb.xpath('//*[@id="list"]/dl/dd/a/text()')[12:]
zjlj = zjljlb[0]
with open(ml+'\zjs.txt', 'r')as f:
ks = f.read()
ks = int(ks)
if qz:
ks = ks+1
else:
ks = ks
if ks == zjs:
print('---------------------没有章节更新-----------------------')
else:
print('------------------发现新的章节,开始更新-----------------')
print('------------------共' + str(zjs - ks) + '个章节-----------------')
# for i in range(len(zjljlb)):
# zjljlb[i] = 'https://www.mcmssc.com/' + zjljlb[i]
# cljs = 0
for i in range(ks, zjs):
# print('------------开始处理-------------')
with open(ml+'\文章.txt', 'a', encoding='utf-8') as file:
file.write(' ---------------------' +zjlb[i] + '---------------------\n')
zjlj = zjljlb[i]
zjlj = 'https://www.biqugeu.net' + zjlj
while 1:
try:
wb = requests.get(zjlj, headers=headers, timeout=3)
except:
time.sleep(0.5)
continue
break
wb.encoding = 'utf-8'
wb = wb.text
wb = etree.HTML(wb)
# zjlj = wb.xpath('//*[@id="list"]/dl/dd[1]/a/@href')
wz = wb.xpath('//*[@id="content"]/text()')
wz = wz[2:]
for j in range(len(wz)):
t = re.findall('.+?(.\w+.+)', wz[j])
if len(t) != 0:
t = t[0]
wz[j] = t.strip('\xa0')
wz[j] = t.strip('r')
# t = t.replace('笔趣阁TV首发www.biqugetv.com m.biqugetv.com', '')
while 1:
try:
file.write(' ' + wz[j] + '\n' + '\n')
except:
continue
break
with open(ml+'\zjs.txt', 'w')as f:
ks = f.write(str(i))
print(str(i)+" "+zjlb[i])
# time.sleep(0.5)
# with open('J:\py文件夹\zjs.txt', 'w')as f:
# ks = f.write(str(zjs))
for i in range(1,137720):
pq('https://www.biqugeu.net/01_'+str(i)+"/")
print('--------------------------结束--------------------------')
sj=time.ctime()
yxend=time.time()
yxsj=yxend-yxstart
print('--------------结束时间:'+sj+'--------------')
print("--------------运行时间:"+str(yxsj)+"-------------")
多线程使用的是threading库