import re
import time
import os
import requests
from lxml import etree
from threading import Thread
import threading
from queue import Queue
headers = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Mobile Safari/537.36'}
'''要爬取小说的网址'''
url = '
'''定义生产者'''
'''获取数据
1.---获取小说名字
2.获取每一章的内容和标题'
title1 = html.xpath('/n//dd/a/text()')
for k in title1:
titles_quenue.put(k)#标题
text_newurl = []
text_url2 = html.xpath('/n//dd/a/@href')
for i in text_url2:
j = 'https://www.biquwx.la/1_1760/' + i
text_newurl.append(j)
for j in text_newurl:
text_urls.put(j)
'''定义消费者'''
def consumer():
'''小说名字'''
name = re.findall('<h1>(.+?)</h1>', contentpage, re.DOTALL)[0]
print(name)
'''小说章节'''
title = titles_quenue.get()
'''小说链接--发起请求获取数据'''
text_url = text_urls.get()
resp = requests.get(url=text_url,headers =headers)
content = resp.content.decode('utf-8')
html1 = etree.HTML(content)
text = html1.xpath('//div[@id="content"]/text()')
text = "".join(text)
while len(text) == 0:
'''如果没有内容,再次发起请求'''
resp2 = requests.get(url=text_url, headers=headers)
content2 = resp2.content.decode('utf-8')
html2 = etree.HTML(content2)
text = html2.xpath('//div[@id="content"]/text()')
text = "".join(text)
if len(text) != 0:
break
if len(text) != 0:
with open('D:\AAAA桃花青帝\Python文件\爬虫\爬取小说\{}/'.format(name) + '{}'.format(title) + '.txt', 'a', encoding='utf-8') as f:
f.write(text)
print(title+'下载完成!!!!')
def multi():
#定义生产者
for i in range(50):
t = threading.Thread(target=producer)
t.start()
for j in range(50):
t= threading.Thread(target=consumer)
t.start()
multi()
章节链获取不到,一直卡在resp.staus的哪里,
流程如下代码所示,但是笔趣阁服务器不咋地,也没啥太大限制的反爬,所以开50个线程基本上就是反馈503了,如果你有代理ip可以加进去,然后就是 线程开少点,爬取速度限制一下,比如在每个章节获取里sleep一下。可以的话,希望能采纳!
import os
from queue import Queue
import threading
import requests
from lxml import etree
class BQG:
def __init__(self):
self.url_queue = Queue()
self.prefix_url = 'https://www.biquwx.la/1_1760/'
self.suffix_url = None
self.dir_path = os.path.join(os.getcwd(), '武炼巅峰')
self.create_dir()
self.get_links()
self.get_consumers()
def create_dir(self):
"""创建目录"""
if not os.path.exists(self.dir_path):
os.mkdir(self.dir_path)
def get_links(self):
"""获取所有url"""
top_res = requests.get(self.prefix_url)
top_html = etree.HTML(top_res.content.decode())
suffix_url = top_html.xpath('//div[@id="list"]/dl/dd/a/@href')
for i in suffix_url:
url = self.prefix_url + i
print(url)
self.url_queue.put(url)
def get_consumers(self):
"""多线程爬取"""
for i in range(50):
t = threading.Thread(target=self.get_content)
t.start()
def get_content(self):
"""获取章节内容"""
url = self.url_queue.get()
res = requests.get(url)
html = etree.HTML(res.content.decode())
title = ''.join(html.xpath('//h1/text()'))
content = '\n'.join(html.xpath('//div[@id="content"]//text()'))
file_path = os.path.join(self.dir_path, '{}.txt'.format(title))
with open(file_path, 'w', encoding='utf-8') as f:
f.write(title)
f.write(content)
if __name__ == '__main__':
bqg = BQG()
有的章节加载不出来,不允许访问
通过多线程或者多进程或者协程或者异步方式来爬取小说,必须要注意反爬的机制,而且在爬取过程中不能太快,以防止IP被封,还有一点就是不能用for循环,因为有些章节是没有的,如果用for循环可能会导致有些页面加载不出来,建议在目录页面把所有的目录都匹配到,然后根据这个目录的章节来爬取,我也经常这个做
下面代码是我在爬取庆余年第二部小说做的反爬机制,您可以参考下
import requests,re,time
import random,os
start_url = 'https://www.ctgf.net/xiaoshuo/84190.html'
USER_AGETN = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
,"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
,"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
,"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"
,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"
,"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
,"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
,"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11"
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"
,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"
,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
,"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36"
]
headers = {
"User-Agent":random.choice(USER_AGETN),
"Refer":start_url,
"Host":"www.ctgf.net",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
}
我也写过笔趣阁的爬虫,可以给你参考一下
你想爬哪个小说,链接发出来,我写写看