不知道为什么 很容易中断。
import re
import requests
from bs4 import BeautifulSoup
from lxml import etree
import os
import shutil
head={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"}
a=0
def get_and_find(url):
global a
a=a+1
d=[]
rew=requests.get(url=url,headers=head)
code=rew.encoding
soup=BeautifulSoup(rew.text.encode(str(code)),"lxml")
title=soup.find_all(name='title')[0]
title_=re.findall(string=str(title),pattern=' 第.*?章 .*?_')[0]
d.append(title_)
text=etree.HTML(str(soup))
word=text.xpath('//div[@id="content"]/text()')
words=[]
texts=[]
trext=[]
for i in word:
words.append(str(i).replace('\xa0',''))
for e in words:
texts.append(str(e).replace('\ufeff',''))
for n in texts:
trext.append(str(n).replace('\u2200',''))
list.clear(words)
for t in texts:
words.append((str(t).replace('\u203f','')))
list.clear(trext)
for q in words:
trext.append(str(q).replace('\xbe',''))
list.clear(words)
for w in trext:
words.append(str(w).replace('\u2022',''))
pattern = '→ <a href=".*?\.html">.*</a>'
next = re.findall(string=str(soup), pattern=pattern)[0]
pattern2 = '/26/26128/.*?html'
next_url = re.findall(string=next, pattern=pattern2)[0]
d.append(words)
d.append(str(next_url))
print(str(a)+str(title)+"爬取成功")
with open(path+"/"+str(a)+str(d[0])+".txt","w") as n:
text=d[1]
n.write(str(d[0]))
for i in text:
n.write(" "+i)
n.flush()
return d
def next_get(f):
next_url_=str(main)+str(f[2])
moiddle(next_url_)
def moiddle(url):
global a
f=get_and_find(url)
next_get(f)
a=a+1
if __name__=="__main__":
url_=input("请输入爬取网址:")
main=input("请输入网址主站网址:")
name=input("请输入书籍名称:")
path='C:/Users/Administrator/Desktop/'+str(name)
if os.path.exists(path)==True:
shutil.rmtree(path)
os.mkdir(path)
else:
os.mkdir(path)
a=62
moiddle(url_)
是针对笔趣阁的爬虫