import requests
from bs4 import BeautifulSoup
# 获取链接
def getHTMLText(url,head):
try:
r = requests.get(url, headers=head)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("获取失败")
# 解析页面,储存数据
def saveText(html, filePath):
soup = BeautifulSoup(html, "html.parser")
allATag = soup.find("table", {"class": "border1"}).find_all("a")
listLink = []
for i in allATag:
url = i.get("href")
textTitle = i.get_text()
open(filePath+textTitle+".txt", "w", encoding="utf-8") # 文件名字
listLink.append("http://www.jgjy.gov.cn/newss/"+url) #完整链接文本
print(listLink)
for j in listLink:
r = requests.get(j)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text,"html.parser")
textTag = soup.find("table", {"class": "border1"})
text = textTag.get_text()
textWrite = open(filePath+textTitle+".txt", "w", encoding="utf-8")
textWrite.write(text)
textWrite.close()
print("完成")
# 入口
def main():
filePath = "E://text//连云港市机关工委-2018/" # 保存地址
url = "http://www.jgjy.gov.cn/newss/nlist.asp?classid=94"
head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
html = getHTMLText(url, head)
saveText(html, filePath)
print("存储完成")
main()
你是不是说这种?可以用编辑工具,新手的话建议用sublime