import requests
from bs4 import BeautifulSoup
import requests as requets
def GetHTMLText(url):
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding="utf-8"
return r.text
def maskSoup(url):
guiyuan_news=GetHTMLText(url)
print(guiyuan_news)
soup = BeautifulSoup(guiyuan_news, "html.parser")
titles=soup.select("div.c_title pt20 mt5>h1")
rank=soup.select("div.v_news_content")
fp = open('guiyuan_news.txt', "a+", encoding="utf-8")
for rank,title in zip(rank,titles):
data={
"题目":rank.get_text().strip(),
"内容":title.get_text()
}
list(data)
fp.writelines(data)
print(data)
if name == 'main':
url="https://www.gxljcollege.cn/info/1012/33907.htm"
maskSoup(url)
问题有些多, 改好了
import requests
from bs4 import BeautifulSoup
def GetHTMLText(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.12151 SLBChan/30'}
r=requests.get(url,headers=headers)
# print(r)
r.raise_for_status()
r.encoding="utf-8"
return r.text
def maskSoup(url):
guiyuan_news=GetHTMLText(url)
# print(guiyuan_news)
soup = BeautifulSoup(guiyuan_news, "html.parser")
titles=soup.select('[class="c_title pt20 mt5"]')
print(titles[0].get_text())
rank=soup.select("div.v_news_content")
fp = open('guiyuan_news.txt', "a+", encoding="utf-8")
for rank,title in zip(rank,titles):
data={
"题目": title.get_text().strip(),
"内容":rank.get_text().strip()
}
print(data)
for k,v in data.items():
fp.writelines("{}:{}\n".format(k,v))
if __name__ == '__main__':
url="https://www.gxljcollege.cn/info/1012/33907.htm"
maskSoup(url)