#这是要爬取网页文本数据,并将数据保存在txt文本文档里
import requests
from bs4 import BeautifulSoup
kv={"Cookie: ad_t_3=2; ad_t_2=1; ad_t_side_fix_2=2; BAIDU_SSP_lcr=https://www.so.com/link?m=b2XYq2CTbCkKCsUJXqlQuOqJ%2F7bmqqkFyYjj6I1OaJj3QzusWcTMg5g2uslG%2Fk2%2FqNagu%2F5%2Fytg5cqFQxqMKNskdDeE2dCrBm4ZpYogB4Jn1KFU90K4VQgEQisAS%2FQfXn%2FwpFnSeMwrE%2BXV4pl38rOEmSAYF%2Bgsj51UgsxVKbAYZ1PAq4anrhQm2h5fdahVDzWGK5RdCdJHfNH00p4uw0LIdJbzcpVP%2FobLyzmCtpYpBu9srnRZ9yh0dlFmtGMI28qYFj6uzZdDH3yiXrYdQfmXqCaJOThcPr%2Ffml2zh08Vv35sEq; SUV=2006101439533660; gidinf=x099980109ee11a705c58d40e0009c30eba77d0f8a96; IPLOC=CN; t=1640072288098; reqtype=pc; beans_new_turn=%7B%22sohu-article%22%3A11%7D"}
url="http://news.sohu.com/a/503136672_121106991"
r = requests.request("GET", url, headers=kv)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text)
bf = soup.find('div', class_='word')
print(bf)
import requests
from bs4 import BeautifulSoup
headers = {
"Cookie": "ad_t_3=2; ad_t_2=1; ad_t_side_fix_2=2; BAIDU_SSP_lcr=https://www.so.com/link?m=b2XYq2CTbCkKCsUJXqlQuOqJ%2F7bmqqkFyYjj6I1OaJj3QzusWcTMg5g2uslG%2Fk2%2FqNagu%2F5%2Fytg5cqFQxqMKNskdDeE2dCrBm4ZpYogB4Jn1KFU90K4VQgEQisAS%2FQfXn%2FwpFnSeMwrE%2BXV4pl38rOEmSAYF%2Bgsj51UgsxVKbAYZ1PAq4anrhQm2h5fdahVDzWGK5RdCdJHfNH00p4uw0LIdJbzcpVP%2FobLyzmCtpYpBu9srnRZ9yh0dlFmtGMI28qYFj6uzZdDH3yiXrYdQfmXqCaJOThcPr%2Ffml2zh08Vv35sEq; SUV=2006101439533660; gidinf=x099980109ee11a705c58d40e0009c30eba77d0f8a96; IPLOC=CN; t=1640072288098; reqtype=pc; beans_new_turn=%7B%22sohu-article%22%3A11%7D"}
url = "http://news.sohu.com/a/503136672_121106991"
res = requests.get(url=url, headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
tags = soup.find(id="mp-editor").find_all('p')
with open('./content.txt', 'w', encoding='utf-8')as fp:
for tag in tags:
fp.write(tag.text+'\n')
fp.close()
这样就行了