# noinspection PyUnresolvedReferences
from bs4 import BeautifulSoup
# noinspection PyUnresolvedReferences
from lxml import etree
import requests
html = "https://zhidao.baidu.com/question/2207469534762529468.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67'
}
h = requests.get(html,headers=headers)
h.encoding = 'gbk'
print(h.text)
提取这个内容
要看你的网页的源代码的结构是什么
有用的话点一下采纳
用xpath吧,正则不容易写
# -*- coding:utf-8 -*- #
import re
from lxml import etree
import requests
# 统一请求响应函数
def unify_requests(method="GET", url="", headers={}, proxies={}, data={}, verify=False, cookies={}):
if method == "GET":
response = requests.get(url, headers=headers, proxies=proxies, data=data, cookies=cookies, timeout=5,
verify=verify)
return response
else:
response = requests.post(url, headers=headers, proxies=proxies, data=data, verify=verify, cookies=cookies,
timeout=5)
return response
class BaiDuZhiDao():
# 初始化函数
def __init__(self,proxy=None):
self.single_url = ""
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67'
}
self.proxy = proxy
# 解析函数
def parms_url(self,response):
try:
response_data = etree.HTML(response.text)
except:
return {}
else:
result_dic = {
"grp_title":"", # 文章标题
"grp_author":"", # 文章作者
"grp_pubtime":"", # 文章发布时间
"grp_text":"", # 文章内容
"grp_img_list":[], # 文章侵权内
}
result_dic["grp_title"] = "".join(response_data.xpath('//span[@class="ask-title"]/text()'))# 文章标题
result_dic["grp_author"] = "".join(response_data.xpath('//div[@class="wgt-replyer-all"]/a/span[@class="wgt-replyer-all-uname "]/text()')[0]).replace('\n','')# 文章作者
try:
result_dic["grp_pubtime"] = re.findall('推荐于(.*)',''.join(response_data.xpath('//span[@class="wgt-replyer-all-time"]/text()')[0]).replace('\n',''))[0]
except:
result_dic["grp_pubtime"] = ''.join(response_data.xpath('//span[@class="wgt-replyer-all-time"]/text()')).replace('\n','')
result_dic["grp_text"] = "".join(response_data.xpath('////div[@class="best-text mb-10"]//text()')).replace('\n','') # 文章内容
result_dic["grp_img_list"] = response_data.xpath('//div[@class="best-text mb-10"]/p//img/@src') # 文章侵权内容
return result_dic
# 获取文章细节
def get_text_detail(self,url=""):
info = self.qin_quan_info(qin_quan_url=url)
info.encoding = info.apparent_encoding
return self.parms_url(response=info)
# 侵权详情 *
def qin_quan_info(self, qin_quan_url):
return unify_requests(url=qin_quan_url, headers=self.headers, proxies=self.proxy)
down_load_songs = BaiDuZhiDao(proxy={}).get_text_detail
if __name__ == '__main__':
info = down_load_songs(url="https://zhidao.baidu.com/question/84758732.html")
print(info)
望采纳,
楼上大哥真够幸苦了。。
content = ''.join(re.findall(
r'<div class="line content"(.|\n)?(
(.|\n)+
)(.|\n)?', '').replace('
如有解决,麻烦点个采纳