python re提取网页内容,代码怎么写


# noinspection PyUnresolvedReferences
from bs4 import BeautifulSoup
# noinspection PyUnresolvedReferences
from lxml import etree
import requests


html = "https://zhidao.baidu.com/question/2207469534762529468.html"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67'
}
h = requests.get(html,headers=headers)
h.encoding = 'gbk'
print(h.text)

img

提取这个内容

要看你的网页的源代码的结构是什么

img

img
有用的话点一下采纳

用xpath吧,正则不容易写


# -*- coding:utf-8 -*- #
import re
from lxml import etree
import requests
# 统一请求响应函数
def unify_requests(method="GET", url="", headers={}, proxies={}, data={}, verify=False, cookies={}):
    if method == "GET":
        response = requests.get(url, headers=headers, proxies=proxies, data=data, cookies=cookies, timeout=5,
                                verify=verify)
        return response
    else:
        response = requests.post(url, headers=headers, proxies=proxies, data=data, verify=verify, cookies=cookies,
                                 timeout=5)
        return response

class BaiDuZhiDao():
    # 初始化函数
    def __init__(self,proxy=None):
        self.single_url = ""
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.67'
        }
        self.proxy = proxy

    # 解析函数
    def parms_url(self,response):
        try:
            response_data = etree.HTML(response.text)
        except:
            return {}
        else:
            result_dic = {
                "grp_title":"", # 文章标题
                "grp_author":"", # 文章作者
                "grp_pubtime":"", # 文章发布时间
                "grp_text":"", # 文章内容
                "grp_img_list":[], # 文章侵权内
            }

            result_dic["grp_title"] = "".join(response_data.xpath('//span[@class="ask-title"]/text()'))# 文章标题
            result_dic["grp_author"] = "".join(response_data.xpath('//div[@class="wgt-replyer-all"]/a/span[@class="wgt-replyer-all-uname             "]/text()')[0]).replace('\n','')# 文章作者
            try:
                result_dic["grp_pubtime"] = re.findall('推荐于(.*)',''.join(response_data.xpath('//span[@class="wgt-replyer-all-time"]/text()')[0]).replace('\n',''))[0]
            except:
                result_dic["grp_pubtime"] = ''.join(response_data.xpath('//span[@class="wgt-replyer-all-time"]/text()')).replace('\n','')
            result_dic["grp_text"] = "".join(response_data.xpath('////div[@class="best-text mb-10"]//text()')).replace('\n','') # 文章内容
            result_dic["grp_img_list"] = response_data.xpath('//div[@class="best-text mb-10"]/p//img/@src')  # 文章侵权内容
        return result_dic
    # 获取文章细节
    def get_text_detail(self,url=""):
        info = self.qin_quan_info(qin_quan_url=url)
        info.encoding = info.apparent_encoding
        return self.parms_url(response=info)
    # 侵权详情 *
    def qin_quan_info(self, qin_quan_url):
        return unify_requests(url=qin_quan_url, headers=self.headers, proxies=self.proxy)

down_load_songs = BaiDuZhiDao(proxy={}).get_text_detail
if __name__ == '__main__':
    info = down_load_songs(url="https://zhidao.baidu.com/question/84758732.html") 
    print(info)

望采纳,

楼上大哥真够幸苦了。。
content = ''.join(re.findall(
r'<div class="line content"(.|\n)?(

(.|\n)+

)(.|\n)?
',
h.text)[0]).replace('

', '').replace('', '').replace('

', '').replace('

  • ','').replace('
    1. ','').replace('
      ','').replace('>','')

      如有解决,麻烦点个采纳