# -*- coding = utf-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import lxml
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
url = "http://www.crpsz.com/zbxx/006001/006001001/20230725/a025dabc-b9ce-40e5-8d14-34fcb0cf1bc3.html"
page = requests.get(url, headers=header, timeout=5,
allow_redirects=False)
page.encoding = "utf-8"
soup = BeautifulSoup(page.text, 'html.parser')
contents = soup.find_all('span')
for content in contents:
content = str(content)
print(content)
import requests
from bs4 import BeautifulSoup
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
url = "http://www.crpsz.com/zbxx/006001/006001001/20230725/a025dabc-b9ce-40e5-8d14-34fcb0cf1bc3.html"
page = requests.get(url, headers=header, timeout=5, allow_redirects=False)
page.encoding = "utf-8"
soup = BeautifulSoup(page.text, 'html.parser')
# 查找所有span标签
contents = soup.find_all('span')
for content in contents:
# 获取当前span标签后的所有文本内容,直到遇到下一个标签
full_text = content.find_next_sibling(text=True)
print(full_text)
contents = soup.find_all('span', class_='count')
for content in contents:
content = content.get_text(separator='<br>').split('<br>') # 使用<br>作为分隔符拆分文本
print(content)
不知道你这个问题是否已经解决, 如果还没有解决的话:问题的关键是如何提取被<br>
标签分割开的信息。目前的代码使用了BeautifulSoup
库的find_all
方法来寻找所有的<span>
标签,但无法提取到被<br>
分割开的内容。要解决这个问题,可以使用BeautifulSoup
库提供的next_siblings
属性来获取<br>
之后的标签,并根据需要进行处理。
修改后的代码如下:
# -*- coding: utf-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import lxml
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
url = "http://www.crpsz.com/zbxx/006001/006001001/20230725/a025dabc-b9ce-40e5-8d14-34fcb0cf1bc3.html"
page = requests.get(url, headers=header, timeout=5, allow_redirects=False)
page.encoding = "utf-8"
soup = BeautifulSoup(page.text, 'html.parser')
contents = soup.find_all('span')
for content in contents:
content = str(content)
# 查找当前标签之后的所有兄弟标签
siblings = content.next_siblings
# 遍历兄弟标签
for sibling in siblings:
# 判断是否为Tag类型的标签
if isinstance(sibling, bs4.element.Tag):
# 判断是否为br标签
if sibling.name == 'br':
# 如果是br标签,则说明遇到了分隔符
break
else:
# 如果不是br标签,则继续处理内容
content += str(sibling)
# 对处理后的内容进行进一步处理,如去除html标签,处理特殊字符等
# ...
print(content)
这样修改后的代码会在提取<span>
标签的同时,根据分隔符<br>
将内容进行合并。可以根据需求对合并后的内容进行进一步的处理,如去除html标签,处理特殊字符等。