<div id="leftContent">
<div><p id="test0">news #1</p></div>
<div><p id="test1">news #2</p></div>
<div><p id="test2">news #3</p></div>
<div><p id="test3">news #4</p></div>
<div><p id="test4">news #5</p></div>
</div>
#上面是本地html代码
#需要将p标签内容修改成(以下是py列表):
['1.体育总局暂停山地越野等赛事\u3000\u3000\u3000\u3000\u3000\u3000\u3000',
'2.广州首例确诊病例郭阿婆将出院\u3000\u3000\u3000\u3000\u3000\u3000',
'3.全国高考报名1078万人\u3000\u3000\u3000\u3000\u3000\u3000\u3000\u3000',
'4.南京胖哥等10人被认定见义勇为\u3000\u3000\u3000\u3000\u3000',
'5.华为正式发布鸿蒙手机操作系统\u3000\u3000\u3000\u3000\u3000\u3000']
#这是py程序代码
base = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(base, r'C:\Users\Mr.Blue\Desktop\index.html'),encoding=("utf-8")) as html:
soup = BeautifulSoup(html,'html.parser')
for k in range(5):
soup.find(id="test{0}".format(k)).text = lst3[k]
测试了好多方法,都无法实现直接更改html
感觉上面这个时最接近的,但他返回了can't set attribute
希望大佬们能帮帮忙
你那个应该是获取解析内容,不是更改吧,你看看这个:
获取网页内容生成html,并将某些标签属性进行修改 (基于python3.6)
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import urllib.request
import os
from bs4 import BeautifulSoup
# 网址
url =
# 更换部分
Splicing =
def get_web(get_url):
page = urllib.request.urlopen(get_url)
html = page.read().decode("utf-8")
all_url = []
url_list = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
for list_div in url_list.find_all('div', class_='col_menu_con'):
for a in list_div.find_all('a', href=True):
if a.get_text(strip=True):
if 'https' in a['href']:
continue
elif 'http' in a['href']:
continue
else:
all_url.append(a['href'])
for want_url in all_url:
jump_url = Splicing + want_url
name_split = want_url.split('/')
file_name = name_split[1] + '.html'
down_page = urllib.request.urlopen(jump_url)
down_html = down_page.read()
write_html = open(file_name, "w+b")
write_html.write(down_html)
write_html.close()
print(file_name + ' ' + 'done!')
def change_web(html_file):
file = html_file
content = open(file, 'r', encoding="utf-8")
html_cont = content.read()
find_content = BeautifulSoup(html_cont, 'lxml')
# 修改<a href
for change_a in find_content.find_all('a', href=True):
change_a.get_text(strip=True)
if 'https' in change_a['href']:
continue
elif 'http' in change_a['href']:
continue
else:
change_href = Splicing + change_a['href']
change_a['href'] = change_href
# 修改<link href
for change_link in find_content.find_all('link', href=True):
change_link.get_text(strip=True)
if 'https' in change_link['href']:
continue
elif 'http' in change_link['href']:
continue
else:
change_linkhref = Splicing + change_link['href']
change_link['href'] = change_linkhref
# 修改<script src
for change_script in find_content.find_all('script', src=True):
change_script.get_text(strip=True)
if 'https' in change_script['src']:
continue
elif 'http' in change_script['src']:
continue
else:
change_src = Splicing + change_script['src']
change_script['src'] = change_src
# 修改<form action
for change_form in find_content.find_all('form', action=True):
change_form.get_text(strip=True)
if 'https' in change_form['action']:
continue
elif 'http' in change_form['action']:
continue
else:
change_action = Splicing + change_form['action']
change_form['action'] = change_action
# 修改<img src
for change_image in find_content.find_all('img', src=True):
change_image.get_text(strip=True)
if 'https' in change_image['src']:
continue
elif 'http' in change_image['src']:
continue
else:
change_imagesrc = Splicing + change_image['src']
change_image['src'] = change_imagesrc
# 修改<img original_src
for change_originalsrc in find_content.find_all('img', original_src=True):
change_originalsrc.get_text(strip=True)
if 'https' in change_originalsrc['original_src']:
continue
elif 'http' in change_originalsrc['original_src']:
continue
else:
change_original = Splicing + change_originalsrc['original_src']
change_originalsrc['original_src'] = change_original
change_content = str(find_content).encode(encoding='utf-8') #尤其注意,soup生成了字典,进行修改后要转为str,并将其固定utf-8编码,才能存回去
change_html = open(file, "w+b")
change_html.write(change_content)
change_html.close()
print(file + ' ' + 'changed!')
get_web(url)
filearray = []
file_list = os.listdir(os.getcwd())
for fileNAME in file_list:
if os.path.splitext(fileNAME)[1] == '.html':
filearray.append(fileNAME)
for html_number in range(len(filearray)):
change_web(filearray[html_number])