请使用lxml库分别按照如下要求查找hello.html文件中的指定节点
lxml是c语言写的
1.使用lxml解析html字符串,使用lxml.etree.HTML
进行解析
from lxml import etree
htmlElement = etree.HTML(text)
print(etree.tostring(htmlElement,encoding='utf-8').decode('utf-8'))
2.解析html文件,使用lxml.etree.parse
进行解析
from lxml import etree
htmlElement = etree.parse("tencent.html")
print(etree.tostring(htmlElement,encoding='utf-8').decode('utf-8'))
parse函数默认使用XML解析器,如果碰到不规范的html代码时会解析错误,此时要自己创建html解析器
from lxml import etree
parser = etree.HTMLParser(encoding='utf-8')
htmlElement = etree.parse("tencent.html",parser=parser)
print(etree.tostring(htmlElement,encoding='utf-8').decode('utf-8'))
# -*- coding: utf-8 -*-
from lxml import etree
# 读取HTML文件
with open("hello.html", encoding="utf-8") as file:
content = file.read()
# 创建解析对象
html = etree.HTML(content)
# 通过XPATH查到所有li元素
li_list = html.xpath("//li")
# 遍历输出
print("##########################################")
print("所有li元素:")
for li in li_list:
print(etree.tostring(li).decode("utf-8"))
# 通过XPATH查到所有class等于item-0的li元素
item0_li_list = html.xpath("//li[@class='item-0']")
# 遍历输出
print("##########################################")
print("所有class等于item-0的li元素:")
for li in item0_li_list:
print(etree.tostring(li).decode("utf-8"))
# 通过XPATH查到所有li元素下名称为A且href属性等于link1.html的节点
link1_list = html.xpath("//li/a[@href='link1.html']")
# 遍历输出
print("##########################################")
print("所有li元素下名称为A且href属性等于link1.html的节点:")
for li in link1_list:
print(etree.tostring(li).decode("utf-8"))