python爬虫中etree问题

问题遇到的现象和发生背景 python爬虫
问题相关代码,请勿粘贴截图
from lxml import etree

text = '''
    <body>
        <div>
            <ul>
                <li>01li>
                <li>02li>
                <li>03li>
                <li>04li>
                <li>05li>
                <li>06li>
                <li>07
            ul>
        div>
    body>
    '''

```html
    <body>
        <div>
            <ul>
                <li>01li>
                <li>02li>
                <li>03li>
                <li>04li>
                <li>05li>
                <li>06li>
                <li>07
            ul>
        div>
    body>

htmla = etree.HTML(text) # 将字符串格式的文件转化为html文档
bc=htmla.xpath("/body/div/ul/li[1]/text()")

print(bc)

```

运行结果及报错内容

img


from lxml import etree
 
text = '''
    <body>
        <div id="content">
            <ul>
                <li>01</li>
                <li>02</li>
                <li>03</li>
                <li>04</li>
                <li>05</li>
                <li>06</li>
                <li>07
            </ul>
        </div>
    </body>
    '''
 
htmla = etree.HTML(text) # 将字符串格式的文件转化为html文档
bc = htmla.xpath('//li')
for c in bc:
    print(c.text)

将/body/div/ul/li[1]/text(),body前的斜杆(/)去掉

img

#!/usr/bin/env python
# -*- coding:utf-8 -*-

from lxml import etree

text = '''
    <body>
        <div id="content">
            <ul>
                <li>01</li>
                <li>02</li>
                <li>03</li>
                <li>04</li>
                <li>05</li>
                <li>06</li>
                <li>07
            </ul>
        </div>
    </body>
    '''

htmla = etree.HTML(text)  # 将字符串格式的文件转化为html文档
bc = htmla.xpath('//div[@id="content"]/ul/li')    #在清洗数据的过程中,我们一般会找有明显标志的HTML属性向下获取数据,比如id或者class
for c in bc:
    print(c.xpath('./text()')[0])

代码如下,望采纳:

from lxml import etree

text = '''
    <body>
        <div>
            <ul>
                <li>01</li>
                <li>02</li>
                <li>03</li>
                <li>04</li>
                <li>05</li>
                <li>06</li>
                <li>07
            </ul>
        </div>
    </body>
    '''

htmla = etree.HTML(text) # 将字符串格式的文件转化为html文档
bc=htmla.xpath("//body/div/ul/li[1]/text()")

print(bc)