from lxml import etree
import requests
import re
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
}
url = "https://www.umei.cc/meinvtupian/index.htm"
resp = requests.get(url, headers=header)
lx = etree.HTML(resp.text)
page = lx.xpath('//div[@class="TypeList"]/ul/li/text()')
print(page)
这个页面的编码是"utf-8"
需要设置
resp.encoding = "utf-8"
再获取resp.text
页面文字是在span
如果是要获取页面中的文本文字是
//div[@class="TypeList"]/ul/li/a/span/text()
from lxml import etree
import requests
import re
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
}
url = "https://www.umei.cc/meinvtupian/index.htm"
resp = requests.get(url, headers=header)
resp.encoding = "utf-8"
lx = etree.HTML(resp.text)
page = lx.xpath('//div[@class="TypeList"]/ul/li/a/span/text()')
print(page)
href = lx.xpath('//div[@class="TypeList"]/ul/li/a/@href')
print(href)
img = lx.xpath('//div[@class="TypeList"]/ul/li/a/img/@src')
print(img)