最近看了理想之城连续刷,挺狗血的,准备想看品味下文字版,去网上找了一个网站,试着去抓取数据,原码如下,一切都还顺利。
import requests
from fake_useragent import UserAgent
from lxml import etree
def get_html(x):
ua = {"User-Agent":UserAgent().random}
r = requests.get(url=x,headers=ua).text
return r
def get_urls(x):
html = etree.HTML(x)
data = html.xpath('//ul/li//@href')[1:-1]
return data
def get_content(x):
html = etree.HTML(x)
data = html.xpath('//p/text()')
return data
def main():
url = 'http://m.jiewuxian.com/10/10910/'
s = url.split('/1')[0]
urls = get_urls(get_html(url))
#print(urls)
for i in urls:
sr = s+i
text = get_content(get_html(sr))
break
if __name__ == '__main__':
main()
#但是我发现get_urls与get_content函数功能相同,
#于是做了如下优化。
#但代码就运行不了,
#请各位同仁看下问题出哪里了。
#不胜感激!!
import requests
from fake_useragent import UserAgent
from lxml import etree
def get_html(x):
ua = {"User-Agent":UserAgent().random}
r = requests.get(url=x,headers=ua).text
return r
def get_urls(x,y):
html = etree.HTML(x)
data = html.xpath(y)
return data
def main():
url = 'http://m.jiewuxian.com/10/10910/'
s = url.split('/1')[0]
url_xpath = '//ul/li/@href'
text_xpath = '//p/text()'
urls = get_urls(get_html(url),url_xpath)[1:-1]
#print(urls)
for i in urls:
sr = s+i
text = get_urls(get_html(sr),text_xpath)
break
if __name__ == '__main__':
main()
报错是什么