跟着视频敲的,我的就是运行不了,大家帮忙看看是哪里的问题啊
汽车之家二级页面数据抓取案例
思路:
1.一级页面提取数据:汽车详情页连接
2.二级页面提取数据:具体汽车的数据
import requests
import re
import time
import random
class CarSpider:
def __init__(self):
self.url = 'https://www.che168.com/beijing/a0_0msdgscncgpi1lto1csp{}exx0/'
self.headers = {'User-Agent': 'User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;'}
def get_html(self, url):
html = requests.get(url=url, headers=self.headers).content.decode('gb2312', 'ignore')
return html
def re_func(self, regex, html):
pattern = re.compile(regex, re.S)
r_list = pattern.findall(html)
return r_list
def parse_html(self, one_url):
"""程序逻辑函数"""
one_html = self.get_html(url=one_url)
one_regex = '<li class="cards-li list-photo-li ".*?<a href="(.*?)"'
# href_list: ['/declear/xxx','','',...]
href_list = self.re_func(regex=one_regex, html=one_html)
for href in href_list:
two_url = 'https://www.che168.com' + href
# 提取一辆汽车的具体信息
self.get_car_info(two_url)
# 控制抓取频率,每抓取一辆汽车随机休眠0-1秒钟
time.sleep(random.uniform(0, 1))
def get_car_info(self, two_url):
"""提取一辆汽车的具体信息"""
two_html = self.get_html(url=two_url)
two_regex = '<div class="car-box">.*?<h3 class="car-brand-name">(.*?)</h3>.*?<h4>(.*?)</h4>.*?<h4>(.*?)</h4>.*?<h4>自动 / (.*?)</h4>.*?<h4>(.*?)</h4>.*?<span class="price" id="overlayPrice">¥(.*?)<b>'
# r_list: [('比亚迪汉','2万公里','2000年','自动/1.6L','廊坊','26万')]
r_list = self.re_func(regex=two_regex, html=two_html)
item = {}
item['name'] = r_list[0][0].strip()
item['km'] = r_list[0][1].strip()
item['time'] = r_list[0][2].strip()
item['type'] = r_list[0][3].split('/')[0].strip()
item['displacement'] = r_list[0][3].split('/')[1].strip()
item['address'] = r_list[0][4].strip()
item['price'] = r_list[0][5].strip()
print(item)
def run(self):
for o in range(1, 3):
page_url = self.url.format(o)
self.parse_html(one_url=page_url)
if __name__ == '__main__':
spider = CarSpider()
spider.run()
Traceback (most recent call last):
File "F:\pythonProject2\day2\demo08.py", line 69, in <module>
spider.run()
File "F:\pythonProject2\day2\demo08.py", line 64, in run
self.parse_html(one_url=page_url)
File "F:\pythonProject2\day2\demo08.py", line 40, in parse_html
self.get_car_info(two_url)
File "F:\pythonProject2\day2\demo08.py", line 52, in get_car_info
item['name'] = r_list[0][0].strip()
IndexError: list index out of range
看下r_list是什么