求帮忙,因为需要爬取酷狗top500的历史数据,在网上找到了一个爬虫代码,运行时出现下列问题。
经过查询,感觉是因为某一行数据中没有“-”符号导致的错误,请教大家要怎么修改代码,万谢!
import requests
import agent # 调用agent模块(自己创建的模块)
from bs4 import BeautifulSoup # 引用BeautifulSoup库中的bs4函数
import time
import xlwt
def spider(url, data, count): # 文件爬取
try:
r = requests.get(url, headers=agent.getheaders(), timeout=10) # 发出请求,设置代理,与超时时间
soup = BeautifulSoup(r.text, "html.parser")
ranks = soup.select("span.pc_temp_num")
titles = soup.select("div.pc_temp_songlist>ul>li>a") # 注意获取到的内容是以列表的形式
times = soup.select("span.pc_temp_tips_r>span")
for rank_s, title_s, time_s in zip(ranks, titles, times):
data.append([])
count += 1
data[count].append(rank_s.get_text().strip())
data[count].append(title_s.get_text().split("-")[0])
data[count].append(title_s.get_text().split("-")[1])
data[count].append(time_s.get_text().strip())
print("排名:{}\t歌手:{}\t歌曲:{}\t时长:{}\t".format(data[count][0], data[count][1], data[count][2], data[count][3]))
#time.sleep(1) # 设置睡眠时间,限制爬取网站数据速率,休眠一秒
except():
print("爬取失败")
def save(data): # 文件存储
try:
book = xlwt.Workbook(encoding="utf-8") # 设置格式为utf-8,创建工作薄
sheet = book.add_sheet("Sheet1") # 创建工作表
titles = ['排名', '歌手', '歌曲', '时长']
for t in range(len(titles)):
sheet.write(0, t, titles[t])
i = 1
for x in data:
j = 0
for y in x:
sheet.write(i, j, y)
j += 1
i += 1
book.save("text.xls") # 保存文件
except():
print("Data Error")
def main():
urls = ["https://www.kugou.com/yy/rank/home/{}-8888-40200.html?from=rank".format(str(i)) for i in range(1, 24)] #循环创建地址
datas = []
count = -1
for url in urls:
spider(url, datas, count)
count += 22
save(datas)
if __name__ == "__main__":
main()
Traceback (most recent call last):
File "/Users/arlen/PycharmProjects/pythonProject3/main.py", line 67, in <module>
main()
File "/Users/arlen/PycharmProjects/pythonProject3/main.py", line 61, in main
spider(url, datas, count)
File "/Users/arlen/PycharmProjects/pythonProject3/main.py", line 21, in spider
data[count].append(title_s.get_text().split("-")[1])
IndexError: list index out of range
进程已结束,退出代码为 1
21、22行代码处重复了吧
打印下看title_s.get_text().split(''-")只得到数组长度为1呢
加个判断或者捕获这个异常