代码:
# _*_ coding=utf-8 _*_
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import concurrent.futures
from multiprocessing.dummy import Pool as pool
limit = 10000 # 热门歌单播放量筛选下限
headers = {
'Referer': 'http://music.163.com/',
'Host': 'music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
}
headers1 = {
'Referer': 'http://music.163.com/',
'Host': 'music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
finallist = []
url_list = []
count = 0
LABEL = '网易云音乐-爬取歌单数据'
def getHTMLText(url, headers): # 通用的获取网站内容的框架
try:
r = rq.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "网络解析错误"
def get_url(cat, depth): # 获取首页该分类下面的歌单url,形成url_list
depth = 1
start_url = 'https://music.163.com/discover/playlist/?order=hot&cat=' + cat
for i in range(depth):
try:
url = start_url + '&limit=35' + '&offset=' + str(35 * (i + 1))
html = getHTMLText(url, headers)
parse_main(html)
except:
print('失败')
continue
def parse_main(html): # 解析单个url
soup = BeautifulSoup(html, 'html.parser')
c = soup.find_all('li')
for unit in c:
try:
name_url = unit.find('a', {'class': "tit f-thide s-fc0"}) # m这里有URL,名字的信息
number = eval(unit.find('span', {'class': 'nb'}).text.replace('万', '0000')) # 这里获取的是播放量的信息,用于初步筛选
list1 = [name_url['title'].replace(u'\xa0', u' '), number, name_url['href']]
url_list.append(list1)
except:
continue
def parse_single(listid): # 进入歌单内部解析,获取播放量,收藏量,标签等信息
global count
count += 1
# print(count)
# print('\0')
singleurl = 'https://music.163.com' + listid
# print(singleurl)
# print('\0')
singletext = getHTMLText(singleurl, headers=headers1)
# print("**")
soup = BeautifulSoup(singletext, 'html.parser')
try:
play_count = eval(soup.find('strong', {'class': 's-fc6'}).text)
fav = soup.find('a', {'class': 'u-btni u-btni-fav'}).i.text.strip('(').strip(')')
if ('万') in fav:
fav = eval(fav.replace('万', '0000'))
share = eval(soup.find('a', {'class': 'u-btni u-btni-share'}).i.text.strip('(').strip(')'))
comment = eval(soup.find('a', {'data-res-action': 'comment'}).i.span.text)
length = eval(soup.find('span', {'id': 'playlist-track-count'}).text)
date = soup.find('span', {'class': 'time s-fc4'}).text[:10]
name = soup.find('h2', {"class": 'f-ff2 f-brk'}).text.replace(u'\xa0', u' ')
tags = soup.find_all('a', {'class': 'u-tag'})
p = len(tags)
tag1 = 'nan'
tag2 = 'nan'
tag3 = 'nan'
if p >= 1:
tag1 = tags[0].text.replace(u'\xa0', u' ')
if p >= 2:
tag2 = tags[1].text.replace(u'\xa0', u' ')
if p == 3:
tag3 = tags[2].text.replace(u'\xa0', u' ')
list1 = [name, date, play_count, fav, share, comment, length, tag1, tag2, tag3]
finallist.append(list1)
print('解析第{}个歌单成功'.format(count))
except:
print('解析第{}个歌单失败'.format(count))
return
def main(type, depth=38):
get_url(type, depth=depth)
print("歌单列表获取完成")
print(url_list)
a = pd.DataFrame(url_list)
b = list(a[2])
with concurrent.futures.ThreadPoolExecutor() as executor:
executor.map(parse_single, b)
# 多线程
print(finallist)
a = pd.DataFrame(finallist)
b = pd.DataFrame(url_list)
title_list = ['名称', '创建日期', '播放次数', '收藏量', '转发量', '评论数', '歌单长度', 'tag1', 'tag2', 'tag3']
c = pd.Series(title_list)
a.columns = c
a.to_excel(r'C:\Users\YTRoski2\Desktop\毕业论文\{}.xlsx'.format(type))
# 数据输出到Excel
main(LABEL, depth=5)
# depth就是总共爬取多少页的,打开网站可以发现每页上有30多个,总共有38页。
报错:
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2895, in get_loc
return self._engine.get_loc(casted_key)
File "pandas\_libs\index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
歌单列表获取完成
[]
File "pandas\_libs\index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\hashtable_class_helper.pxi", line 1675, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas\_libs\hashtable_class_helper.pxi", line 1683, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 2
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "C:/Users/YTRoski2/PycharmProjects/Scrapy/scrapy_first/download_playlist/music163.py", line 124, in <module>
main(LABEL, depth=5)
File "C:/Users/YTRoski2/PycharmProjects/Scrapy/scrapy_first/download_playlist/music163.py", line 110, in main
b = list(a[2])
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py", line 2902, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 2897, in get_loc
raise KeyError(key) from err
KeyError: 2
这是因为a是DataFrame数据类型,它在行/列选取时不支持a[2]的取法,若需要提取第二行的数据,可以通过DataFrame的iloc语法来实现
您好,我是有问必答小助手,您的问题已经有小伙伴解答了,您看下是否解决,可以追评进行沟通哦~
如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~
ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632