import os
class KSSpider(object):
os_path=os.getcwd()+'/快手视频/'
if not os.path.exists(os_path):
os.mkdir(os_path)
def __init__(self):
'''
爬虫第一步,准备数据
'''
self.start_url='https://www.kuaishou.com/graphql'
self.header={
'content-type': 'application/json',
'Cookie': 'clientid=3; did=web_d1293a3fc875d20683a3a838188cba87; client_key=65890b29; kpf=PC_WEB; kpn=KUAISHOU_VISION',
'Host': 'www.kuaishou.com',
'Origin': 'https://www.kuaishou.com',
'Referer': 'https://www.kuaishou.com/brilliant',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.50'
}
self.data='{"operationName":"brilliantTypeDataQuery","variables":{"hotChannelId":"00","page":"brilliant"},"query":"fragment photoContent on PhotoEntity {\n id\n duration\n caption\n likeCount\n viewCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n __typename\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n __typename\n}\n\nfragment photoResult on PhotoResult {\n result\n llsid\n expTag\n serverExpTag\n pcursor\n feeds {\n ...feedContent\n __typename\n }\n webPageArea\n __typename\n}\n\nquery brilliantTypeDataQuery($pcursor: String, $hotChannelId: String, $page: String, $webPageArea: String) {\n brilliantTypeData(pcursor: $pcursor, hotChannelId: $hotChannelId, page: $page, webPageArea: $webPageArea) {\n ...photoResult\n __typename\n }\n}\n"}'
def parse_start_url(self):
'''
爬虫第二步,发送请求,响应数据
:return:
'''
#使用for循环,模拟翻页
for page in range(3):
response=requests.post(url=self.start_url,headers=self.header,json=self.data)
'''类之间调用方法,需要用self'''
self.parse_response_data(response)
def parse_response_data(self,response):
'''
爬虫第三步,解析数据,数据提取
:return:
'''
#解析数据
json_data=response.json()
data_list=json_data['data']['brilliantTypeData']['feeds']
#数据提取
for data_dict in data_list:
like_count=data_dict['photo']['likeCount']
if like_count >50000:
#提取视频播放名称
data_name=data_dict['photo']['caption']
#提取视频播放地址
data_url=data_dict['photo']['photoUrl']
cm4_data=requests.get(url=data_url).content
'''类之间调用方法,需要用self'''
self.parse_save_data(data_name,cm4_data)
def parse_save_data(self,data_name,cme_data):
'''
爬虫第四步,保存数据
:return:
'''
title=data_name.replace('\\','').replace('\n','').replace('@','')
with open(self.os_path+title+'.mp4','wb') as f:
f.write(cme_data)
print(f'{title}--------保存完成!!!')
if name == 'main':
ks=KSSpider()
ks.parse_start_url()
Traceback (most recent call last):
File "D:/桌面/python/爬虫/快手爬虫.py", line 68, in
ks.parse_start_url()
File "D:/桌面/python/爬虫/快手爬虫.py", line 33, in parse_start_url
self.parse_response_data(response)
File "D:/桌面/python/爬虫/快手爬虫.py", line 42, in parse_response_data
data_list=json_data['data']['brilliantTypeData']['feeds']
KeyError: 'data'
你确定你这个url能爬取到指定的数据吗?放到postman或者浏览器上看看返回的数据格式是什么样,再去解析