我在用Python的pyquery爬取QQ音乐的音乐MV。这个网页的网址就是url,用了find方法找这个视频的地址,先找video节点,然后获取它的src属性,就是它的地址,但是视频地址打印出来是None。
我的部分代码:
from pyquery import PyQuery as pq
url_mv = pq(url='https://y.qq.com/n/ryqq/mv/d0023bpqirq')
url_mp4 = url_mv.find('#video_player__source').attr('src')
print(url_mp4)
我试过打印出这个网页的HTML内容,但是在其中找不到这个节点,难道说说我获取到的网页内容并不完整?我想知道这是为什么,怎么解决。
动态创建的节点。配合selenium.webdriver的无头浏览器来抓。
输入mv ID 获取播放地址
# -*- coding:utf-8 -*-
import json
import random
import re
import requests
class QQYingYue():
def __init__(self,proxy):
self.proxy = proxy
# 获取响应的函数
def unify_requests(self,method="GET",url="",headers={},proxies={},data={},params=()):
if method=="GET":
response = requests.get(url, headers=headers,proxies=proxies,params=params,timeout=5)
return response
def down_load_single_song(self,song_id,song_uid=''):
headers = {
"Proxy-Tunnel": str(random.randint(1, 10000)),
'authority': 'u.y.qq.com',
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Mobile Safari/537.36',
'accept': '*/*',
'sec-fetch-site': 'same-site',
'sec-fetch-mode': 'no-cors',
'sec-fetch-dest': 'script',
'referer': 'https://i.y.qq.com/n2/m/share/details/mv.html?ADTAG=newyqq.mv&vid=r0033wmeu4q',
'accept-language': 'zh-CN,zh;q=0.9',
}
params = (
('ct', '23'),
('cv', '0'),
('format', 'json'),
('callback', 'qmv_jsonp_2'),
('data',
'{"getMVInfo":{"module":"video.VideoDataServer","method":"get_video_info_batch","param":{"vidlist":["%s"],"required":["vid","sid","gmid","type","name","cover_pic","video_switch","msg"],"from":"h5.mvplay"}},"getMVUrl":{"module":"gosrf.Stream.MvUrlProxy","method":"GetMvUrls","param":{"vids":["%s"],"from":"h5.mvplay"},"request_typet":10001}}'%(song_id,song_id)),
('platform', 'h5'),
)
response = self.unify_requests(url='https://u.y.qq.com/cgi-bin/musicu.fcg', headers=headers, params=params,proxies=self.proxy)
return self.get_url_re(response.text)
def get_url_re(self,text):
info = re.findall('"freeflow_url":(\[.*?\])',text)
if info:
# 这里可能有问题 有很多url 可能是分高清和普清了
url_list = []
for e in info:
each_json_list = json.loads(e)
if isinstance(each_json_list,list) and len(each_json_list)>1:
# print(each_json_list[1])
url_list.append(each_json_list[1])
if url_list:
return url_list[-1]
else:
print("请注意 url_list为空但是找到了 freeflow_url,请检查freeflow_url是否都是空 会员???")
return -1
else:
return -2
# 批量下载歌曲
def down_load_songs(self):
pass
if __name__ == '__main__':
qqyy = QQYingYue(proxy={})
link = qqyy.down_load_single_song(song_id="d0023bpqirq") # 下架2规则
print(link)
https://www.jianshu.com/p/1b63c5f3c98e
https://blog.csdn.net/bizcatt/article/details/88693982
使用selenium获取源码,然后再用pyquery进行解析
# coding=utf-8
from pyquery import PyQuery as pq
from selenium import webdriver
from pyquery import PyQuery as pq
driver = webdriver.Chrome()
url_mv = 'https://y.qq.com/n/ryqq/mv/d0023bpqirq'
driver.get(url_mv)
source = driver.page_source
url_mp4 = pq(source).find('#video_player__source').attr('src')
print(url_mp4)
driver.quit()
至于selenium怎么用,可以看我之前的一些文章,或者网上查询,已经亲测
已解决:https://blog.csdn.net/weixin_52132159/article/details/119055557