#分析网页得有两个div的class 分别是card 和 content 需要分开解析
#现在解析名为card的class
card_lists = dom.xpath('//div[@class="card"]/ul/li')
video_lists = []
page_url = 'https:/haokan.baidu.com'
for card_li in card_lists:
video_name = card_li.xpath('./@title')[0]
video_page = page_url + card_li.xpath('./a[@class="card-item-link"]/@href')[0]
video_dict = {
'video_name':video_name,
'video_page':video_page
}
video_lists.append(video_dict)
#现在解析名为content的class
content_lists = dom.xpath('//div[@class="content"]/div[@class="cardinner card"]')
for content_div in content_lists:
content_lis = content_div.xpath('./ul[@class="card-list clear"]/li[@class="card-item"]')
for content_li in content_lis:
video_name = content_li.xpath('./@title')[0]
video_page = page_url + content_li.xpath('./a[@class="card-item-link"]/@href')[0]
video_dict = {
'video_name': video_name,
'video_page': video_page
}
video_lists.append(video_dict)
#一级页面解析完毕
for video in video_lists:
broswer.get(video_dict['video_page'])
sleep(2)
html_detall = broswer.page_source
dom_detail = etree.HTML(html_detall)
download_url = dom_detail.xpath('//div[@class="videos"]/div/video/@src')[0]
response = requests.get(url=download_url).content
with open('{}.mp4'.format(video['video_name']),'wb') as f:
f.write(response)
为什么这个循环不起作用 只抓到一个链接
输出这个
{'video_name': '成人礼:在少男少女达到成人年龄时举行的象征迈向成人阶段的仪式', 'video_page': 'https:/haokan.baidu.com/v?vid=14031486828386410064&tab=recommend'}
有没有大佬帮忙看看谢谢谢谢
你可以查看源文件,他所有的内容都放在了 js 里
你在控制台可以输入下 window.__PRELOADED_STATE__,查看他的内容
所以,采集也是要做页面分析的