from selenium import webdriver
import time
import copy
import json
def closeLoginDiv(driver):
""""
#关闭抖音登录窗口
#:param driver:chrome
#:return:无
"""
time.sleep(3)
login_windows = driver.find_elements_by_xpath("//div[@id='login-pannel']")
#判断login存在不
if len(login_windows)>0:
login_window = login_windows[0]
#找关闭按钮
close_link = login_window.find_elements_by_xpath("//div[@class='dy-account-close']")[0]
# 调用javascript
driver.execute_script("arguments[0].click();",close_link)
def scrollDown(driver,begin,step,times):
"""
##浏览器向下滑动
:param driver: 历览器驱动对象
:param bagin:滑动起点
:param step:滑动距离
:param times:滑动次数
:return:无
"""
scroll_y = begin
for i in range(times):
time.sleep(2)
#用javascript滑动
driver.execute_script("window.scrollTo(0,{})".format(scroll_y))
scroll_y += step
def getALLVidelInfo(driver):
"""
在某个人抖音主页,视频链接标题爬取
:param driver: chrome驱动对象
:return:返回list
"""
video_section = driver.find_elements_by_xpath("//ul[@class='ARNw21RN']")[0]
all_video_list = video_section.find_elements_by_tag_name("li")
video_links = []
video_titles = []
for video_info in all_video_list:
#拿到单个视频链接
video_link = video_info.find_elements_by_tag_name("a")[0]
video_link = video_link.get_attribute("href")
#拿到标题
video_title = video_info.find_elements_by_tag_name("img")[0]
video_title = video_title.get_attribute("alt")
video_links.append(video_link)
video_titles.append(video_title)
return video_links,video_titles
def getAllComments(drive):
"""
在给定的视频链接页面下面,爬取所有评论
:param drive: 浏览器驱动对象
:return: 用户名和对于评论
"""
username_list = []
comment_list = []
#找到用户名所在span位置
username_areas = driver.find_elements_by_xpath("//span[@class='Nu66P_ba NCRZnxVF']")
for username_areas in username_areas:
username = username_areas.get_attribute('innerHTML')
# 把清理干净
username = username.replace("","").replace("","")
username_list.append(username)
#找到评论所在span位置
comment_areas = driver.find_elements_by_xpath("//span[@class='Nu66P_ba']")
for comment_areas in comment_areas:
comment = comment_areas.get_attribute('innerHTML')
# 把清理干净
comment = comment.replace("","").replace("","")
comment_list.append(comment)
return username_list,comment_list
main_url = "https://www.douyin.com/user/MS4wLjABAAAAZdRDXLm_f1yTuZfnvtYWUh0rWBtzAtrkTb0uDDBj5MW_BN3ZOyJp2Q7FHXKAZNH6"
# 指定浏览器驱动
driver = webdriver.Chrome("./chromedriver.exe")
# 打开指定页面
driver.get(main_url)
# 先关闭登录窗口
closeLoginDiv(driver)
# 向下滑动加载
scrollDown(driver,1000,1000,10)
video_dict = {}
comment_dict = {}
all_comment_data = []
#视频标题链接
video_links,video_titles = getALLVidelInfo(driver)
#循环所有视频链接
for i in range(len(video_links)):
video_link = video_links[i]
video_title =video_titles[i]
video_dict["链接"] = video_link
video_dict["标题"] = video_title
video_dict["评论"] = []
print("正在爬取第{}个视频的评论".format(i+1))
# 代开地i+1个视频评论
driver.get(video_link)
# 如果弹出登录窗口,关闭
closeLoginDiv(driver)
# 向下滑动加载更多评论
scrollDown(driver,1000,1000,10)
# 湖区所有评论信息,用户名对应评论
username_list,comment_list = getAllComments(driver)
#第二次循环,嵌套循环
for j in range(len(username_list)):
username = username_list[j]
comment = comment_list[j]
comment_dict["用户"] = username
comment_dict["评论"] = comment
video_dict["评论"].append(copy.copy(comment_dict))
# 把所有数据评论放进all_comment_data里
all_comment_data.append(copy.copy(video_dict))
# 保存数据,每三个存储一次
# 防止数据丢失
if i % 3 == 0:
print("数据获取完毕,正在保存数据。。。")
fp = "./comment1.json"
f = open(fp,"w")
json.dump(all_comment_data,f)
f.close()
print("数据保存完成")
class属性会改变, 那么就一种唯一的方式,比如id,如果没有id还可以根据文本,子元素,父元素等等,总有一种能确定唯一的,xpath使用:
https://blog.csdn.net/weixin_45827692/article/details/121417000
那你就不用class值定位呀
瞅瞅有没有其他的属性。
或者你去定位它的父标签,然后再去获取父标签的ul
比如//div/ul