import csv
import bs4,threading
import selenium,selenium.webdriver
import time
import re
import requests
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
BASE_URL="https://movie.douban.com/"
CHR_URL=BASE_URL+"/chart"
HEADES={"User-Agent":"Mozilla/5.0(Windows NT 10.0;WOW64) AppleeWebKit/537.36(KHTML,like Gecko chrome 157.0.2987.133 Safari 1537.36"}#头文件
driver = webdriver.Chrome()#浏览器驱动
driver.get(url=CHR_URL)#发送访问请求
SAVE_DIR="d:"+os.sep+"douban"#保存目录
IMAGE_PATH=SAVE_DIR+"images"#图片保存路径
CSV_heards=["type","rank","name","rating","comment","img","crew","url"]
if not os.path.exists(IMAGE_PATH):
os.makedirs(IMAGE_PATH)
class Movie:
def __init__(self,type):
self.img=None
self.name=None
self.type=type
self.rank=None
self.crew=None
self.rating=None
self.comment=None
self.url=None #影片路径
def __repr__(self):
return "[电影]分类:%s、名称:%s、名次:%d、评分:%f、评论人数:%d、演员列表:%s、图片:%s、影片路径:%s"% \
(self.type,self.name,self.rank,self.rating,self.comment,self.crew,self.img,self.url)
def get(self):
return [self.type,self.name,self.rank,self.rating,self.comment,self.crew,self.img,self.url]#返回数据到csv文件里
def main():
request=requests.get(url=CHR_URL,headers=HEADES)#发送GET请求
request.encoding="UTF-8"#设置获取页面编码
soup = bs4.BeautifulSoup(markup=request.text, features="lxml") # 使用lxml库解析
typerank_list = soup.find_all("a", href=re.compile("^/typerank")) # 获得所有分类信息
for type in typerank_list: # 进行数据迭代
type_title = type.contents[0] # 获取类型标题
download_type(type_title, BASE_URL+type["href"])#进行数据爬取
def downloda_movie_image(url,image_name):#url下载路径,imge_name保
image_path=IMAGE_PATH+os.sep+image_name
response=requests.get(url)
with open(file=image_path,mode="bw") as filel:#二进制信息写入
filel.write(response.content)
def download_type(type,url):#定义图片下载函数
driver.get(url=url)
for item in range(5):
target=driver.find_element(By.ID,'footer')#滑轮指定元素footer
driver.execute_script("arguments[0].scrollIntoView();",target)
time.sleep(1)
time.sleep(1)
count = 0
save_path = SAVE_DIR + os.sep + type + ".csv"
with open(file=save_path,mode="w",newline="",encoding="UTF-8") as file:
csv_file=csv.writer(file)
csv_file.writerow(CSV_heards)
try:
for content in driver.find_elements(By.XPATH,"//div[@class='movie-content']"):
time.sleep(0.5)
movie = Movie(type)[]()
movie.url = content.find_element(By.TAG_NAME,"a").get_property("href")#获取影片路径
image_url = content.find_element(By.CLASS_NAME,"movie-img").get_property("src")#获取图片路径
print(image_url)
if image_url:
movie.img = image_url[image_url.rfind("/") + 1:]#获取图片名称
movie.name = content.find_element(By.CLASS_NAME,"movie-name-text").text#电影名称
thread=threading.Thread(target=downloda_movie_image,args=(image_url,movie.name,)).start()#多线程启动图片下载
movie.rank = int(content.find_element(By.CLASS_NAME,"rank-num").text)#
movie.crew = content.find_element(By.CLASS_NAME,"movie-crew").text.split("/")
movie.rating = float(content.find_element(By.CLASS_NAME,"rating_num").text)
movie.comment = int(re.sub("\D", "", content.find_element(By.CLASS_NAME,"comment-num").text))
csv_file.writerow(movie.get())
count += 1
if count >= 10:
raise Exception("爬够了,休息")
except Exception as exp:
print(exp)
if __name__=="__main__":
main()
```python
在爬取豆瓣网时
可以执行csv文件保存但是图片无法保存,图片名称和路径也可以获取通过thread线程下载
有大佬帮忙看一下吗