试过好多次,能爬到大概2页多一点,没能爬完指定的页面就显示代码里面的except的内容,然后就停了,好像又没报错,实在不知道要怎么调试,(爬取的文件有点多,是不是需要打包成多线程,百度试过方法,没成功),然后需要把详情页里的一段信息和网址加到图片属性里,一点头绪也没有,门外汉一点点百度学的,请大神们不要嫌弃,求大神们赐教
import traceback
from bs4 import BeautifulSoup
import requests
import os
import lxml
import json
import time
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
# 从缩略图网页里爬取整个图片集
def getPic(url):
print("download pic url +==="+url)
result = requests.get(url, headers=headers)
result.encoding = 'utf-8'
soup = BeautifulSoup(result.content, 'lxml')
json_data = soup.find('div', attrs={'id': 'gallery-items'})
name = soup.find('a').string
simpleName = re.sub(r'[/:*?"<>|\\\\]+', '-', name)
print(simpleName)
path = 'f:/CodeWar/spider/Archdaily/'
newPath = os.path.join(path, simpleName)
os.makedirs(newPath, exist_ok=True)
os.chdir(newPath)
# print(newPath)
figures = json.loads(json_data.get('data-images'))
i = 1
for figure in figures:
# print(figure['url_large'])
try:
print('downloading number:' + str(i)+"====>>"+figure['url_large'])
image = requests.get(url=figure['url_large'], headers=headers)
if image.status_code == 200:
# with open(simpleName + str(i) + '.jpg', 'wb') as f:
with open(str(i) + '.jpg', 'wb') as f:
f.write(image.content)
i += 1
except:
print("figure=======>>ZZzzzz...")
time.sleep(5)
print("===================e...")
continue
# 从主网页获取单个网页的地址
def get_url(page):
pageResult = requests.get(sourceWeb, headers=headers)
pageSoup = BeautifulSoup(pageResult.content, 'lxml')
for collection in pageSoup.find_all('a', class_='afd-title--black-link'):
if 'href' in collection.attrs:
sonLink = 'https://www.archdaily.com' + collection.attrs['href']
sonResponde = requests.get(sonLink, headers=headers)
sonResponde.encoding = 'utf-8'
sonSoup = BeautifulSoup(sonResponde.content, 'lxml')
thumb = sonSoup.find('a', class_='gallery-thumbs-link')
if thumb:
thumbLink = 'https://www.archdaily.com' + thumb.attrs['href']
# print(thumbLink)
try:
getPic(thumbLink)
except:
print("ZZzzzz...")
time.sleep(5)
print("==xxxxxx=======e...")
continue
# print(url_collections)
print('—--------creat next folder—--------')
motherWeb = 'https://www.archdaily.com/page/'
n = 0
# 指定需要爬取页数
wanna_page = 10
while n <= wanna_page:
n += 1
sourceWeb = motherWeb + str(n)
try:
get_url(sourceWeb)
print('this is page' + str(n))
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue
是放在属性的详细信息里吧? 修改图片exif信息,把你的user-agent 放作者里了。
from PIL import Image
import piexif
im = Image.open('4.jpg')
exif_dict = piexif.load(im.info["exif"])
exif_dict["0th"][piexif.ImageIFD.Artist] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36".encode()
exif_bytes = piexif.dump(exif_dict)
im.save("4.jpg", exif=exif_bytes)