原因是有的节点中没有该图像元素,获取不到,在对Nonetype对象调用find方法会报错。解决办法:
可用try/except异常处理:try img=ku_part.find('img') 。。。except;pass;也可用if/else对过滤,不为空则为选取的值,为空时,赋值为“”。
笑死,这玩意我写过
import os, sys, requests, re
import multiprocessing as mp
os.chdir(sys.path[0]) # 重置程序位置
url = r'https://pic.netbian.com/4kmeinv/' # 目标网页根目录
img_root = r'https://pic.netbian.com' # 图片相对地址前缀
img_src = re.compile(
'<div class="photo-pic"><a href="" id="img"><img src="(.+?)" data-pic="(.+?)" alt="(.+?)" title="(.+?)"></a></div>',
re.S)
page_suffix = r'index_' # 翻页用前缀
html_suffix = '.html' # html后缀
jpg_suffix = '.jpg' # jpg后缀
url_list_pattern = re.compile('<div class="slist">(.*?)</div>', re.S)
img_url_pattern = re.compile('<li><a href="(.+?)" target="_blank"><img src="(.+?)" alt="(.+?)" />(.+?)</b></a></li>',
re.S)
aim_dir = '4Kmeinv/' # 输出文件夹相对目录
max_page = 146 # 手动校准的页数
coding = 'GBK' # 解析编码
# ----------re程序初始化----------
headers = {}
kw = {}
proxies = {}
forbid_character = re.compile('[\\/:*?\"<>|]') # 作为文件名时的违禁字符
root = r'https?://'
root1 = r'https://'
user_agent = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
headers['User-Agent'] = user_agent
first_use = 1
if not re.match(root, url):
url = root1 + url
timeout = 15
r = requests.get(url, headers=headers, params=kw, proxies=proxies, timeout=timeout)
if not os.path.exists(aim_dir):
os.mkdir(aim_dir)
if first_use:
with open(aim_dir + 'test.html', 'wb') as f:
f.write(r.content)
# ----------re程序初始化----------
def get_and_download(page_index):
# 访问第i页
if page_index == 1:
r = requests.get(url, headers=headers, params=kw, proxies=proxies, timeout=timeout)
else:
r = requests.get(url + page_suffix + str(page_index) + html_suffix, headers=headers, params=kw, proxies=proxies,
timeout=timeout)
# 所有扫描到的图片详情页网址
url_list = url_list_pattern.findall(r.content.decode(coding))[0]
img_list = img_url_pattern.findall(url_list)
for img_url in img_list:
r = requests.get(img_root + img_url[0], headers=headers, params=kw, proxies=proxies,
timeout=timeout) # 获取的是详情页url
if not r.status_code == 200:
print(f'获取第{page_index}页位于{img_url[0]}的图片时失败:{r.status_code}')
continue
r1 = img_src.findall(r.content.decode(coding))[0] # 获取的是图片url
r2 = requests.get(img_root + r1[0], headers=headers, params=kw, proxies=proxies, timeout=timeout)
img_id = img_url[0].split('/')[2]
print(f'正在下载第{img_id}张图片')
with open(aim_dir + img_id + '_' + forbid_character.sub('+', r1[3]) + jpg_suffix, 'wb') as g:
g.write(r2.content)
print(url_list)
if __name__ == '__main__':
pool = mp.Pool(5) # 进程池
for i in range(140, max_page + 1):
# get_and_download(i)
pool.apply_async(get_and_download, (i,))
pool.close()
pool.join()
把为空的数据过滤掉就行了
剑谱第一页,忘掉背景图