pycharm爬虫:运行不稳定,爬到的东西时多时少,而且还不能再次进入主循环,这种情况怎么处理

img

img

img

原因是有的节点中没有该图像元素,获取不到,在对Nonetype对象调用find方法会报错。解决办法:
可用try/except异常处理:try img=ku_part.find('img') 。。。except;pass;也可用if/else对过滤,不为空则为选取的值,为空时,赋值为“”。

笑死,这玩意我写过


import os, sys, requests, re
import multiprocessing as mp

os.chdir(sys.path[0])  # 重置程序位置

url = r'https://pic.netbian.com/4kmeinv/'  # 目标网页根目录
img_root = r'https://pic.netbian.com'  # 图片相对地址前缀
img_src = re.compile(
    '<div class="photo-pic"><a href="" id="img"><img src="(.+?)" data-pic="(.+?)" alt="(.+?)" title="(.+?)"></a></div>',
    re.S)
page_suffix = r'index_'  # 翻页用前缀
html_suffix = '.html'  # html后缀
jpg_suffix = '.jpg'  # jpg后缀
url_list_pattern = re.compile('<div class="slist">(.*?)</div>', re.S)
img_url_pattern = re.compile('<li><a href="(.+?)" target="_blank"><img src="(.+?)" alt="(.+?)" />(.+?)</b></a></li>',
                             re.S)

aim_dir = '4Kmeinv/'  # 输出文件夹相对目录
max_page = 146  # 手动校准的页数
coding = 'GBK'  # 解析编码

# ----------re程序初始化----------
headers = {}
kw = {}
proxies = {}
forbid_character = re.compile('[\\/:*?\"<>|]')  # 作为文件名时的违禁字符
root = r'https?://'
root1 = r'https://'
user_agent = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
headers['User-Agent'] = user_agent
first_use = 1
if not re.match(root, url):
    url = root1 + url
timeout = 15
r = requests.get(url, headers=headers, params=kw, proxies=proxies, timeout=timeout)
if not os.path.exists(aim_dir):
    os.mkdir(aim_dir)

if first_use:
    with open(aim_dir + 'test.html', 'wb') as f:
        f.write(r.content)


# ----------re程序初始化----------

def get_and_download(page_index):
    # 访问第i页
    if page_index == 1:
        r = requests.get(url, headers=headers, params=kw, proxies=proxies, timeout=timeout)
    else:
        r = requests.get(url + page_suffix + str(page_index) + html_suffix, headers=headers, params=kw, proxies=proxies,
                         timeout=timeout)

    # 所有扫描到的图片详情页网址
    url_list = url_list_pattern.findall(r.content.decode(coding))[0]
    img_list = img_url_pattern.findall(url_list)
    for img_url in img_list:
        r = requests.get(img_root + img_url[0], headers=headers, params=kw, proxies=proxies,
                         timeout=timeout)  # 获取的是详情页url
        if not r.status_code == 200:
            print(f'获取第{page_index}页位于{img_url[0]}的图片时失败:{r.status_code}')
            continue
        r1 = img_src.findall(r.content.decode(coding))[0]  # 获取的是图片url
        r2 = requests.get(img_root + r1[0], headers=headers, params=kw, proxies=proxies, timeout=timeout)
        img_id = img_url[0].split('/')[2]
        print(f'正在下载第{img_id}张图片')
        with open(aim_dir + img_id + '_' + forbid_character.sub('+', r1[3]) + jpg_suffix, 'wb') as g:
            g.write(r2.content)
    print(url_list)


if __name__ == '__main__':
    pool = mp.Pool(5)  # 进程池
    for i in range(140, max_page + 1):
        # get_and_download(i)
        pool.apply_async(get_and_download, (i,))

    pool.close()
    pool.join()

把为空的数据过滤掉就行了

剑谱第一页,忘掉背景图