关于keras训练图片分割模型遇到保存失败的问题 TypeError: cannot pickle '_thread.RLock' object!

问题遇到的现象和发生背景

今日在使用Tensorflow和Keras做图片分割,在网上找到了这一段代码,自己修改之后保存模型始终保存不上,使用model.save_weight时可以保存,但是使用model.save时就不可以,以下是相关代码:

问题相关代码

主函数main:

import numpy as np
import random
import os
from PIL import Image
from keras.utils.np_utils import to_categorical


# Horse dataset, download from http://www.msri.org/people/members/eranb/
def get_horse_generator(horse_path, batch_size=2, input_hw=(256, 256, 3), mask_hw=(256, 256, 20),
                        preprocess=True, shuffle=True):
    def norm(x):
        x = x / 128
        x -= 1
        return x

    image_list = ['{:02d}'.format(i) for i in range(49)]  # 001 to 250

    batch_images = np.empty((batch_size, input_hw[0], input_hw[1], input_hw[2]))
    batch_masks = np.empty((batch_size, mask_hw[0], mask_hw[1], mask_hw[2]))
    batch_id = 0

    while True:
        if shuffle:
            random.shuffle(image_list)

        for image_name in image_list:
            try:
                image_path = os.path.join(horse_path, 'train\image', image_name + '.jpg')
                image = Image.open(image_path)
                image = image.resize(input_hw[0:2], Image.NEAREST)
                image_np = np.asarray(image, dtype=np.uint8)
                if preprocess:
                    image_np = norm(image_np)
                batch_images[batch_id] = image_np

                mask_path = os.path.join(horse_path, 'train\label', image_name + '.png')
                mask = Image.open(mask_path)
                mask = mask.resize(mask_hw[0:2], Image.NEAREST)
                mask_np = np.asarray(mask, dtype=np.uint8).copy()

                mask_np[mask_np != 0] = 1  # zero, indicating horse.
                mask_np = to_categorical(mask_np, num_classes=mask_hw[2])

                batch_masks[batch_id] = mask_np

                batch_id += 1
                if batch_id == batch_size:
                    batch_id = 0
                    yield batch_images, batch_masks
            except FileNotFoundError:
                print('Image not found, Ignore', image_name)

# np.set_printoptions(threshold=np.nan)
# horse_path = 'C:/Users/AlphaCat/Desktop/image_segmentation/weizmann_horse_db'
# horse_gen = get_horse_generator(horse_path, train_or_val='train', batch_size=2, input_hw=(32, 32, 3),
#                                 mask_hw=(32, 32, 2),
#                                 preprocess=True, shuffle=True)
#
# print(next(horse_gen))
# print(next(horse_gen)[0].shape)
# print(next(horse_gen)[1].shape)


fcn处代码:

    def get_fcn32s_model(input_shape=(224, 224, 3), class_no=2):
        """
        FCN 32 模型
        :param input_shape: (输入图片长,输入图片宽,RGB层数),注意长宽最好是32的倍数
        :param class_no: 类别数量
        :return: Keras模型
        """
        input_tensor = layers.Input(shape=input_shape)
        x = layers.ZeroPadding2D(padding=(99, 99))(input_tensor)  # Pad 100, 99 + 1 in first layer of vgg
        with tf.compat.v1.variable_scope("vgg_encoder"):
            encoder = vgg16.VGG16(input_tensor=x, include_top=False, weights='imagenet')

        with tf.compat.v1.variable_scope("vgg_decoder"):
            x = encoder.get_layer('block5_pool').output
            # 卷积做降采用
            x = layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='same', name='fc6')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu', padding='same', name='fc7')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.MaxPooling2D(pool_size=(2, 2), strides=2, padding='same')(x)
            x = layers.Dropout(0.2)(x)

            x = layers.Conv2D(filters=128, kernel_size=(3, 3), activation='relu', padding='same', name='fc8')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(filters=128, kernel_size=(3, 3), activation='relu', padding='same', name='fc9')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.MaxPooling2D(pool_size=(2, 2), strides=2, padding='same')(x)
            x = layers.Dropout(0.2)(x)

            x = layers.Conv2D(filters=256, kernel_size=(3, 3), activation='relu', padding='same', name='fc10')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(filters=256, kernel_size=(3, 3), activation='relu', padding='same', name='fc11')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(filters=256, kernel_size=(3, 3), activation='relu', padding='same', name='fc12')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.MaxPooling2D(pool_size=(2, 2), strides=2, padding='same')(x)
            x = layers.Dropout(0.2)(x)

            x = layers.Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding='same', name='fc13')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding='same', name='fc14')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding='same', name='fc15')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.MaxPooling2D(pool_size=(2, 2), strides=2, padding='same')(x)
            x = layers.Dropout(0.2)(x)

            x = layers.Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding='same', name='fc16')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding='same', name='fc17')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.Conv2D(filters=512, kernel_size=(3, 3), activation='relu', padding='same', name='fc18')(x)
            x = layers.BatchNormalization()(x)
            x = layers.Activation('relu')(x)
            x = layers.MaxPooling2D(pool_size=(2, 2), strides=2, padding='same')(x)
            x = layers.Dropout(0.2)(x)

            # 使用 1x1卷积 做卷积操作,模拟全链接层操作
            x = layers.Conv2D(filters=class_no, kernel_size=(1, 1), padding='valid')(x)

            # 使用反卷积做Upsampling
            x = layers.Conv2DTranspose(filters=class_no, kernel_size=(64, 64), strides=(32, 32), padding='same',
                                       use_bias=False, name='Upsampling1')(x)

        # 如果size不够,再做一个Bilinear的Upsampling(通常在图片size不为32的倍数时候需要)
        if K.int_shape(x)[1:3] != K.int_shape(input_tensor)[1:3]:
            print('Size different, do Bilinear Upsampling')
            x = layers.Lambda(lambda x: tf.compat.v1.image.resize_bilinear(x, size=K.int_shape(input_tensor)[1:3]))(x)

        # 对输出的每一个像素的各类别(即各通道)的输出使用softmax
        x = layers.Activation('relu', name='output')(x)

        model = models.Model(inputs=input_tensor, outputs=x)

        return model

图片处理的代码:

import numpy as np
import random
import os
from PIL import Image
from keras.utils.np_utils import to_categorical


# Horse dataset, download from http://www.msri.org/people/members/eranb/
def get_horse_generator(horse_path, batch_size=2, input_hw=(256, 256, 3), mask_hw=(256, 256, 20),
                        preprocess=True, shuffle=True):
    def norm(x):
        x = x / 128
        x -= 1
        return x

    image_list = ['{:02d}'.format(i) for i in range(49)]  # 001 to 250

    batch_images = np.empty((batch_size, input_hw[0], input_hw[1], input_hw[2]))
    batch_masks = np.empty((batch_size, mask_hw[0], mask_hw[1], mask_hw[2]))
    batch_id = 0

    while True:
        if shuffle:
            random.shuffle(image_list)

        for image_name in image_list:
            try:
                image_path = os.path.join(horse_path, 'train\image', image_name + '.jpg')
                image = Image.open(image_path)
                image = image.resize(input_hw[0:2], Image.NEAREST)
                image_np = np.asarray(image, dtype=np.uint8)
                if preprocess:
                    image_np = norm(image_np)
                batch_images[batch_id] = image_np

                mask_path = os.path.join(horse_path, 'train\label', image_name + '.png')
                mask = Image.open(mask_path)
                mask = mask.resize(mask_hw[0:2], Image.NEAREST)
                mask_np = np.asarray(mask, dtype=np.uint8).copy()

                mask_np[mask_np != 0] = 1  # zero, indicating horse.
                mask_np = to_categorical(mask_np, num_classes=mask_hw[2])

                batch_masks[batch_id] = mask_np

                batch_id += 1
                if batch_id == batch_size:
                    batch_id = 0
                    yield batch_images, batch_masks
            except FileNotFoundError:
                print('Image not found, Ignore', image_name)

# np.set_printoptions(threshold=np.nan)
# horse_path = 'C:/Users/AlphaCat/Desktop/image_segmentation/weizmann_horse_db'
# horse_gen = get_horse_generator(horse_path, train_or_val='train', batch_size=2, input_hw=(32, 32, 3),
#                                 mask_hw=(32, 32, 2),
#                                 preprocess=True, shuffle=True)
#
# print(next(horse_gen))
# print(next(horse_gen)[0].shape)
# print(next(horse_gen)[1].shape)


运行结果及报错内容

报错如下:

Epoch 1/30
2022-04-15 16:51:37.163568: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8101
2022-04-15 16:51:38.652509: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.58GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-15 16:51:41.660889: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.16GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-15 16:51:42.470852: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.14GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-15 16:51:42.496813: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.16GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-15 16:51:43.549107: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.80GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-04-15 16:51:43.975971: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.58GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
  1/250 [..............................] - ETA: 44:28 - batch: 0.0000e+00 - size: 2.0000 - loss: 1411838.7500 - mean_iou: 0.0000e+00 - acc: 0.53442022-04-15 16:51:44.198713: I tensorflow/core/profiler/lib/profiler_session.cc:110] Profiler session initializing.
2022-04-15 16:51:44.198963: I tensorflow/core/profiler/lib/profiler_session.cc:125] Profiler session started.
2022-04-15 16:51:44.199151: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1630] Profiler found 1 GPUs
2022-04-15 16:51:44.202350: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cupti64_112.dll'; dlerror: cupti64_112.dll not found
2022-04-15 16:51:44.205388: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cupti.dll'; dlerror: cupti.dll not found
2022-04-15 16:51:44.205686: E tensorflow/core/profiler/internal/gpu/cupti_error_manager.cc:135] cuptiGetTimestamp: error 999: 
2022-04-15 16:51:44.205951: E tensorflow/core/profiler/internal/gpu/cupti_error_manager.cc:184] cuptiSubscribe: ignored due to a previous error.
2022-04-15 16:51:44.206173: E tensorflow/core/profiler/internal/gpu/cupti_error_manager.cc:457] cuptiGetResultString: ignored due to a previous error.
2022-04-15 16:51:44.206385: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1682] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error 
2022-04-15 16:51:44.565697: I tensorflow/core/profiler/lib/profiler_session.cc:67] Profiler session collecting data.
2022-04-15 16:51:44.566253: E tensorflow/core/profiler/internal/gpu/cupti_error_manager.cc:140] cuptiFinalize: ignored due to a previous error.
2022-04-15 16:51:44.566520: E tensorflow/core/profiler/internal/gpu/cupti_error_manager.cc:457] cuptiGetResultString: ignored due to a previous error.
2022-04-15 16:51:44.566736: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1773] function cupti_interface_->Finalize()failed with error 
2022-04-15 16:51:44.596698: E tensorflow/core/profiler/internal/gpu/cupti_error_manager.cc:133] cuptiGetTimestamp: ignored due to a previous error.
2022-04-15 16:51:44.596994: E tensorflow/core/profiler/internal/gpu/cupti_error_manager.cc:133] cuptiGetTimestamp: ignored due to a previous error.
2022-04-15 16:51:44.597213: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:526]  GpuTracer has collected 0 callback api events and 0 activity events. 
2022-04-15 16:51:44.606245: I tensorflow/core/profiler/lib/profiler_session.cc:143] Profiler session tear down.
2022-04-15 16:51:44.618307: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: log\plugins\profile\2022_04_15_08_51_44
2022-04-15 16:51:44.750214: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to log\plugins\profile\2022_04_15_08_51_44\LAPTOP-4BUVNCF9.trace.json.gz
2022-04-15 16:51:44.847338: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: log\plugins\profile\2022_04_15_08_51_44
2022-04-15 16:51:44.888927: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for memory_profile.json.gz to log\plugins\profile\2022_04_15_08_51_44\LAPTOP-4BUVNCF9.memory_profile.json.gz
2022-04-15 16:51:45.035196: I tensorflow/core/profiler/rpc/client/capture_profile.cc:251] Creating directory: log\plugins\profile\2022_04_15_08_51_44
Dumped tool data for xplane.pb to log\plugins\profile\2022_04_15_08_51_44\LAPTOP-4BUVNCF9.xplane.pb
Dumped tool data for overview_page.pb to log\plugins\profile\2022_04_15_08_51_44\LAPTOP-4BUVNCF9.overview_page.pb
Dumped tool data for input_pipeline.pb to log\plugins\profile\2022_04_15_08_51_44\LAPTOP-4BUVNCF9.input_pipeline.pb
Dumped tool data for tensorflow_stats.pb to log\plugins\profile\2022_04_15_08_51_44\LAPTOP-4BUVNCF9.tensorflow_stats.pb
Dumped tool data for kernel_stats.pb to log\plugins\profile\2022_04_15_08_51_44\LAPTOP-4BUVNCF9.kernel_stats.pb
250/250 [==============================] - 102s 367ms/step - batch: 124.5000 - size: 2.0000 - loss: 79747.2732 - mean_iou: 0.4251 - acc: 0.6109 - lr: 0.0010
Epoch 2/30
250/250 [==============================] - 91s 365ms/step - batch: 124.5000 - size: 2.0000 - loss: 1044.2393 - mean_iou: 0.4415 - acc: 0.6177 - lr: 0.0010
Epoch 3/30
250/250 [==============================] - ETA: 0s - batch: 124.5000 - size: 2.0000 - loss: 269.6952 - mean_iou: 0.4431 - acc: 0.6153
Epoch 00003: saving model to fcn32s.h5
Traceback (most recent call last):
  File "E:\anaconda3\envs\hr\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-2c9bf53164ed>", line 1, in <module>
    runfile('E:/pycharm数据/AI/image_segmentation-master/main.py', wdir='E:/pycharm数据/AI/image_segmentation-master')
  File "E:\PyCharm 2021.3\plugins\python\helpers\pydev\_pydev_bundle\pydev_umd.py", line 198, in runfile
    pydev_imports.execfile(filename, global_vars, local_vars)  # execute the script
  File "E:\PyCharm 2021.3\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "E:/pycharm数据/AI/image_segmentation-master/main.py", line 39, in <module>
    model.fit(
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\engine\training_v1.py", line 777, in fit
    return func.fit(
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\engine\training_generator_v1.py", line 570, in fit
    return fit_generator(
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\engine\training_generator_v1.py", line 315, in model_iteration
    callbacks.on_epoch_end(epoch, epoch_logs)
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\callbacks.py", line 414, in on_epoch_end
    callback.on_epoch_end(epoch, logs)
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\callbacks.py", line 1376, in on_epoch_end
    self._save_model(epoch=epoch, batch=None, logs=logs)
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\callbacks.py", line 1442, in _save_model
    self.model.save(filepath, overwrite=True, options=self._options)
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "E:\anaconda3\envs\hr\lib\copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 205, in _deepcopy_list
    append(deepcopy(a, memo))
  File "E:\anaconda3\envs\hr\lib\copy.py", line 172, in deepcopy
    y = _reconstruct(x, memo, *rv)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 296, in _reconstruct
    value = deepcopy(value, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 210, in _deepcopy_tuple
    y = [deepcopy(a, memo) for a in x]
  File "E:\anaconda3\envs\hr\lib\copy.py", line 210, in <listcomp>
    y = [deepcopy(a, memo) for a in x]
  File "E:\anaconda3\envs\hr\lib\copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 210, in _deepcopy_tuple
    y = [deepcopy(a, memo) for a in x]
  File "E:\anaconda3\envs\hr\lib\copy.py", line 210, in <listcomp>
    y = [deepcopy(a, memo) for a in x]
  File "E:\anaconda3\envs\hr\lib\copy.py", line 172, in deepcopy
    y = _reconstruct(x, memo, *rv)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 270, in _reconstruct
    state = deepcopy(state, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 172, in deepcopy
    y = _reconstruct(x, memo, *rv)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 270, in _reconstruct
    state = deepcopy(state, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 172, in deepcopy
    y = _reconstruct(x, memo, *rv)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 270, in _reconstruct
    state = deepcopy(state, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 146, in deepcopy
    y = copier(x, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 230, in _deepcopy_dict
    y[deepcopy(key, memo)] = deepcopy(value, memo)
  File "E:\anaconda3\envs\hr\lib\copy.py", line 161, in deepcopy
    rv = reductor(4)
TypeError: cannot pickle '_thread.RLock' object

我的解答思路和尝试过的方法

去别处找寻答案时,说可能是因为输入格式的问题,但是并没有找到我这儿对应的代码,希望给各位能给出点解答!谢谢大家了!