用tensorflow图片分割遇到类似gpu内存不足的问题

用tensorflow图片分割遇到:“Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode. ”

问题遇到的现象和发生背景

最近做毕设在网上找到了相关的fcn代码,是要用这个做一个图片分割,之前跑通过unet,但是使用这个的时候就出了点问题,代码如下

问题相关代码
import sys

from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from keras.models import load_model
# from keras.optimizers import Adam
from keras.optimizer_v2.adam import Adam

from custom_loss import *
from custom_metrics import *
from data_gens import *
from models import *


sys.setrecursionlimit(1000)

if __name__ == "__main__":
    # Use VOC 2012 Dataset
    horse_path = 'weizmann_horse_db'
    batch_size = 2

    train_gen = horse_gen.get_horse_generator(horse_path, train_or_val='train', batch_size=batch_size,
                                              input_hw=(299, 299, 3), mask_hw=(299, 299, 2))
    val_gen = horse_gen.get_horse_generator(horse_path, 'val', batch_size=batch_size * 1, input_hw=(299, 299, 3), mask_hw=(299, 299, 2))

    # model = FCN.get_fcn8s_model(input_shape=(299, 299, 3), class_no=2)
    # model = FCN.get_fcn16s_model(input_shape=(299, 299, 3), class_no=2)
    model = FCN.get_fcn32s_model(input_shape=(299, 299, 3), class_no=2)
    # model = Unet.get_unet_model(input_shape=(299, 299, 3), class_no=2)
    # model = DeepLabV3Plus.get_model(input_shape=(299, 299, 3), atrous_rate=(4, 8, 12), class_no=2)

    # model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[mean_iou, 'acc'])
    model.compile(loss=categorical_focal_loss(alpha=None, gamma=2.), optimizer='adam', metrics=[mean_iou, 'acc'])
    model.summary()

    checkpoint = ModelCheckpoint('fcn32s.h5', verbose=1, save_best_only=False, period=3)  # every 3 epoch
    tensor_board = TensorBoard(log_dir='log', histogram_freq=0, write_graph=True, write_grads=True, write_images=True)
    learning_rate_reduction = ReduceLROnPlateau(monitor='loss', patience=2, verbose=1, factor=0.5, min_lr=0.000001)

    model.fit(
        train_gen,
        steps_per_epoch=250,
        epochs=50,
        validation_data=val_gen,
        validation_steps=3,
        callbacks=[checkpoint, tensor_board, learning_rate_reduction]
    )

    print('Start saving model into h5 file')
    model.save('fcn32s.h5')

    # tf.contrib.saved_model.save_keras_model(model, 'output') # available on tensorflow 1.12

    print('======== Start Test ===========')
    model = load_model('fcn32s.h5', compile=False, custom_objects={'BilinearResizeLayer2D': BilinearResizeLayer2D})

    # 取val集100张图片,测试一下效果
    val_gen2 = horse_gen.get_horse_generator(horse_path, 'val', batch_size=1, input_hw=(299, 299, 3),
                                            mask_hw=(299, 299, 2))
    i = 0
    for val_images, mask in val_gen2:
        img_np = val_images[0]
        img_np = (img_np + 1.) * 127.5
        im0 = Image.fromarray(np.uint8(img_np))
        im0.save('output/{}_img.jpg'.format(i))

        res = model.predict(val_images)[0]
        pred_label = res.argmax(axis=2)
        pred_label[pred_label == 1] = 255
        im1 = Image.fromarray(np.uint8(pred_label))
        im1.save('output/{}_pred.png'.format(i))

        true_label = mask[0].argmax(axis=2)
        true_label[true_label == 1] = 255
        im2 = Image.fromarray(np.uint8(true_label))
        im2.save('output/{}_true.png'.format(i))

        i += 1
        if i == 100:
            print('End test')
            exit(1)


运行结果及报错内容
2022-04-13 19:45:26.764431: W tensorflow/core/common_runtime/bfc_allocator.cc:275] Allocator (GPU_0_bfc) ran out of memory trying to allocate 572.75MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
Traceback (most recent call last):
  File "E:\anaconda3\envs\hr\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-2c9bf53164ed>", line 1, in <module>
    runfile('E:/pycharm数据/AI/image_segmentation-master/main.py', wdir='E:/pycharm数据/AI/image_segmentation-master')
  File "E:\PyCharm 2021.3\plugins\python\helpers\pydev\_pydev_bundle\pydev_umd.py", line 198, in runfile
    pydev_imports.execfile(filename, global_vars, local_vars)  # execute the script
  File "E:\PyCharm 2021.3\plugins\python\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "E:/pycharm数据/AI/image_segmentation-master/main.py", line 39, in <module>
    model.fit(
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\engine\training_v1.py", line 777, in fit
    return func.fit(
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\engine\training_generator_v1.py", line 570, in fit
    return fit_generator(
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\engine\training_generator_v1.py", line 252, in model_iteration
    batch_outs = batch_function(*batch_data)
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\engine\training_v1.py", line 1076, in train_on_batch
    outputs = self.train_function(ins)  # pylint: disable=not-callable
  File "E:\anaconda3\envs\hr\lib\site-packages\keras\backend.py", line 4186, in __call__
    fetched = self._callable_fn(*array_vals,
  File "E:\anaconda3\envs\hr\lib\site-packages\tensorflow\python\client\session.py", line 1483, in __call__
    ret = tf_session.TF_SessionRunCallable(self._session._session,
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[7,7,512,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
     [[{{node training/Adam/Adam/update_vgg_decoder/fc6/kernel/ResourceApplyAdam}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
Process finished with exit code 0


我的解答思路和尝试过的方法

去网上搜了搜类似的问题,说是gpu内存不足的问题,我按照解答修改了batch size也还是不可以,各位可不可以给我一点别的解决方案?