在做无注意力机制的图像描述问题,在网上复现了一些代码,但是我最后的结果全是重复语句,我怀疑是自己的解码器结构有问题,但是自己找不到头绪,希望有人可以告诉一下这是什么原因呀

问题遇到的现象和发生背景
问题相关代码,请勿粘贴截图

#这是解码器部分
class Rnn_Local_Decoder1(tf.keras.Model):
def init(self, embedding_dim, units, vocab_size):
super(Rnn_Local_Decoder1, self).init()
self.units = units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
self.fc1 = tf.keras.layers.Dense(self.units)
self.dropout = tf.keras.layers.Dropout(0.5, noise_shape=None, seed=None)
self.batchnormalization = tf.keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True,
scale=True, beta_initializer='zeros',
gamma_initializer='ones',
moving_mean_initializer='zeros',
moving_variance_initializer='ones',
beta_regularizer=None, gamma_regularizer=None,
beta_constraint=None, gamma_constraint=None)

    self.fc2 = tf.keras.layers.Dense(vocab_size)
    self.fc3 = tf.keras.layers.Dense(embedding_dim)
    self.fc4 = tf.keras.layers.Dense(self.units)

def call(self, x, features, hidden,i):    #x(64,1)
    # 输入通过embedding 层, 得到的输出形状: (batch_size, 1, emb#自己添加到内容
    features = tf.keras.layers.Flatten()(features)   #(64 49*256=12544) (64,64*256=16384)
    features = self.dropout(features)
    features = self.fc3(features)  #(64,256)
    features = tf.expand_dims(features, 1)  #(64,1,256)

    hidden = tf.expand_dims(hidden, 1)  #(64,1,512)
    x = self.embedding(x)  #(64,1,256)
    if i == 0:
        x = tf.concat([features, hidden], axis=-1)
        x=self.fc4(x)
        output, state = self.gru(x)   #(64,1,512)(64,512)
    else:
        x = tf.concat([x, hidden], axis=-1)
        x = self.fc4(x)
        output, state = self.gru(x)
    x = self.fc1(output)   #(64,1,512)
    x = tf.reshape(x, (-1, x.shape[2]))   #(64,512)
    # Adding Dropout and BatchNorm Layers
    x = self.dropout(x)  #(64,512)
    x = self.batchnormalization(x) #(64,512)
    x = self.fc2(x)
    return x, state  #(64,8329)(64,512)

def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))

#这是总模型部分
def train_step(img_tensor, target,decoder,encoder,tokenizer,BATCH_SIZE,decode_fun):
loss = 0
# 初始化每个批次的隐藏状态
# 因为图片与图片之间的标题不相关

# 初始化解码器的隐含状态张量
hidden = decoder.reset_state(batch_size=target.shape[0])  #(64,512)
# 定义解码器的第一个文本描述输入(即起始符<start>对应的张量)
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)  #(64,1)
# 开启一个用于梯度记录的上下文管理器
with tf.GradientTape() as tape:
    # 使用编码器处理输入的图片张量
    features = encoder(img_tensor)  #(64,49,256)
    for i in range(0, target.shape[1]):
          predictions, hidden = decoder(dec_input, features, hidden, i)
          if i==0:
                dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
          else:
                # 计算该解码过程的损失
                loss += loss_function(target[:, i], predictions)

                # using teacher forcing
                # 接下来这里使用了teacher_forcing来定义下一次解码的输入
                dec_input = tf.expand_dims(target[:, i], 1)
# 全部循环解码完成后, 计算句子粒度的平均损失
total_loss = (loss / int(target.shape[1]))
# 获得整个模型训练的参数变量
trainable_variables = encoder.trainable_variables + decoder.trainable_variables
# 使用梯度管理器对象对参数变量求解梯度
gradients = tape.gradient(loss, trainable_variables)
# 根据梯度更新参数
optimizer.apply_gradients(zip(gradients, trainable_variables))
# 返回句子粒度的平均损失
return loss, total_loss
运行结果及报错内容

结果全是重复的内容
Real Caption: person trails behind some ducks on the water to lake
Prediction Caption: dog is running through the snow
7550
Real Caption: black dog carries green toy in his mouth as he walks through the grass
Prediction Caption: dog is running through the snow

我的解答思路和尝试过的方法
我想要达到的结果