#这是解码器部分
class Rnn_Local_Decoder1(tf.keras.Model):
def init(self, embedding_dim, units, vocab_size):
super(Rnn_Local_Decoder1, self).init()
self.units = units
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(self.units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
self.fc1 = tf.keras.layers.Dense(self.units)
self.dropout = tf.keras.layers.Dropout(0.5, noise_shape=None, seed=None)
self.batchnormalization = tf.keras.layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True,
scale=True, beta_initializer='zeros',
gamma_initializer='ones',
moving_mean_initializer='zeros',
moving_variance_initializer='ones',
beta_regularizer=None, gamma_regularizer=None,
beta_constraint=None, gamma_constraint=None)
self.fc2 = tf.keras.layers.Dense(vocab_size)
self.fc3 = tf.keras.layers.Dense(embedding_dim)
self.fc4 = tf.keras.layers.Dense(self.units)
def call(self, x, features, hidden,i): #x(64,1)
# 输入通过embedding 层, 得到的输出形状: (batch_size, 1, emb#自己添加到内容
features = tf.keras.layers.Flatten()(features) #(64 49*256=12544) (64,64*256=16384)
features = self.dropout(features)
features = self.fc3(features) #(64,256)
features = tf.expand_dims(features, 1) #(64,1,256)
hidden = tf.expand_dims(hidden, 1) #(64,1,512)
x = self.embedding(x) #(64,1,256)
if i == 0:
x = tf.concat([features, hidden], axis=-1)
x=self.fc4(x)
output, state = self.gru(x) #(64,1,512)(64,512)
else:
x = tf.concat([x, hidden], axis=-1)
x = self.fc4(x)
output, state = self.gru(x)
x = self.fc1(output) #(64,1,512)
x = tf.reshape(x, (-1, x.shape[2])) #(64,512)
# Adding Dropout and BatchNorm Layers
x = self.dropout(x) #(64,512)
x = self.batchnormalization(x) #(64,512)
x = self.fc2(x)
return x, state #(64,8329)(64,512)
def reset_state(self, batch_size):
return tf.zeros((batch_size, self.units))
#这是总模型部分
def train_step(img_tensor, target,decoder,encoder,tokenizer,BATCH_SIZE,decode_fun):
loss = 0
# 初始化每个批次的隐藏状态
# 因为图片与图片之间的标题不相关
# 初始化解码器的隐含状态张量
hidden = decoder.reset_state(batch_size=target.shape[0]) #(64,512)
# 定义解码器的第一个文本描述输入(即起始符<start>对应的张量)
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1) #(64,1)
# 开启一个用于梯度记录的上下文管理器
with tf.GradientTape() as tape:
# 使用编码器处理输入的图片张量
features = encoder(img_tensor) #(64,49,256)
for i in range(0, target.shape[1]):
predictions, hidden = decoder(dec_input, features, hidden, i)
if i==0:
dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)
else:
# 计算该解码过程的损失
loss += loss_function(target[:, i], predictions)
# using teacher forcing
# 接下来这里使用了teacher_forcing来定义下一次解码的输入
dec_input = tf.expand_dims(target[:, i], 1)
# 全部循环解码完成后, 计算句子粒度的平均损失
total_loss = (loss / int(target.shape[1]))
# 获得整个模型训练的参数变量
trainable_variables = encoder.trainable_variables + decoder.trainable_variables
# 使用梯度管理器对象对参数变量求解梯度
gradients = tape.gradient(loss, trainable_variables)
# 根据梯度更新参数
optimizer.apply_gradients(zip(gradients, trainable_variables))
# 返回句子粒度的平均损失
return loss, total_loss
结果全是重复的内容
Real Caption: person trails behind some ducks on the water to lake
Prediction Caption: dog is running through the snow
7550
Real Caption: black dog carries green toy in his mouth as he walks through the grass
Prediction Caption: dog is running through the snow