改动Transformer做文本匹配,训练后准确率极低且不收敛
验证发现从encoder层对premise和hypothesis作self-Attention就有问题,但是水平太差看不出问题在哪
该层代码如下:
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.5, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
y = self.pe[:x.size(0), :]
x = x + y
return self.dropout(x)
class PoswiseFeedForwardNet(nn.Module):
def __init__(self, embedding_dim):
super(PoswiseFeedForwardNet, self).__init__()
self.embedding_dim = embedding_dim
self.fc = nn.Sequential(
nn.Linear(self.embedding_dim, d_ff, bias=False),
nn.ReLU(),
nn.Linear(d_ff, self.embedding_dim, bias=False)
)
def forward(self, inputs):
residual = inputs
output = self.fc(inputs)
return nn.LayerNorm(self.embedding_dim).to(device)(output + residual)
class MultiHeadAttention(nn.Module):
def __init__(self, embedding_dim):
super(MultiHeadAttention, self).__init__()
self.embedding_dim = embedding_dim
self.W_Q = nn.Linear(self.embedding_dim, d_k * n_heads, bias=False)
self.W_K = nn.Linear(self.embedding_dim, d_k * n_heads, bias=False)
self.W_V = nn.Linear(self.embedding_dim, d_v * n_heads, bias=False)
self.fc = nn.Linear(n_heads * d_v, self.embedding_dim, bias=False)
def forward(self, input_Q, input_K, input_V, attn_mask):
residual, batch_size = input_Q, input_Q.size(0)
Q = self.W_Q(input_Q).view(batch_size, -1, n_heads, d_k).transpose(1, 2)
K = self.W_K(input_K).view(batch_size, -1, n_heads, d_k).transpose(1, 2)
V = self.W_V(input_V).view(batch_size, -1, n_heads, d_v).transpose(1, 2)
attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1)
context, attn = ScaledDotProductAttention()(Q, K, V, attn_mask)
context = context.transpose(1, 2).reshape(batch_size, -1, n_heads * d_v)
output = self.fc(context)
res_out = output + residual
return nn.LayerNorm(self.embedding_dim).to(device)(res_out), attn
class EncoderLayer(nn.Module):
def __init__(self, embedding_dim):
super(EncoderLayer, self).__init__()
self.enc_self_attn = MultiHeadAttention(self.embedding_dim)
self.pos_ffn = PoswiseFeedForwardNet(self.embedding_dim)
def forward(self, enc_inputs, enc_self_attn_mask):
enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
enc_outputs = self.pos_ffn(enc_outputs)
return enc_outputs, attn
class Encoder(nn.Module):
def __init__(self,
vocab_size,
embedding_dim):
super(Encoder, self).__init__()
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.layers = nn.ModuleList([EncoderLayer(self.embedding_dim) for _ in range(n_layers)])
def forward(self, embedded_inputs, inputs):
enc_self_attn_mask = get_attn_pad_mask(inputs, inputs)
enc_self_attns = []
for layer in self.layers:
embedded_inputs, enc_self_attn = layer(embedded_inputs, enc_self_attn_mask)
enc_self_attns.append(enc_self_attn)
return embedded_inputs, enc_self_attns
class Transformer_01(nn.Module):
def __init__(self,
vocab_size,
embedding_dim,
hidden_size,
embeddings=512,
padding_idx=0,
dropout=0.5,
num_classes=3,
device="cuda"
):
super(Transformer_01, self).__init__()
self.vocab_size = vocab_size
self.embedding_dim = embedding_dim
self.hidden_size = hidden_size
self.num_classes = num_classes
self.dropout = dropout
self.device = device
self.src_emb = nn.Embedding(self.vocab_size,
self.embedding_dim,
padding_idx=padding_idx,
_weight=embeddings)
self.pos_emb = PositionalEncoding(self.embedding_dim)
self.encoder = Encoder(self.vocab_size, self.embedding_dim).to(device)
...
def forward(self, premises, premises_lengths, hypotheses, hypotheses_lengths):
# embedding
embedded_premises = self.src_emb(premises)
embedded_hypotheses = self.src_emb(hypotheses)
# position-embedding
embedded_premises = self.pos_emb(embedded_premises.transpose(0, 1)).transpose(0, 1)
embedded_hypotheses = self.pos_emb(embedded_hypotheses.transpose(0, 1)).transpose(0, 1)
# self-encoder
pre_enc_outputs, pre_enc_self_attns = self.encoder(embedded_premises, premises)
hyp_enc_outputs, hyp_enc_self_attns = self.encoder(embedded_hypotheses, hypotheses)
...