我现在知道dense attention是一种简单的注意力机制打分函数,请问还有没有其他打分函数的得思路及代码实现
参考GPT和自己的思路:
当涉及到注意力机制打分函数时,有许多不同的方法可以实现。以下是一些流行的打分函数及其代码实现:
class MLPAttention(torch.nn.Module):
def __init__(self, input_size, fc_size):
super(MLPAttention, self).__init__()
self.fc = torch.nn.Linear(input_size, fc_size)
self.attn = torch.nn.Linear(fc_size, 1)
def forward(self, encoder_states):
energy = torch.tanh(self.fc(encoder_states))
attention_scores = self.attn(energy)
attention_weights = torch.softmax(attention_scores, dim=1)
context_vector = attention_weights * encoder_states
return context_vector, attention_weights
class ScaledDotProductAttention(torch.nn.Module):
def __init__(self):
super(ScaledDotProductAttention, self).__init__()
def forward(self, query, key, value, mask=None):
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(query.size(-1))
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
softmax_scores = torch.softmax(scores, dim=-1)
output = torch.matmul(softmax_scores, value)
return output, softmax_scores
class BidirectionalAttention(torch.nn.Module):
def __init__(self, input_size, att_size):
super(BidirectionalAttention, self).__init__()
self.W_s = torch.nn.Linear(input_size, att_size)
self.W_t = torch.nn.Linear(input_size, att_size)
self.W_a = torch.nn.Linear(att_size, 1)
def forward(self, s, t):
s_proj = self.W_s(s)
t_proj = self.W_t(t)
scores = self.W_a(torch.tanh(s_proj + t_proj.unsqueeze(1))).squeeze(2)
alpha = torch.softmax(scores, dim=-1)
t_attend = torch.bmm(alpha.unsqueeze(1), t).squeeze(1)
s_attend = torch.bmm(alpha.unsqueeze(2), s.unsqueeze(1)).squeeze(2)
return s_attend, t_attend
这只是一些打分函数的例子,当然还有很多其他的打分函数。选择合适的打分函数取决于问题的具体情况。