卷积神经网络加入注意力机制后矩阵纬度不匹配

最近在改进一个残差网络想往里面加入注意力机制,结果调了半天这个矩阵纬度还是不匹配,没办法来问问大伙了。
代码如下:

import os

import numpy as np
import torch.nn as nn
import torchvision
from PIL import Image
import torch
import torch.nn.functional as F
from matplotlib import pyplot as plt
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms
from center_loss import CenterLoss

transformtrain = transforms.Compose([
    transforms.Resize((256, 256)),  # increase image size for better attention mechanism

    # data augmentation
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(degrees=(-30, 30)),

    transforms.ToTensor(),
])


class MyData_train(Dataset):
    def __init__(self, img_path, transform=None):
        super(MyData_train, self).__init__()
        self.root = img_path
        self.txt_root = 'train.txt'
        f = open(self.txt_root, 'r')
        data = f.readlines()
        imgs = []
        labels = []
        for line in data:
            line = line.rstrip()
            word = line.split()
            imgs.append(os.path.join(self.root, word[0], word[1]))
            labels.append(word[2])
        self.img = imgs
        self.label = labels
        self.transform = transform

    def __len__(self):
        return len(self.label)

    def __getitem__(self, item):
        img = self.img[item]
        label = self.label[item]
        img = Image.open(img).convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        label = np.array(label).astype(np.int64)
        label = torch.from_numpy(label)
        return img, label


path = 'Train'
dataset = MyData_train(path, transformtrain)
train_loader = DataLoader(dataset=dataset, batch_size=16, shuffle=True, drop_last=True)


# define attention module
class Attention(nn.Module):
    def __init__(self, in_channels=512, out_channels=512, kernel_size=3):
        super(Attention, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, padding=(kernel_size - 1) // 2)
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv2 = nn.Conv2d(in_channels, in_channels, kernel_size, padding=(kernel_size - 1) // 2)
        self.bn2 = nn.BatchNorm2d(in_channels)
        self.conv3 = nn.Conv2d(in_channels, in_channels, kernel_size, padding=(kernel_size - 1) // 2)
        self.bn3 = nn.BatchNorm2d(in_channels)
        self.conv4 = nn.Conv2d(in_channels, in_channels, kernel_size, padding=(kernel_size - 1) // 2)
        self.bn4 = nn.BatchNorm2d(in_channels)
        self.fc = nn.Linear(in_channels, out_channels)

    def forward(self, x):
        h = F.relu(self.bn1(self.conv1(x)))
        h = F.relu(self.bn2(self.conv2(h)))
        h = F.relu(self.bn3(self.conv3(h)))
        h = F.relu(self.bn4(self.conv4(h)))
        h = h.view(h.size(0), h.size(1))
        h = self.fc(h)
        r = F.softmax(h, dim=1)
        r = r.view(r.size(0), r.size(1), 1, 1)
        a = (r * x).sum(dim=1)
        return a


transformtest = transforms.Compose([
    transforms.Resize((256, 256)),  # increase image size for better attention mechanism
    transforms.ToTensor()
])


class MyData_test(Dataset):
    def __init__(self, img_path, transform=None):
        super(MyData_test, self).__init__()
        self.root = img_path
        self.txt_root = 'test.txt'
        f = open(self.txt_root, 'r')
        data = f.readlines()
        imgs = []
        labels = []
        for line in data:
            line = line.rstrip()
            word = line.split()
            imgs.append(os.path.join(self.root, word[0], word[1]))
            labels.append(word[2])
        self.img = imgs
        self.label = labels
        self.transform = transform

    def __len__(self):
        return len(self.label)

    def __getitem__(self, item):
        img = self.img[item]
        label = self.label[item]
        img = Image.open(img).convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        label = np.array(label).astype(np.int64)
        label = torch.from_numpy(label)
        return img, label


path = 'Test'
dataset_test = MyData_test(path, transformtest)
test_loader = DataLoader(dataset=dataset_test, batch_size=1, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torchvision.models.resnet50(pretrained=True)


# define custom classifier head with attention mechanism
class my_Module(nn.Module):
    def __init__(self):
        super(my_Module, self).__init__()
        self.backbone = model
        self.fc1 = nn.Linear(2048, 512)
        self.attention = Attention(in_channels=512)
        self.fc2 = nn.Linear(512, 23, bias=True)

    def forward(self, x):
        x = self.backbone.conv1(x)
        x = self.backbone.bn1(x)
        x = self.backbone.relu(x)
        x = self.backbone.maxpool(x)

        x = self.backbone.layer1(x)
        x = self.backbone.layer2(x)
        x = self.backbone.layer3(x)
        x = self.backbone.layer4(x)

        x = self.backbone.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        #print("x shape before attention:", x.shape)

        # apply attention module to x instead of whole model output
        x = x.view(x.size(0), x.size(1), 1, 1)
        x = self.attention(x)

        x = self.fc2(x)
        return x
model1 = my_Module()
print(model1)
model1.to(device)

criterion_xent = torch.nn.CrossEntropyLoss()
criterion_cent = CenterLoss(num_classes=23, feat_dim=2048, use_gpu=True)
optimizer_model = torch.optim.SGD(model.parameters(), lr=0.001)
optimizer_centloss = torch.optim.SGD(criterion_cent.parameters(), lr=0.0001)


def train(epoch):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        input, labels = data
        input, labels = input.to(device), labels.to(device)
        features, outputs = model1(input)
        loss_xent = criterion_xent(outputs, labels)
        loss_cent = criterion_cent(features, labels)
        loss_cent *= 1
        loss = loss_xent + 0.01 * loss_cent
        optimizer_model.zero_grad()
        optimizer_centloss.zero_grad()
        loss.backward()
        optimizer_model.step()
        for param in criterion_cent.parameters():
            param.grad.data *= (1. / 0.01)
        optimizer_centloss.step()
        running_loss += loss.data
        if (i % 20 == 19):
            print("epoch:{0} {1}  loss:{2}".format(epoch + 1, i + 1, running_loss / 20))
            running_loss = 0.0


def test(epoch):
    correct = 0
    total = 0
    with torch.no_grad():
        for _, data in enumerate(test_loader, 0):
            input, labels = data
            input, labels = input.to(device), labels.to(device)
            _, y_pred = model1(input)
            _, max_pos = torch.max(y_pred.data, dim=1)
            total += labels.size(0)
            correct += (max_pos == labels).sum().item()
        print('Accuracy on test set:{}%'.format(100 * correct / total))
        yy[epoch] = 100 * correct / total


sum_epoch = 250
yy = np.zeros(sum_epoch, )
for epoch in range(sum_epoch):
    train(epoch)
    model1.eval()
    test(epoch)
    print()

torch.save(model1, 'resnet50_250_0001_centerloss_attention.pkl')
np.save('center_loss_001_attention.npy', yy)

运行结果如下:

Traceback (most recent call last):
  File "D:\guaduatepaper\Remote_sensing_image_classification\pytorch_classification\Test5_resnet\improvedexperiment\addCenterloss\atmain.py", line 221, in <module>
    train(epoch)
  File "D:\guaduatepaper\Remote_sensing_image_classification\pytorch_classification\Test5_resnet\improvedexperiment\addCenterloss\atmain.py", line 185, in train
    features, outputs = model1(input)
  File "C:\Users\60234\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "D:\guaduatepaper\Remote_sensing_image_classification\pytorch_classification\Test5_resnet\improvedexperiment\addCenterloss\atmain.py", line 168, in forward
    x = self.fc2(x)
  File "C:\Users\60234\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\60234\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x1 and 512x23)

请各路好汉帮忙修正我的代码,让这个模型能正常跑通。

以下答案由GPT-3.5大模型与博主波罗歌共同编写:
根据代码和错误提示,矩阵乘法的问题出在了self.fc2(x)这一层,具体来说是由于x的纬度不匹配导致无法进行矩阵乘法。根据代码可知,x的形状是(batch_size, 512),而self.fc2的形状是(512, 23),即需要将x的纬度从2变为3,使其能够与self.fc2进行矩阵乘法。

根据上面的问题,需要在Attention模块里将输出的纬度改变从而与self.fc1连接的输出通道数相匹配。具体就是将Attention模块的代码修改如下:

class Attention(nn.Module):
    def __init__(self, in_channels=512, out_channels=512, kernel_size=3):
        super(Attention, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size, padding=(kernel_size - 1) // 2)
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv2 = nn.Conv2d(in_channels, in_channels, kernel_size, padding=(kernel_size - 1) // 2)
        self.bn2 = nn.BatchNorm2d(in_channels)
        self.conv3 = nn.Conv2d(in_channels, in_channels, kernel_size, padding=(kernel_size - 1) // 2)
        self.bn3 = nn.BatchNorm2d(in_channels)
        self.conv4 = nn.Conv2d(in_channels, out_channels, kernel_size, padding=(kernel_size - 1) // 2)  # 这里进行了修改
        self.bn4 = nn.BatchNorm2d(out_channels)
        self.fc = nn.Linear(out_channels, out_channels)

    def forward(self, x):
        h = F.relu(self.bn1(self.conv1(x)))
        h = F.relu(self.bn2(self.conv2(h)))
        h = F.relu(self.bn3(self.conv3(h)))
        h = F.relu(self.bn4(self.conv4(h)))  # 修改后的代码
        h = h.view(h.size(0), h.size(1))
        h = self.fc(h)
        r = F.softmax(h, dim=1)
        r = r.view(r.size(0), r.size(1), 1, 1)
        a = (r * x).sum(dim=1)
        return a

这样就可以保证self.fc2输出的纬度与x匹配了。
如果我的回答解决了您的问题,请采纳!

个人理解,供你参考:
首先,你需要检查下,下面这三个维度:
1、检查输入层的维度:确保在创建输入层时使用了相同的维度(通常为 N*d)。
2、检查权重的维度:确保在创建注意力机制时使用了相同的维度。
3、检查注意力机制的维度:确保在创建注意力机制时使用了相同的维度。
其次,通过下面的思路去检查:
1、数据集不合适:确保数据集包含足够的输入和输出样本,如果没有足够的输入和输出样本,那么通过随机过采样或欠采样来增加数据量
2、网络结构不合适:网络结构与任务不匹配,缺乏残差项,需要重新设计网络以包含更多的残差项。
3、模型不正确:模型在训练时使用了不正确的权重,或者没有正确定义损失函数,那么需要对模型进行调整。
4、超参数设置不正确:例如:学习率、批次大小、正则化系数。如果超参数设置不正确,那么模型无法收敛或达到最佳性能。

引用chatGPT作答,在 my_Module 类中,需要修改 self.fc1 的输入维度。根据代码中的 ResNet50 的输出特征维度为 2048,加上注意力机制后的特征维度为 512,所以在 my_Module 类中,需要将 self.fc1 的输入维度设置为 2048 + 512 = 2560。
具体来说,将以下行

self.fc1 = nn.Linear(2048, 6)

修改为

self.fc1 = nn.Linear(2560, 6)