用resnet18模型来跑的验证集准确率,用了多种的降低过拟合的办法都不好使,有没有兄弟能指教一下,所用数据集是由1:2的阴阳性某种疾病玻璃涂片数据扩充到1:1再进行扩充到2万
以下是代码:
train
import os
import sys
import json
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from tensorflow.python.estimator.inputs import inputs
from torch.autograd._functions import tensor
from torchvision import transforms, datasets
from tqdm import tqdm
import torch
# 训练resnet18
from model import resnet18
def main():
# 如果有NVIDA显卡,转到GPU训练,否则用CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using {} device.".format(device))
data_transform = {
# 训练
# Compose():将多个transforms的操作整合在一起
"train": transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]),
# 验证
"val": transforms.Compose(
[transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])}
# abspath():获取文件当前目录的绝对路径
# join():用于拼接文件路径,可以传入多个路径
# getcwd():该函数不需要传递参数,获得当前所运行脚本的路径
data_root = os.path.abspath(os.getcwd())
# 得到数据集的路径
image_path = os.path.join(data_root, "E:\pythonProject6\output5")
# exists():判断括号里的文件是否存在,可以是文件路径
# 如果image_path不存在,序会抛出AssertionError错误,报错为参数内容“ ”
assert os.path.exists(image_path), "{} path does not exist.".format(image_path)
train_dataset = datasets.ImageFolder(root=os.path.join(image_path, "train"),
transform=data_transform["train"])
# 训练集长度
train_num = len(train_dataset)
# {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4}
# class_to_idx:获取分类名称对应索引
flower_list = train_dataset.class_to_idx
# dict():创建一个新的字典
# 循环遍历数组索引并交换val和key的值重新赋值给数组,这样模型预测的直接就是value类别值
cla_dict = dict((val, key) for key, val in flower_list.items())
print(cla_dict)
# 把字典编码成json格式
json_str = json.dumps(cla_dict, indent=4)
# 把字典类别索引写入json文件
with open('class_indices.json', 'w') as json_file:
json_file.write(json_str)
# 一次训练载入16张图像
batch_size = 64
# 确定进程数
# min():返回给定参数的最小值,参数可以为序列
# cpu_count():返回一个整数值,表示系统中的CPU数量,如果不确定CPU的数量,则不返回任何内容
nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0])
print('Using {} dataloader workers every process'.format(nw))
# DataLoader:将读取的数据按照batch size大小封装给训练集
# dataset (Dataset):输入的数据集
# batch_size (int, optional):每个batch加载多少个样本,默认: 1
# shuffle (bool, optional):设置为True时会在每个epoch重新打乱数据,默认: False
# num_workers(int, optional): 决定了有几个进程来处理,默认为0意味着所有的数据都会被load进主进程
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=batch_size, shuffle=True,
num_workers=0)
# 加载测试数据集
validate_dataset = datasets.ImageFolder(root=os.path.join(image_path, "val"),
transform=data_transform["val"])
# 测试集长度
val_num = len(validate_dataset)
validate_loader = torch.utils.data.DataLoader(validate_dataset,
batch_size=batch_size, shuffle=False,
num_workers=0)
print("using {} images for training, {} images for validation.".format(train_num,
val_num))
# 模型实例化
net = resnet18().to(device)
# 加载预训练模型权重
#model_weight_path = "./resNet18.pth"
# exists():判断括号里的文件是否存在,可以是文件路径
#assert os.path.exists(model_weight_path), "file {} does not exist.".format(model_weight_path)
#net.load_state_dict(torch.load(model_weight_path))
# 输入通道数
#in_channel = net.fc.in_features
# 全连接层
#net.fc = nn.Linear(in_channel, 5).to(device)
# 定义损失函数(交叉熵损失)
loss_function = nn.CrossEntropyLoss()
# 抽取模型参数
params = [p for p in net.parameters() if p.requires_grad]
# 定义adam优化器
# params(iterable):要训练的参数,一般传入的是model.parameters()
# lr(float):learning_rate学习率,也就是步长,默认:1e-3
optimizer = optim.Adam(params, lr=0.0005 ,betas=(0.9, 0.999), weight_decay=0.001)
# 迭代次数(训练次数)
epochs = 50
# 用于判断最佳模型
best_acc = 0.0
# 最佳模型保存地址
save_path = './resNet18.pth'
train_steps = len(train_loader)
loss_list = [0 for i in range(50)]
accuracy_list = [0 for i in range(50)]
for epoch in range(epochs):
# 训练
net.train()
running_loss = 0.0
# tqdm:进度条显示
train_bar = tqdm(train_loader, file=sys.stdout)
# train_bar: 传入数据(数据包括:训练数据和标签)
# enumerate():将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在for循环当中
# enumerate返回值有两个:一个是序号,一个是数据(包含训练数据和标签)
# x:训练数据(inputs)(tensor类型的),y:标签(labels)(tensor类型)
for step, data in enumerate(train_bar):
# 前向传播
images, labels = data
# 计算训练值
logits = net(images.to(device)).to(device)
# 计算损失
loss = loss_function(logits, labels.to(device))
# 反向传播
# 清空过往梯度
optimizer.zero_grad()
# 反向传播,计算当前梯度
loss.backward()
optimizer.step()
# item():得到元素张量的元素值
running_loss += loss.item()
# 进度条的前缀
# .3f:表示浮点数的精度为3(小数位保留3位)
train_bar.desc = "train epoch[{}/{}] loss:{:.3f}".format(epoch + 1,
epochs,
loss)
# 测试
# eval():如果模型中有Batch Normalization和Dropout,则不启用,以防改变权值
net.eval()
acc = 0.0
# 清空历史梯度,与训练最大的区别是测试过程中取消了反向传播
with torch.no_grad():
val_bar = tqdm(validate_loader, file=sys.stdout)
for val_data in val_bar:
val_images, val_labels = val_data
outputs = net(val_images.to(device))
# torch.max(input, dim)函数
# input是具体的tensor,dim是max函数索引的维度,0是每列的最大值,1是每行的最大值输出
# 函数会返回两个tensor,第一个tensor是每行的最大值;第二个tensor是每行最大值的索引
predict_y = torch.max(outputs, dim=1)[1]
# 对两个张量Tensor进行逐元素的比较,若相同位置的两个元素相同,则返回True;若不同,返回False
# .sum()对输入的tensor数据的某一维度求和
acc += torch.eq(predict_y, val_labels.to(device)).sum().item()
val_bar.desc = "valid epoch[{}/{}]".format(epoch + 1,
epochs)
loss_list[epoch]=(running_loss / train_steps)
accuracy_list[epoch]= (acc / val_num)
print('[epoch %d] train_loss: %.3f val_accuracy: %.3f' %
(epoch + 1, running_loss / train_steps, acc / val_num))
# 保存最好的模型权重
if (acc / val_num) > best_acc:
best_acc = (acc / val_num)
# torch.save(state, dir)保存模型等相关参数,dir表示保存文件的路径+保存文件名
# model.state_dict():返回的是一个OrderedDict,存储了网络结构的名字和对应的参数
torch.save(net.state_dict(), save_path)
print('Finished Training')
x1 = range(0, 50)
x2 = range(0, 50)
y1 = accuracy_list
y2 = loss_list
plt.subplot(2, 1, 1)
# plt.plot(x1, y1, 'o-',color='r')
plt.plot(x1, y1, 'o-', label="Train_Accuracy")
plt.title(' epoch ')
plt.ylabel(' accuracy')
plt.legend(loc='best')
plt.subplot(2, 1, 2)
plt.plot(x2, y2, '.-', label="Train_Loss")
plt.xlabel(' epoch')
plt.ylabel(' loss')
plt.legend(loc='best')
plt.show()
if __name__ == '__main__':
main()
model
import torch.nn as nn
import torch
class BasicBlock(nn.Module): # 18层或34层残差网络的 残差模块
expansion = 1 # 记录各个层的卷积核个数是否有变化
def __init__(self, in_channel, out_channel, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
kernel_size=3, stride=stride, padding=1, bias=False) # 有无bias对bn没多大影响
self.bn1 = nn.BatchNorm2d(out_channel)
self.relu = nn.ReLU()
self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
kernel_size=3, stride=1, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(out_channel)
self.downsample = downsample
if stride != 1 or in_channel != out_channel:
self.downsample = nn.Sequential(
nn.Conv2d(in_channel, out_channel, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(out_channel)
)
self.dropout = nn.Dropout(p=0.5)
def forward(self, x):
identity = x # 记录上一个残差模块输出的结果
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(identity)
out += identity
out = self.relu(out)
out = self.dropout(out)
return out
class ResNet(nn.Module): # 网络框架
# 参数:block 如果定义的是18层或34层的框架 就是BasicBlock, 如果定义的是50,101,152层的框架,就是Bottleneck
# blocks_num 残差层的个数,对应34层的残差网络就是 [3,4,6,3]
# include_top 方便以后在resnet的基础上搭建更复杂的网络
def __init__(self, block, blocks_num, num_classes=1000, include_top=True):
super(ResNet, self).__init__()
self.include_top = include_top
self.in_channel = 64 # 上一层的输出channel数,及这一层的输入channel数
# part 1 卷积+池化 conv1+pooling
self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(self.in_channel)
self.relu = nn.ReLU(inplace=True) # 利用in-place计算可以节省内(显)存,同时还可以省去反复申请和释放内存的时间。但是会对原变量覆盖,只要不带来错误就用。计算结果不会有影响
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
# part 2 残差网络的四部分残差块:conv2,3,4,5
self.layer1 = self._make_layer(block, 64, blocks_num[0]) # 5中不同深度的残差网络的第一部分残差块个数:2,3,3,3,3
self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2) # 5中不同深度的残差网络的第一部分残差块个数:2,4,4,4,8
self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2) # 5中不同深度的残差网络的第一部分残差块个数:2,6,6,23,36
self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2) # 5中不同深度的残差网络的第一部分残差块个数:2,3,3,3,3
# part 3 平均池化层+全连接层
if self.include_top:
self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # output size = (1, 1)
self.fc = nn.Linear(512 * block.expansion, num_classes)
# 卷积层的初始化操作
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
def _make_layer(self, block, channel, block_num, stride=1):
downsample = None
if stride != 1 or self.in_channel != channel * block.expansion:
# 虚线部分
downsample = nn.Sequential(
nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm2d(channel * block.expansion))
layers = []
layers.append(block(self.in_channel, channel, downsample=downsample, stride=stride))
self.in_channel = channel * block.expansion
for _ in range(1, block_num):
layers.append(block(self.in_channel, channel)) # stride=1,downsample=None
return nn.Sequential(*layers) # 将list转换为非关键字参数传入
def forward(self, x):
# part 1
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
# part 2
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
# part 3
if self.include_top:
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
def resnet18(num_classes=1000, include_top=True):
return ResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes, include_top=include_top)
early stopping(提前停止迭代),在迭代过程中,训练集损失越来越小,验证集先减小后增大,找到临界点提前停止迭代,防止过拟合。但因此减少了迭代次数可能效果不如L2正则化好(使用L2会需要更多的超参数)
下图紫色线为验证集,蓝色线为训练集,可见,随着迭代次数增加,验证集损失先减小后增大,训练集损失一直减小,所以形成了过拟合,应当提前停止迭代
针对该问题,建议进行以下解决方案:
尝试使用更多的正则化方式,例如L1正则化、Dropout等,以减少过拟合情况。
建议尝试使用更多的数据集,尤其是阳性数据集,以增加训练数据的数量。
建议在进行数据集扩充时,注意扩充后的数据集是否和原数据集在分布上相同,并根据需要进行再平衡处理。
可以尝试使用迁移学习,将预训练好的模型作为初始模型进行微调,以加速训练且更好地解决过拟合问题。
建议对输入数据进行归一化处理,以加速计算和提高模型训练速度。
应该对模型进行梯度检验,以判断梯度是否出现了爆炸或消失的情况,进行调整和修正。
建议使用更佳的优化器,例如Adam,以获得更好的结果。同时也建议将学习率适当调整,以避免过拟合。
应该尝试通过增加模型复杂度,增加层数或神经元的数量,来进一步提高模型性能。但要注意仅有在过拟合问题解决后,才可进行这一步骤。