用自己做的数据集做一个图像分类任务,但是训练的时候模型的loss和acc一直不变
class dp_cnn_torch(object):
def init(self, train_loader, test_loader):
self.train_loader = train_loader
self.test_loader = test_loader
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
self.model = self.dp_cnn_model()
optimizer = torch.optim.Adam(self.model.parameters(), lr=0.05)
epoch = 0
best_acc = 0
bad_count = 0
best_epoch = 0
patient_count = 10
privacy_model_save_path = './trained_privacy_model/dp_cnn_torch.tar'
while True:
# for epoch in range(100):
self.dp_cnn_train(self.model, self.train_loader, optimizer, epoch, self.device, delta=1e-5)
current_acc = self.dp_cnn_test(self.model, self.test_loader, epoch, self.device)
# early stop
if current_acc > best_acc:
best_acc = current_acc
bad_count = 0
best_epoch = epoch
torch.save(self.model, privacy_model_save_path)
else:
bad_count += 1
if bad_count != 0:
print("The last {0} epochs have not shown improvements on the validation set.".format(bad_count))
if bad_count > patient_count:
if best_acc < 0.90:
continue
else:
print("Early Stop!")
print("best_acc:{}, best_epoch:{}".format(best_acc, best_epoch))
print("dp_model finished!")
break
epoch += 1
def dp_cnn_model(self):
model = torch.nn.Sequential(
torch.nn.Conv2d(3, 16, kernel_size=8, stride=2, padding=3),
torch.nn.ReLU(),
torch.nn.MaxPool2d(2, 1),
torch.nn.Conv2d(16, 32, 4, 2),
torch.nn.ReLU(),
torch.nn.MaxPool2d(2, 1),
torch.nn.Flatten(),
torch.nn.Linear(32 * 4 * 4, 32),
torch.nn.Sigmoid(),
torch.nn.Linear(32, 2)
)
print(model)
return model
def dp_cnn_train(self, model, train_loader, optimizer, epoch, device, delta):
model.train()
criterion = torch.nn.CrossEntropyLoss()
losses = []
for batch_idx, (data, target) in enumerate(tqdm(train_loader)):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
losses.append(loss.item())
epsilon, best_alpha = optimizer.privacy_engine.get_privacy_spent(delta)
print(
f"Train Epoch:{epoch} \t"
f"Loss:{np.mean(losses):.6f}"
f"(epsilon = {epsilon:.2f}, delta={delta} for alpha = {best_alpha})"
)
def dp_cnn_test(self, model, test_loader, epoch, device):
model.eval()
with torch.no_grad():
# test
total_correct = 0
total_num = 0
label_error = 0
label_zero = 0
label_one = 0
label_two = 0
for batch_idx, (x, label) in enumerate(test_loader):
# [b,3,32,32]
# [b]
x, label = x.to(device), label.to(device)
# [b,10]
logits = model(x)
# [b]
pred = logits.argmax(dim=1)
# [b] vs [b] => scalar tensor
if torch.eq(pred, label).float().sum().item():
pass
else:
label_error += 1
if label.item() == 0:
label_zero += 1
elif label.item() == 1:
label_one += 1
else:
label_two += 1
total_correct += torch.eq(pred, label).float().sum().item()
total_num += x.size(0)
acc = total_correct / total_num
print(
f"Test Epoch:{epoch} \t"
f"accuracy:{acc:.6f} \n"
f"label_error:{label_error} \t"
f"label_zero:{label_zero} \t"
f"label_one:{label_one} \t"
f"label_two:{label_two} \t"
)
return acc
if name == 'main':
dp_train_path = 'privacy_data/raw_data/dp_train.txt'
dp_test_path = 'privacy_data/raw_data/dp_test.txt'
dp_train_data = LoadData(dp_train_path, True)
dp_test_data = LoadData(dp_test_path, False)
dp_train_loader = torch.utils.data.DataLoader(dataset=dp_train_data,
batch_size=8,
shuffle=True)
dp_test_loader = torch.utils.data.DataLoader(dataset=dp_test_data,
batch_size=1,
shuffle=True)
dp_cnn_torch(dp_train_loader, dp_test_loader)
一开始检查可能是标签与图像不匹配,但是检查之后发现还是这样;
又检查了各类数据数量是否均衡,处理完之后仍没有改善;
再次检查后发现模型只能识别一类,其他类识别均错误。改成二分类也只能识别一个类,好像把所有图像都归到了一个类里面
求各位指点。
你先调整一下学习率,看看loss和acc有没有改动
如果没有的话,就是模型和数据的问题
模型合并的时候出现的问题
loss不变,acc不变,那就是训练完成了。acc不高的可能性是,网络不适用于此类数据集的分类,更换别的模型看看,或者说两种类别的特征差别很小,网络没有办法区分开来。
看了一下你的模型代码和结果,发现你只训练了3个epoch就结束,明显是不正确的。
建议修改训练的代码,先指定epoch,不要使用最后两轮没改变就结束训练这种方法,因为深度学习训练很多时候是随机的,一旦陷入一个梯度没超过,结果就是不变。dp_cnn_torch
里面这段代码修改为:
for epoch in range(100):
self.dp_cnn_train(self.model, self.train_loader, optimizer, epoch, self.device, delta=1e-5)
current_acc = self.dp_cnn_test(self.model, self.test_loader, epoch, self.device)
# early stop
if current_acc > best_acc:
best_acc = current_acc
bad_count = 0
best_epoch = epoch
torch.save(self.model, privacy_model_save_path)
if best_acc < 0.90:
continue
else:
print("Early Stop!")
print("best_acc:{}, best_epoch:{}".format(best_acc, best_epoch))
print("dp_model finished!")
你的这个很可能是过拟合了,你的数据集会不会太少了,或者特征不够,我以前做过一个损失和精度直接不变,后面发现只有3个特征,增加特征数量就解决了
几种可能性:学习率太高,数据集太少,这些都会导致过拟合。还有可能你计算错了