import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
# 加载数据
data = pd.read_csv('1.csv')
# 分割SMILES和核磁数据
smiles = data['SMILES'].values
nmr_data = data['spectra'].values
# 划分训练集和测试集
smiles_train, smiles_test, nmr_train, nmr_test = train_test_split(smiles, nmr_data, test_size=0.2, random_state=42)
# 加载预训练的BERT模型和分词器
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
# 定义自定义的Transformer模型
class NMRTransformer(nn.Module):
def __init__(self, model):
super(NMRTransformer, self).__init__()
self.bert = model
self.fc = nn.Linear(768, 1) # 768是BERT模型的隐藏状态维度
def forward(self, input_ids):
outputs = self.bert(input_ids)
last_hidden_state = outputs.last_hidden_state[:, 0, :] # 取CLS token的隐藏状态
prediction = self.fc(last_hidden_state)
return prediction
# 初始化模型
nmr_transformer = NMRTransformer(module)
# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(nmr_transformer.parameters(), lr=0.001)
# 训练模型
num_epochs = 10
batch_size = 80
for epoch in range(num_epochs):
running_loss = 0.0
# 将数据划分为小批量
for i in range(0, len(smiles_train), batch_size):
batch_smiles = smiles_train[i:i+batch_size]
batch_nmr = nmr_train[i:i+batch_size]
# 将SMILES转换为BERT输入
batch_inputs = tokenizer.batch_encode_plus(batch_smiles, padding=True, truncation=True, return_tensors='pt')
input_ids = batch_inputs['input_ids']
# 前向传播
outputs = nmr_transformer(input_ids)
predicted_nmr = outputs.squeeze()
batch_nmr_new=[]
# 计算损失并执行反向传播
for i in batch_nmr:
i=i.split(',')
i = list(map(float, i))
i = i + [0.0] * (80 - len(i))
batch_nmr_new.append(i)
#batch_nmr=list(map(float,batch_nmr_new))
i=torch.tensor(i,dtype=torch.float)
a=torch.tensor(batch_nmr_new, dtype=torch.float)
loss=0
for i in a:
loss += criterion(predicted_nmr,i)
#loss+=loss#torch.tensor(batch_nmr_new, dtype=torch.float)
loss=loss/a.shape[0]
print(loss)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
print(running_loss)
# 打印每个epoch的损失
print(f"Epoch {epoch+1} loss: {running_loss}")
# 测试模型
nmr_transformer.eval()
test_inputs = tokenizer.batch_encode_plus(smiles_test, padding=True, truncation=True, return_tensors='pt')['input_ids']
test_outputs = nmr_transformer(test_inputs)
test_predictions = test_outputs.squeeze().detach().numpy()
# 打印预测结果和真实值
for i in range(len(smiles_test)):
print(f"SMILES: {smiles_test[i]}, Predicted NMR: {test_predictions[i]}, True NMR: {nmr_test[i]}")
loss上万,感觉内部逻辑出了点问题但是自己看不出来
下面是数据,就用到了前两列
【以下回答由 GPT 生成】
首先,我注意到您使用的是BertModel作为Transformer的基础模型。BertModel主要用于NLP任务,而不是序列到序列的预测任务。所以,我建议您使用适用于序列到序列任务的Transformer模型,例如nn.Transformer。以下是修改后的代码:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
# 加载数据
data = pd.read_csv('1.csv')
# 划分SMILES和核磁数据
smiles = data['SMILES'].values
nmr_data = data['spectra'].values
# 划分训练集和测试集
smiles_train, smiles_test, nmr_train, nmr_test = train_test_split(smiles, nmr_data, test_size=0.2, random_state=42)
# 加载预训练的BERT分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# 定义自定义的Transformer模型
class NMRTransformer(nn.Module):
def __init__(self, input_size, hidden_size, num_layers):
super(NMRTransformer, self).__init__()
self.embedding = nn.Embedding(input_size, hidden_size)
self.model = nn.Transformer(hidden_size, num_heads=8, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dim_feedforward=2048, dropout=0.1)
self.fc = nn.Linear(hidden_size, 1)
def forward(self, src):
src = self.embedding(src) * np.sqrt(hidden_size)
output = self.model(src, src)
output = self.fc(output[:, 0, :]) # 取CLS token的隐藏状态
return output
# 初始化模型
input_size = len(tokenizer)
hidden_size = 768
num_layers = 6
nmr_transformer = NMRTransformer(input_size, hidden_size, num_layers)
# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(nmr_transformer.parameters(), lr=0.001)
# 准备数据
def prepare_data(inputs, targets):
input_ids = [tokenizer.encode_plus(smile, padding='longest', truncation=True, return_tensors='pt')['input_ids'] for smile in inputs]
input_ids = torch.cat(input_ids, dim=0)
targets = torch.tensor(targets, dtype=torch.float)
return input_ids, targets
# 训练模型
num_epochs = 10
batch_size = 80
for epoch in range(num_epochs):
running_loss = 0.0
# 将数据划分为小批量
for i in range(0, len(smiles_train), batch_size):
batch_smiles = smiles_train[i:i+batch_size]
batch_nmr = nmr_train[i:i+batch_size]
# 准备训练数据
input_ids, targets = prepare_data(batch_smiles, batch_nmr)
# 前向传播
predictions = nmr_transformer(input_ids)
loss = criterion(predictions, targets)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss}")
下面是对修改后的代码的解释:
nn.Transformer
作为Transformer的基础模型,而不是BertModel
。NMRTransformer
类的定义,使用nn.Transformer
作为基础模型,并添加了一个线性层作为最终的预测层。prepare_data
函数,用于将SMILES序列转换为BERT输入,并将其与对应的目标数据一起准备成张量。prepare_data
函数准备训练数据,并根据新定义的模型进行前向传播和反向传播。这些修改应该帮助您改善模型的损失,从而提高预测的准确性。
【相关推荐】