import csv
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
# 加载预训练的 BERT 模型和 tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
# 自定义一个用于预测核磁数据的线性层
linear_layer = nn.Linear(model.config.hidden_size, 1)
# 自定义一个用于预测峰与原子对应关系的线性层
num_atoms = 20
peak_atom_linear_layer = nn.Linear(model.config.hidden_size, num_atoms)
# 定义损失函数和优化器
mse_loss_fn = nn.MSELoss()
cross_entropy_loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([
{'params': model.parameters()},
{'params': linear_layer.parameters()},
{'params': peak_atom_linear_layer.parameters()}
], lr=0.001)
class SpectraDataset(Dataset):
def __init__(self, smiles_list, spectra_list, atom_peak_mapping_list):
self.smiles_list = smiles_list
self.spectra_list = spectra_list
self.atom_peak_mapping_list = atom_peak_mapping_list
def __len__(self):
return len(self.smiles_list)
def __getitem__(self, index):
smiles = self.smiles_list[index]
spectra = self.spectra_list[index]
atom_peak_mapping = self.atom_peak_mapping_list[index]
input_ids = tokenizer.encode(smiles, add_special_tokens=True,max_length=80,pad_to_max_length=True)
spectra = spectra + [0.0] * (80 - len(spectra))
atom_peak_mapping = atom_peak_mapping + [0] * (80 - len(atom_peak_mapping))
input_ids = torch.tensor([input_ids])
spectra = torch.tensor([spectra], dtype=torch.float32)
atom_peak_mapping = torch.tensor(atom_peak_mapping, dtype=torch.long)
return input_ids, spectra, atom_peak_mapping
def train_model(input_ids, spectra, atom_peak_mapping):
model.train()
optimizer.zero_grad()
# 将输入传递给 BERT 模型
outputs = model(input_ids)
# 获取 BERT 模型的输出
pooled_output = outputs[1] # 获取 [CLS] token 的输出
#pooled_output = outputs.pooler_output
pooled_output = outputs.last_hidden_state[:, 0]
# 使用线性层进行预测核磁谱数据
predicted_spectra = linear_layer(pooled_output)
# 计算核磁谱数据的损失
spectra_loss = mse_loss_fn(predicted_spectra, spectra)
# 使用线性层进行预测峰与原子对应关系
predicted_atom_peak_mapping = peak_atom_linear_layer(pooled_output)
# 计算峰与原子对应关系的损失
atom_peak_mapping_loss = cross_entropy_loss_fn(predicted_atom_peak_mapping, atom_peak_mapping)
# 加权求和得到总损失
loss = spectra_loss + atom_peak_mapping_loss
# 反向传播和参数更新
loss.backward()
optimizer.step()
return loss.item()
def predict(input_ids):
model.eval()
with torch.no_grad():
outputs =model(input_ids)
pooled_output = outputs[1]
#pooled_output = outputs.pooler_output
pooled_output = outputs.last_hidden_state[:, 0]
predicted_spectra = linear_layer(pooled_output)
predicted_atom_peak_mapping = peak_atom_linear_layer(pooled_output)
predicted_atom_peak_mapping = predicted_atom_peak_mapping[:, :num_atoms]
return predicted_spectra.item(), torch.argmax(predicted_atom_peak_mapping).item()
# 读取数据集
def read_dataset(csv_file):
smiles_list = []
spectra_list = []
atom_peak_mapping_list = []
with open(csv_file, 'r') as file:
reader = csv.reader(file)
next(reader) # 跳过标题行
for row in reader:
smiles = row[0]
spectra = list(map(float, row[1].split(',')[:-1]))
atom_peak_mapping = list(map(int, row[2].split(',')[:-1]))
smiles_list.append(smiles)
spectra_list.append(spectra)
atom_peak_mapping_list.append(atom_peak_mapping)
return smiles_list, spectra_list, atom_peak_mapping_list
def main():
# 读取数据集
csv_file = '1.csv'
smiles_list, spectra_list, atom_peak_mapping_list = read_dataset(csv_file)
# 创建数据加载器
train_dataset = SpectraDataset(smiles_list, spectra_list, atom_peak_mapping_list)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# 设置训练轮数
num_epochs = 20
# 训练循环
for epoch in range(num_epochs):
total_loss = 0
for data in train_dataloader:
input_ids, spectra, atom_peak_mapping = data
loss = train_model(input_ids, spectra, atom_peak_mapping)
total_loss += loss
# 打印训练结果
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")
# 预测示例
example_smiles = 'CCN(CC)[Si](C)(C)C'
example_input_ids = tokenizer.encode(example_smiles, add_special_tokens=True)
example_input_ids = torch.tensor([example_input_ids])
spectra_prediction, atom_peak_mapping_prediction = predict(example_input_ids)
print("预测的核磁数据:", spectra_prediction)
print("预测的峰与原子对应关系:", atom_peak_mapping_prediction)
if __name__ == '__main__':
main()
C:\Users\Administrator\AppData\Local\Programs\Python\Python311\python.exe H:\niuniu\11.py
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\tokenization_utils_base.py:2377: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).
warnings.warn(
Traceback (most recent call last):
File "H:\niuniu\11.py", line 156, in <module>
main()
File "H:\niuniu\11.py", line 139, in main
loss = train_model(input_ids, spectra, atom_peak_mapping)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "H:\niuniu\11.py", line 57, in train_model
outputs = model(input_ids)
^^^^^^^^^^^^^^^^
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\nn\modules\module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\models\bert\modeling_bert.py", line 974, in forward
batch_size, seq_length = input_shape
^^^^^^^^^^^^^^^^^^^^^^
ValueError: too many values to unpack (expected 2)
给了个任务让用transformer做一下,上面是代码
调了很长时间不知道应该怎么改,试过很多方法都不管用,不知道是哪里出了问题,任务是用第一列数据预测二三列,数据格式在图片里希望有会的能教教
【以下回答由 GPT 生成】
该错误发生在创建SpectraDataset对象的过程中,主要原因是在__getitem__方法中返回的对象数量与解包的变量数量不一致。在这种情况下,return语句应该返回一个包含所有返回值的元组或列表。以下是修复该错误的方式:
class SpectraDataset(Dataset):
def __init__(self, smiles_list, spectra_list, atom_peak_mapping_list):
self.smiles_list = smiles_list
self.spectra_list = spectra_list
self.atom_peak_mapping_list = atom_peak_mapping_list
def __len__(self):
return len(self.smiles_list)
def __getitem__(self, index):
smiles = self.smiles_list[index]
spectra = self.spectra_list[index]
atom_peak_mapping = self.atom_peak_mapping_list[index]
input_ids = tokenizer.encode(smiles, add_special_tokens=True, max_length=80, pad_to_max_length=True)
spectra = spectra + [0.0] * (80 - len(spectra))
atom_peak_mapping = atom_peak_mapping + [0] * (80 - len(atom_peak_mapping))
input_ids = torch.tensor([input_ids])
spectra = torch.tensor([spectra], dtype=torch.float32)
atom_peak_mapping = torch.tensor(atom_peak_mapping, dtype=torch.long)
return input_ids, spectra, atom_peak_mapping
# 修正代码
def train_model(sample):
input_ids, spectra, atom_peak_mapping = sample
model.train()
optimizer.zero_grad()
# 将输入传递给 BERT 模型
outputs = model(input_ids)
# 获取 BERT 模型的输出
pooled_output = outputs[1] # 获取 [CLS] token 的输出
return pooled_output
# 使用示例
dataset = SpectraDataset(smiles_list, spectra_list, atom_peak_mapping_list)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
for batch in dataloader:
train_model(batch)
首先,在train_model函数中,直接从sample参数中解包input_ids、spectra和atom_peak_mapping。然后,我添加了一个for循环来对数据集中的每个batch进行训练。通过DataLoader将SpectraDataset对象转化为一个batch的数据,使得datatime对象被正确传递给train_model函数。
希望这个解决方案能够帮助到你!如果有任何疑问,请随时问我。