在构建模型中遇到了这个错误ValueError: too many values to unpack (expected 2)


import csv
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer

# 加载预训练的 BERT 模型和 tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 自定义一个用于预测核磁数据的线性层
linear_layer = nn.Linear(model.config.hidden_size, 1)

# 自定义一个用于预测峰与原子对应关系的线性层
num_atoms = 20
peak_atom_linear_layer = nn.Linear(model.config.hidden_size, num_atoms)

# 定义损失函数和优化器
mse_loss_fn = nn.MSELoss()
cross_entropy_loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam([
    {'params': model.parameters()},
    {'params': linear_layer.parameters()},
    {'params': peak_atom_linear_layer.parameters()}
], lr=0.001)


class SpectraDataset(Dataset):
    def __init__(self, smiles_list, spectra_list, atom_peak_mapping_list):
        self.smiles_list = smiles_list
        self.spectra_list = spectra_list
        self.atom_peak_mapping_list = atom_peak_mapping_list

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, index):
        smiles = self.smiles_list[index]
        spectra = self.spectra_list[index]
        atom_peak_mapping = self.atom_peak_mapping_list[index]

        input_ids = tokenizer.encode(smiles, add_special_tokens=True,max_length=80,pad_to_max_length=True)
        spectra = spectra + [0.0] * (80 - len(spectra))
        atom_peak_mapping = atom_peak_mapping + [0] * (80 - len(atom_peak_mapping))
        input_ids = torch.tensor([input_ids])
        spectra = torch.tensor([spectra], dtype=torch.float32)
        atom_peak_mapping = torch.tensor(atom_peak_mapping, dtype=torch.long)

        return input_ids, spectra, atom_peak_mapping


def train_model(input_ids, spectra, atom_peak_mapping):
    model.train()
    optimizer.zero_grad()

    # 将输入传递给 BERT 模型
    outputs = model(input_ids)

    # 获取 BERT 模型的输出
    pooled_output = outputs[1]  # 获取 [CLS] token 的输出
    #pooled_output = outputs.pooler_output
    pooled_output = outputs.last_hidden_state[:, 0]

    # 使用线性层进行预测核磁谱数据
    predicted_spectra = linear_layer(pooled_output)

    # 计算核磁谱数据的损失
    spectra_loss = mse_loss_fn(predicted_spectra, spectra)

    # 使用线性层进行预测峰与原子对应关系
    predicted_atom_peak_mapping = peak_atom_linear_layer(pooled_output)

    # 计算峰与原子对应关系的损失
    atom_peak_mapping_loss = cross_entropy_loss_fn(predicted_atom_peak_mapping, atom_peak_mapping)

    # 加权求和得到总损失
    loss = spectra_loss + atom_peak_mapping_loss

    # 反向传播和参数更新
    loss.backward()
    optimizer.step()

    return loss.item()


def predict(input_ids):
    model.eval()

    with torch.no_grad():
        outputs =model(input_ids)
        pooled_output = outputs[1]
        #pooled_output = outputs.pooler_output
        pooled_output = outputs.last_hidden_state[:, 0]
        predicted_spectra = linear_layer(pooled_output)
        predicted_atom_peak_mapping = peak_atom_linear_layer(pooled_output)
        predicted_atom_peak_mapping = predicted_atom_peak_mapping[:, :num_atoms]
    return predicted_spectra.item(), torch.argmax(predicted_atom_peak_mapping).item()


# 读取数据集
def read_dataset(csv_file):
    smiles_list = []
    spectra_list = []
    atom_peak_mapping_list = []

    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # 跳过标题行
        for row in reader:
            smiles = row[0]
            spectra = list(map(float, row[1].split(',')[:-1]))
            atom_peak_mapping = list(map(int, row[2].split(',')[:-1]))

            smiles_list.append(smiles)
            spectra_list.append(spectra)
            atom_peak_mapping_list.append(atom_peak_mapping)

    return smiles_list, spectra_list, atom_peak_mapping_list


def main():
    # 读取数据集
    csv_file = '1.csv'
    smiles_list, spectra_list, atom_peak_mapping_list = read_dataset(csv_file)

    # 创建数据加载器
    train_dataset = SpectraDataset(smiles_list, spectra_list, atom_peak_mapping_list)
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # 设置训练轮数
    num_epochs = 20

    # 训练循环
    for epoch in range(num_epochs):
        total_loss = 0

        for data in train_dataloader:
            input_ids, spectra, atom_peak_mapping = data
            loss = train_model(input_ids, spectra, atom_peak_mapping)
            total_loss += loss

        # 打印训练结果
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

    # 预测示例
    example_smiles = 'CCN(CC)[Si](C)(C)C'
    example_input_ids = tokenizer.encode(example_smiles, add_special_tokens=True)
    example_input_ids = torch.tensor([example_input_ids])
    spectra_prediction, atom_peak_mapping_prediction = predict(example_input_ids)

    print("预测的核磁数据:", spectra_prediction)
    print("预测的峰与原子对应关系:", atom_peak_mapping_prediction)


if __name__ == '__main__':
    main()
C:\Users\Administrator\AppData\Local\Programs\Python\Python311\python.exe H:\niuniu\11.py 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\tokenization_utils_base.py:2377: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).
  warnings.warn(
Traceback (most recent call last):
  File "H:\niuniu\11.py", line 156, in <module>
    main()
  File "H:\niuniu\11.py", line 139, in main
    loss = train_model(input_ids, spectra, atom_peak_mapping)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "H:\niuniu\11.py", line 57, in train_model
    outputs = model(input_ids)
              ^^^^^^^^^^^^^^^^
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\nn\modules\module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Administrator\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\models\bert\modeling_bert.py", line 974, in forward
    batch_size, seq_length = input_shape
    ^^^^^^^^^^^^^^^^^^^^^^
ValueError: too many values to unpack (expected 2)

给了个任务让用transformer做一下,上面是代码
调了很长时间不知道应该怎么改,试过很多方法都不管用,不知道是哪里出了问题,任务是用第一列数据预测二三列,数据格式在图片里希望有会的能教教

img

【以下回答由 GPT 生成】

该错误发生在创建SpectraDataset对象的过程中,主要原因是在__getitem__方法中返回的对象数量与解包的变量数量不一致。在这种情况下,return语句应该返回一个包含所有返回值的元组或列表。以下是修复该错误的方式:

class SpectraDataset(Dataset):
    def __init__(self, smiles_list, spectra_list, atom_peak_mapping_list):
        self.smiles_list = smiles_list
        self.spectra_list = spectra_list
        self.atom_peak_mapping_list = atom_peak_mapping_list

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, index):
        smiles = self.smiles_list[index]
        spectra = self.spectra_list[index]
        atom_peak_mapping = self.atom_peak_mapping_list[index]

        input_ids = tokenizer.encode(smiles, add_special_tokens=True, max_length=80, pad_to_max_length=True)
        spectra = spectra + [0.0] * (80 - len(spectra))
        atom_peak_mapping = atom_peak_mapping + [0] * (80 - len(atom_peak_mapping))
        input_ids = torch.tensor([input_ids])
        spectra = torch.tensor([spectra], dtype=torch.float32)
        atom_peak_mapping = torch.tensor(atom_peak_mapping, dtype=torch.long)

        return input_ids, spectra, atom_peak_mapping

# 修正代码
def train_model(sample):
    input_ids, spectra, atom_peak_mapping = sample
    model.train()
    optimizer.zero_grad()

    # 将输入传递给 BERT 模型
    outputs = model(input_ids)

    # 获取 BERT 模型的输出
    pooled_output = outputs[1]  # 获取 [CLS] token 的输出

    return pooled_output

# 使用示例
dataset = SpectraDataset(smiles_list, spectra_list, atom_peak_mapping_list)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
for batch in dataloader:
    train_model(batch)

首先,在train_model函数中,直接从sample参数中解包input_ids、spectra和atom_peak_mapping。然后,我添加了一个for循环来对数据集中的每个batch进行训练。通过DataLoader将SpectraDataset对象转化为一个batch的数据,使得datatime对象被正确传递给train_model函数。

希望这个解决方案能够帮助到你!如果有任何疑问,请随时问我。


如果你已经解决了该问题, 非常希望你能够分享一下解决方案, 写成博客, 将相关链接放在评论区, 以帮助更多的人 ^-^