现在只发现有保存模型个数,和保存模型的方法的设置,还有一个是load_best_model,但这不是保存最好的模型吧。
huggingface的参数并不能实现只保存最佳的模型,但是它保存的模型里面肯定包含了最佳的模型,但是evaluation_strategy
,save_strategy
,load_best_model_at_end
这几个参数设置好的话可以让你在最后加载的是最佳的模型,你可以尝试在训练完后单独保存这个最佳的模型,反正都是Pytorch的模型嘛,可以试试torch.save()
或者huggingface里面的model.save_pretrained
之类的方法吧。实在不行就只用huggingface提供的模型吧,然后自己去定义训练和评估以及模型的保存策略。
可以参考一下
import os
import csv
from transformers import BertTokenizer, WEIGHTS_NAME,TrainingArguments
from model.modeling_nezha import NeZhaForSequenceClassification,NeZhaForMaskedLM
from model.configuration_nezha import NeZhaConfig
import tokenizers
import torch
from datasets import load_dataset,Dataset
from transformers import (
CONFIG_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING,
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
HfArgumentParser,
Trainer,
TrainingArguments,
set_seed,
LineByLineTextDataset
)
## 制作自己的tokenizer
bwpt = tokenizers.BertWordPieceTokenizer()
filepath = "../excel2txt.txt" # 和本文第一部分的语料格式一致
bwpt.train(
files=[filepath],
vocab_size=50000,
min_frequency=1,
limit_alphabet=1000
)
bwpt.save_model('./pretrained_models/') # 得到vocab.txt
## 加载tokenizer和模型
model_path='../tmp/nezha/'
token_path='./pretrained_models/vocab.txt'
tokenizer = BertTokenizer.from_pretrained(token_path, do_lower_case=True)
config=NeZhaConfig.from_pretrained(model_path)
model=NeZhaForMaskedLM.from_pretrained(model_path, config=config)
model.resize_token_embeddings(len(tokenizer))
# 通过LineByLineTextDataset接口 加载数据 #长度设置为128, # 这里file_path于本文第一部分的语料格式一致
train_dataset=LineByLineTextDataset(tokenizer=tokenizer,file_path='../tmp/all_data_txt.txt',block_size=128)
# MLM模型的数据DataCollator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
# 训练参数
pretrain_batch_size=64
num_train_epochs=300
training_args = TrainingArguments(
output_dir='./outputs/', overwrite_output_dir=True, num_train_epochs=num_train_epochs, learning_rate=6e-5,
per_device_train_batch_size=pretrain_batch_size,, save_total_limit=10)# save_steps=10000
# 通过Trainer接口训练模型
trainer = Trainer(
model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset)
# 开始训练
trainer.train(True)
trainer.save_model('./outputs/')
下面的代码挺多的,感觉有正确的
看看它的参数支不支持
支持就能实现
不支持就自己写代码实现
或者keras
https://www.csdn.net/tags/MtTaMg1sODY4NTgwLWJsb2cO0O0O.html
这个自己说了算吧
需要你自己设置适合的参数实现你的需求
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
"I've been waiting for a HuggingFace course my whole life.",
"This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
batch["labels"] = torch.tensor([1, 1])
optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()