最近改代码需要比较直接one-hot 和embeding输入的差距 已有embeding 输入 怎么改 one-hot输入 聊我 回$ 急!
问问题的时候请附上详细描述和对应部分的代码,这样方便大家给你做答。
部分代码如下 从输入到网络架构
from keras import backend as K
df_train=pd.read_excel(r'/home/s/Ytrain.xlsx',Sheetname='训练集')
df_test=pd.read_excel(r'/home/s/Ytest.xlsx',Sheetname='测试集')
#Y值的处理
lables_list = df_train['标签'].unique().tolist()
dig_lables = dict(enumerate(lables_list))
lable_dig = dict((lable,dig) for dig, lable in dig_lables.items())
df_train['标签_数字'] = df_train['标签'].apply(lambda lable: lable_dig[lable])#得到标签(lable)数字0和1
num_classes = len(dig_lables)
train_lables = to_categorical(df_train['标签_数字'],num_classes=num_classes)
num_words = 20 #总词数 20个
max_len = 21 #每行最大词数,不足补0,多余截取掉。
#创建分词器tokenizer对象
tokenizer = Tokenizer(num_words=num_words)#num_words:None或整数,处理的最大单词数量。少于此数的单词丢掉
df_all=pd.concat([df_train['文本'],df_test['文本']])#将列表连接
tokenizer.fit_on_texts(df_all)
train_sequences = tokenizer.texts_to_sequences(df_train['文本'])
train_data = pad_sequences(train_sequences, maxlen=max_len, padding='post')
#测试集处理
test_sequences = tokenizer.texts_to_sequences(df_test['文本'])
test_data = pad_sequences(test_sequences, maxlen=max_len, padding='post')
'''skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True)
skf.get_n_splits(train_data, df_train['标签'])
for train_index, val_index in skf.split(train_data, df_train['标签']):
DATE_train, x_val = train_data[train_index], train_data[val_index]#训练集和验证集的data数据
LABLES_train, y_val = df_train['标签'][train_index], df_train['标签'][val_index] # 练集和验证集的lables'''
seed=7
np.random.seed(seed)
DATE_train, x_val, LABLES_train, y_val = train_test_split(train_data,df_train['标签_数字'],test_size=0.2, random_state=0,shuffle=True)
model = tf.keras.Sequential()
vocab_size=len(tokenizer.word_index)+1
model.add(layers.Embedding(input_dim=vocab_size,output_dim=128,mask_zero=True,embeddings_initializer='uniform'))
model.add(layers.LSTM(128,return_sequences=False,dropout=0, recurrent_dropout=0))
model.add(layers.Dense(256, activation='relu', kernel_initializer='glorot_uniform'))
model.add(layers.Dense(256, activation='relu', kernel_initializer='glorot_uniform'))
model.add(layers.Dense(256,activation='relu', kernel_initializer='glorot_uniform'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(2,activation='softmax'))
model.summary()
opt=tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
model.compile(optimizer=opt,loss='sparse_categorical_crossentropy',metrics=['accuracy'])
去掉embedding 换one-hot输入
w
输入信息:文本信息,输入标签