###################打乱数据
import random
#print(len(labels))
#print(range(len(labels)))
a = list(range(len(labels)))
random.shuffle(a)
shuffled_features = [features[i] for i in a]
shuffled_labels = [labels[i] for i in a]
##########分训练、测试集
X = shuffled_features
y = shuffled_labels
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(len(X_train), len(X_test), len(y_train), len(y_test))
print(y_train.count('0'), y_train.count('1'), y_test.count('0'), y_test.count('1'))
##################K-fold 模型的交叉验证
lg_scores_1 = []
lg_scores_2 = []
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.get_n_splits(shuffled_features)
for train_index, test_index in kf.split(shuffled_features):
#分训练集和测试集
#print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = np.array(shuffled_features)[train_index], np.array(shuffled_features)[test_index]
y_train, y_test = np.array(shuffled_labels)[train_index], np.array(shuffled_labels)[test_index]
##########################################训练logistic regression模型
from xgboost import XGBClassifier
import xgboost as xgb
xgb = xgb.XGBClassifier()
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)
xgb.fit(X_train,y_train)
# 对测试集进行预测
y_pred = xgb.predict(X_test)
#结果检验
f1_lg1= f1_score(y_test, y_pred, average='weighted')
lg_scores_1.append(f1_lg1)
#######################
from sklearn.model_selection import GridSearchCV
parameters = {'C':[0.00001, 0.0001, 0.001, 0.005,0.01,0.05, 0.1, 0.5,1,2,5,10]}
lr = XGBClassifier()#选好模型
clf = GridSearchCV(lr, parameters, cv=5, scoring = fscore) #找寻最好的参数
clf.fit(X_train, y_train)
print (clf.best_params_)
y_pred = clf.predict(X_test)
#结果检验
f1_lg2= f1_score(y_test, y_pred, average='weighted')
lg_scores_2.append(f1_lg2)
print(f1_lg1, f1_lg2) #调参前 调参后
第一次写 还是很迷糊 如果代码里有很奇怪的地方,求指正