from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
data = pd.read_excel(r'C:\Users\Administrator\Desktop\HF.xlsx')
target = data['平均日产油']
features = data.iloc[:, 0:]
X, y = features, target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100)
param_grid = {
'criterion': ['entropy', 'gini'],
'max_depth': [5, 6, 7, 8], # 深度:这里是森林中每棵决策树的深度
'n_estimators': [11, 13, 15], # 决策树个数-随机森林特有参数
'max_features': [0.3, 0.4, 0.5],
# 每棵决策树使用的变量占比-随机森林特有参数(结合原理)
'min_samples_split': [4, 8, 12, 16] # 叶子的最小拆分样本量
}
import sklearn.ensemble as ensemble # ensemble learning: 集成学习
rfc = ensemble.RandomForestClassifier()
rfc_cv = GridSearchCV(estimator=rfc, param_grid=param_grid,
scoring='roc_auc', cv=4)
rfc_cv.fit(X_train, y_train)
# 使用随机森林对测试集进行预测
test_est = rfc_cv.predict(X_test)
print('随机森林精确度...')
print(metrics.classification_report(test_est, y_test))
print('随机森林 AUC...')
fpr_test, tpr_test, th_test = metrics.roc_curve(test_est, y_test)
# 构造 roc 曲线
print('AUC = %.4f' % metrics.auc(fpr_test, tpr_test))
报错信息:
Traceback (most recent call last):
File "D:\PythoProject\BISHE\caogao.py", line 28, in <module>
rfc_cv.fit(X_train, y_train)
File "D:\envir\lib\site-packages\sklearn\model_selection\_search.py", line 874, in fit
self._run_search(evaluate_candidates)
File "D:\envir\lib\site-packages\sklearn\model_selection\_search.py", line 1388, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File "D:\envir\lib\site-packages\sklearn\model_selection\_search.py", line 851, in evaluate_candidates
_warn_or_raise_about_fit_failures(out, self.error_score)
File "D:\envir\lib\site-packages\sklearn\model_selection\_validation.py", line 367, in _warn_or_raise_about_fit_failures
raise ValueError(all_fits_failed_message)
ValueError:
All the 1152 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.
这段代码中出现了一些问题。在从pandas读取数据之后,你需要将数据分成训练集和测试集,但是你没有使用train_test_split函数正确地分割数据。train_test_split函数需要分别将特征和目标数据集分割成训练集和测试集,因此你需要将以下代码:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100)
改为:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.1, random_state=100)
另外,由于你的目标变量'target'是连续的值,所以你应该使用随机森林回归器RandomForestRegressor而不是随机森林分类器RandomForestClassifier。因此,你需要将以下代码:
import sklearn.ensemble as ensemble
rfc = ensemble.RandomForestClassifier()
改为:
import sklearn.ensemble as ensemble
rfc = ensemble.RandomForestRegressor()
修改完毕后,你可以再次运行代码,应该就能够成功地训练和测试随机森林模型了。
点击复制后,将打开C知道体验页