基本的框架
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
# 读取数据文件
data = pd.read_csv('data.csv')
# 划分特征和目标变量
X = data.drop(['ID', 'CD Account'], axis=1)
y = data['CD Account']
# 划分训练集和验证集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 建立决策树模型
model = DecisionTreeClassifier()
# 选择最优的模型参数
param_grid = {'max_depth': range(1, 10), 'min_samples_split': range(2, 10)}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
# 输出最优参数
print(grid_search.best_params_)
# 使用最优模型进行预测
y_pred = grid_search.predict(X_test)
# 输出模型准确率
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
以下是使用Python分析五千个数据,划分训练集和验证集,建立决策树模型,选择最优决策树的示例代码:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
# 读入数据
data = pd.read_csv('data.csv')
# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(data.drop('target', axis=1), data['target'], test_size=0.2, random_state=42)
# 建立决策树模型
dtc = DecisionTreeClassifier()
# 设置参数范围
param_grid = {
'criterion': ['gini', 'entropy'],
'max_depth': range(1, 11),
'min_samples_split': range(2, 21, 2),
'min_samples_leaf': range(1, 11),
}
# 使用GridSearchCV选择最优参数
grid_search = GridSearchCV(dtc, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
# 输出最优参数
print('Best parameters:', grid_search.best_params_)
# 使用最优参数训练模型
dtc = DecisionTreeClassifier(**grid_search.best_params_)
dtc.fit(X_train, y_train)
# 验证模型效果
print('Training accuracy:', dtc.score(X_train, y_train))
print('Validation accuracy:', dtc.score(X_val, y_val))
其中,data.csv是包含五千个数据的CSV文件,其中最后一列是目标变量(即分类结果)。在代码中,我们首先使用train_test_split函数将数据划分为训练集和验证集,然后使用GridSearchCV函数选择最优参数,最后使用最优参数训练模型并计算模型在训练集和验证集上的准确率。您可以根据自己的数据和需求进行修改。