使用xgboost模块,决策树预测汽车价格

大lao们,能不能帮我看一下这串代码,决策树预测,但我看不太懂,帮我注解一下,提前万分感谢
这个决策树预测是放在flask中app.py中的,用的是xgboost模块,预测的是汽车的价格,代码可能有点残,我不知道该截多少,帮我稍微瞅一眼吧,可怜我这个决策树的菜鸟吧,

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


all_y = dataset['最低指导价'].values
del dataset['最低指导价']
del dataset['最高指导价']
all_x = dataset.values

df_columns = dataset.columns.values
print('---> cv train to choose best_num_boost_round')
all_y = np.log1p(all_y)

dtrain = xgb.DMatrix(all_x, label=all_y, feature_names=df_columns)

xgb_params = {
    'learning_rate': 0.01,
    'max_depth': 4,
    'eval_metric': 'rmse',
    'objective': 'reg:linear',
    'nthread': -1,
    'silent': 1,
    'booster': 'gbtree'
}

cv_result = xgb.cv(dict(xgb_params),
                   dtrain,
                   num_boost_round=4000,
                   early_stopping_rounds=100,
                   verbose_eval=100,
                   show_stdv=False,
                   )
best_num_boost_rounds = len(cv_result)
mean_train_logloss = cv_result.loc[best_num_boost_rounds -
                                   11: best_num_boost_rounds - 1, 'train-rmse-mean'].mean()
mean_test_logloss = cv_result.loc[best_num_boost_rounds -
                                  11: best_num_boost_rounds - 1, 'test-rmse-mean'].mean()
print('best_num_boost_rounds = {}'.format(best_num_boost_rounds))

print('mean_train_rmse = {:.7f} , mean_valid_rmse = {:.7f}\n'.format(
    mean_train_logloss, mean_test_logloss))
print('---> training on total dataset to predict test and submit')
model = xgb.train(dict(xgb_params),
                  dtrain,
                  num_boost_round=best_num_boost_rounds)
# 特征重要程度
feature_importance = model.get_fscore()
feature_importance = sorted(
    feature_importance.items(), key=lambda d: d[1], reverse=True)
print(feature_importance)
print(df_columns)


@app.route('/get_all_unique_values/')
def get_all_unique_values(key):
    values = list(set(ori_dataset[key]))
    if '' in values:
        values.remove('')
    if '未知' in values:
        values.remove('未知')
    return jsonify(values)


@app.route(
    '/predict_car_price////////')
def predict_car_price(pinpai, pingfen, jibie, cheshenjiegou, fadongji, biansux, xuhanglich, diandongji):
    """预测汽车的价格"""
    test_x = [
        brand_map[pinpai],
        float(pingfen),
        jibie_map[jibie],
        jiegou_map[cheshenjiegou],
        fadongji_map(fadongji),
        biansuxiang_map[biansux],
        xuhang(xuhanglich),
        diandongji_map(diandongji)
    ]

    dtest = xgb.DMatrix(test_x, feature_names=df_columns)
    predict_price = model.predict(dtest)[0]
    predict_price = np.expm1(predict_price)

    return jsonify({
        'predict_price': float(predict_price)
    })

你好,简单的为你写了一下代码注释,希望能帮到你:


# 导入numpy \xgboost \sklearn(主要是评价指标)\flask模块
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from flask import Flask, jsonify

# 定义 RMSE 评价指标的函数
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# 从数据集中提取所有的最低指导价,删除最低指导价和最高指导价列,并将剩余列保存到 all_x 中
all_y = dataset['最低指导价'].values
del dataset['最低指导价']
del dataset['最高指导价']
all_x = dataset.values

# 获取数据集的所有列名
df_columns = dataset.columns.values

# 使用自然对数转换所有的最低指导价(初步的数据处理)
all_y = np.log1p(all_y)

# 将数据集和标签转换为矩阵准备送入模型
dtrain = xgb.DMatrix(all_x, label=all_y, feature_names=df_columns)

# 定义 XGBoost 参数
xgb_params = {
    'learning_rate': 0.01,
    'max_depth': 4,
    'eval_metric': 'rmse',
    'objective': 'reg:linear',
    'nthread': -1,
    'silent': 1,
    'booster': 'gbtree'
}

# 使用交叉验证来选择参数(主要是树的数量)
cv_result = xgb.cv(dict(xgb_params),
                   dtrain,
                   num_boost_round=4000,
                   early_stopping_rounds=100,
                   verbose_eval=100,
                   show_stdv=False,
                   )
best_num_boost_rounds = len(cv_result)
mean_train_logloss = cv_result.loc[best_num_boost_rounds -
                                   11: best_num_boost_rounds - 1, 'train-rmse-mean'].mean()
mean_test_logloss = cv_result.loc[best_num_boost_rounds -
                                  11: best_num_boost_rounds - 1, 'test-rmse-mean'].mean()
print('best_num_boost_rounds = {}'.format(best_num_boost_rounds))

# 输出训练误差和验证误差
print('mean_train_rmse = {:.7f} , mean_valid_rmse = {:.7f}\n'.format(
    mean_train_logloss, mean_test_logloss))

# 训练模型获取特征重要性
model = xgb.train(dict(xgb_params),
                  dtrain,
                  num_boost_round=best_num_boost_rounds)
feature_importance = model.get_fscore()
feature_importance = sorted(
    feature_importance.items(), key=lambda d: d[1], reverse=True)
print(feature_importance)
print(df_columns)

# 定义 Flask 的路由来处理请求
@app.route('/get_all_unique_values/<key>')
def get_all_unique_values(key):
    values = list(set(ori_dataset[key]))
    if '' in values:
        values.remove('')
    if '未知' in values:
        values.remove('未知')
    return jsonify(values)

代码的后半部分主要是调用了flask配置了接口,处理请求然后用训练好的模型进行预测。

https://blog.csdn.net/kaxiaokui/article/details/105156784

你把源码发给我测试一下

不知道你这个问题是否已经解决, 如果还没有解决的话:

如果你已经解决了该问题, 非常希望你能够分享一下解决方案, 写成博客, 将相关链接放在评论区, 以帮助更多的人 ^-^