from sklearn.model_selection import KFold
#st
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib as plt
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from IPython.display import clear_output as clear
print('Loading data...')
#加载数据
path1 = r"D:\data\daqiyaogan\LightGBM数据文件\单独测试\删除异常值.csv"
test_file = r"D:\data\daqiyaogan\LightGBM数据文件\单独测试\删除异常值_2015.csv"
txt_path = r"D:\data\daqiyaogan\LightGBM数据文件\单独测试\2015.txt"
#文件开头
content_ls = ['learning_rate', 'max_depth', 'num_leaves', 'RMSE', 'MAE', 'R2', 'RMSE_2015', 'MAE_2015', 'R2_2015']
if os.path.exists(txt_path):
os.remove(txt_path)
with open(txt_path, 'w') as f:
f.write(','.join(content_ls) + '\n')
df = pd.read_csv(path1)
Y = df['pm10_mean'].values
X = df.drop(['time', 'pm2_5', 'site', 'pm10_mean'],axis = 1).values
x_train,x_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size = 0.1,random_state = 33)
test_df = pd.read_csv(test_file)
test_x = test_df.drop(['time', 'pm2_5', 'site', 'pm10_mean'],axis = 1).values
test_y = test_df['pm10_mean'].values
print('Starting training...')
#模型训练
max_depth = 10
num_leaves = 490
learning_rate = 0.1
n_estimators = 850
min_child_samples = 1
min_child_weight = 0.001
feature_fraction = 1
lambda_l1 = 0
drop_rate = 0.1
lambda_l2 = 0
subsample = 1
max_drop = 100
print('正在验证learning_rate为{},max_depth为{},num_leaves为{}的STLightBDT模型'.format(round(learning_rate,3), max_depth, num_leaves))
gbm = lgb.LGBMRegressor(num_leaves = num_leaves,
learning_rate = learning_rate,
max_depth = max_depth,
min_child_samples = min_child_samples,
min_child_weight = min_child_weight,
feature_fraction = feature_fraction,
extra_tree = False,
first_metric_only = True,
drop_rate = drop_rate,
lambda_l1 = lambda_l1,
lambda_l2 = lambda_l2,
subsample = subsample,
max_drop = max_drop,
n_estimators = n_estimators)
gbm.fit(x_train, y_train,
eval_set = [(x_test, y_test)],
eval_metric = 'l1',
early_stopping_rounds = 20)
clear()
print('Starting predicting...')
def calc(real, pred):
rmse = round(sqrt(mean_squared_error(real, pred)), 3)
mae = round(mean_absolute_error(real, pred), 3)
r2 = round(r2_score(real, pred), 3)
return rmse, mae, r2
#模型预测
y_pred = gbm.predict(x_test, num_iteration = gbm.best_iteration_)
rmse, mae, r2 = calc(y_test, y_pred)
print('样本集中RMSE={}, MAE={}, R2={}'.format(rmse, mae, r2))
#模型评估
# rmse = round(sqrt(mean_squared_error(y_test, y_pred)), 3)
# mae = round(mean_absolute_error(y_test, y_pred), 3)
# r2 = round(r2_score(y_test, y_pred), 3)
# print('样本集中RMSE={}, MAE={}, R2={}'.format(rmse, mae, r2))
y_pred = gbm.predict(test_x, num_iteration = gbm.best_iteration_)
rmse, mae, r2 = calc(test_y, y_pred)
# rmse = round(sqrt(mean_squared_error(test_y, y_pred)), 3)
# mae = round(mean_absolute_error(test_y, y_pred), 3)
# r2 = round(r2_score(test_y, y_pred), 3)
print('2015测试集中RMSE={}, MAE={}, R2={}'.format(rmse, mae, r2,))
我修改数据,每一年运行一次上面的代码,但是得到的结果(R2)不是逐年上升的,这不应该啊。运行结果如下:
2015测试集中RMSE=21.148, MAE=14.754, R2=0.902
2017测试集中RMSE=18.222, MAE=12.74, R2=0.916
2018测试集中RMSE=18.281, MAE=12.317, R2=0.908
2019测试集中RMSE=16.447, MAE=11.404, R2=0.903
2020测试集中RMSE=15.908, MAE=11.174, R2=0.889
这里要补充的第一点就是,rmse与mae关联性强一些,与r2的关联性差一些,严格意义上rmse、mae、r2是三个指标,rmse下降r2不一定下降。还有就是问题中对训练数据与测试数据的划分不够明确,博主暂时理解为有2015,2016,2017,2018,2019,2020共计6年的数据。关于问题中给出运行结果来源推导为,2016-2020训练=》2015年测试集结果;[2015,2017-2020]训练=》2016年测试集结果;[2015-2016,2018-2020]训练=》2017年测试集结果;[2015-2018,2020]训练=》2019年测试集结果;[2015-2019]训练=》2020年测试集结果。
假如是博主所推导的这种情况,数据的特征分布可以解释为:训练数据中包含的未来数据(超过测试数据年份)越少,模型拟合效果越好。也就是说,数据的变化特征具有一定的时序性,过去与未来相关,但未来与过去无关,在建模时应考虑数据的时序关系。
假如题目在的测试结果是2015年训练,2015年测试;2016年训练,2016年测试;........;2020年训练,2020年测试。可以看到的是针对不同年份的数据,模型拟合的效果是不一样的。建议题主对每年数据的分布特征进行一个统计(观测数据的复杂程度),对数据进行一个统一映射(或者标准化)