# 分离特征和目标变量
y = '23h结果'
X= [x for x in data.columns if x not in [y, '23h结果']]
X=pd.DataFrame(X)
# 定义LightGBM模型
params = {'boosting_type': 'gbdt',
'objective': 'regression',
'metric': ('mae','mse','r2'),# 默认使用MAE作为评价指标
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9}
# 顺序向前选择特征
selected_features = []
score_list = []
for i in range(len(X.columns)):
scores = []
for feature in X.columns:
if feature not in selected_features:
features = selected_features + [feature]
X_selected = X[features]
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
# 训练模型并预测
gbm = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=lgb_test, early_stopping_rounds=10,
verbose_eval=False)
y_pred = gbm.predict(X_test)
运行上述代码时,出现了如下错误
Traceback (most recent call last):
File "C:/Users/86187/PycharmProjects/pythonProject10/23h预测.py", line 49, in <module>
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
File "D:\py\anaconda3\lib\site-packages\sklearn\model_selection\_split.py", line 2559, in train_test_split
arrays = indexable(*arrays)
File "D:\py\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 443, in indexable
check_consistent_length(*result)
File "D:\py\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 397, in check_consistent_length
raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [18, 5]
具体解决方是什么?
根据报错信息,输入变量的样本数量不一致,X_selected的样本数量为18,而y的数量为5。可能是由于在分离特征和目标变量时,没有对y进行正确的处理。应该修改为:
# 分离特征和目标变量
y = data['23h结果']
X = data.drop(['23h结果'], axis=1)
【以下回答由 GPT 生成】
这个错误是由于train_test_split
函数要求输入的特征和目标变量都必须是数组形式的,而不是DataFrame。解决这个错误的方法是将特征和目标变量转换为数组形式。
以下是修改后的代码示例:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
# 分离特征和目标变量
y = data['23h结果']
X = data.drop(['23h结果'], axis=1)
# 定义LightGBM模型
params = {'boosting_type': 'gbdt',
'objective': 'regression',
'metric': ('mae', 'mse', 'r2'), # 默认使用MAE作为评价指标
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9}
# 顺序向前选择特征
selected_features = []
score_list = []
for i in range(len(X.columns)):
scores = []
for feature in X.columns:
if feature not in selected_features:
features = selected_features + [feature]
X_selected = X[features].values # 将特征转换为数组形式
y_selected = y.values # 将目标变量转换为数组形式
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_selected, test_size=0.2, random_state=42)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
# 训练模型并预测
gbm = lgb.train(params, lgb_train, num_boost_round=1000, valid_sets=lgb_test, early_stopping_rounds=10,
verbose_eval=False)
y_pred = gbm.predict(X_test)
请注意,上述代码中使用values
属性将DataFrame转换为数组形式。这样就能解决train_test_split
函数抛出的错误。
【相关推荐】