使用sklearn和lightGBM进行训练,在进行随机搜索时,出现问题:
import random
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import numpy as np
import sys
import os
from osgeo import gdal
import joblib
# 1. 随机选取几块进行数据集创建
res = [] # 存放数据标签,list,str
# 在标签中选出
for i in range(0, 7):
for j in range(0, 16):
res.append(str(i) + '_' + str(j))
def sample_block_to_name(seq, block):
"""
随机选取blck个数,并返回对应的标签名称
:param seq: 输入初始标签数量
:param block: 定义随机选取的数
:return: 标签名称和新的剩余数量
"""
num = random.sample(seq, block)
name = []
for k in num:
name.append(res[k])
seq.remove(k)
return name, seq
seq_original = list(range(0, 112))
name_original = []
for i in range(0, 7):
for j in range(0, 16):
name_original.append(str(i) + '_' + str(j))
name, seq_new = sample_block_to_name(seq_original, 70)
print(name)
print(seq_new)
# 读取tif文件, 后续tif_to_array中使用
def Read_img2array(img_file_path):
"""
读取栅格数据,将其转换成对应数组
img_file_path: 栅格数据路径
:return: 返回投影,几何信息,和转换后的数组
"""
dataset = gdal.Open(img_file_path) # 读取栅格数据
# 判断是否读取到数据
if dataset is None:
print('Unable to open *.tif')
sys.exit(1) # 退出
# 直接读取dataset
img_array = dataset.ReadAsArray()
return img_array
# 将选取的特征区块tif转换成array
def tif_to_array(path, name):
"""
将选取的所有特征区块转变为数组
:param path: 文件路径,不到.tif文件
:param name: 区块
:return: 返回数组
"""
pathDir = os.listdir(path)
result = [[0] * 15] * 1
for x in name:
for y in pathDir: # 0_1,0_1...
if x == y:
pathDir_y = os.listdir(path + "/" + y)
arr = [[0] * 1] * 41 * 36
arr = np.array(arr)
arr = arr.reshape(-2, 1)
for i in pathDir_y:
index = i.rfind('.')
if i[index:] == '.tif':
tif = Read_img2array(path + "/" + y + "/" + i)
mul = np.array(tif).reshape(-2, 1)
arr = np.column_stack((arr, mul))
arr = arr[:, 1:] # 第一个区块的数组
result = np.concatenate((result, arr), axis=0)
result = result[1:, :]
return result
# 将选取的标签区块tif转换成array
def tif_to_array_label(path, name):
pathdir = os.listdir(path)
rest = [[0] * 1] * 1
for x in name:
for y in pathdir:
if y == 'dilei' + x + '.tif':
tif = Read_img2array(path + '/' + y)
tif = tif.reshape(-2, 1)
rest = np.concatenate((rest, tif), axis=0)
rest = rest[1:, :].ravel()
return rest
####################################################
iteration = 0
while iteration <= 5:
# 数据、标签
X_train = tif_to_array("D:\\Personality\\paper\\GBDT\\testshiyan\\train", name)
y_train = tif_to_array_label("D:\\Personality\\paper\\GBDT\\testshiyan\\label", name)
X_test = tif_to_array("D:\\Personality\\paper\\GBDT\\testshiyan\\train", name_original)
y_test = tif_to_array_label("D:\\Personality\\paper\\GBDT\\testshiyan\\label", name_original)
# 2.随机搜索寻找模型最优参数
clf = lgb.LGBMClassifier() # LightGBM参数默认
param_test = {'n_estimators': list(range(200, 1000, 100)), 'num_leaves': list(range(10, 40, 2)),
'learning_rate': list(np.arange(0, 1, 0.1)), 'max_depth': list(range(-1, 50, 1))} # 设置搜索范围
rand = RandomizedSearchCV(clf, param_test, scoring='accuracy', n_iter=10,
random_state=5, n_jobs=1) # 随机搜索,默认cv为5折
rand.fit(X_train, y_train) # 拟合
print('随机搜索-最佳参数:', rand.best_params_) # 获取最佳度量值时的代定参数的值。是一个字典
print('随机搜索-score-train:', rand.score(X_train, y_train))
print('随机搜索-score-test:', rand.score(X_test, y_test))
# 3.训练模型
params = dict(rand.best_params_) # 转化为字典
setting = {"boosting_type": 'gbdt', "subsample": 1.0}
params.update(setting) # 更新字典键值对
clf_new = lgb.LGBMClassifier(**params) # 训练模型
clf_new.fit(X_train, y_train) # 拟合
# 4.预测,存储预测结果
y_gbr = clf_new.predict(X_test) # 测试集进行预测
accuracy = accuracy_score(y_gbr, y_test) # 计算准确率
print("精确率为:")
print(accuracy)
# 5.查看预测较低的块
n = 41 * 36
acc = []
for i in range(0, 165312, n):
y_test1 = y_test[i:i + n]
y_gbr1 = y_gbr[i:i + n]
acc1 = accuracy_score(y_gbr1, y_test1)
acc.append(acc1)
sort_new = np.argsort(acc)
sor_new = []
for i in sort_new:
for j in seq_new:
if i == j:
sor_new.append(i)
# 6.添加随机块数据集
for i in sor_new[0:5]:
name.append(res[i])
seq_new.remove(i)
joblib.dump(clf_new, f'train_model_result{iteration}.m') # 保存模型
iteration += 1
score= ' raise '