python随机森林分类地物

请教关于随机森林分类遥感影像的问题

首先是栅格标签图制作。我在 arcgis中通过目视判断，建立了森林、城市两种感兴趣区，并且转为tif图层。生成的tif图层，森林的值为1，城市的值为2。

然后训练了随机森林模型。再用训练好的随机森林模型去测试新的数据。但是我发现生成的图是一张所有的值都是127的分类图。这是咋回事啊？我发现127似乎是原来随机森林模型中不属于“森林”与“城市”的其他地物的分类值


import numpy as np
from osgeo import gdal

# 读取遥感影像和栅格标签图数据
image_path = r'C:\Users\ASUS\Desktop\train.tif'
label_path = r'C:\Users\ASUS\Desktop\广东省\lable.tif'

image_dataset = gdal.Open(image_path)
label_dataset = gdal.Open(label_path)

image_array = image_dataset.ReadAsArray()
label_array = label_dataset.ReadAsArray()

# 初始化特征矩阵 X 和类别标签向量 y
X = []
y = []

# 遍历所有像素,shape[0]与shape[1]表示从左上角到右下角。
# label_array读取自lable.tif，lable.tif为单波段影像。所以shape[0]与shape[1]表示列和行
for i in range(label_array.shape[0]):
    for j in range(label_array.shape[1]):
        feature = image_array[:, i, j]  # 提取像素特征
        label = label_array[i, j]  # 获取像素的类别标签
        X.append(feature)
        y.append(label)

# 将 X 和 y 转换为 Numpy 数组
X = np.array(X)
y = np.array(y)
# 将特征保存到文本文件
np.savetxt('D:\y.txt', y, fmt='%d', delimiter=' ')
np.savetxt('D:\X.txt', X, fmt='%d', delimiter=' ')

# 现在 X 是特征矩阵，y 是类别标签向量，可以用于训练随机森林模型
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建随机森林模型
n_estimators = 100  # 设置随机森林中决策树的数量
random_state = 42  # 设置随机种子，以确保结果可重复
rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

# 训练随机森林模型
rf_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = rf_model.predict(X_test)

# 计算模型的准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型准确率:", accuracy)

##用训练好的模型去测试新的数据
# 读取新的待预测图像
import rasterio
from rasterio.transform import from_origin
new_image_path = r'C:\Users\ASUS\Desktop\test.tif'
with rasterio.open(new_image_path) as new_image_dataset:
    Tif_width = new_image_dataset.width  # 栅格矩阵的列数
    Tif_height = new_image_dataset.height  # 栅格矩阵的行数
    Tif_geotrans = new_image_dataset.transform  # 获取仿射矩阵信息
    Tif_proj = new_image_dataset.crs  # 获取投影信息
    new_image_array = new_image_dataset.read()

# 对新图像的数据格式进行调整，图像为rgb三通道图像
# 在机器学习中，特征矩阵的形状应该是 (n_samples, n_features)
# n_samples 表示样本数量（即图像的行列像元数height* width），n_features 表示每个样本的特征数（即rgb三个bands的像素值）。
# transpose(1, 2, 0)将(波段数, 列数 , 行数)变成二维数据满足随机森林输入，实现了从 (num_bands, height, width) 到 (height, width, num_bands) 的转置。
# reshape(-1, 3): 这是在进行数据重塑，reshape用于将多维的图像数据转换成一个二维矩阵，其中每一行代表一个像素点的特征。
# -1 在这里表示根据其他维度的大小自动计算当前维度的大小，3 表示bands，因此每个样本（每个像素点）具有三个特征。
#这样的操作会确保每一行代表一个像素点的特征，而列代表不同的波段特征，使其符合随机森林模型的输入要求。
data = new_image_array.transpose(1, 2, 0).reshape(-1,3)
#  对调整好格式的数据进行预测
oldpred = rf_model.predict(data)
# 调整预测结果的形状
newpred = oldpred.reshape(Tif_height, Tif_width)
#  将结果写到tif图像里
meta = new_image_dataset.meta
meta.update(dtype=rasterio.uint8, count=1)
with rasterio.open(r'D:\test826.tif', 'w', **meta) as test:
    test.write(newpred, 1)
print('image_array shape',image_array.shape)
print('label_array shape',label_array.shape)
print('X',X.shape)
print('y',y.shape)
print('X value',X)
print('y value',y)
print('new_image_array shape',new_image_array.shape)
print('new_image_array value',new_image_array)
# 将特征保存到文本文件
print('data',data.shape)
print('oldpred',oldpred.shape)
print('newpred',newpred.shape)
# 可视化预测结果
import matplotlib.pyplot as plt
plt.imshow(newpred, cmap='jet')
plt.colorbar()
plt.show()


输出结果如下：
模型准确率: 0.962233268671193
image_array shape (3, 7691, 7541)
label_array shape (253, 326)
X (82478, 3)
y (82478,)
X value [[0 0 0]
 [0 0 0]
 [0 0 0]
 ...
 [0 0 0]
 [0 0 0]
 [0 0 0]]
y value [127 127 127 ... 127 127 127]
new_image_array shape (3, 7691, 7531)
new_image_array value [[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]
data (57920921, 3)
oldpred (57920921,)
newpred (7691, 7531)

数据链接：https://pan.baidu.com/s/1-5nVSRrl2THVt2naLqVzng 提取码：1234
输出的图

引用皆我百晓生小程序回复内容作答：

根据你提供的代码和输出结果，我注意到以下几点可能导致问题：

训练数据和测试数据的像素值范围不一致：在训练随机森林模型之前，你将训练数据和测试数据分别保存到了 "D:\X.txt" 和 "D:\y.txt" 中。然后在测试新数据时，你读取了新的待预测图像，将其调整为符合随机森林模型输入要求的格式。但是你没有对新图像的数据进行归一化或标准化处理。如果训练数据和测试数据的像素值范围不一致，可能会导致预测结果出现问题。
模型准确率高：根据输出结果，模型在训练集和测试集上的准确率都很高，分别为0.962。这可能意味着你的模型过拟合了训练数据，即在训练数据上表现良好，但在新数据上的泛化能力较差。你可以尝试调整模型的超参数，如 n_estimators（决策树数量）和 random_state（随机种子），以及使用交叉验证来评估模型的性能。
输出分类图全为127：这可能是因为你的模型在新数据上的预测结果都属于不属于“森林”和“城市”的其他地物，导致分类值都为127。你可以尝试查看预测结果的分布情况，看是否有其他分类值出现。可以使用np.unique()函数查看预测结果中的唯一值及其出现次数。

另外，你还可以检查训练数据和测试数据是否正确加载，以及特征矩阵X和类别标签向量y的形状是否正确。你已经打印了这些信息，但可以进一步确认。

最后，你可以尝试增加训练样本的数量，以充分利用随机森林算法的优势。

【以下回答由 GPT 生成】

问题出现的原因在于在特征矩阵 X 和类别标签向量 y 的构建过程中存在一些错误。在遍历像素并提取特征时，应该在训练像素时使用特征矩阵 X 和类别标签向量 y，并使用与此相应的特征和标签来预测测试像素的类别。

以下是对代码进行修正的解决方案：

import numpy as np
from osgeo import gdal

# 读取遥感影像和栅格标签图数据
image_path = r'C:\Users\ASUS\Desktop\train.tif'
label_path = r'C:\Users\ASUS\Desktop\广东省\lable.tif'

image_dataset = gdal.Open(image_path)
label_dataset = gdal.Open(label_path)

image_array = image_dataset.ReadAsArray()
label_array = label_dataset.ReadAsArray()

# 初始化特征矩阵 X 和类别标签向量 y
X = []
y = []

# 遍历所有像素，shape[0]与shape[1]表示从左上角到右下角。
# label_array读取自lable.tif，label.tif为单波段影像。所以shape[0]与shape[1]表示列和行
for i in range(label_array.shape[0]):
    for j in range(label_array.shape[1]):
        feature = image_array[:, i, j]  # 提取像素特征
        label = label_array[i, j]  # 获取像素的类别标签
        X.append(feature)
        y.append(label)

# 将 X 和 y 转换为 Numpy 数组
X = np.array(X)
y = np.array(y)

# 现在 X 是特征矩阵，y 是类别标签向量，可以用于训练随机森林模型
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建随机森林模型
n_estimators = 100  # 设置随机森林中决策树的数量
random_state = 42  # 设置随机种子，以确保结果可重复
rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

# 训练随机森林模型
rf_model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = rf_model.predict(X_test)

# 计算模型的准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型准确率:", accuracy)

##用训练好的模型去测试新的数据
# 读取新的待预测图像
predict_image_path = r'C:\Users\ASUS\Desktop\predict.tif'
predict_image_dataset = gdal.Open(predict_image_path)
predict_image_array = predict_image_dataset.ReadAsArray()

# 处理新的待预测图像
predict_X = np.transpose(predict_image_array, (1, 2, 0)).reshape(-1, predict_image_array.shape[0])
predict_y = rf_model.predict(predict_X)

# 保存分类结果到新的.tif文件
predicted_image = np.reshape(predict_y, (predict_image_array.shape[1], predict_image_array.shape[2])).astype(np.uint8)
driver = gdal.GetDriverByName("GTiff")
output_path = r'C:\Users\ASUS\Desktop\predicted_image.tif'
output_dataset = driver.Create(output_path, predicted_image.shape[1], predicted_image.shape[0], 1, gdal.GDT_Byte)
output_dataset.GetRasterBand(1).WriteArray(predicted_image)
output_dataset.FlushCache()

# 释放资源
image_dataset = None
label_dataset = None
predict_image_dataset = None
output_dataset = None

注意：修正后的代码使用了新的待预测图像 predict.tif，你需要调整预测图像的路径为你实际的图像路径。另外，代码最后还包括了释放资源的步骤，以确保在完成处理后释放内存。

如果你已经解决了该问题, 非常希望你能够分享一下解决方案, 写成博客, 将相关链接放在评论区, 以帮助更多的人 ^-^