KNN算法因子K的选择与过拟合问题——分类过拟合演示
import numpy as np
import warnings
from sklearn import neighbors
# 构建实验数据
def createDataSet():
dataSet = np.array([[19, 30], [30, 40], [39, 47], [40, 52], [47, 50], [50, 55], [60, 60], [62, 65], [73, 70],[75, 82], [77, 85], [90, 95], [92, 90]]) # 活跃和学生
labels = ['0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1']
return dataSet, labels
def knnClassifier(dataSet, labels, testData, k):
knn = neighbors.KNeighborsClassifier(k) # 取分类器
knn.fit(dataSet, labels) # 导入数据进行训练
return knn.predict([testData])
# 执行程序
if __name__ == "__main__":
dataSet, labels=createDataSet() #取得特征空间样本,即13名学生活动情况数据
warnings.filterwarnings('ignore') # warning信息不打印,可有可无
# 求得k=3时,预测样本分类结果
pred1=knnClassifier(dataSet, labels, [55,65] , 3)
print('k=3时,预测样本分类结果为:',pred1)
# 求得k=7时,预测样本分类结果
pred2 = knnClassifier(dataSet, labels, [55,65], 7)
print('k=7时,预测样本分类结果为:',pred2)
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from numpy import *
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
def createDataSet():
data1 = np.array(
[[19, 30], [30, 40], [39, 47], [40, 52], [47, 50], [50, 55], [60, 60], [62, 65], [73, 70],
[75, 82], [77, 85], [90, 95], [92, 90]])
X = data1[0:, :1];
y = data1[0:, 1:2];
return X, y
def knnRegressor(X,y,k):
knn = KNeighborsRegressor(k)
knn.fit(X, y);
y_pred = knn.predict(X);
knn.score(X, y)
return y_pred
if __name__ == "__main__":
X, y = createDataSet()
y_pred1=knnRegressor(X, y ,3)
y_pred2 = knnRegressor(X, y, 5)
zhfont = FontProperties(fname='C:/Windows/Fonts /simsun.ttc', size=12)
fig = plt.figure()
plt.figure(figsize=(6, 4.5), dpi=80)
ax = plt.subplot(111)
p1 = ax.scatter(X, y,marker='o', c='blue',edgecolors='g', label='1', s=40)
p2, = ax.plot(X, y_pred, '-')
p3, = ax.plot(X, y_pred1, '-.')
p4, = ax.plot(X, y_pred2, '--')
x = np.linspace(0, 100,4)
y = np.linspace(0, 100,4)
plt.xticks(x, ('不积极','一般','积极','非常积极'),c='blue', rotation=60, fontproperties=zhfont)
plt.xticks(x)
plt.yticks(y,('不关注','一般','关注','非常关注'),color='red', fontproperties=zhfont)
plt.xlabel('校活动中的表现', fontproperties=zhfont)
plt.ylabel('关注活动公告', fontproperties=zhfont)
ax.legend([p1,p2, p3, p4], ["特征空间样本","k=3", "k=10", "k=13"], loc='upper left', prop=zhfont)
plt.show()
if __name__ == "__main__":
X, y = createDataSet()
y_pred=knnRegressor(X, y ,3)
y_pred1 = knnRegressor(X, y, 10)
y_pred2 = knnRegressor(X, y ,13)
# 画出拟合曲线
zhfont = FontProperties(fname='C:/Windows/Fonts /simsun.ttc', size=12)
fig = plt.figure()
plt.figure(figsize=(6, 4.5), dpi=80)
ax = plt.subplot(111)
p1 = ax.scatter(X, y,marker='o', c='blue',edgecolors='g', label='1', s=40)
p2, = ax.plot(X, y_pred, '-') # 画出k=3拟合曲线
p3, = ax.plot(X, y_pred1, '-.') # 画出k=10拟合曲线
p4, = ax.plot(X, y_pred2, '--') # 画出k=13拟合曲线
x = np.linspace(0, 100,4)
y = np.linspace(0, 100,4)
plt.xticks(x, ('不积极','一般','积极','非常积极'),c='blue', rotation=60, fontproperties=zhfont)
plt.xticks(x)
plt.yticks(y,('不关注','一般','关注','非常关注'),color='red', fontproperties=zhfont)
plt.xlabel('校活动中的表现', fontproperties=zhfont)
plt.ylabel('关注活动公告', fontproperties=zhfont)
ax.legend([p1,p2, p3, p4], ["特征空间样本","k=3", "k=10", "k=13"], loc='upper left', prop=zhfont)
plt.show()
#前面的均能正常运行,后面这段不行
def knnClassifier(dataSet, labels, testData, k):
knn = neighbors.KNeighborsClassifier(k) # 取得kNN分类器
knn.fit(dataSet, labels) # 导入数据进行训练
return knn.predict([testData])
if __name__ == "__main__":
dataSet, labels=createDataSet() #取得特征空间样本,即13名学生活动情况数据
warnings.filterwarnings('ignore')
print('k=1时,预测样本分类结果为:', knnClassifier(dataSet, labels, [41,33], 1))
print('k=5时,预测样本分类结果为:', knnClassifier(dataSet, labels, [41,33], 5))
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_15832/3389289976.py in
6 dataSet, labels=createDataSet() #取得特征空间样本,即13名学生活动情况数据
7 warnings.filterwarnings('ignore')
----> 8 print('k=1时,预测样本分类结果为:', knnClassifier(dataSet, labels, [41,33], 1))
9 print('k=5时,预测样本分类结果为:', knnClassifier(dataSet, labels, [41,33], 5))
10
~\AppData\Local\Temp/ipykernel_15832/3389289976.py in knnClassifier(dataSet, labels, testData, k)
2 knn = neighbors.KNeighborsClassifier(k) # 取得kNN分类器
3 knn.fit(dataSet, labels) # 导入数据进行训练
----> 4 return knn.predict([testData])
5 if __name__ == "__main__":
6 dataSet, labels=createDataSet() #取得特征空间样本,即13名学生活动情况数据
D:\ANACONDA\lib\site-packages\sklearn\neighbors\_classification.py in predict(self, X)
212 Class labels for each data sample.
213 """
--> 214 neigh_dist, neigh_ind = self.kneighbors(X)
215 classes_ = self.classes_
216 _y = self._y
D:\ANACONDA\lib\site-packages\sklearn\neighbors\_base.py in kneighbors(self, X, n_neighbors, return_distance)
715 X = _check_precomputed(X)
716 else:
--> 717 X = self._validate_data(X, accept_sparse="csr", reset=False)
718 else:
719 query_is_train = True
D:\ANACONDA\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
583
584 if not no_val_X and check_params.get("ensure_2d", True):
--> 585 self._check_n_features(X, reset=reset)
586
587 return out
D:\ANACONDA\lib\site-packages\sklearn\base.py in _check_n_features(self, X, reset)
398
399 if n_features != self.n_features_in_:
--> 400 raise ValueError(
401 f"X has {n_features} features, but {self.__class__.__name__} "
402 f"is expecting {self.n_features_in_} features as input."
ValueError: X has 2 features, but KNeighborsClassifier is expecting 1 features as input.
仅最后一段报错