如图所示,假设平面上有8个点,坐标为A1=(2,10), A2=(2,5), A3=(8,4), A4=(5,8),A5=(7,5), A6=(6,4), A7=(1,2), A8=(4,9)。假设初始以A1和A4为两个簇的中心, 请使用K-均值聚类方法,计算并画出各点最终的聚类归属,各自新的簇中心坐标,也在图中 画出。
from numpy import *
import matplotlib.pyplot as plt
import operator
INF = 9999999.0
def distEclud(vecA, vecB):
# 计算欧式距离
return sqrt(sum(power(vecA - vecB, 2)))
def kMeans(dataSet, k, distMeans=distEclud):
"""
输入:数据集, 聚类个数, 距离计算函数, 生成随机质心函数
输出:质心矩阵, 簇分配和距离矩阵
"""
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m, 2)))
centroids = mat([[2,10],[5,8]] )
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m): # 寻找最近的质心
minDist = INF
minIndex = -1
for j in range(k):
distJI = distMeans(centroids[j, :], dataSet[i, :])
if distJI < minDist:
minDist = distJI
minIndex = j
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist**2
for cent in range(k): # 更新质心的位置
ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]]
centroids[cent, :] = mean(ptsInClust, axis=0)
return centroids, clusterAssment
def plotFeature(dataSet, centroids, clusterAssment):
m = shape(centroids)[0]
fig = plt.figure()
scatterMarkers = ['s', 'o']
scatterColors = ['black', 'red']
ax = fig.add_subplot(111)
for i in range(m):
ptsInCurCluster = dataSet[nonzero(clusterAssment[:, 0].A == i)[0], :]
markerStyle = scatterMarkers[i % len(scatterMarkers)]
colorSytle = scatterColors[i % len(scatterColors)]
ax.scatter(ptsInCurCluster[:, 0].flatten().A[0], ptsInCurCluster[:, 1].flatten().A[0], marker=markerStyle, c=colorSytle, s=90)
ax.scatter(centroids[:, 0].flatten().A[0], centroids[:, 1].flatten().A[0], marker='+', c='red', s=300)
if __name__ == '__main__':
dataSet = mat([[2, 10], [2, 5],[8, 4],[5, 8], [7, 5],[6, 4],[1, 2],[4, 9]])
resultCentroids, clustAssing = kMeans(dataSet, 2)
print('*******************')
print(resultCentroids)
print('*******************')
plotFeature(dataSet, resultCentroids, clustAssing)
plt.show()