import numpy as np
from sklearn.datasets import load_iris
iris = load_iris()
x,y =iris.data,iris.target
#x=np.array([[1,1,1,1],[10,10,10,10],[20,20,20,20],[5,5,5,5],[3,3,3,3],[6,6,6,6],[25,25,25,25]])
centroids_a=[]
list1=[]
#聚类中心
def init_random_centroids(k,x):
data1=x.shape[0]
suiji_a =np.random.choice(data1,k,replace=False)
for i in suiji_a:
xlist = x[i]
centroids_a.append(xlist)
centroids =np.array(centroids_a)
return centroids
#计算距离
def euclidean_distance(one_sample, x):
#print("中心点",x)
#print("样本",one_sample)
#print("单个样本",one_sample)
#print("中心",x)
distances= np.sum((x-one_sample)**2)
#print("距离",distances)
return distances
#返回离样本最近的中心索引
def _closest_centroid(sample, centroids):
k = centroids.shape[0]
#print("k",k)
#print("聚类中心",centroids)
list_distance =[]
for i in range(k):
a=euclidean_distance(one_sample=sample, x=centroids[i])
list_distance.append(a)
# print("list",list_distance)
distance_list=np.array(list_distance)
id =np.argmin(distance_list)
#print("单个样本与聚类中心的距离:",distance_list)
#print("距离最近的中心索引",id)
return id
#将所有样本进行归类,归类规则就是将该样本归类到与其最近的中心
def create_clusters(k,centroids, x):
clusters = [[] for _ in range(k)]
#print("cen",centroids)
for sample in x:
ys=_closest_centroid(sample, centroids)
#print(ys)
clusters[ys].append(sample)
#print("0",clusters[0])
#print("1",clusters[1])
#print("2",clusters[2])
return clusters
#中心点更新
def update_centroids(k,clusters):
#print(k)
# print("类型0",clusters[0])
# print("类型1",clusters[1])
# print("类型2",clusters[2])
for i in range(k):
centroid = np.mean(clusters[i], axis=0)
#print("平均聚点",centroid)
centroids[i] = centroid
print("新聚类中心",centroids)
return centroids
# 将所有样本进行归类,其所在的类别的索引就是其类别标签
def get_cluster_labels(clusters, x):
#print(clusters)
y_pred = []
for sample in x:
ys=_closest_centroid(sample=sample, centroids=centroids)
y_pred.append(ys)
print(y_pred)
return y_pred
#随机选取k个聚类中心
centroids = init_random_centroids(3, x)
for number in range(20000000):
#样本归类
cluster = create_clusters(k=3,centroids=centroids,x=x)
former_centroids = centroids
#更新新的聚类中心
clusters = update_centroids(k=3,clusters=cluster)
diff = centroids - former_centroids
if diff.any() <0.000001:
get_cluster_labels(clusters=cluster,x=x)
break
K-Means算法从随机初始化簇质心开始。每次运行KMeans时,此选择都会有所不同,可能会产生不同的结果。为了得到可再现的结果,可以在KMeans中使用random_state参数,这将修复簇中心线的初始选择:
model = KMeans(n_clusters=number_of_clusters,
init='k-means++',
max_iter=100,
n_init=100,
random_state=123)
如果质心是随机生成的,那么结果有可能是不一样的
你里面有随机函数都改成固定的,结果就不会随机了。
再kmeans中改这个参数random_state
第86行,随即选取质心,聚类结果就无法保证是唯一的。另外,说句题外话,这样使用numpy是非常低效的,1万个样本的话,可能需要几秒钟——正常应该在几十毫秒。如有兴趣,可以参考这一篇博客: