想同时对DBSCAN算法进行多个最优参数的选择
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
import time
import numpy as np
#读取文件
beer = pd.read_table(r"G:\Python photo\shuju\Python数据分析与机器学习\14.DBSCAN聚类\data.txt",
sep=' ', encoding='utf8', engine='python')
#传入变量(列名)
beer.columns
X = beer.iloc[:,1:]
eps_list = [10,15,20,25,30,35]
min_samples_list = [3,4,3,6,7,8]
def cluster_score(epss ,min_sampless):
global scores
z = 1
scores = []
for eps in eps_list:
for min_samples in min_samples_list:
start_time = time.time()
db = DBSCAN(eps=eps ,min_samples=min_samples).fit(X)
labels = db.labels_
beer['cluster_db'] = labels
beer.sort_values('cluster_db').mean()
score = metrics.silhouette_score(X, beer.cluster_db)
scores.append(score)
end_time = time.time()
print("第{}次...".format(z))
print("time spend:{:.2f}".format(end_time - start_time))
print("score值为:{}".format(score))
z +=1
return scores
cluster_score(eps_list, min_samples_list)
best_eps_var = eps_list[np.argmax(scores)]
best_min_samples = min_samples_list[np.argmax(scores)]
print()
print("最优参数为: {},{}".format(best_eps_var ,best_min_samples))
Traceback (most recent call last):
File "C:\Users\ADMINI~1\AppData\Local\Temp/ipykernel_14608/2023938558.py", line 37, in
cluster_score(eps_list, min_samples_list)
File "C:\Users\ADMINI~1\AppData\Local\Temp/ipykernel_14608/2023938558.py", line 28, in cluster_score
score = metrics.silhouette_score(X, beer.cluster_db)
File "E:\anaconda\lib\site-packages\sklearn\metrics\cluster_unsupervised.py", line 117, in silhouette_score
return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
File "E:\anaconda\lib\site-packages\sklearn\metrics\cluster_unsupervised.py", line 227, in silhouette_samples
check_number_of_labels(len(le.classes_), n_samples)
File "E:\anaconda\lib\site-packages\sklearn\metrics\cluster_unsupervised.py", line 33, in check_number_of_labels
raise ValueError(
ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1
想通过双循环遍历eps和min_samples两个参数list的集合得到最大化的轮廓系数,找到最优的这两个参数
上述代码在循环到第24次的时候报出这个错误
得到最优eps和min_samples两个参数
轮廓系数有其本身的约束建议尝试不同的评价方法,如兰的系数