Class 25: Evaluating ClusteringΒΆ

  1. Log onto Prismia

  2. (forgot to use this on monday, honoring post-halloween) share your favorite candy in the zoom chat (or just say hi)

# %load http://drsmb.co/310
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import datasets
from sklearn import cluster
from sklearn import metrics
iris_X, iris_y = datasets.load_iris(return_X_y=True)
km3 = cluster.KMeans(n_clusters=3)
km3.fit(iris_X)
KMeans(n_clusters=3)
km3.cluster_centers_
array([[5.9016129 , 2.7483871 , 4.39354839, 1.43387097],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [6.85      , 3.07368421, 5.74210526, 2.07105263]])
plt.scatter(iris_X[:,0],iris_X[:,1],c=km3.labels_)
<matplotlib.collections.PathCollection at 0x7f44a3ab1590>
../_images/2020-11-04_6_1.png
km2 = cluster.KMeans(n_clusters=2)
km2.fit(iris_X)
plt.scatter(iris_X[:,0],iris_X[:,1],c=km2.labels_)
<matplotlib.collections.PathCollection at 0x7f44a39cf250>
../_images/2020-11-04_7_1.png
km4 = cluster.KMeans(n_clusters=4)
km4.fit(iris_X)
plt.scatter(iris_X[:,0],iris_X[:,1],c=km4.labels_)
<matplotlib.collections.PathCollection at 0x7f44a39cff90>
../_images/2020-11-04_8_1.png
\[ s = \frac{b-a}{max(a,b)}\]

a: The mean distance between a sample and all other points in the same class.

b: The mean distance between a sample and all other points in the next nearest cluster.

metrics.silhouette_score(iris_X, km3.labels_)
0.5528190123564091
metrics.silhouette_score(iris_X, km2.labels_)
0.681046169211746
metrics.silhouette_score(iris_X, km4.labels_)
0.49745518901737446
km3a = cluster.KMeans(n_clusters=3)
km3a.fit(iris_X)
plt.scatter(iris_X[:,0],iris_X[:,1],c=km3.labels_)
<matplotlib.collections.PathCollection at 0x7f44a38db790>
../_images/2020-11-04_13_1.png
metrics.adjusted_rand_score(iris_y,km3.labels_)
0.7302382722834697
metrics.adjusted_rand_score(iris_y,km2.labels_)
0.5399218294207123
metrics.adjusted_rand_score(iris_y,km4.labels_)
0.6460787233460947
metrics.adjusted_mutual_info_score(iris_y,km3.labels_)
0.7551191675800482
metrics.adjusted_mutual_info_score(iris_y,km2.labels_)
0.6538380713762781
metrics.adjusted_mutual_info_score(iris_y,km4.labels_)
0.7151742230795862