Clustering
Grouping related examples, particularly during unsupervised learning. Once all the examples are grouped, a human can optionally supply meaning to each cluster.
Clustering Methods
k-means
from sklearn.datasets import make_blobs
from numpy import quantile, random, where
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
Finding number of Clusters
Elbow Method
Bayesian Information Criterion (BIC)
def bic_score(X, labels):
"""
BIC score for the goodness of fit of clusters.
This Python function is directly translated from the GoLang code made by the author of the paper.
The original code is available here: https://github.com/bobhancock/goxmeans/blob/a78e909e374c6f97ddd04a239658c7c5b7365e5c/km.go#L778
"""
n_points = len(labels)
n_clusters = len(set(labels))
n_dimensions = X.shape[1]
n_parameters = (n_clusters - 1) + (n_dimensions * n_clusters) + 1
loglikelihood = 0
for label_name in set(labels):
X_cluster = X[labels == label_name]
n_points_cluster = len(X_cluster)
centroid = np.mean(X_cluster, axis=0)
variance = np.sum((X_cluster - centroid) ** 2) / (len(X_cluster) - 1)
loglikelihood += \
n_points_cluster * np.log(n_points_cluster) \
- n_points_cluster * np.log(n_points) \
- n_points_cluster * n_dimensions / 2 * np.log(2 * math.pi * variance) \
- (n_points_cluster - 1) / 2
bic = loglikelihood - (n_parameters / 2) * np.log(n_points)
return bic