Source code for pylearn.clustering.clustering
import numpy as np
[docs]
class Clustering:
"""
An abstract base class for Clustering Algorithms.
Defines the basic properties and methods that can be used.
Attributes:
:k (int): Number of clusters
:centroids (numpy.ndarray): Matrix of centroids of all k clusters
:data_points (numpy.ndarray): Matrix of all data points
:data_points_to_cluster (list): List of each data point's assigned cluster
:clusters (list): List of all k clusters
"""
def __init__(self, k=3) -> None:
self.k = k
self.centroids = None
self.data_points = None
self.data_points_to_cluster = None
self.clusters = None
[docs]
@staticmethod
def euclidean_distance(x: np.ndarray, centroids: np.ndarray) -> np.ndarray:
"""
Calculates distance of a data point x
to all k centroids.
Parameters:
:x (numpy.ndarray): Data point (vector)
:centroids (numpy.ndarray): Centroids in a matrix (each row is one centroid)
Returns:
Array of the distances
"""
# change axis to 1 because every centroid is stored in one row
return np.sqrt(np.sum((x - centroids)**2, axis = 1))
[docs]
def assigned_clusters(self, clusters: list | str | int) -> list[tuple]:
"""
All to the clusters assigned data points.
Parameters:
:clusters (list | str | int): Cluster name(s)
Returns:
List of the data points
"""
mapping = zip(self.data_points, self.data_points_to_cluster)
result = []
for data_point, cluster in mapping:
if (isinstance(clusters, list) and cluster in clusters) or (not isinstance(clusters, list) and cluster == clusters): # allow clusters as list, str or int
result.append((list(data_point), cluster))
return result
[docs]
def rename(self, old_clusters: list, new_clusters: list) -> list:
"""
Renames the clusters.
Parameters:
:old_clusters (list): List of all old clusters to get renamed
:new_clusters (list): List of the renamed clusters
Returns:
A list of the data points
"""
# TODO add parameter limit (int) to limit the output of each cluster data points
mapping = zip(old_clusters, new_clusters)
mapping = {old:new for old, new in mapping}
renamed_clusters = list(self.data_points_to_cluster)
for i, data_point in enumerate(self.data_points_to_cluster):
renamed_clusters[i] = mapping[data_point]
self.data_points_to_cluster[i] = mapping[data_point]
self.clusters = list(set(renamed_clusters))
return renamed_clusters