Source code for pylearn.clustering.clustering

import numpy as np


[docs]
class Clustering:
    """
    An abstract base class for Clustering Algorithms.
    Defines the basic properties and methods that can be used.

    Attributes:
        :k (int): Number of clusters
        :centroids (numpy.ndarray): Matrix of centroids of all k clusters
        :data_points (numpy.ndarray): Matrix of all data points
        :data_points_to_cluster (list): List of each data point's assigned cluster
        :clusters (list): List of all k clusters
    """
    def __init__(self, k=3) -> None:
        self.k = k
        self.centroids = None
        self.data_points = None
        self.data_points_to_cluster = None
        self.clusters = None


[docs]
    @staticmethod
    def euclidean_distance(x: np.ndarray, centroids: np.ndarray) -> np.ndarray:
        """
        Calculates distance of a data point x 
        to all k centroids.

        Parameters:
            :x (numpy.ndarray): Data point (vector)
            :centroids (numpy.ndarray): Centroids in a matrix (each row is one centroid) 

        Returns:
            Array of the distances
        """
        # change axis to 1 because every centroid is stored in one row
        return np.sqrt(np.sum((x - centroids)**2, axis = 1))

    

[docs]
    @staticmethod
    def median(x: np.ndarray) -> np.ndarray:
        """
        Determines the point with the median smallest distance to
        all other data points in the cluster.
        The median point must be one of the data points.

        Parameters:
            x (numpy.ndarray): Matrix of all data points in one cluster

        Returns:
            Data point as one-element array
        """  
        min_distance = float('inf') 
    
        for point in x:
            distance = 0
        
            for other_point in x:
                distance += Clustering.euclidean_distance(point, other_point)  
            
            if distance < min_distance:
                min_distance = distance
                centroid = point

        # returned array only has one element, but is an array of array, therefore [0]
        return centroid[0]



[docs]
    def assigned_clusters(self, clusters: list | str | int) -> list[tuple]:
        """
        All to the clusters assigned data points.

        Parameters:
            :clusters (list | str | int): Cluster name(s) 

        Returns:
            List of the data points
        """
        mapping = zip(self.data_points, self.data_points_to_cluster)
        result = []
        for data_point, cluster in mapping:
            if (isinstance(clusters, list) and cluster in clusters) or (not isinstance(clusters, list) and cluster == clusters):    # allow clusters as list, str or int
                result.append((list(data_point), cluster))
        return result



[docs]
    def rename(self, old_clusters: list, new_clusters: list) -> list: 
        """
        Renames the clusters.  

        Parameters:
            :old_clusters (list): List of all old clusters to get renamed
            :new_clusters (list): List of the renamed clusters

        Returns:
            A list of the data points
        """
        # TODO add parameter limit (int) to limit the output of each cluster data points 
        mapping = zip(old_clusters, new_clusters)
        mapping = {old:new for old, new in mapping}
        renamed_clusters = list(self.data_points_to_cluster)
        
        for i, data_point in enumerate(self.data_points_to_cluster):
            renamed_clusters[i] = mapping[data_point]
            self.data_points_to_cluster[i] = mapping[data_point]

        self.clusters = list(set(renamed_clusters))
        return renamed_clusters
PyLearn 1.2.0 documentation

Source code for pylearn.clustering.clustering