Source code for pylearn.classification.gaussian_naive_bayes

import numpy as np
import pandas as pd
import time


[docs]
class GaussianNaiveBayes:
    """
    Computes continuous classification problems by applying the Bayes theorem
    with a gaussian distribution.

    Attributes:
        :classes (list): A list of all classes
        :mean (numpy.ndarray | pandas.DataFrame): Mean of all features
        :variance (numpy.ndarray | pandas.DataFrame): Variance of all features
        :prior (numpy.ndarray | pandas.DataFrame): Prior of Bayes theorem
    """
    # loc selects rows by index label (can also be numeric, but numeric index can differ from real index), 
    # iloc selects row by actual index

[docs]
    def fit(self, X: np.ndarray | pd.DataFrame | pd.Series, Y: np.ndarray | pd.DataFrame | pd.Series, log_duration=True) -> None:
        """
        Trains the algorithm. Input can be a numpy or pandas object.

        Parameters:
            :X (numpy.ndarray | pandas.DataFrame | pd.Series): Training input
            :Y (numpy.ndarray | pandas.DataFrame | pd.Series): Training output
            :log_duration (bool, optional): Logs the duration of the training, default: True

        Returns:
            None
        """
        start = time.time()

        X = pd.DataFrame(X)
        Y = pd.DataFrame(Y)
        self.classes = sorted(list(Y.iloc[:, 0].unique()))
        num_of_samples, num_of_features = X.shape 
        self.mean = pd.DataFrame(data=0, index=range(len(self.classes)), columns=range(num_of_features)).astype(float)
        self.variance = pd.DataFrame(data=0, index=range(len(self.classes)), columns=range(num_of_features)).astype(float)
        self.prior = pd.DataFrame(data=0, index=range(len(self.classes)), columns=range(1)).astype(float)

        for index, c in enumerate(self.classes):
            class_df = X[Y.iloc[:, 0] == c]
            self.mean.iloc[index] = class_df.mean().values        # operator returns column names and values --> .values
            self.variance.iloc[index] = class_df.var().values     # NaN for one value due to 0 divison
            self.prior.iloc[index] = float(len(class_df)) / float(num_of_samples)

        end = time.time()

        if log_duration:
            print(f"Duration of training: {end - start}\n")



[docs]
    def predict(self, X: np.ndarray | pd.DataFrame) -> np.ndarray:
        """
        Computes the output of a given X.

        Parameters:
            :X (numpy.ndarray | pandas.DataFrame): Testing input

        Returns:
            Predicted classes as array
        """
        X = pd.DataFrame(X)
        y_pred = [self._predict(x[1:len(x)]) for x in X.itertuples()]           # x is itertuple object --> x[1:len(x)] removes index
        return np.array(y_pred)


    def _predict(self, x: tuple) -> int | str:
        """
        Helper function for predict.
        """
        posteriors = []
        # P(y) * ∏ P(x_i|y) --> ln P(y) + ∑ ln P(x_i|y) to prevent underflow
        for index, c in enumerate(self.classes):
            prior = np.log(self.prior.iloc[index])      # get prior at index
            posterior = np.sum(np.log(self._gaussian_distribution(index, x)))
            posterior += prior
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors)]

    def _gaussian_distribution(self, index: int, x: tuple) -> pd.Series:
        """
        The gaussian or normal distribution of a feature x_i (Probability Density Function).

        Parameters:
            :index (int): Index of the current class
            :x (tuple): Current row

        Returns:
            Gaussian distribution of each feature as pandas series
        """
        mean, variance = self.mean.iloc[index], self.variance.iloc[index]       # mean, variance for each class (row)
        # P(x_i|y) = N(μ, σ) with x = x_i, μ = μ_y, σ = σ_y
        return np.exp(-((x - mean) ** 2)/(2 * variance)) / np.sqrt(2 * np.pi * variance)
PyLearn 1.2.0 documentation

Source code for pylearn.classification.gaussian_naive_bayes