Source code for pylearn.classification.gaussian_naive_bayes
import numpy as np
import pandas as pd
import time
[docs]
class GaussianNaiveBayes:
"""
Computes continuous classification problems by applying the Bayes theorem
with a gaussian distribution.
Attributes:
:classes (list): A list of all classes
:mean (numpy.ndarray | pandas.DataFrame): Mean of all features
:variance (numpy.ndarray | pandas.DataFrame): Variance of all features
:prior (numpy.ndarray | pandas.DataFrame): Prior of Bayes theorem
"""
# loc selects rows by index label (can also be numeric, but numeric index can differ from real index),
# iloc selects row by actual index
[docs]
def fit(self, X: np.ndarray | pd.DataFrame | pd.Series, Y: np.ndarray | pd.DataFrame | pd.Series, log_duration=True) -> None:
"""
Trains the algorithm. Input can be a numpy or pandas object.
Parameters:
:X (numpy.ndarray | pandas.DataFrame | pd.Series): Training input
:Y (numpy.ndarray | pandas.DataFrame | pd.Series): Training output
:log_duration (bool, optional): Logs the duration of the training, default: True
Returns:
None
"""
start = time.time()
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)
self.classes = sorted(list(Y.iloc[:, 0].unique()))
num_of_samples, num_of_features = X.shape
self.mean = pd.DataFrame(data=0, index=range(len(self.classes)), columns=range(num_of_features)).astype(float)
self.variance = pd.DataFrame(data=0, index=range(len(self.classes)), columns=range(num_of_features)).astype(float)
self.prior = pd.DataFrame(data=0, index=range(len(self.classes)), columns=range(1)).astype(float)
for index, c in enumerate(self.classes):
class_df = X[Y.iloc[:, 0] == c]
self.mean.iloc[index] = class_df.mean().values # operator returns column names and values --> .values
self.variance.iloc[index] = class_df.var().values # NaN for one value due to 0 divison
self.prior.iloc[index] = float(len(class_df)) / float(num_of_samples)
end = time.time()
if log_duration:
print(f"Duration of training: {end - start}\n")
[docs]
def predict(self, X: np.ndarray | pd.DataFrame) -> np.ndarray:
"""
Computes the output of a given X.
Parameters:
:X (numpy.ndarray | pandas.DataFrame): Testing input
Returns:
Predicted classes as array
"""
X = pd.DataFrame(X)
y_pred = [self._predict(x[1:len(x)]) for x in X.itertuples()] # x is itertuple object --> x[1:len(x)] removes index
return np.array(y_pred)
def _predict(self, x: tuple) -> int | str:
"""
Helper function for predict.
"""
posteriors = []
# P(y) * ∏ P(x_i|y) --> ln P(y) + ∑ ln P(x_i|y) to prevent underflow
for index, c in enumerate(self.classes):
prior = np.log(self.prior.iloc[index]) # get prior at index
posterior = np.sum(np.log(self._gaussian_distribution(index, x)))
posterior += prior
posteriors.append(posterior)
return self.classes[np.argmax(posteriors)]
def _gaussian_distribution(self, index: int, x: tuple) -> pd.Series:
"""
The gaussian or normal distribution of a feature x_i (Probability Density Function).
Parameters:
:index (int): Index of the current class
:x (tuple): Current row
Returns:
Gaussian distribution of each feature as pandas series
"""
mean, variance = self.mean.iloc[index], self.variance.iloc[index] # mean, variance for each class (row)
# P(x_i|y) = N(μ, σ) with x = x_i, μ = μ_y, σ = σ_y
return np.exp(-((x - mean) ** 2)/(2 * variance)) / np.sqrt(2 * np.pi * variance)