Source code for pylearn.classification.multinomial_naive_bayes

import numpy as np
import pandas as pd
import time
import re

[docs] class MultinomialNaiveBayes: """ Computes text classification problems by applying the Bayes theorem. Attributes: :texts (pandas.Series): Input texts used for training :classes (list): List of unique classes in the training data :num_of_samples (int): Total number of samples in the training data :prior (pandas.DataFrame): Prior probabilities of each class :vocab (list): List of unique words in the training data :posterior (pandas.DataFrame): Posterior probabilities of each word given each class """ # loc selects rows by index label (can also be numeric, but numeric index can differ from real index), # iloc selects row by actual index
[docs] def fit(self, X: pd.Series, Y: pd.Series, alpha=1, log_duration=True) -> None: """ Trains the algorithm. Data must not be continuous data. Parameters: :X (pandas.Series): Training input :Y (pandas.Series): Training output :alpha (float, optional): Smoothing parameter :log_duration (bool, optional): Logs the duration of the training, default: True Returns: None """ start = time.time() self.texts = X self.classes = sorted(list(Y.unique())) self.num_of_samples = len(X) self.prior = pd.DataFrame(data=0, index=range(len(self.classes)), columns=range(1)).astype(float) tokenized_texts = self._tokenize(self.texts) # list of lists of words of one text self.vocab = self._create_vocab(tokenized_texts) # contains all words self.posterior = pd.DataFrame(data=0, index=range(len(self.classes)), columns=self.vocab).astype(float) for index, c in enumerate(self.classes): class_df = X[Y == c] self.prior.iloc[index] = len(class_df) / self.num_of_samples # ratio of classes in the sample tokenized_class_texts = self._tokenize(class_df) # tokenized texts per class self.posterior.iloc[index] = self._bow(tokenized_class_texts) # Laplace smoothing: # P(x|y) = (N_x,y + α) / (N_y + α * D) # α: smoothing parameter N_xy = self.posterior.iloc[index] # N_x,y: occuramce of feature (word) x in class y N_y = 0 # N_y: number of features (words) in y for text in tokenized_class_texts: N_y += len(text) D = self.posterior.shape[1] # D: number of features (words) in whole data set self.posterior.iloc[index] = (N_xy + alpha) / (N_y + alpha * D) end = time.time() if log_duration: print(f"Duration of training: {end - start}\n")
[docs] def predict(self, X: pd.Series) -> np.ndarray: """ Computes the output of a given X. Parameters: :X (pandas.Series): Testing input Returns: Predicted classes as array """ y_pred = [self._predict(x[1:len(x)]) for x in X.items()] # x is item object --> x[1:len(x)] removes index (iteritems deprecated) return np.array(y_pred)
def _predict(self, x: tuple) -> int | str: """ Helper function for predict. Parameters: :x (tuple): A tuple with the tokenized text Returns: The class with the highest probability """ # P(y) * ∏ P(x_i|y) --> ln P(y) + ∑ ln P(x_i|y) to prevent underflow x = self._tokenize(pd.Series(x))[0] # get a list of all words prior = np.log(self.prior) # ln P(y) posterior = pd.DataFrame(data=0, index=range(len(self.classes)), columns=x).astype(float) try: posterior = np.log(self.posterior[x]) # ln P(x_i|y) except KeyError: # self.posterior[x] raises KeyError if word not found, happens if vocab not big enough for word in x: if word not in self.posterior.columns: posterior[word] = 1e-10 # add new column (word) with probability near 0 else: posterior[word] = self.posterior[word] posterior = posterior.sum(axis=1) # ∑ ln P(x_i|y) posterior = prior[0] + posterior # ln P(y) + ∑ ln P(x_i|y) (prior[0] to change DataFrame to Series) return self.classes[np.argmax(posterior)] def _bow(self, tokenized_texts: list) -> pd.DataFrame: """ Helper function for fit. Creates bag-of-words model from the vocabulary of a class. Parameters: :tokenized_texts (list): A list of the words of each sample Returns: The probabilities of each word as pandas dataframe """ bow_df = pd.DataFrame(np.zeros((len(tokenized_texts), len(self.vocab))), columns=self.vocab) for i, text in enumerate(tokenized_texts): for word in text: bow_df.at[i, word] += 1 word_probability = bow_df.sum(axis=0) return pd.DataFrame([word_probability]) @staticmethod def _create_vocab(tokenized_texts: list) -> list: """ Helper function for fit. Parameters: :tokenized_texts (list): A list of the words of each sample Returns: The vocabulary with all unique words """ return list(set(word for text in tokenized_texts for word in text)) @staticmethod def _tokenize(texts: pd.Series) -> list: """ Helper function for fit. Parameters: :texts (pandas.Series): All texts in a series Returns: The texts as a list of lists of tokenized texts, input is cleaned from special characters """ def clean_text(text): # remove everything but letters and whitespace (note: falsifies some words, e.g. Gerard's --> Gerards) text = re.sub(r"[^a-zA-Z\s]", "", text.lower()) tokens = text.split() # split words by whitespace return tokens texts = texts.tolist() return [clean_text(text) for text in texts]