Source code for pylearn.classification.multinomial_naive_bayes

import numpy as np
import pandas as pd
import time
import re


[docs]
class MultinomialNaiveBayes:
    """
    Computes text classification problems by applying the Bayes theorem.

    Attributes:
        :texts (pandas.Series): Input texts used for training
        :classes (list): List of unique classes in the training data
        :num_of_samples (int): Total number of samples in the training data
        :prior (pandas.DataFrame): Prior probabilities of each class
        :vocab (list): List of unique words in the training data
        :posterior (pandas.DataFrame): Posterior probabilities of each word given each class
    """
    # loc selects rows by index label (can also be numeric, but numeric index can differ from real index), 
    # iloc selects row by actual index

[docs]
    def fit(self, X: pd.Series, Y: pd.Series, alpha=1, log_duration=True) -> None:
        """
        Trains the algorithm. 
        Data must not be continuous data.

        Parameters:
            :X (pandas.Series): Training input
            :Y (pandas.Series): Training output
            :alpha (float, optional): Smoothing parameter
            :log_duration (bool, optional): Logs the duration of the training, default: True

        Returns:
            None
        """
        start = time.time()

        self.texts = X
        self.classes = sorted(list(Y.unique()))
        self.num_of_samples = len(X) 
        self.prior = pd.DataFrame(data=0, index=range(len(self.classes)), columns=range(1)).astype(float)
        tokenized_texts = self._tokenize(self.texts)        # list of lists of words of one text
        self.vocab = self._create_vocab(tokenized_texts)    # contains all words
        self.posterior = pd.DataFrame(data=0, index=range(len(self.classes)), columns=self.vocab).astype(float)
        for index, c in enumerate(self.classes):
            class_df = X[Y == c]
            self.prior.iloc[index] = len(class_df) / self.num_of_samples                # ratio of classes in the sample
            tokenized_class_texts = self._tokenize(class_df)                            # tokenized texts per class
            self.posterior.iloc[index] = self._bow(tokenized_class_texts)         
            # Laplace smoothing:
            # P(x|y) = (N_x,y + α) / (N_y + α * D)  
            # α: smoothing parameter
            N_xy = self.posterior.iloc[index]           # N_x,y: occuramce of feature (word) x in class y    
            N_y = 0                                     # N_y: number of features (words) in y
            for text in tokenized_class_texts:          
                N_y += len(text)
            D = self.posterior.shape[1]                 # D: number of features (words) in whole data set
            self.posterior.iloc[index] = (N_xy + alpha) / (N_y + alpha * D)
            
        end = time.time()

        if log_duration:
            print(f"Duration of training: {end - start}\n")

    

[docs]
    def predict(self, X: pd.Series) -> np.ndarray:
        """
        Computes the output of a given X.

        Parameters:
            :X (pandas.Series): Testing input

        Returns:
            Predicted classes as array
        """
        y_pred = [self._predict(x[1:len(x)]) for x in X.items()]           # x is item object --> x[1:len(x)] removes index (iteritems deprecated)
        return np.array(y_pred)                                        


    def _predict(self, x: tuple) -> int | str:
        """
        Helper function for predict.

        Parameters:
            :x (tuple): A tuple with the tokenized text

        Returns:
            The class with the highest probability
        """
        # P(y) * ∏ P(x_i|y) --> ln P(y) + ∑ ln P(x_i|y) to prevent underflow

        x = self._tokenize(pd.Series(x))[0]             # get a list of all words
        prior = np.log(self.prior)                      # ln P(y)
        posterior = pd.DataFrame(data=0, index=range(len(self.classes)), columns=x).astype(float)
        try:
            posterior = np.log(self.posterior[x])       # ln P(x_i|y)
        except KeyError:                                # self.posterior[x] raises KeyError if word not found, happens if vocab not big enough
            for word in x:
                if word not in self.posterior.columns:
                    posterior[word] = 1e-10             # add new column (word) with probability near 0
                else:
                    posterior[word] = self.posterior[word]
        posterior = posterior.sum(axis=1)               # ∑ ln P(x_i|y)
        posterior = prior[0] + posterior                # ln P(y) + ∑ ln P(x_i|y)       (prior[0] to change DataFrame to Series)
        return self.classes[np.argmax(posterior)]
    
    def _bow(self, tokenized_texts: list) -> pd.DataFrame:
        """
        Helper function for fit.
        Creates bag-of-words model from the vocabulary of a class.

        Parameters:
            :tokenized_texts (list): A list of the words of each sample

        Returns:
            The probabilities of each word as pandas dataframe
        """
        bow_df = pd.DataFrame(np.zeros((len(tokenized_texts), len(self.vocab))), columns=self.vocab)

        for i, text in enumerate(tokenized_texts):
            for word in text:
                bow_df.at[i, word] += 1

        word_probability = bow_df.sum(axis=0) 
   
        return pd.DataFrame([word_probability])    
        
    @staticmethod
    def _create_vocab(tokenized_texts: list) -> list:
        """
        Helper function for fit.

        Parameters:
            :tokenized_texts (list): A list of the words of each sample

        Returns:
            The vocabulary with all unique words
        """
        return list(set(word for text in tokenized_texts for word in text))
    
    @staticmethod
    def _tokenize(texts: pd.Series) -> list:
        """
        Helper function for fit.

        Parameters:
            :texts (pandas.Series): All texts in a series

        Returns:
            The texts as a list of lists of tokenized texts, input is cleaned from special characters
        """
        def clean_text(text):
                # remove everything but letters and whitespace (note: falsifies some words, e.g. Gerard's --> Gerards)
                text = re.sub(r"[^a-zA-Z\s]", "", text.lower())     
                tokens = text.split()   # split words by whitespace
                return tokens

        texts = texts.tolist()
        return [clean_text(text) for text in texts]
PyLearn 1.2.0 documentation

Source code for pylearn.classification.multinomial_naive_bayes