Naive Bayes from scratch


In [2]:
import pandas as pd
import numpy as np

In [3]:
def get_accuracy(x: pd.DataFrame, y: pd.Series, y_hat: pd.Series):
    correct = y_hat == y
    acc = np.sum(correct) / len(y)
    cond = y == 1
    y1 = len(y[cond])
    y0 = len(y[~cond])

    print(f'Class 0: tested {y0}, correctly classified {correct[~cond].sum()}')
    print(f'Class 1: tested {y1}, correctly classified {correct[cond].sum()}')
    print(f'Overall: tested {len(y)}, correctly classified {correct.sum()}')
    print(f'Accuracy = {acc:.2f}')

In [4]:
class Classifier:
    def __init__(self, dataset: str = None, mle: bool=True):
        if dataset:
            x_train, y_train = reader(f'datasets/{dataset}-train.txt')
            x_test, y_test = reader(f'datasets/{dataset}-test.txt')
            self.train(x_train, y_train, mle)
            print('Training accuracy')
            print('=' * 10)
            self.accuracy(x_train, y_train)
            print('Test accuracy')
            print('=' * 10)
            self.accuracy(x_test, y_test)
            
    def accuracy(self, x: pd.DataFrame, y: pd.DataFrame) -> None:
        y_hat = self.predict(x)
        get_accuracy(x, y, y_hat)

In [5]:
class NB(Classifier):
    def __init__(self, dataset: str = None, mle: bool=True):
        self.prior = None
        self.p_xi_given_y = {0: {}, 1: {}}
        self.prior_x = {}
        self.cols = None
        super().__init__(dataset, mle)
    
    def train(self, x: pd.DataFrame, y: pd.Series, mle: bool=True):
        adj_den = 0 if mle else 2
        adj_num = 0 if mle else 1
        self.prior = y.value_counts().to_dict()
        for c in [0, 1]:
            self.prior[c] += adj_num
            self.prior[c] /= (len(y) + adj_den)
        
        self.cols = x.columns
        for col in x.columns:
            self.prior_x[col] = (x[col].value_counts() / len(y)).to_dict()
        
        cond = y == 1
        y1 = np.sum(cond)
        y0 = len(y) - y1
        y1 += adj_den
        y0 += adj_den
        x_pos = x[cond]
        x_neg = x[~cond]
        for cls in [0, 1]:
            for col in x.columns:
                x_cls = x_pos if cls == 1 else x_neg
                y_cls = y1 if cls == 1 else y0
                x1 = len(x_cls.query(f'{col} == 1'))
                x0 = len(x_cls.query(f'{col} == 0'))
                
                x1 += adj_num
                x0 += adj_num
                
                self.p_xi_given_y[cls][col] = {
                    0: x0 / y_cls,
                    1: x1 / y_cls
                }
    
    def predict(self, x: pd.DataFrame) -> pd.Series:
        out = []
        for _, row in x.iterrows():
            m = {}
            for cls in [0, 1]:
                m[cls] = np.log([self.prior[0]] + [
                    self.p_xi_given_y[cls][col][row[col]]
                    for col in x.columns
                ]).sum()
            out.append(1 if m[1] >= m[0] else 0)
        return pd.Series(out)
    
    def _get_ind(self, col):
        num = self.prior_x[col][0] * self.p_xi_given_y[1][col][1]
        den = self.prior_x[col][1] * self.p_xi_given_y[1][col][0]
        return num / den
    
    def most_indicative(self):
        return pd.Series({
            col: self._get_ind(col)
            for col in self.cols
        }).sort_values(ascending=False)

In [6]:
x = pd.DataFrame({'x1': [0, 0, 1, 1], 'x2': [0, 1, 0, 1]})
y = pd.Series([0, 0, 1, 1])

In [8]:
x


Out[8]:
x1 x2
0 0 0
1 0 1
2 1 0
3 1 1

In [10]:
nb = NB()
nb.train(x, y)
nb.accuracy(x, y)


Class 0: tested 2, correctly classified 2
Class 1: tested 2, correctly classified 2
Overall: tested 4, correctly classified 4
Accuracy = 1.00
/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:50: RuntimeWarning: divide by zero encountered in log