In [2]:
import pandas as pd
import numpy as np
In [3]:
def get_accuracy(x: pd.DataFrame, y: pd.Series, y_hat: pd.Series):
correct = y_hat == y
acc = np.sum(correct) / len(y)
cond = y == 1
y1 = len(y[cond])
y0 = len(y[~cond])
print(f'Class 0: tested {y0}, correctly classified {correct[~cond].sum()}')
print(f'Class 1: tested {y1}, correctly classified {correct[cond].sum()}')
print(f'Overall: tested {len(y)}, correctly classified {correct.sum()}')
print(f'Accuracy = {acc:.2f}')
In [4]:
class Classifier:
def __init__(self, dataset: str = None, mle: bool=True):
if dataset:
x_train, y_train = reader(f'datasets/{dataset}-train.txt')
x_test, y_test = reader(f'datasets/{dataset}-test.txt')
self.train(x_train, y_train, mle)
print('Training accuracy')
print('=' * 10)
self.accuracy(x_train, y_train)
print('Test accuracy')
print('=' * 10)
self.accuracy(x_test, y_test)
def accuracy(self, x: pd.DataFrame, y: pd.DataFrame) -> None:
y_hat = self.predict(x)
get_accuracy(x, y, y_hat)
In [5]:
class NB(Classifier):
def __init__(self, dataset: str = None, mle: bool=True):
self.prior = None
self.p_xi_given_y = {0: {}, 1: {}}
self.prior_x = {}
self.cols = None
super().__init__(dataset, mle)
def train(self, x: pd.DataFrame, y: pd.Series, mle: bool=True):
adj_den = 0 if mle else 2
adj_num = 0 if mle else 1
self.prior = y.value_counts().to_dict()
for c in [0, 1]:
self.prior[c] += adj_num
self.prior[c] /= (len(y) + adj_den)
self.cols = x.columns
for col in x.columns:
self.prior_x[col] = (x[col].value_counts() / len(y)).to_dict()
cond = y == 1
y1 = np.sum(cond)
y0 = len(y) - y1
y1 += adj_den
y0 += adj_den
x_pos = x[cond]
x_neg = x[~cond]
for cls in [0, 1]:
for col in x.columns:
x_cls = x_pos if cls == 1 else x_neg
y_cls = y1 if cls == 1 else y0
x1 = len(x_cls.query(f'{col} == 1'))
x0 = len(x_cls.query(f'{col} == 0'))
x1 += adj_num
x0 += adj_num
self.p_xi_given_y[cls][col] = {
0: x0 / y_cls,
1: x1 / y_cls
}
def predict(self, x: pd.DataFrame) -> pd.Series:
out = []
for _, row in x.iterrows():
m = {}
for cls in [0, 1]:
m[cls] = np.log([self.prior[0]] + [
self.p_xi_given_y[cls][col][row[col]]
for col in x.columns
]).sum()
out.append(1 if m[1] >= m[0] else 0)
return pd.Series(out)
def _get_ind(self, col):
num = self.prior_x[col][0] * self.p_xi_given_y[1][col][1]
den = self.prior_x[col][1] * self.p_xi_given_y[1][col][0]
return num / den
def most_indicative(self):
return pd.Series({
col: self._get_ind(col)
for col in self.cols
}).sort_values(ascending=False)
In [6]:
x = pd.DataFrame({'x1': [0, 0, 1, 1], 'x2': [0, 1, 0, 1]})
y = pd.Series([0, 0, 1, 1])
In [8]:
x
Out[8]:
In [10]:
nb = NB()
nb.train(x, y)
nb.accuracy(x, y)