In [1]:
import pandas as pd
import numpy as np
In [2]:
def get_accuracy(x: pd.DataFrame, y: pd.Series, y_hat: pd.Series):
correct = y_hat == y
acc = np.sum(correct) / len(y)
cond = y == 1
y1 = len(y[cond])
y0 = len(y[~cond])
print(f'Class 0: tested {y0}, correctly classified {correct[~cond].sum()}')
print(f'Class 1: tested {y1}, correctly classified {correct[cond].sum()}')
print(f'Overall: tested {len(y)}, correctly classified {correct.sum()}')
print(f'Accuracy = {acc:.2f}')
In [3]:
class Classifier:
def __init__(self, dataset: str = None, mle: bool=True):
if dataset:
x_train, y_train = reader(f'datasets/{dataset}-train.txt')
x_test, y_test = reader(f'datasets/{dataset}-test.txt')
self.train(x_train, y_train, mle)
print('Training accuracy')
print('=' * 10)
self.accuracy(x_train, y_train)
print('Test accuracy')
print('=' * 10)
self.accuracy(x_test, y_test)
def accuracy(self, x: pd.DataFrame, y: pd.DataFrame) -> None:
y_hat = self.predict(x)
get_accuracy(x, y, y_hat)
In [4]:
class LR(Classifier):
def __init__(self,
eta: float = 1e-4,
epochs: int = int(1e4),
dataset: str = None):
self.ws = None
self.eta = eta
self.epochs = epochs
super().__init__(dataset, mle=True)
@staticmethod
def _sigmoid(x: np.array):
return 1 / (1 + np.exp(-x))
@staticmethod
def _prepend_intercept(x: pd.DataFrame) -> pd.DataFrame:
return pd.concat([pd.DataFrame([1] * len(x),
columns=['intercept']), x], axis=1)
def train(self, x: pd.DataFrame, y: pd.Series, mle=None) -> None:
xx = self._prepend_intercept(x)
self.ws = np.zeros(xx.shape[1])
for _ in range(self.epochs):
gs = (y - self._sigmoid(xx @ self.ws)) @ xx
self.ws += self.eta * gs.values
def predict(self, x: pd.DataFrame) -> pd.Series:
xx = self._prepend_intercept(x)
return (xx @ self.ws) > 0.5
def _ll(self, x: pd.DataFrame,
y: pd.DataFrame, ws: np.array) -> float:
s = self._sigmoid(x @ ws)
return y @ np.log(s) + (1 - y) @ np.log(1 - s)
def ll(self, x: pd.DataFrame, y: pd.Series) -> float:
xx = self._prepend_intercept(x)
return self._ll(xx, y, self.ws)
def ll_all_zero(self, x: pd.DataFrame, y: pd.Series) -> float:
xx = self._prepend_intercept(x)
ws_zero = np.zeros(xx.shape[1])
return self._ll(xx, y, ws_zero)
In [5]:
x = pd.DataFrame({'x1': [0, 0, 1, 1], 'x2': [0, 1, 0, 1]})
y = pd.Series([0, 0, 1, 1])
In [6]:
x
Out[6]:
In [7]:
y
Out[7]:
In [17]:
%%time
lr = LR(epochs=10, eta=1e-1)
lr.train(x, y)
lr.accuracy(x, y)
In [18]:
lr.ws
Out[18]: