Logistic regression from scratch


In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_accuracy(x: pd.DataFrame, y: pd.Series, y_hat: pd.Series):
    correct = y_hat == y
    acc = np.sum(correct) / len(y)
    cond = y == 1
    y1 = len(y[cond])
    y0 = len(y[~cond])

    print(f'Class 0: tested {y0}, correctly classified {correct[~cond].sum()}')
    print(f'Class 1: tested {y1}, correctly classified {correct[cond].sum()}')
    print(f'Overall: tested {len(y)}, correctly classified {correct.sum()}')
    print(f'Accuracy = {acc:.2f}')

In [3]:
class Classifier:
    def __init__(self, dataset: str = None, mle: bool=True):
        if dataset:
            x_train, y_train = reader(f'datasets/{dataset}-train.txt')
            x_test, y_test = reader(f'datasets/{dataset}-test.txt')
            self.train(x_train, y_train, mle)
            print('Training accuracy')
            print('=' * 10)
            self.accuracy(x_train, y_train)
            print('Test accuracy')
            print('=' * 10)
            self.accuracy(x_test, y_test)
            
    def accuracy(self, x: pd.DataFrame, y: pd.DataFrame) -> None:
        y_hat = self.predict(x)
        get_accuracy(x, y, y_hat)

In [4]:
class LR(Classifier):
    def __init__(self,
                 eta: float = 1e-4,
                 epochs: int = int(1e4),
                 dataset: str = None):
        self.ws = None
        self.eta = eta
        self.epochs = epochs
        
        super().__init__(dataset, mle=True)
        
    @staticmethod
    def _sigmoid(x: np.array):
        return 1 / (1 + np.exp(-x))
        
    @staticmethod
    def _prepend_intercept(x: pd.DataFrame) -> pd.DataFrame:
        return pd.concat([pd.DataFrame([1] * len(x),
                                       columns=['intercept']), x], axis=1)
    
    def train(self, x: pd.DataFrame, y: pd.Series, mle=None) -> None:
        xx = self._prepend_intercept(x)
        self.ws = np.zeros(xx.shape[1])
        for _ in range(self.epochs):
            gs = (y - self._sigmoid(xx @ self.ws)) @ xx
            self.ws += self.eta * gs.values
    
    def predict(self, x: pd.DataFrame) -> pd.Series:
        xx = self._prepend_intercept(x)
        return (xx @ self.ws) > 0.5
    
    def _ll(self, x: pd.DataFrame,
            y: pd.DataFrame, ws: np.array) -> float:
        s = self._sigmoid(x @ ws)
        return y @ np.log(s) + (1 - y) @ np.log(1 - s)
    
    def ll(self, x: pd.DataFrame, y: pd.Series) -> float:
        xx = self._prepend_intercept(x)
        return self._ll(xx, y, self.ws)
    
    def ll_all_zero(self, x: pd.DataFrame, y: pd.Series) -> float:
        xx = self._prepend_intercept(x)
        ws_zero = np.zeros(xx.shape[1])
        return self._ll(xx, y, ws_zero)

In [5]:
x = pd.DataFrame({'x1': [0, 0, 1, 1], 'x2': [0, 1, 0, 1]})
y = pd.Series([0, 0, 1, 1])

In [6]:
x


Out[6]:
x1 x2
0 0 0
1 0 1
2 1 0
3 1 1

In [7]:
y


Out[7]:
0    0
1    0
2    1
3    1
dtype: int64

In [17]:
%%time
lr = LR(epochs=10, eta=1e-1)
lr.train(x, y)
lr.accuracy(x, y)


Class 0: tested 2, correctly classified 2
Class 1: tested 2, correctly classified 2
Overall: tested 4, correctly classified 4
Accuracy = 1.00
CPU times: user 18.6 ms, sys: 2.61 ms, total: 21.2 ms
Wall time: 18.8 ms

In [18]:
lr.ws


Out[18]:
array([-0.14223617,  0.83037099, -0.06601339])