Gausian Discriminant Analysis

This notebook runs 1D gausian discriminant analysis on a database.


In [1]:
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

from w3ml.db import Database

In [125]:
class GDA(object):
    """Computes gda parameters based on training data."""
    
    def __init__(self, x, y):
        """x and y are 1d arrays of training data, x should be a float array and 
        y must be a boolean array.
        """
        self.x_i = x
        self.y_i = y
        self.m = m = len(x)
        numtrue = np.sum(y)
        numfalse = m - numtrue
        self.phi = numtrue / float(m)
        self.mu0 = mu0 = np.sum(x[~y]) / numfalse
        self.mu1 = mu1 = np.sum(x[y]) / numtrue
        self.Sigma = Sigma = (np.sum((x[~y] - mu0)**2) + np.sum((x[y] - mu1)**2))/ float(m)
        
        # factor out front
        self._k = 1.0 / np.sqrt(2*np.pi * Sigma)
        
        
    def p(self, x, y=True):
        """Returns that probablility that x will be y."""
        mu = self.mu1 if y else self.mu0
        return self._k * np.exp(-((x - mu)**2)/(2*self.Sigma))
    
    def decision_boundary(self, mid=0.5):
        """Finds the decision boundary for p(y=1|x) = mid."""
        phi = self.phi
        mu0 = self.mu0
        mu1 = self.mu1
        a = np.log(phi * (1.0 - mid)/(mid*(1.0 - phi)))
        numer = self.Sigma*a + mu0**2 - mu1**2
        return numer / (mu0 - mu1)
    
    def plot(self, mid=0.5):
        """"makes a figure of the training data."""
        # compute
        x = self.x_i
        y = self.y_i
        
        s = np.sqrt(self.Sigma)
        distx = np.linspace(min(self.mu0, self.mu1) - 3*s, max(self.mu0, self.mu1) + 3*s, 101)
        dist0 = self.p(distx, y=False)
        dist1 = self.p(distx, y=True)
        b = self.decision_boundary(mid=mid)
        x0 = x[~y]
        x1 = x[y]
        
        # plot
        fig, ax1 = plt.subplots()
        fig.set_size_inches(10.0, 6.0)
        ax1.plot(x0, np.zeros(len(x0)) + 0.01, 'rx')
        ax1.plot(x1, np.ones(len(x1)) - 0.01, 'bo')
        ax1.plot([b, b], [0, 1], 'k--')
        ax1.set_xlabel('x')
        ax1.set_ylabel('category')
        
        ax2 = ax1.twinx()
        ax2.plot(distx, dist0, 'r-')
        ax2.plot(distx, dist1, 'b-')
        ax2.set_ylabel('p(x), gaussian')

First, load the data:


In [126]:
with Database('db.h5') as db:
    metadata = db.metadata[:]

Try looking at total APM


In [127]:
apm = np.append(metadata['apm1'], metadata['apm2'])
winner = metadata['winner']
won = np.append(metadata['pid1'] == winner, metadata['pid2'] == winner)

In [128]:
gda = GDA(apm, won)

In [129]:
gda.plot()


Try with difference between APM


In [130]:
delta_apm = metadata['apm1'] - metadata['apm2']
p1won = metadata['pid1'] == metadata['winner']
gda = GDA(delta_apm, p1won)

In [131]:
gda.plot()


Try with relative APM


In [132]:
rel_apm = 1.0 - (metadata['apm2'] / metadata['apm1'])
p1won = metadata['pid1'] == metadata['winner']
gda = GDA(rel_apm, p1won)

In [133]:
gda.plot()


Threshold the data:


In [134]:
threshold = 300  # apms 
mask = (metadata['apm1'] > threshold) | (metadata['apm2'] > threshold)
rel_apm = 1.0 - (metadata['apm2'] / metadata['apm1'])
p1won = metadata['pid1'] == metadata['winner']
gda = GDA(rel_apm[mask], p1won[mask])

In [135]:
gda.plot()



In [135]: