Gausian Discriminant Analysis

This notebook runs 1D gausian discriminant analysis on a database.



In [1]:

    
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

from w3ml.db import Database



In [125]:

    
class GDA(object):
    """Computes gda parameters based on training data."""
    
    def __init__(self, x, y):
        """x and y are 1d arrays of training data, x should be a float array and 
        y must be a boolean array.
        """
        self.x_i = x
        self.y_i = y
        self.m = m = len(x)
        numtrue = np.sum(y)
        numfalse = m - numtrue
        self.phi = numtrue / float(m)
        self.mu0 = mu0 = np.sum(x[~y]) / numfalse
        self.mu1 = mu1 = np.sum(x[y]) / numtrue
        self.Sigma = Sigma = (np.sum((x[~y] - mu0)**2) + np.sum((x[y] - mu1)**2))/ float(m)
        
        # factor out front
        self._k = 1.0 / np.sqrt(2*np.pi * Sigma)
        
        
    def p(self, x, y=True):
        """Returns that probablility that x will be y."""
        mu = self.mu1 if y else self.mu0
        return self._k * np.exp(-((x - mu)**2)/(2*self.Sigma))
    
    def decision_boundary(self, mid=0.5):
        """Finds the decision boundary for p(y=1|x) = mid."""
        phi = self.phi
        mu0 = self.mu0
        mu1 = self.mu1
        a = np.log(phi * (1.0 - mid)/(mid*(1.0 - phi)))
        numer = self.Sigma*a + mu0**2 - mu1**2
        return numer / (mu0 - mu1)
    
    def plot(self, mid=0.5):
        """"makes a figure of the training data."""
        # compute
        x = self.x_i
        y = self.y_i
        
        s = np.sqrt(self.Sigma)
        distx = np.linspace(min(self.mu0, self.mu1) - 3*s, max(self.mu0, self.mu1) + 3*s, 101)
        dist0 = self.p(distx, y=False)
        dist1 = self.p(distx, y=True)
        b = self.decision_boundary(mid=mid)
        x0 = x[~y]
        x1 = x[y]
        
        # plot
        fig, ax1 = plt.subplots()
        fig.set_size_inches(10.0, 6.0)
        ax1.plot(x0, np.zeros(len(x0)) + 0.01, 'rx')
        ax1.plot(x1, np.ones(len(x1)) - 0.01, 'bo')
        ax1.plot([b, b], [0, 1], 'k--')
        ax1.set_xlabel('x')
        ax1.set_ylabel('category')
        
        ax2 = ax1.twinx()
        ax2.plot(distx, dist0, 'r-')
        ax2.plot(distx, dist1, 'b-')
        ax2.set_ylabel('p(x), gaussian')

First, load the data:



In [126]:

    
with Database('db.h5') as db:
    metadata = db.metadata[:]

Try looking at total APM



In [127]:

    
apm = np.append(metadata['apm1'], metadata['apm2'])
winner = metadata['winner']
won = np.append(metadata['pid1'] == winner, metadata['pid2'] == winner)



In [128]:

    
gda = GDA(apm, won)



In [129]:

    
gda.plot()

Try with difference between APM



In [130]:

    
delta_apm = metadata['apm1'] - metadata['apm2']
p1won = metadata['pid1'] == metadata['winner']
gda = GDA(delta_apm, p1won)



In [131]:

    
gda.plot()

Try with relative APM



In [132]:

    
rel_apm = 1.0 - (metadata['apm2'] / metadata['apm1'])
p1won = metadata['pid1'] == metadata['winner']
gda = GDA(rel_apm, p1won)



In [133]:

    
gda.plot()

Threshold the data:



In [134]:

    
threshold = 300  # apms 
mask = (metadata['apm1'] > threshold) | (metadata['apm2'] > threshold)
rel_apm = 1.0 - (metadata['apm2'] / metadata['apm1'])
p1won = metadata['pid1'] == metadata['winner']
gda = GDA(rel_apm[mask], p1won[mask])



In [135]:

    
gda.plot()



In [135]: