In [1]:
import numpy as np
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
from w3ml.db import Database
In [125]:
class GDA(object):
"""Computes gda parameters based on training data."""
def __init__(self, x, y):
"""x and y are 1d arrays of training data, x should be a float array and
y must be a boolean array.
"""
self.x_i = x
self.y_i = y
self.m = m = len(x)
numtrue = np.sum(y)
numfalse = m - numtrue
self.phi = numtrue / float(m)
self.mu0 = mu0 = np.sum(x[~y]) / numfalse
self.mu1 = mu1 = np.sum(x[y]) / numtrue
self.Sigma = Sigma = (np.sum((x[~y] - mu0)**2) + np.sum((x[y] - mu1)**2))/ float(m)
# factor out front
self._k = 1.0 / np.sqrt(2*np.pi * Sigma)
def p(self, x, y=True):
"""Returns that probablility that x will be y."""
mu = self.mu1 if y else self.mu0
return self._k * np.exp(-((x - mu)**2)/(2*self.Sigma))
def decision_boundary(self, mid=0.5):
"""Finds the decision boundary for p(y=1|x) = mid."""
phi = self.phi
mu0 = self.mu0
mu1 = self.mu1
a = np.log(phi * (1.0 - mid)/(mid*(1.0 - phi)))
numer = self.Sigma*a + mu0**2 - mu1**2
return numer / (mu0 - mu1)
def plot(self, mid=0.5):
""""makes a figure of the training data."""
# compute
x = self.x_i
y = self.y_i
s = np.sqrt(self.Sigma)
distx = np.linspace(min(self.mu0, self.mu1) - 3*s, max(self.mu0, self.mu1) + 3*s, 101)
dist0 = self.p(distx, y=False)
dist1 = self.p(distx, y=True)
b = self.decision_boundary(mid=mid)
x0 = x[~y]
x1 = x[y]
# plot
fig, ax1 = plt.subplots()
fig.set_size_inches(10.0, 6.0)
ax1.plot(x0, np.zeros(len(x0)) + 0.01, 'rx')
ax1.plot(x1, np.ones(len(x1)) - 0.01, 'bo')
ax1.plot([b, b], [0, 1], 'k--')
ax1.set_xlabel('x')
ax1.set_ylabel('category')
ax2 = ax1.twinx()
ax2.plot(distx, dist0, 'r-')
ax2.plot(distx, dist1, 'b-')
ax2.set_ylabel('p(x), gaussian')
First, load the data:
In [126]:
with Database('db.h5') as db:
metadata = db.metadata[:]
Try looking at total APM
In [127]:
apm = np.append(metadata['apm1'], metadata['apm2'])
winner = metadata['winner']
won = np.append(metadata['pid1'] == winner, metadata['pid2'] == winner)
In [128]:
gda = GDA(apm, won)
In [129]:
gda.plot()
Try with difference between APM
In [130]:
delta_apm = metadata['apm1'] - metadata['apm2']
p1won = metadata['pid1'] == metadata['winner']
gda = GDA(delta_apm, p1won)
In [131]:
gda.plot()
Try with relative APM
In [132]:
rel_apm = 1.0 - (metadata['apm2'] / metadata['apm1'])
p1won = metadata['pid1'] == metadata['winner']
gda = GDA(rel_apm, p1won)
In [133]:
gda.plot()
Threshold the data:
In [134]:
threshold = 300 # apms
mask = (metadata['apm1'] > threshold) | (metadata['apm2'] > threshold)
rel_apm = 1.0 - (metadata['apm2'] / metadata['apm1'])
p1won = metadata['pid1'] == metadata['winner']
gda = GDA(rel_apm[mask], p1won[mask])
In [135]:
gda.plot()
In [135]: