notebook.community

Edit and run



In [1]:

    
%matplotlib inline
from util import *
import sklearn as skl
from sklearn import preprocessing
#from sklearn.mixture import GMM
from sklearn.cluster import KMeans
import seaborn as sns



In [2]:

    
our_data = SKLData(*process_data())



In [3]:

    
our_data.train(skl.neighbors.KNeighborsClassifier(10))









    Out[3]:





0.68918918918918914



In [2]:

    
# Read in our data
data = SKLData("./data/keystroke0.csv", \
        [lambda r: [float(rv) for rv in r[3:]], \
        lambda r: int(r[0][1:])])



In [24]:

    
# First we have to ask: what does the data look like?
# For KNN, we want the data to be clustered nicely.
# It looks like this exists!

# Long tails biasing our sample
# We can throw out outliers first and then perform prediction
# Or, we can perform a square root or log transform.
data.plot(2, marker='.', alpha=0.3, color='purple', s=70)
data.plot(3, marker='.', alpha=0.3, color='darkgreen', s=70)
data.plot(5, marker='.', alpha=0.3, color='darkred', s=70)
plt.title("Three Users (2 Highest Variance Dimensions)")









    Out[24]:





<matplotlib.text.Text at 0x11004bc10>



In [5]:

    
data.train(skl.neighbors.KNeighborsClassifier(n_neighbors=2))









    Out[5]:





0.74117647058823533



In [6]:

    
data.train(skl.neighbors.KNeighborsClassifier(n_neighbors=10, weights='uniform', p=1))









    Out[6]:





0.82843137254901966



In [7]:

    
def run_power(p=1, data=data):
    sqrtfun = lambda X: np.sign(X)*np.power(np.abs(X), p)
    clf = skl.neighbors.KNeighborsClassifier(n_neighbors=20)
    return data.train(clf, sqrtfun)



In [9]:

    
def run_neighbor(n=20, data=data):
    return data.train(skl.neighbors.KNeighborsClassifier(n_neighbors=n))

n_xs = np.array(range(1,40))
n_ys = [run_neighbor(n) for n in n_xs]
ppl.plot(n_xs, n_ys)









    Out[9]:





[<matplotlib.lines.Line2D at 0x10e510110>]



In [8]:

    
power_xs = np.array(range(1,31))/10.
power_ys = [run_power(p) for p in power_xs]
ppl.plot(power_xs, power_ys)









    Out[8]:





[<matplotlib.lines.Line2D at 0x13166130>]



In [9]:

    
# We also want to answer: if we had an ideal model,
# one that only used the number of dimensions that we needed,
# how much would we have to model?

def run_pca(n=2, data=data):
    lpca = lambda x: skl.decomposition.PCA(n).fit_transform(x)
    clf = skl.neighbors.KNeighborsClassifier(n_neighbors=20, weights='uniform', p=1)
    return data.train(clf, lpca)



In [10]:

    
pca_xs = range(1,31)
pca_acc = [run_pca(i) for i in pca_xs]
# Looks like there's about 15 true dimensions in our data
# The other values don't really matter, probably because
# People tend to type the same way
# Future work: define a model that is more specific
ppl.plot(pca_xs, pca_acc)









    Out[10]:





[<matplotlib.lines.Line2D at 0x152a1930>]



In [10]:

    
# Transform to be imposter
real_users = [2,3,4,5,12]

from collections import defaultdict
rdict = defaultdict(lambda:0, {r:1 for r in real_users})
data.transform(lambda x:x, lambda ys: np.array([rdict[y] for y in ys]))



In [11]:

    
def run_neighbor(n=20, data=data):
    return data.train(skl.neighbors.KNeighborsClassifier(n_neighbors=n))

n_xs = np.array(range(1,40))
n_ys = [run_neighbor(n) for n in n_xs]
ppl.plot(n_xs, n_ys)









    Out[11]:





[<matplotlib.lines.Line2D at 0x10e69e050>]



In [28]:

    
# GMM Code
# SKLEARN IS BAD
ys = [data.train(skl.mixture.GMM(i*5)) for i in xrange(1,21)]
xs = np.array(xrange(1,21))*5
ppl.plot(xs, ys)









    Out[28]:





[<matplotlib.lines.Line2D at 0x10d822450>]



In [4]:

    
gmm = GMM(2)
gmm.fit(data.X, data.y)
gmm.predict(data.X)









    Out[4]:





array([55, 36, 36, ..., 36, 36, 36])



In [3]:

    
data.train(GMM())









    Out[3]:





0.0



In [ ]:

    
from sklearn import cluster
xvals = skl.cluster.AgglomerativeClustering(len(np.unique(data.y))).fit_predict(data.X, data.y)
np.sum(xvals == data.y)/float(len(xvals))



In [13]:

    
pca = skl.decomposition.PCA(2)
xnew = pca.fit_transform(data.X)
xn, yn = zip(*xnew)
plt.scatter(xn,yn, color='darkblue', alpha=0.03)
plt.scatter(*zip(*pca.transform(gmm.means)), color='darkturquoise', alpha=0.5)
plt.xlim([-1,5])
plt.ylim([-3,3])









    Out[13]:





(-3, 3)



In [13]:

    
from sklearn import mixture
data.transform(lambda x: skl.preprocessing.scale(x, 0),lambda y:y)
data.train(skl.mixture.GMM(56))









    Out[13]:





0.017156862745098041



In [29]:

    
def run_gauss_pca(n=2, data=data):
    lpca = lambda x: skl.decomposition.PCA(n).fit_transform(x)
    clf = skl.mixture.GMM(56)
    return data.train(clf, lpca)

pca_xs = range(1,31)
pca_gauss_acc = [run_gauss_pca(i) for i in pca_xs]
ppl.plot(pca_xs, pca_gauss_acc)









    Out[29]:





[<matplotlib.lines.Line2D at 0x106e74fd0>]



In [36]:

    
data.train(skl.mixture.GMM(n_components=30, params='mc'))









    Out[36]:





0.019117647058823531



In [20]:

    
# We now move onto the GREYC Dataset
def getdata(row):
    x = [int(r) for r in row[5].strip().split(" ")[:60]]
    if len(x) == 0:
        raise ValueError("Length failed")
    return x


greyc = SKLData("./data/keystroke1.csv", [getdata, lambda r: int(r[7])])



In [12]:

    
greyc.train(skl.neighbors.KNeighborsClassifier(n_neighbors=20, weights='uniform', p=1))









    Out[12]:





0.78835978835978837



In [13]:

    
greyc.plot(1)
greyc.plot(2)
greyc.plot(3)
greyc.plot(4)









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x152b0ab0>



In [27]:

    
ppl.hist(np.bincount(greyc.y))









    Out[27]:





<matplotlib.axes.AxesSubplot at 0x10cc44050>



In [ ]: