In [1]:
%pylab
%matplotlib inline


Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib

In [2]:
cd ..


/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-work

In [3]:
import sys
import numpy as np
import skimage
import cv2
import sklearn
import imp

In [4]:
from holoviews import *


:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [5]:
import neukrill_net.utils
import neukrill_net.highlevelfeatures

In [6]:
import time

In [7]:
settings = neukrill_net.utils.Settings('settings.json')

In [8]:
X,y = settings.flattened_train_paths(settings.classes)

In [9]:
hlf = neukrill_net.highlevelfeatures.ThresholdAdjacency()

In [10]:
t0 = time.time()
XF = hlf.transform(X)
print("Computing features took {}".format(time.time()-t0))


Computing features took 85.849462986

In [20]:
XF.shape


Out[20]:
(1, 30336, 54)

In [24]:
sklearn.externals.joblib.dump((hlf,XF,y),'cache/pftas.pkl')


Out[24]:
['cache/pftas.pkl', 'cache/pftas.pkl_01.npy']

Naive Bayes


In [11]:
import sklearn.naive_bayes

In [12]:
clf = sklearn.naive_bayes.GaussianNB()

In [13]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=0.0756931304932
Accuracy=0.191455696203
Logloss=14.5961701725

Logistic Regression


In [14]:
clf = sklearn.linear_model.LogisticRegression(random_state=42)

In [15]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=97.2654531002
Accuracy=0.32865242616
Logloss=2.76713927682

Random Forest


In [16]:
import sklearn.ensemble

In [17]:
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=310.334695101
Accuracy=0.404535864979
Logloss=2.40882472837

Linear SVC


In [18]:
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=106.997659206
Accuracy=0.380472046414
Logloss=2.49029657269

Non-linear SVC

one-vs-one


In [19]:
clf = sklearn.svm.SVC(probability=True, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=113.672945023
Accuracy=0.382977320675
Logloss=2.36932021533