In [1]:
    
%pylab
%matplotlib inline
    
    
In [2]:
    
cd ..
    
    
In [3]:
    
import sys
import numpy as np
import skimage
import cv2
import sklearn
    
In [4]:
    
import imp
    
In [5]:
    
import holoviews
    
    
In [6]:
    
%load_ext holoviews.ipython
    
    
    
    
    
In [7]:
    
import neukrill_net.utils
import neukrill_net.highlevelfeatures
    
In [8]:
    
import time
    
In [9]:
    
import mahotas
    
In [10]:
    
settings = neukrill_net.utils.Settings('settings.json')
    
In [11]:
    
X,y = settings.flattened_train_paths(settings.classes)
    
In [12]:
    
H = mahotas.features.haralick(neukrill_net.highlevelfeatures.loadimage(X[0]))
    
In [13]:
    
H
    
    Out[13]:
In [14]:
    
H.shape
    
    Out[14]:
In [15]:
    
(np.amax(H, 0) - np.amin(H, 0)).shape
    
    Out[15]:
In [16]:
    
hlf = neukrill_net.highlevelfeatures.Haralick()
    
In [17]:
    
hlf.extract_image(neukrill_net.highlevelfeatures.loadimage(X[0]))
    
    Out[17]:
In [18]:
    
t0 = time.time()
XF = hlf.transform(X)
print("Computing features took {}".format(time.time()-t0))
    
    
In [19]:
    
XF.shape
    
    Out[19]:
In [23]:
    
sklearn.externals.joblib.dump((hlf,XF,y),'cache/haralick.pkl')
    
    Out[23]:
In [20]:
    
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet')
    
In [21]:
    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
    
    
    
    
In [ ]:
    
settings.class_priors.shape
    
In [ ]:
    
clf.predict_proba(X_test).shape
    
Maybe we can do better if we run for longer
In [ ]:
    
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet', n_iter=100)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
    
That was worse!
How about even longer?
In [ ]:
    
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet', n_iter=1000)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
    
Okay, forget trying to do gradient descent.
In [26]:
    
import sklearn.ensemble
    
In [27]:
    
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)
    
In [28]:
    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
    
In [29]:
    
np.unique(y_test)
    
    Out[29]:
In [30]:
    
np.unique(y_test)
    
    Out[30]:
In [31]:
    
CM = sklearn.metrics.confusion_matrix(y_test, clf.predict(X_test), labels=np.unique(y))
    
In [32]:
    
plt.matshow(CM)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
    
    
In [33]:
    
holoviews.Image(CM)
    
    Out[33]:
In [34]:
    
CM.sum(0)
    
    Out[34]:
In [35]:
    
CM.sum(1)
    
    Out[35]:
In [36]:
    
CM1 = (CM+0.0) / CM.sum(1)
    
In [37]:
    
plt.matshow(CM1)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
    
    
In [38]:
    
def confusion_matrix_from_proba(y_true, y_pred, labels=None):
    y_true = np.array(y_true)
    if labels is None:
        labels = np.union1d(y_true,np.arange(y_pred.shape[1]))
    n_classes = len(labels)
    M = np.zeros((n_classes,n_classes))
    for i in range(n_classes):
        li = (y_true == i)
        M[i,:] = np.mean(y_pred[li,:],0)
    return M
    
In [39]:
    
CM2 = confusion_matrix_from_proba(y_test, clf.predict_proba(X_test), labels=np.unique(y))
    
In [40]:
    
CM2.shape
    
    Out[40]:
In [41]:
    
plt.matshow(CM2)
plt.title('Confusion matrix v2')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
    
    
In [42]:
    
import scipy.cluster.hierarchy
    
In [43]:
    
Y = scipy.cluster.hierarchy.distance.pdist(CM, metric='euclidean')
Z = scipy.cluster.hierarchy.linkage(Y, method='single')
ax = scipy.cluster.hierarchy.dendrogram(Z, show_contracted=True)
    
    
In [44]:
    
Y = scipy.cluster.hierarchy.distance.pdist(CM1, metric='euclidean')
Z = scipy.cluster.hierarchy.linkage(Y, method='single')
ax = scipy.cluster.hierarchy.dendrogram(Z, show_contracted=True)
    
    
In [45]:
    
Y = scipy.cluster.hierarchy.distance.pdist(CM2, metric='euclidean')
Z = scipy.cluster.hierarchy.linkage(Y, method='single')
ax = scipy.cluster.hierarchy.dendrogram(Z, show_contracted=True)
    
    
In [ ]:
    
    
In [ ]:
    
clf = sklearn.linear_model.LogisticRegression(random_state=42)
    
In [ ]:
    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
Forgot to scale, and that is important for Logistic Regression!
In [ ]:
    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
Now we're about as good as the Random Forest!
In [ ]:
    
import sklearn.naive_bayes
    
In [ ]:
    
clf = sklearn.naive_bayes.GaussianNB()
    
In [ ]:
    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
Well that sucked. Maybe better with a Z-score?
In [ ]:
    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
Nope. Naive Bayes just plain sucks.
In [ ]:
    
clf = sklearn.svm.LinearSVC()
    
In [ ]:
    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
No predict probabilities opiton with SVC
Let's do proper CV
In [ ]:
    
cv = sklearn.cross_validation.StratifiedShuffleSplit(y)
    
In [ ]:
    
clf = sklearn.linear_model.LogisticRegression(random_state=42)
    
In [ ]:
    
print('Cross-validating')
results = []
for train, test in cv:
    sc = sklearn.preprocessing.StandardScaler()
    sc.fit(XF.squeeze(0)[train,:])
    clf.fit(sc.transform(XF.squeeze(0)[train,:]), np.array(y)[train])
    res = sklearn.metrics.log_loss(np.array(y)[test], clf.predict_proba(sc.transform(XF.squeeze(0)[test,:])))
    print(res)
    results.append(res)
    
In [ ]:
    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
In [ ]:
    
sklearn.grid_search.GridSearchCV(estimator, param_grid, scoring=None)