In [1]:
%pylab
%matplotlib inline
In [2]:
cd ..
In [3]:
import sys
import numpy as np
import skimage
import cv2
import sklearn
In [5]:
import imp
In [6]:
import holoviews
In [7]:
%load_ext holoviews.ipython
In [8]:
import neukrill_net.utils
import neukrill_net.highlevelfeatures
In [9]:
import time
In [10]:
import mahotas
In [11]:
settings = neukrill_net.utils.Settings('settings.json')
In [12]:
X,y = settings.flattened_train_paths(settings.classes)
In [13]:
H = mahotas.features.haralick(neukrill_net.highlevelfeatures.loadimage(X[0]))
In [14]:
H
Out[14]:
In [15]:
H.shape
Out[15]:
In [16]:
(np.amax(H, 0) - np.amin(H, 0)).shape
Out[16]:
In [17]:
hlf = neukrill_net.highlevelfeatures.Haralick()
In [18]:
hlf.extract_image(neukrill_net.highlevelfeatures.loadimage(X[0]))
Out[18]:
In [23]:
tmp = sklearn.externals.joblib.load('cache/'+'haralick.pkl')
hlf = tmp[0]
XF = tmp[1]
In [24]:
XF.shape
Out[24]:
In [25]:
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet')
In [26]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
In [ ]:
settings.class_priors.shape
In [ ]:
clf.predict_proba(X_test).shape
Maybe we can do better if we run for longer
In [27]:
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet', n_iter=100)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
That was worse!
How about even longer?
In [28]:
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet', n_iter=1000)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
In [29]:
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet', shuffle=True, n_jobs=12, n_iter=1000)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
In [31]:
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet', shuffle=True, n_jobs=12, n_iter=1000)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
sclr = sklearn.preprocessing.StandardScaler()
X_train = sclr.fit_transform(X_train)
X_test = sclr.transform(X_test)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
In [30]:
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet', shuffle=True, n_jobs=12, n_iter=10000)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
In [32]:
clf = sklearn.linear_model.SGDClassifier(loss='log', penalty='elasticnet', shuffle=True, n_jobs=12, n_iter=10000)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
sclr = sklearn.preprocessing.StandardScaler()
X_train = sclr.fit_transform(X_train)
X_test = sclr.transform(X_test)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss with prior mixin={}".format(sklearn.metrics.log_loss(y_test, 0.5*settings.class_priors + 0.5*clf.predict_proba(X_test))))
print("Logloss without prior mixin={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [35]:
print("Logloss without prior mixin={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [38]:
print settings.classes[48]
print settings.classes[92]
print settings.classes[94]
print settings.classes[95]
In [26]:
import sklearn.ensemble
In [27]:
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5)
In [28]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [29]:
np.unique(y_test)
Out[29]:
In [30]:
np.unique(y_test)
Out[30]:
In [31]:
CM = sklearn.metrics.confusion_matrix(y_test, clf.predict(X_test), labels=np.unique(y))
In [32]:
plt.matshow(CM)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [33]:
holoviews.Image(CM)
Out[33]:
In [34]:
CM.sum(0)
Out[34]:
In [35]:
CM.sum(1)
Out[35]:
In [36]:
CM1 = (CM+0.0) / CM.sum(1)
In [37]:
plt.matshow(CM1)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [38]:
def confusion_matrix_from_proba(y_true, y_pred, labels=None):
y_true = np.array(y_true)
if labels is None:
labels = np.union1d(y_true,np.arange(y_pred.shape[1]))
n_classes = len(labels)
M = np.zeros((n_classes,n_classes))
for i in range(n_classes):
li = (y_true == i)
M[i,:] = np.mean(y_pred[li,:],0)
return M
In [39]:
CM2 = confusion_matrix_from_proba(y_test, clf.predict_proba(X_test), labels=np.unique(y))
In [40]:
CM2.shape
Out[40]:
In [41]:
plt.matshow(CM2)
plt.title('Confusion matrix v2')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [42]:
import scipy.cluster.hierarchy
In [43]:
Y = scipy.cluster.hierarchy.distance.pdist(CM, metric='euclidean')
Z = scipy.cluster.hierarchy.linkage(Y, method='single')
ax = scipy.cluster.hierarchy.dendrogram(Z, show_contracted=True)
In [44]:
Y = scipy.cluster.hierarchy.distance.pdist(CM1, metric='euclidean')
Z = scipy.cluster.hierarchy.linkage(Y, method='single')
ax = scipy.cluster.hierarchy.dendrogram(Z, show_contracted=True)
In [45]:
Y = scipy.cluster.hierarchy.distance.pdist(CM2, metric='euclidean')
Z = scipy.cluster.hierarchy.linkage(Y, method='single')
ax = scipy.cluster.hierarchy.dendrogram(Z, show_contracted=True)
In [ ]:
In [ ]:
clf = sklearn.linear_model.LogisticRegression(random_state=42)
In [ ]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
Forgot to scale, and that is important for Logistic Regression!
In [ ]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
Now we're about as good as the Random Forest!
In [ ]:
import sklearn.naive_bayes
In [ ]:
clf = sklearn.naive_bayes.GaussianNB()
In [ ]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
Well that sucked. Maybe better with a Z-score?
In [ ]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
Nope. Naive Bayes just plain sucks.
In [ ]:
clf = sklearn.svm.LinearSVC()
In [ ]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
No predict probabilities opiton with SVC
Let's do proper CV
In [ ]:
cv = sklearn.cross_validation.StratifiedShuffleSplit(y)
In [ ]:
clf = sklearn.linear_model.LogisticRegression(random_state=42)
In [ ]:
print('Cross-validating')
results = []
for train, test in cv:
sc = sklearn.preprocessing.StandardScaler()
sc.fit(XF.squeeze(0)[train,:])
clf.fit(sc.transform(XF.squeeze(0)[train,:]), np.array(y)[train])
res = sklearn.metrics.log_loss(np.array(y)[test], clf.predict_proba(sc.transform(XF.squeeze(0)[test,:])))
print(res)
results.append(res)
In [ ]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(XF.squeeze(0), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
In [ ]:
sklearn.grid_search.GridSearchCV(estimator, param_grid, scoring=None)