In [1]:
    
%pylab
%matplotlib inline
    
    
In [2]:
    
cd ..
    
    
In [3]:
    
import sys
import numpy as np
import skimage
import cv2
import sklearn
import imp
    
In [4]:
    
import holoviews
    
    
In [5]:
    
import neukrill_net.utils
import neukrill_net.image_features
import neukrill_net.highlevelfeatures
import neukrill_net.stacked
    
In [6]:
    
import skimage.feature
    
In [7]:
    
import sklearn.ensemble
    
In [8]:
    
import time
    
In [9]:
    
#%pdb
    
In [10]:
    
settings = neukrill_net.utils.Settings('settings.json')
    
In [11]:
    
X,y = settings.flattened_train_paths(settings.classes)
    
In [12]:
    
reload(neukrill_net.highlevelfeatures)
    
    Out[12]:
In [13]:
    
reload(neukrill_net.image_features)
    
    Out[13]:
In [14]:
    
attrlst = ['height','width','numpixels','aspectratio','mean','std','stderr',
           'numwhite','propwhite','numnonwhite','propnonwhite','numblack','propblack','numbool','propbool']
hlf  = neukrill_net.highlevelfeatures.BasicAttributes(attrlst)
hlf += neukrill_net.highlevelfeatures.Haralick()
hlf += neukrill_net.highlevelfeatures.ThresholdAdjacency()
hlf += neukrill_net.highlevelfeatures.ContourMoments()
hlf += neukrill_net.highlevelfeatures.ContourHistogram()
hlf += neukrill_net.highlevelfeatures.CoocurProps()
    
In [15]:
    
hlf.preprocess_and_extract_image(neukrill_net.highlevelfeatures.loadimage(X[0]))
    
    Out[15]:
In [16]:
    
kprf_base = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=25,
                                                            min_samples_leaf=20, n_jobs=12, random_state=42)
    
In [17]:
    
max_num_kp = 150
detector_list = [lambda image: neukrill_net.image_features.get_ORB_keypoints(image, n=max_num_kp, patchSize=9),
                 lambda image: neukrill_net.image_features.get_BRISK_keypoints(image, n=max_num_kp),
                 lambda image: neukrill_net.image_features.get_MSER_keypoints(image, n=max_num_kp)]
describer_list = [neukrill_net.image_features.get_ORB_descriptions,
                  neukrill_net.image_features.get_BRISK_descriptions,
                  neukrill_net.image_features.get_ORB_descriptions]
    
In [18]:
    
for index,detector in enumerate(detector_list):
    hlf += neukrill_net.highlevelfeatures.KeypointEnsembleClassifier(detector, describer_list[index], kprf_base,
                                                                     return_num_kp=True, summary_method='vote')
    
In [19]:
    
rf_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=2500, max_depth=30,
                                                            min_samples_leaf=1, n_jobs=12, random_state=42)
    
In [20]:
    
import sklearn.pipeline
    
In [21]:
    
selector = sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif, percentile=33)
    
In [22]:
    
stack_pipe = sklearn.pipeline.Pipeline([('filter', selector), ('clf', rf_clf)])
stacked_clf = neukrill_net.stacked.StackedClassifier(hlf, stack_pipe, inner_prop=0.25, random_state=42)
    
In [23]:
    
import neukrill_net.taxonomy
    
In [24]:
    
neukrill_net.taxonomy.taxonomy
    
    Out[24]:
In [25]:
    
marked_taxonomy = neukrill_net.stacked.propagate_labels_to_leaves(neukrill_net.taxonomy.taxonomy, settings.classes)
    
In [26]:
    
marked_taxonomy
    
    Out[26]:
In [27]:
    
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.5, random_state=42)
    
In [35]:
    
reload(neukrill_net.stacked)
    
    Out[35]:
In [36]:
    
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, stacked_clf)
    
In [37]:
    
t0 = time.time()
hier_clf.fit(X_train, y_train)
print("Time={}".format(time.time()-t0))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
    
On original
This is similar to just the Contour Moments and Haralick features
On reduced
In [22]:
    
my_X = X_new
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
    
Does slightly worse with fewer features.
Maybe it was too few?
In [23]:
    
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
    
In [24]:
    
import neukrill_net.taxonomy
    
In [29]:
    
reload(neukrill_net.stacked)
    
    Out[29]:
In [36]:
    
reload(neukrill_net.taxonomy)
    
    Out[36]:
In [37]:
    
neukrill_net.taxonomy.taxonomy
    
    Out[37]:
In [38]:
    
settings.classes
    
    Out[38]:
In [39]:
    
marked_taxonomy = neukrill_net.stacked.propagate_labels_to_leaves(neukrill_net.taxonomy.taxonomy, settings.classes)
    
In [40]:
    
marked_taxonomy
    
    Out[40]:
In [41]:
    
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
In [42]:
    
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
Try with a pipline to reduce the number of features at each level
In [44]:
    
import sklearn.pipeline
    
In [47]:
    
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
    
In [47]:
    
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
    
In [48]:
    
clf = sklearn.linear_model.LogisticRegression(random_state=42)
    
In [49]:
    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
    
In [50]:
    
clf = sklearn.linear_model.LogisticRegression(random_state=42)
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
    
In [55]:
    
base_clf = sklearn.linear_model.LogisticRegression(random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
In [52]:
    
base_clf = sklearn.linear_model.LogisticRegression(random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
In [56]:
    
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
    
In [57]:
    
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
    
In [58]:
    
base_clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
In [59]:
    
base_clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
one-vs-one
In [60]:
    
clf = sklearn.svm.SVC(probability=True, random_state=42)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
    
In [61]:
    
clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))
    
    
In [62]:
    
base_clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
In [63]:
    
base_clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)
base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)
my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)
t0 = time.time()
hier_clf.fit(X_train, y_train)
t1 = time.time()
total = t1-t0
print("Time={}".format(total))
t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))
print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))
    
    
In [ ]: