notebook.community

Edit and run



In [1]:

    
%pylab
%matplotlib inline









    



Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib



In [2]:

    
cd ..









    



/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-work



In [3]:

    
import sys
import numpy as np
import skimage
import cv2
import sklearn
import imp



In [4]:

    
import holoviews









    



:0: FutureWarning: IPython widgets are experimental and may change in the future.



In [5]:

    
import neukrill_net.utils
import neukrill_net.image_features
import neukrill_net.highlevelfeatures
import neukrill_net.stacked



In [6]:

    
import skimage.feature



In [7]:

    
import sklearn.ensemble



In [8]:

    
import time



In [9]:

    
#%pdb



In [10]:

    
settings = neukrill_net.utils.Settings('settings.json')



In [11]:

    
X,y = settings.flattened_train_paths(settings.classes)



In [12]:

    
reload(neukrill_net.highlevelfeatures)









    Out[12]:





<module 'neukrill_net.highlevelfeatures' from '/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/highlevelfeatures.pyc'>



In [13]:

    
reload(neukrill_net.image_features)









    Out[13]:





<module 'neukrill_net.image_features' from '/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/image_features.pyc'>

Construct the full feature list



In [14]:

    
attrlst = ['height','width','numpixels','aspectratio','mean','std','stderr',
           'numwhite','propwhite','numnonwhite','propnonwhite','numblack','propblack','numbool','propbool']

hlf  = neukrill_net.highlevelfeatures.BasicAttributes(attrlst)
hlf += neukrill_net.highlevelfeatures.Haralick()
hlf += neukrill_net.highlevelfeatures.ThresholdAdjacency()
hlf += neukrill_net.highlevelfeatures.ContourMoments()
hlf += neukrill_net.highlevelfeatures.ContourHistogram()
hlf += neukrill_net.highlevelfeatures.CoocurProps()



In [15]:

    
hlf.preprocess_and_extract_image(neukrill_net.highlevelfeatures.loadimage(X[0]))









    Out[15]:





array([  6.60000000e+01,   5.90000000e+01,   3.89400000e+03,
         8.93939394e-01,   2.52098870e+02,   1.87598078e+01,
         3.00628681e-01,   3.29300000e+03,   8.45659990e-01,
         6.01000000e+02,   1.54340010e-01,   0.00000000e+00,
         0.00000000e+00,   3.86200000e+03,   9.91782229e-01,
         5.92909729e-01,   1.17428038e+02,   8.37376309e-01,
         3.60378916e+02,   8.37396275e-01,   5.04056774e+02,
         1.32408763e+03,   1.77634694e+00,   2.17408130e+00,
         2.42847737e-03,   1.54530917e+00,  -2.79362396e-01,
         7.10802264e-01,   8.44503234e-03,   8.02833026e+01,
         1.08540997e-01,   6.01075856e+00,   7.49707356e-03,
         9.91907566e-02,   5.62402684e+01,   7.00702220e-02,
         9.22398670e-02,   4.07591384e-05,   1.03418161e-01,
         5.27140192e-02,   4.48146178e-02,   9.27835052e-02,
         8.24742268e-02,   1.28865979e-01,   2.57731959e-01,
         2.21649485e-01,   1.23711340e-01,   5.67010309e-02,
         1.03092784e-02,   2.57731959e-02,   9.27835052e-02,
         8.24742268e-02,   1.28865979e-01,   2.57731959e-01,
         2.21649485e-01,   1.23711340e-01,   5.67010309e-02,
         1.03092784e-02,   2.57731959e-02,   7.88043478e-02,
         8.69565217e-02,   1.35869565e-01,   1.87500000e-01,
         1.19565217e-01,   8.15217391e-02,   1.22282609e-01,
         1.03260870e-01,   8.42391304e-02,   9.22432432e-01,
         2.94594595e-02,   1.89189189e-02,   1.70270270e-02,
         1.00000000e-02,   1.62162162e-03,   5.40540541e-04,
         0.00000000e+00,   0.00000000e+00,   9.22432432e-01,
         2.94594595e-02,   1.89189189e-02,   1.70270270e-02,
         1.00000000e-02,   1.62162162e-03,   5.40540541e-04,
         0.00000000e+00,   0.00000000e+00,   8.10550199e-01,
         8.11117413e-02,   4.42427680e-02,   3.14804311e-02,
         1.95689166e-02,   1.04934770e-02,   1.70164492e-03,
         8.50822462e-04,   0.00000000e+00,   2.55193896e-01,
         2.04862918e+00,   1.44272580e+00,   2.12646011e+00,
        -4.62433192e+00,   3.15379871e+00,   3.07003271e+04,
         1.39122141e+05,   2.54433417e+05,   2.35574674e-01,
         8.21080723e+06,  -1.02133457e+05,   4.17133345e+04,
         3.20081449e-01,   8.22184495e+06,  -4.12477274e-02,
         2.76398107e+03,   1.50253777e+04,   2.12090229e-02,
         6.06816516e-03,   3.34214333e+05,   1.16103130e+07,
         3.61000000e+02,   1.04675000e+04,   1.95815874e+05,
         7.90824091e-02,   5.61860172e-02,   8.67950000e+03,
         2.50394000e+05,   7.29116057e+06,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   1.03626940e-02,
         0.00000000e+00,   1.55440411e-02,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   1.03626940e-02,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   1.03626940e-02,
         5.18134702e-03,   5.18134702e-03,   0.00000000e+00,
         5.18134702e-03,   5.18134702e-03,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         1.03626940e-02,   5.18134702e-03,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   1.03626940e-02,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         5.18134702e-03,   0.00000000e+00,   1.55440411e-02,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   5.18134702e-03,   1.03626940e-02,
         2.59067360e-02,   1.55440411e-02,   1.55440411e-02,
         1.03626940e-02,   1.03626940e-02,   0.00000000e+00,
         1.03626940e-02,   1.03626940e-02,   1.03626940e-02,
         3.10880821e-02,   2.07253881e-02,   1.55440411e-02,
         2.07253881e-02,   1.55440411e-02,   2.07253881e-02,
         1.03626940e-02,   2.07253881e-02,   3.10880821e-02,
         2.07253881e-02,   1.55440411e-02,   1.55440411e-02,
         2.07253881e-02,   2.07253881e-02,   2.59067360e-02,
         1.55440411e-02,   2.59067360e-02,   3.62694301e-02,
         5.18134721e-02,   5.18134721e-02,   3.62694301e-02,
         3.10880821e-02,   2.59067360e-02,   2.07253881e-02,
         2.07253881e-02,   2.07253881e-02,   4.14507762e-02,
         4.14507762e-02,   9.32642519e-02,   1.65803105e-01,
         3.52331609e-01,   8.96373034e-01,   3.26424867e-01,
         8.94109831e+01,   2.33748865e+02,   3.84509414e+02,
         5.26368752e+02,   6.42020154e+02,   7.26356382e+02,
         7.71580354e+02,   7.91133169e+02,   8.07771936e+02,
         8.25750940e+02,   8.46006732e+02,   8.65581185e+02,
         8.88185145e+02,   9.09241280e+02,   9.31630503e+02,
         9.53560952e+02,   9.74185997e+02,   9.96331812e+02,
         1.83858521e+00,   2.99505974e+00,   3.94732815e+00,
         4.73317533e+00,   5.33734531e+00,   5.78863418e+00,
         6.07474006e+00,   6.24732499e+00,   6.38570297e+00,
         6.52894236e+00,   6.67213181e+00,   6.79616729e+00,
         6.94389156e+00,   7.07420303e+00,   7.20536685e+00,
         7.29957729e+00,   7.40371459e+00,   7.52496845e+00,
         8.40084695e-01,   8.26183165e-01,   8.15821243e-01,
         8.02941013e-01,   7.95434761e-01,   7.88870645e-01,
         7.82232097e-01,   7.75070516e-01,   7.71926731e-01,
         7.65476700e-01,   7.62281847e-01,   7.59127917e-01,
         7.56305787e-01,   7.54961425e-01,   7.50305416e-01,
         7.50588006e-01,   7.49492059e-01,   7.46619366e-01,
         5.96495631e-01,   5.79167049e-01,   5.68021035e-01,
         5.50765298e-01,   5.38027011e-01,   5.24626488e-01,
         5.14111233e-01,   5.01112629e-01,   4.95427673e-01,
         4.86848350e-01,   4.80802771e-01,   4.76177174e-01,
         4.72000191e-01,   4.70452396e-01,   4.64396625e-01,
         4.63196895e-01,   4.62578103e-01,   4.58417435e-01,
         7.72331281e-01,   7.61030016e-01,   7.53661990e-01,
         7.42131370e-01,   7.33484501e-01,   7.24280265e-01,
         7.17006625e-01,   7.07851751e-01,   7.03822562e-01,
         6.97647562e-01,   6.93274867e-01,   6.89879555e-01,
         6.86788186e-01,   6.85566058e-01,   6.81118767e-01,
         6.80196244e-01,   6.79676649e-01,   6.76613325e-01,
         8.74966675e-01,   6.78312406e-01,   4.79365635e-01,
         2.98999447e-01,   1.59252766e-01,   6.49875146e-02,
         2.39117444e-02,   1.67861331e-02,   1.40638698e-02,
         1.05485550e-02,   4.85300532e-03,   8.64317606e-04,
        -5.76297384e-03,  -9.92750856e-03,  -1.48418163e-02,
        -1.86466840e-02,  -1.99869197e-02,  -2.18218335e-02])



In [16]:

    
kprf_base = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=25,
                                                            min_samples_leaf=20, n_jobs=12, random_state=42)



In [17]:

    
max_num_kp = 150

detector_list = [lambda image: neukrill_net.image_features.get_ORB_keypoints(image, n=max_num_kp, patchSize=9),
                 lambda image: neukrill_net.image_features.get_BRISK_keypoints(image, n=max_num_kp),
                 lambda image: neukrill_net.image_features.get_MSER_keypoints(image, n=max_num_kp)]

describer_list = [neukrill_net.image_features.get_ORB_descriptions,
                  neukrill_net.image_features.get_BRISK_descriptions,
                  neukrill_net.image_features.get_ORB_descriptions]



In [18]:

    
for index,detector in enumerate(detector_list):
    hlf += neukrill_net.highlevelfeatures.KeypointEnsembleClassifier(detector, describer_list[index], kprf_base,
                                                                     return_num_kp=True, summary_method='vote')



In [19]:

    
rf_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=2500, max_depth=30,
                                                            min_samples_leaf=1, n_jobs=12, random_state=42)



In [20]:

    
import sklearn.pipeline



In [21]:

    
selector = sklearn.feature_selection.SelectPercentile(sklearn.feature_selection.f_classif, percentile=33)



In [22]:

    
stack_pipe = sklearn.pipeline.Pipeline([('filter', selector), ('clf', rf_clf)])

stacked_clf = neukrill_net.stacked.StackedClassifier(hlf, stack_pipe, inner_prop=0.25, random_state=42)

Add hierarcy



In [23]:

    
import neukrill_net.taxonomy



In [24]:

    
neukrill_net.taxonomy.taxonomy









    Out[24]:





{'no_class': {'artifacts': {}, 'artifacts_edge': {}},
 'plankton': {'chaetognaths': {'chaetognath_non_sagitta': {},
   'chaetognath_other': {},
   'chaetognath_sagitta': {}},
  'chordate_type1': {},
  'crustaceans': {'amphipods': {},
   'copepods': {'calanoid': {'copepod_calanoid': {},
     'copepod_calanoid_eggs': {},
     'copepod_calanoid_eucalanus': {},
     'copepod_calanoid_flatheads': {},
     'copepod_calanoid_frillyAntennae': {},
     'copepod_calanoid_large': {},
     'copepod_calanoid_large_side_antennatucked': {},
     'copepod_calanoid_octomoms': {},
     'copepod_calanoid_small_longantennae': {},
     'copepod_other': {}},
    'cyclopoid_copepods': {'copepod_cyclopoid_copilia': {},
     'oithona': {'copepod_cyclopoid_oithona': {},
      'copepod_cyclopoid_oithona_eggs': {}}}},
   'crustacean_other': {},
   'shrimp_like': {'decapods_all': {'decapods': {},
     'shrimp_caridean': {},
     'shrimp_sergestidae': {},
     'shrimp_zoea': {}},
    'euphausiids_all_ages': {'euphausiids': {}, 'euphausiids_young': {}},
    'shrimp-like_other': {}},
   'stomatopod': {}},
  'detritus': {'detritus_blob': {},
   'detritus_filamentous': {},
   'detritus_other': {},
   'fecal_pellet': {}},
  'diatoms': {'diatom_chain_string': {}, 'diatom_chain_tube': {}},
  'fish': {'fish_larvae_deep_body': {},
   'fish_larvae_leptocephali': {},
   'fish_larvae_medium_body': {},
   'fish_larvae_myctophids': {},
   'fish_larvae_thin_body': {},
   'fish_larvae_very_thin_body': {}},
  'gastropods': {'heteropod': {},
   'pteropods': {'pteropod_butterfly': {},
    'pteropod_theco_dev_seq': {},
    'pteropod_triangle': {}}},
  'gelatinous zooplankton': {'ctenophores': {'ctenophore_cestid': {},
    'ctenophore_lobate': {},
    'cydippid': {'ctenophore_cydippid_no_tentacles': {},
     'ctenophore_cydippid_tentacles': {}}},
   'ephyra': {},
   'hydromedusae': {'other_hydromedusae': {'hydromedusae_bell_and_tentacles': {},
     'hydromedusae_h15': {},
     'hydromedusae_other': {},
     'hydromedusae_partial_dark': {},
     'hydromedusae_shapeA': {},
     'hydromedusae_shapeA_sideview_small': {},
     'hydromedusae_shapeB': {},
     'hydromedusae_sideview_big': {},
     'hydromedusae_typeD': {},
     'hydromedusae_typeD_bell_and_tentacles': {},
     'hydromedusae_typeE': {},
     'hydromedusae_typeF': {}},
    'sub_hydromedusae1': {'hydromedusae_aglaura': {},
     'hydromedusae_haliscera': {},
     'hydromedusae_haliscera_small_sideview': {},
     'hydromedusae_liriope': {}},
    'sub_hydromedusae2': {'hydromedusae_narco_dark': {},
     'hydromedusae_narco_young': {},
     'hydromedusae_narcomedusae': {},
     'hydromedusae_solmaris': {},
     'hydromedusae_solmundella': {}}},
   'jellies_tentacles': {},
   'pelagic_tunicates': {'appendicularians': {'appendicularian_fritillaridae': {},
     'appendicularian_s_shape': {},
     'appendicularian_slight_curve': {},
     'appendicularian_straight': {}},
    'tunicate': {'tunicate_doliolid': {},
     'tunicate_doliolid_nurse': {},
     'tunicate_partial': {},
     'tunicate_salp': {},
     'tunicate_salp_chains': {}}},
   'siphonophores': {'calycophoran_siphonophores': {'rocketship': {'siphonophore_calycophoran_rocketship_adult': {},
      'siphonophore_calycophoran_rocketship_young': {}},
     'siphonophore_calycophoran_abylidae': {},
     'sphaeronectes': {'siphonophore_calycophoran_sphaeronectes': {},
      'siphonophore_calycophoran_sphaeronectes_stem': {},
      'siphonophore_calycophoran_sphaeronectes_young': {}}},
    'physonect': {'siphonophore_physonect': {},
     'siphonophore_physonect_young': {}},
    'siphonophore_other_parts': {},
    'siphonophore_partial': {}}},
  'other_invert_larvae': {'echinoderm': {'echinoderm_seacucumber_auricularia_larva': {},
    'pluteus': {'echinoderm_larva_pluteus_brittlestar': {},
     'echinoderm_larva_pluteus_early': {},
     'echinoderm_larva_pluteus_typeC': {},
     'echinoderm_larva_pluteus_urchin': {},
     'echinopluteus': {}},
    'seastar': {'echinoderm_larva_seastar_bipinnaria': {},
     'echinoderm_larva_seastar_brachiolaria': {}}},
   'invertebrate_larvae_other_A': {},
   'invertebrate_larvae_other_B': {},
   'tornaria_acorn_worm_larvae': {},
   'trochophore_larvae': {}},
  'polychaete': {},
  'protists': {'acantharia': {'acantharia_protist': {},
    'acantharia_protist_big_center': {},
    'acantharia_protist_halo': {}},
   'protist_noctiluca': {},
   'radiolarian': {'radiolarian_chain': {}, 'radiolarian_colony': {}},
   'sub_protists': {'protist_dark_center': {},
    'protist_fuzzy_olive': {},
    'protist_other': {},
    'protist_star': {}}},
  'trichodesmium': {'trichodesmium_bowtie': {},
   'trichodesmium_multiple': {},
   'trichodesmium_puff': {},
   'trichodesmium_tuft': {}},
  'unknown': {'unknown_blobs_and_smudges': {},
   'unknown_sticks': {},
   'unknown_unclassified': {}}}}



In [25]:

    
marked_taxonomy = neukrill_net.stacked.propagate_labels_to_leaves(neukrill_net.taxonomy.taxonomy, settings.classes)



In [26]:

    
marked_taxonomy









    Out[26]:





{'no_class': {'artifacts': 8, 'artifacts_edge': 9},
 'plankton': {'chaetognaths': {'chaetognath_non_sagitta': 10,
   'chaetognath_other': 11,
   'chaetognath_sagitta': 12},
  'chordate_type1': 13,
  'crustaceans': {'amphipods': 3,
   'copepods': {'calanoid': {'copepod_calanoid': 14,
     'copepod_calanoid_eggs': 15,
     'copepod_calanoid_eucalanus': 16,
     'copepod_calanoid_flatheads': 17,
     'copepod_calanoid_frillyAntennae': 18,
     'copepod_calanoid_large': 19,
     'copepod_calanoid_large_side_antennatucked': 20,
     'copepod_calanoid_octomoms': 21,
     'copepod_calanoid_small_longantennae': 22,
     'copepod_other': 26},
    'cyclopoid_copepods': {'copepod_cyclopoid_copilia': 23,
     'oithona': {'copepod_cyclopoid_oithona': 24,
      'copepod_cyclopoid_oithona_eggs': 25}}},
   'crustacean_other': 27,
   'shrimp_like': {'decapods_all': {'decapods': 32,
     'shrimp_caridean': 92,
     'shrimp_sergestidae': 94,
     'shrimp_zoea': 95},
    'euphausiids_all_ages': {'euphausiids': 47, 'euphausiids_young': 48},
    'shrimp-like_other': 93},
   'stomatopod': 106},
  'detritus': {'detritus_blob': 33,
   'detritus_filamentous': 34,
   'detritus_other': 35,
   'fecal_pellet': 49},
  'diatoms': {'diatom_chain_string': 36, 'diatom_chain_tube': 37},
  'fish': {'fish_larvae_deep_body': 50,
   'fish_larvae_leptocephali': 51,
   'fish_larvae_medium_body': 52,
   'fish_larvae_myctophids': 53,
   'fish_larvae_thin_body': 54,
   'fish_larvae_very_thin_body': 55},
  'gastropods': {'heteropod': 56,
   'pteropods': {'pteropod_butterfly': 87,
    'pteropod_theco_dev_seq': 88,
    'pteropod_triangle': 89}},
  'gelatinous zooplankton': {'ctenophores': {'ctenophore_cestid': 28,
    'ctenophore_lobate': 31,
    'cydippid': {'ctenophore_cydippid_no_tentacles': 29,
     'ctenophore_cydippid_tentacles': 30}},
   'ephyra': 46,
   'hydromedusae': {'other_hydromedusae': {'hydromedusae_bell_and_tentacles': 58,
     'hydromedusae_h15': 59,
     'hydromedusae_other': 66,
     'hydromedusae_partial_dark': 67,
     'hydromedusae_shapeA': 68,
     'hydromedusae_shapeA_sideview_small': 69,
     'hydromedusae_shapeB': 70,
     'hydromedusae_sideview_big': 71,
     'hydromedusae_typeD': 74,
     'hydromedusae_typeD_bell_and_tentacles': 75,
     'hydromedusae_typeE': 76,
     'hydromedusae_typeF': 77},
    'sub_hydromedusae1': {'hydromedusae_aglaura': 57,
     'hydromedusae_haliscera': 60,
     'hydromedusae_haliscera_small_sideview': 61,
     'hydromedusae_liriope': 62},
    'sub_hydromedusae2': {'hydromedusae_narco_dark': 63,
     'hydromedusae_narco_young': 65,
     'hydromedusae_narcomedusae': 64,
     'hydromedusae_solmaris': 72,
     'hydromedusae_solmundella': 73}},
   'jellies_tentacles': 80,
   'pelagic_tunicates': {'appendicularians': {'appendicularian_fritillaridae': 4,
     'appendicularian_s_shape': 6,
     'appendicularian_slight_curve': 5,
     'appendicularian_straight': 7},
    'tunicate': {'tunicate_doliolid': 113,
     'tunicate_doliolid_nurse': 114,
     'tunicate_partial': 115,
     'tunicate_salp': 116,
     'tunicate_salp_chains': 117}},
   'siphonophores': {'calycophoran_siphonophores': {'rocketship': {'siphonophore_calycophoran_rocketship_adult': 97,
      'siphonophore_calycophoran_rocketship_young': 98},
     'siphonophore_calycophoran_abylidae': 96,
     'sphaeronectes': {'siphonophore_calycophoran_sphaeronectes': 99,
      'siphonophore_calycophoran_sphaeronectes_stem': 100,
      'siphonophore_calycophoran_sphaeronectes_young': 101}},
    'physonect': {'siphonophore_physonect': 104,
     'siphonophore_physonect_young': 105},
    'siphonophore_other_parts': 102,
    'siphonophore_partial': 103}},
  'other_invert_larvae': {'echinoderm': {'echinoderm_seacucumber_auricularia_larva': 44,
    'pluteus': {'echinoderm_larva_pluteus_brittlestar': 38,
     'echinoderm_larva_pluteus_early': 39,
     'echinoderm_larva_pluteus_typeC': 40,
     'echinoderm_larva_pluteus_urchin': 41,
     'echinopluteus': 45},
    'seastar': {'echinoderm_larva_seastar_bipinnaria': 42,
     'echinoderm_larva_seastar_brachiolaria': 43}},
   'invertebrate_larvae_other_A': 78,
   'invertebrate_larvae_other_B': 79,
   'tornaria_acorn_worm_larvae': 107,
   'trochophore_larvae': 112},
  'polychaete': 81,
  'protists': {'acantharia': {'acantharia_protist': 0,
    'acantharia_protist_big_center': 1,
    'acantharia_protist_halo': 2},
   'protist_noctiluca': 84,
   'radiolarian': {'radiolarian_chain': 90, 'radiolarian_colony': 91},
   'sub_protists': {'protist_dark_center': 82,
    'protist_fuzzy_olive': 83,
    'protist_other': 85,
    'protist_star': 86}},
  'trichodesmium': {'trichodesmium_bowtie': 108,
   'trichodesmium_multiple': 109,
   'trichodesmium_puff': 110,
   'trichodesmium_tuft': 111},
  'unknown': {'unknown_blobs_and_smudges': 118,
   'unknown_sticks': 119,
   'unknown_unclassified': 120}}}

Random Forest, with hierarchy



In [27]:

    
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.5, random_state=42)



In [35]:

    
reload(neukrill_net.stacked)









    Out[35]:





<module 'neukrill_net.stacked' from '/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/stacked.py'>



In [36]:

    
hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, stacked_clf)



In [37]:

    
t0 = time.time()
hier_clf.fit(X_train, y_train)
print("Time={}".format(time.time()-t0))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/utils/validation.py:332: UserWarning: StandardScaler assumes floating point values as input, got uint8
  "got %s" % (estimator, X.dtype))






    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-37-0ead87c1e72b> in <module>()
      1 t0 = time.time()
----> 2 hier_clf.fit(X_train, y_train)
      3 print("Time={}".format(time.time()-t0))
      4 
      5 t0 = time.time()

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/stacked.py in fit(self, X, y)
    179         """
    180         # Stores the top-level tuple
--> 181         self.clf_hierarchy = self.subfit(X, y, self.class_hierarchy, self.base_clf)
    182 
    183 

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/stacked.py in subfit(self, X, y, hierarchy, base_clf)
    220 
    221                 # Request child branch do its own hierarchical training
--> 222                 clf_hierarchy[branch_counter] = self.subfit(subX, suby, subhierarchy, base_clf)
    223 
    224             else:

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/stacked.py in subfit(self, X, y, hierarchy, base_clf)
    242         # Train our own classifier
    243         my_clf = copy.deepcopy(base_clf)
--> 244         my_clf.fit(X, my_y)
    245 
    246         return (my_clf, branch_map, clf_hierarchy)

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/stacked.py in fit(self, X, y)
    127             self.hlf.fit(X_inner, y_inner)
    128             # Transform the training input for use with the outer classifier
--> 129             XF = self.hlf.transform(X_outer)
    130 
    131         # Resize if we have [num_aug, num_samples, num_features] shape

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/highlevelfeatures.pyc in transform(self, images)
    193             for augment_index, augmented_image in enumerate(augmented_list):
    194                 # Extract features and put them into the array
--> 195                 X[augment_index, image_index, :] = self.preprocess_and_extract_image(augmented_image).ravel()
    196 
    197         # Return the completed arary

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/highlevelfeatures.pyc in preprocess_and_extract_image(self, image)
    265         NOTE: Subclasses should not modify this function!
    266         """
--> 267         return np.concatenate( [child.preprocess_and_extract_image(image).ravel() for child in self._childHLFs] )
    268 
    269 

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/highlevelfeatures.pyc in preprocess_and_extract_image(self, image)
    156         NOTE: Subclasses should not modify this function!
    157         """
--> 158         return self.preprocess_image(self.extract_image(image))
    159 
    160 

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/highlevelfeatures.pyc in extract_image(self, image)
    685             descriptions = self.scaler.transform(descriptions)
    686             # Compute probabilites for each keypoint belonging to each class
--> 687             kp_probs = self.classifier.predict_proba(descriptions)
    688 
    689             # Average over keypoints

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in predict_proba(self, X)
    466                 self.n_classes_,
    467                 self.n_outputs_)
--> 468             for i in range(n_jobs))
    469 
    470         # Reduce

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    658                 # consumption.
    659                 self._iterating = False
--> 660             self.retrieve()
    661             # Make sure that we get a last message telling us we are done
    662             elapsed_time = time.time() - self._start_time

/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in retrieve(self)
    510                 self._lock.release()
    511             try:
--> 512                 self._output.append(job.get())
    513             except tuple(self.exceptions) as exception:
    514                 try:

/usr/lib/python2.7/multiprocessing/pool.pyc in get(self, timeout)
    556             return self._value
    557         else:
--> 558             raise self._value
    559 
    560     def _set(self, i, obj):

KeyError: '<ipython-input-5-929cddc60db8>'

On original

BREAK

This is similar to just the Contour Moments and Haralick features

On reduced



In [22]:

    
my_X = X_new

clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))









    



Time=23.738656044
Accuracy=0.502505274262
Logloss=1.94713154469

Does slightly worse with fewer features.

Maybe it was too few?



In [23]:

    
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))









    



Time=35.6933050156
Accuracy=0.529733649789
Logloss=1.86473985616

Hierarchical classifier



In [24]:

    
import neukrill_net.taxonomy



In [29]:

    
reload(neukrill_net.stacked)









    Out[29]:





<module 'neukrill_net.stacked' from '/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/stacked.py'>



In [36]:

    
reload(neukrill_net.taxonomy)









    Out[36]:





<module 'neukrill_net.taxonomy' from '/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/taxonomy.py'>



In [37]:

    
neukrill_net.taxonomy.taxonomy









    Out[37]:





{'no_class': {'artifacts': {}, 'artifacts_edge': {}},
 'plankton': {'chaetognaths': {'chaetognath_non_sagitta': {},
   'chaetognath_other': {},
   'chaetognath_sagitta': {}},
  'chordate_type1': {},
  'crustaceans': {'amphipods': {},
   'copepods': {'calanoid': {'copepod_calanoid': {},
     'copepod_calanoid_eggs': {},
     'copepod_calanoid_eucalanus': {},
     'copepod_calanoid_flatheads': {},
     'copepod_calanoid_frillyAntennae': {},
     'copepod_calanoid_large': {},
     'copepod_calanoid_large_side_antennatucked': {},
     'copepod_calanoid_octomoms': {},
     'copepod_calanoid_small_longantennae': {},
     'copepod_other': {}},
    'cyclopoid_copepods': {'copepod_cyclopoid_copilia': {},
     'oithona': {'copepod_cyclopoid_oithona': {},
      'copepod_cyclopoid_oithona_eggs': {}}}},
   'crustacean_other': {},
   'shrimp_like': {'decapods_all': {'decapods': {},
     'shrimp_caridean': {},
     'shrimp_sergestidae': {},
     'shrimp_zoea': {}},
    'euphausiids_all_ages': {'euphausiids': {}, 'euphausiids_young': {}},
    'shrimp-like_other': {}},
   'stomatopod': {}},
  'detritus': {'detritus_blob': {},
   'detritus_filamentous': {},
   'detritus_other': {},
   'fecal_pellet': {}},
  'diatoms': {'diatom_chain_string': {}, 'diatom_chain_tube': {}},
  'fish': {'fish_larvae_deep_body': {},
   'fish_larvae_leptocephali': {},
   'fish_larvae_medium_body': {},
   'fish_larvae_myctophids': {},
   'fish_larvae_thin_body': {},
   'fish_larvae_very_thin_body': {}},
  'gastropods': {'heteropod': {},
   'pteropods': {'pteropod_butterfly': {},
    'pteropod_theco_dev_seq': {},
    'pteropod_triangle': {}}},
  'gelatinous zooplankton': {'ctenophores': {'ctenophore_cestid': {},
    'ctenophore_lobate': {},
    'cydippid': {'ctenophore_cydippid_no_tentacles': {},
     'ctenophore_cydippid_tentacles': {}}},
   'ephyra': {},
   'hydromedusae': {'other_hydromedusae': {'hydromedusae_bell_and_tentacles': {},
     'hydromedusae_h15': {},
     'hydromedusae_other': {},
     'hydromedusae_partial_dark': {},
     'hydromedusae_shapeA': {},
     'hydromedusae_shapeA_sideview_small': {},
     'hydromedusae_shapeB': {},
     'hydromedusae_sideview_big': {},
     'hydromedusae_typeD': {},
     'hydromedusae_typeD_bell_and_tentacles': {},
     'hydromedusae_typeE': {},
     'hydromedusae_typeF': {}},
    'sub_hydromedusae1': {'hydromedusae_aglaura': {},
     'hydromedusae_haliscera': {},
     'hydromedusae_haliscera_small_sideview': {},
     'hydromedusae_liriope': {}},
    'sub_hydromedusae2': {'hydromedusae_narco_dark': {},
     'hydromedusae_narco_young': {},
     'hydromedusae_narcomedusae': {},
     'hydromedusae_solmaris': {},
     'hydromedusae_solmundella': {}}},
   'jellies_tentacles': {},
   'pelagic_tunicates': {'appendicularians': {'appendicularian_fritillaridae': {},
     'appendicularian_s_shape': {},
     'appendicularian_slight_curve': {},
     'appendicularian_straight': {}},
    'tunicate': {'tunicate_doliolid': {},
     'tunicate_doliolid_nurse': {},
     'tunicate_partial': {},
     'tunicate_salp': {},
     'tunicate_salp_chains': {}}},
   'siphonophores': {'calycophoran_siphonophores': {'rocketship': {'siphonophore_calycophoran_rocketship_adult': {},
      'siphonophore_calycophoran_rocketship_young': {}},
     'siphonophore_calycophoran_abylidae': {},
     'sphaeronectes': {'siphonophore_calycophoran_sphaeronectes': {},
      'siphonophore_calycophoran_sphaeronectes_stem': {},
      'siphonophore_calycophoran_sphaeronectes_young': {}}},
    'physonect': {'siphonophore_physonect': {},
     'siphonophore_physonect_young': {}},
    'siphonophore_other_parts': {},
    'siphonophore_partial': {}}},
  'other_invert_larvae': {'echinoderm': {'echinoderm_seacucumber_auricularia_larva': {},
    'pluteus': {'echinoderm_larva_pluteus_brittlestar': {},
     'echinoderm_larva_pluteus_early': {},
     'echinoderm_larva_pluteus_typeC': {},
     'echinoderm_larva_pluteus_urchin': {},
     'echinopluteus': {}},
    'seastar': {'echinoderm_larva_seastar_bipinnaria': {},
     'echinoderm_larva_seastar_brachiolaria': {}}},
   'invertebrate_larvae_other_A': {},
   'invertebrate_larvae_other_B': {},
   'tornaria_acorn_worm_larvae': {},
   'trochophore_larvae': {}},
  'polychaete': {},
  'protists': {'acantharia': {'acantharia_protist': {},
    'acantharia_protist_big_center': {},
    'acantharia_protist_halo': {}},
   'protist_noctiluca': {},
   'radiolarian': {'radiolarian_chain': {}, 'radiolarian_colony': {}},
   'sub_protists': {'protist_dark_center': {},
    'protist_fuzzy_olive': {},
    'protist_other': {},
    'protist_star': {}}},
  'trichodesmium': {'trichodesmium_bowtie': {},
   'trichodesmium_multiple': {},
   'trichodesmium_puff': {},
   'trichodesmium_tuft': {}},
  'unknown': {'unknown_blobs_and_smudges': {},
   'unknown_sticks': {},
   'unknown_unclassified': {}}}}



In [38]:

    
settings.classes









    Out[38]:





[u'acantharia_protist',
 u'acantharia_protist_big_center',
 u'acantharia_protist_halo',
 u'amphipods',
 u'appendicularian_fritillaridae',
 u'appendicularian_slight_curve',
 u'appendicularian_s_shape',
 u'appendicularian_straight',
 u'artifacts',
 u'artifacts_edge',
 u'chaetognath_non_sagitta',
 u'chaetognath_other',
 u'chaetognath_sagitta',
 u'chordate_type1',
 u'copepod_calanoid',
 u'copepod_calanoid_eggs',
 u'copepod_calanoid_eucalanus',
 u'copepod_calanoid_flatheads',
 u'copepod_calanoid_frillyAntennae',
 u'copepod_calanoid_large',
 u'copepod_calanoid_large_side_antennatucked',
 u'copepod_calanoid_octomoms',
 u'copepod_calanoid_small_longantennae',
 u'copepod_cyclopoid_copilia',
 u'copepod_cyclopoid_oithona',
 u'copepod_cyclopoid_oithona_eggs',
 u'copepod_other',
 u'crustacean_other',
 u'ctenophore_cestid',
 u'ctenophore_cydippid_no_tentacles',
 u'ctenophore_cydippid_tentacles',
 u'ctenophore_lobate',
 u'decapods',
 u'detritus_blob',
 u'detritus_filamentous',
 u'detritus_other',
 u'diatom_chain_string',
 u'diatom_chain_tube',
 u'echinoderm_larva_pluteus_brittlestar',
 u'echinoderm_larva_pluteus_early',
 u'echinoderm_larva_pluteus_typeC',
 u'echinoderm_larva_pluteus_urchin',
 u'echinoderm_larva_seastar_bipinnaria',
 u'echinoderm_larva_seastar_brachiolaria',
 u'echinoderm_seacucumber_auricularia_larva',
 u'echinopluteus',
 u'ephyra',
 u'euphausiids',
 u'euphausiids_young',
 u'fecal_pellet',
 u'fish_larvae_deep_body',
 u'fish_larvae_leptocephali',
 u'fish_larvae_medium_body',
 u'fish_larvae_myctophids',
 u'fish_larvae_thin_body',
 u'fish_larvae_very_thin_body',
 u'heteropod',
 u'hydromedusae_aglaura',
 u'hydromedusae_bell_and_tentacles',
 u'hydromedusae_h15',
 u'hydromedusae_haliscera',
 u'hydromedusae_haliscera_small_sideview',
 u'hydromedusae_liriope',
 u'hydromedusae_narco_dark',
 u'hydromedusae_narcomedusae',
 u'hydromedusae_narco_young',
 u'hydromedusae_other',
 u'hydromedusae_partial_dark',
 u'hydromedusae_shapeA',
 u'hydromedusae_shapeA_sideview_small',
 u'hydromedusae_shapeB',
 u'hydromedusae_sideview_big',
 u'hydromedusae_solmaris',
 u'hydromedusae_solmundella',
 u'hydromedusae_typeD',
 u'hydromedusae_typeD_bell_and_tentacles',
 u'hydromedusae_typeE',
 u'hydromedusae_typeF',
 u'invertebrate_larvae_other_A',
 u'invertebrate_larvae_other_B',
 u'jellies_tentacles',
 u'polychaete',
 u'protist_dark_center',
 u'protist_fuzzy_olive',
 u'protist_noctiluca',
 u'protist_other',
 u'protist_star',
 u'pteropod_butterfly',
 u'pteropod_theco_dev_seq',
 u'pteropod_triangle',
 u'radiolarian_chain',
 u'radiolarian_colony',
 u'shrimp_caridean',
 u'shrimp-like_other',
 u'shrimp_sergestidae',
 u'shrimp_zoea',
 u'siphonophore_calycophoran_abylidae',
 u'siphonophore_calycophoran_rocketship_adult',
 u'siphonophore_calycophoran_rocketship_young',
 u'siphonophore_calycophoran_sphaeronectes',
 u'siphonophore_calycophoran_sphaeronectes_stem',
 u'siphonophore_calycophoran_sphaeronectes_young',
 u'siphonophore_other_parts',
 u'siphonophore_partial',
 u'siphonophore_physonect',
 u'siphonophore_physonect_young',
 u'stomatopod',
 u'tornaria_acorn_worm_larvae',
 u'trichodesmium_bowtie',
 u'trichodesmium_multiple',
 u'trichodesmium_puff',
 u'trichodesmium_tuft',
 u'trochophore_larvae',
 u'tunicate_doliolid',
 u'tunicate_doliolid_nurse',
 u'tunicate_partial',
 u'tunicate_salp',
 u'tunicate_salp_chains',
 u'unknown_blobs_and_smudges',
 u'unknown_sticks',
 u'unknown_unclassified']



In [39]:

    
marked_taxonomy = neukrill_net.stacked.propagate_labels_to_leaves(neukrill_net.taxonomy.taxonomy, settings.classes)



In [40]:

    
marked_taxonomy









    Out[40]:





{'no_class': {'artifacts': 8, 'artifacts_edge': 9},
 'plankton': {'chaetognaths': {'chaetognath_non_sagitta': 10,
   'chaetognath_other': 11,
   'chaetognath_sagitta': 12},
  'chordate_type1': 13,
  'crustaceans': {'amphipods': 3,
   'copepods': {'calanoid': {'copepod_calanoid': 14,
     'copepod_calanoid_eggs': 15,
     'copepod_calanoid_eucalanus': 16,
     'copepod_calanoid_flatheads': 17,
     'copepod_calanoid_frillyAntennae': 18,
     'copepod_calanoid_large': 19,
     'copepod_calanoid_large_side_antennatucked': 20,
     'copepod_calanoid_octomoms': 21,
     'copepod_calanoid_small_longantennae': 22,
     'copepod_other': 26},
    'cyclopoid_copepods': {'copepod_cyclopoid_copilia': 23,
     'oithona': {'copepod_cyclopoid_oithona': 24,
      'copepod_cyclopoid_oithona_eggs': 25}}},
   'crustacean_other': 27,
   'shrimp_like': {'decapods_all': {'decapods': 32,
     'shrimp_caridean': 92,
     'shrimp_sergestidae': 94,
     'shrimp_zoea': 95},
    'euphausiids_all_ages': {'euphausiids': 47, 'euphausiids_young': 48},
    'shrimp-like_other': 93},
   'stomatopod': 106},
  'detritus': {'detritus_blob': 33,
   'detritus_filamentous': 34,
   'detritus_other': 35,
   'fecal_pellet': 49},
  'diatoms': {'diatom_chain_string': 36, 'diatom_chain_tube': 37},
  'fish': {'fish_larvae_deep_body': 50,
   'fish_larvae_leptocephali': 51,
   'fish_larvae_medium_body': 52,
   'fish_larvae_myctophids': 53,
   'fish_larvae_thin_body': 54,
   'fish_larvae_very_thin_body': 55},
  'gastropods': {'heteropod': 56,
   'pteropods': {'pteropod_butterfly': 87,
    'pteropod_theco_dev_seq': 88,
    'pteropod_triangle': 89}},
  'gelatinous zooplankton': {'ctenophores': {'ctenophore_cestid': 28,
    'ctenophore_lobate': 31,
    'cydippid': {'ctenophore_cydippid_no_tentacles': 29,
     'ctenophore_cydippid_tentacles': 30}},
   'ephyra': 46,
   'hydromedusae': {'other_hydromedusae': {'hydromedusae_bell_and_tentacles': 58,
     'hydromedusae_h15': 59,
     'hydromedusae_other': 66,
     'hydromedusae_partial_dark': 67,
     'hydromedusae_shapeA': 68,
     'hydromedusae_shapeA_sideview_small': 69,
     'hydromedusae_shapeB': 70,
     'hydromedusae_sideview_big': 71,
     'hydromedusae_typeD': 74,
     'hydromedusae_typeD_bell_and_tentacles': 75,
     'hydromedusae_typeE': 76,
     'hydromedusae_typeF': 77},
    'sub_hydromedusae1': {'hydromedusae_aglaura': 57,
     'hydromedusae_haliscera': 60,
     'hydromedusae_haliscera_small_sideview': 61,
     'hydromedusae_liriope': 62},
    'sub_hydromedusae2': {'hydromedusae_narco_dark': 63,
     'hydromedusae_narco_young': 65,
     'hydromedusae_narcomedusae': 64,
     'hydromedusae_solmaris': 72,
     'hydromedusae_solmundella': 73}},
   'jellies_tentacles': 80,
   'pelagic_tunicates': {'appendicularians': {'appendicularian_fritillaridae': 4,
     'appendicularian_s_shape': 6,
     'appendicularian_slight_curve': 5,
     'appendicularian_straight': 7},
    'tunicate': {'tunicate_doliolid': 113,
     'tunicate_doliolid_nurse': 114,
     'tunicate_partial': 115,
     'tunicate_salp': 116,
     'tunicate_salp_chains': 117}},
   'siphonophores': {'calycophoran_siphonophores': {'rocketship': {'siphonophore_calycophoran_rocketship_adult': 97,
      'siphonophore_calycophoran_rocketship_young': 98},
     'siphonophore_calycophoran_abylidae': 96,
     'sphaeronectes': {'siphonophore_calycophoran_sphaeronectes': 99,
      'siphonophore_calycophoran_sphaeronectes_stem': 100,
      'siphonophore_calycophoran_sphaeronectes_young': 101}},
    'physonect': {'siphonophore_physonect': 104,
     'siphonophore_physonect_young': 105},
    'siphonophore_other_parts': 102,
    'siphonophore_partial': 103}},
  'other_invert_larvae': {'echinoderm': {'echinoderm_seacucumber_auricularia_larva': 44,
    'pluteus': {'echinoderm_larva_pluteus_brittlestar': 38,
     'echinoderm_larva_pluteus_early': 39,
     'echinoderm_larva_pluteus_typeC': 40,
     'echinoderm_larva_pluteus_urchin': 41,
     'echinopluteus': 45},
    'seastar': {'echinoderm_larva_seastar_bipinnaria': 42,
     'echinoderm_larva_seastar_brachiolaria': 43}},
   'invertebrate_larvae_other_A': 78,
   'invertebrate_larvae_other_B': 79,
   'tornaria_acorn_worm_larvae': 107,
   'trochophore_larvae': 112},
  'polychaete': 81,
  'protists': {'acantharia': {'acantharia_protist': 0,
    'acantharia_protist_big_center': 1,
    'acantharia_protist_halo': 2},
   'protist_noctiluca': 84,
   'radiolarian': {'radiolarian_chain': 90, 'radiolarian_colony': 91},
   'sub_protists': {'protist_dark_center': 82,
    'protist_fuzzy_olive': 83,
    'protist_other': 85,
    'protist_star': 86}},
  'trichodesmium': {'trichodesmium_bowtie': 108,
   'trichodesmium_multiple': 109,
   'trichodesmium_puff': 110,
   'trichodesmium_tuft': 111},
  'unknown': {'unknown_blobs_and_smudges': 118,
   'unknown_sticks': 119,
   'unknown_unclassified': 120}}}



In [41]:

    
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=70.759239912
Logloss=1.8479429741
Time=17.3513498306
Accuracy=0.530129219409

With all the features left in



In [42]:

    
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=87.1275148392
Logloss=1.91641586977
Time=20.142124176
Accuracy=0.526305379747

Try with a pipline to reduce the number of features at each level



In [44]:

    
import sklearn.pipeline



In [47]:

    
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=68.5872459412
Logloss=1.8722423589
Time=18.8261601925
Accuracy=0.520767405063






    



/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:106: RuntimeWarning: divide by zero encountered in divide
  f = msb / msw
/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:106: RuntimeWarning: invalid value encountered in divide
  f = msb / msw



In [47]:

    
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=68.5872459412
Logloss=1.8722423589
Time=18.8261601925
Accuracy=0.520767405063






    



/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:106: RuntimeWarning: divide by zero encountered in divide
  f = msb / msw
/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:106: RuntimeWarning: invalid value encountered in divide
  f = msb / msw

Logistic Regression



In [48]:

    
clf = sklearn.linear_model.LogisticRegression(random_state=42)



In [49]:

    
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))









    



Time=748.569411993
Accuracy=0.470266350211
Logloss=2.16218685973



In [50]:

    
clf = sklearn.linear_model.LogisticRegression(random_state=42)

my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))









    



Time=145.000400066
Accuracy=0.527030590717
Logloss=1.83400284889



In [55]:

    
base_clf = sklearn.linear_model.LogisticRegression(random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=123.477735996
Logloss=2.53232405225
Time=0.659505844116
Accuracy=0.448641877637



In [52]:

    
base_clf = sklearn.linear_model.LogisticRegression(random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=21.4947209358
Logloss=2.04123557253
Time=1.12955403328
Accuracy=0.476331751055

Linear SVC



In [56]:

    
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))









    



Time=337.150140047
Accuracy=0.452202004219
Logloss=2.14135559841



In [57]:

    
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)

my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))









    



Time=86.2040469646
Accuracy=0.553270042194
Logloss=1.7817047324



In [58]:

    
base_clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=1202.48394012
Logloss=2.14906265613
Time=122.273081064
Accuracy=0.434665084388



In [59]:

    
base_clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=177.089457035
Logloss=1.91130093883
Time=34.1300561428
Accuracy=0.488528481013

Non-linear SVC

one-vs-one



In [60]:

    
clf = sklearn.svm.SVC(probability=True, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))









    



Time=518.949584007
Accuracy=0.494857594937
Logloss=1.82661110623



In [61]:

    
clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)

my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))









    



Time=121.488356829
Accuracy=0.522679324895
Logloss=1.70179310087



In [62]:

    
base_clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=446.839751005
Logloss=1.85225221785
Time=184.890357971
Accuracy=0.50039556962



In [63]:

    
base_clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))









    



Time=128.495127916
Logloss=1.79123650959
Time=55.6328210831
Accuracy=0.507318037975



In [ ]: