In [22]:
pos='bursi.pos.gspan'
neg='bursi.neg.gspan'

train_test_split=0.7

pos2neg_ratio=0.05

n_jobs=-1
cv=10

In [15]:
from eden.graph import Vectorizer
vectorizer = Vectorizer( complexity=3 )

In [16]:
%%time
#fit and evaluate on full dataset

#create iterable from data
from eden.converter.graph.gspan import gspan_to_eden
iterable_pos = gspan_to_eden( pos )
iterable_neg = gspan_to_eden( neg )

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

#copy iterators for later use
from itertools import tee
iterable_pos_train,iterable_pos_train_ = tee(iterable_pos_train)
iterable_neg_train,iterable_neg_train_ = tee(iterable_neg_train)
iterable_pos_test,iterable_pos_test_ = tee(iterable_pos_test)
iterable_neg_test,iterable_neg_test_ = tee(iterable_neg_test)

from eden.util import fit, estimate
estimator = fit(iterable_pos_train_, iterable_neg_train_, vectorizer, n_jobs=n_jobs, cv=cv)
estimate(iterable_pos_test_, iterable_neg_test_, estimator, vectorizer, n_jobs=n_jobs)


Classifier:
SGDClassifier(alpha=8.97631593083e-05, class_weight='auto', epsilon=0.1,
       eta0=6.75034750726, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=13, n_jobs=-1,
       penalty='elasticnet', power_t=0.677131793837, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.836 +- 0.018
           precision: 0.855 +- 0.038
              recall: 0.853 +- 0.047
                  f1: 0.852 +- 0.017
   average_precision: 0.918 +- 0.017
             roc_auc: 0.906 +- 0.013
--------------------------------------------------------------------------------
Test set
Instances: 1302 ; Features: 1048577 with an avg of 181 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.82      0.82      0.82       581
          1       0.85      0.85      0.85       721

avg / total       0.84      0.84      0.84      1302

ROC: 0.905
APR: 0.916
CPU times: user 25.7 s, sys: 10.7 s, total: 36.4 s
Wall time: 1min 25s

In [23]:
%%time
#fit and evaluate on fraction of negatives

iterable_pos_train,iterable_pos_train_ = tee(iterable_pos_train)
iterable_neg_train,iterable_neg_train_ = tee(iterable_neg_train)
iterable_pos_test,iterable_pos_test_ = tee(iterable_pos_test)
iterable_neg_test,iterable_neg_test_ = tee(iterable_neg_test)

#train and evaluate
from eden.util import self_training
estimator = self_training(iterable_pos_train_, iterable_neg_train_, vectorizer=vectorizer, pos2neg_ratio=pos2neg_ratio, num_iterations=1, threshold=-1, mode='more_than' )
estimate(iterable_pos_test_, iterable_neg_test_, estimator, vectorizer)


Positives:
Instances: 1680 ; Features: 1048577 with an avg of 188 features per instance
Iteration: 1/1
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 153 features per instance
Classifier:
SGDClassifier(alpha=0.000148599680904, class_weight='auto', epsilon=0.1,
       eta0=8.30729929027, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=29, n_jobs=-1,
       penalty='l2', power_t=0.962092086917, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.945 +- 0.012
           precision: 0.971 +- 0.007
              recall: 0.971 +- 0.010
                  f1: 0.971 +- 0.006
   average_precision: 0.994 +- 0.003
             roc_auc: 0.895 +- 0.040
--------------------------------------------------------------------------------
Test set
Instances: 1302 ; Features: 1048577 with an avg of 181 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.88      0.31      0.46       581
          1       0.64      0.97      0.77       721

avg / total       0.75      0.68      0.63      1302

ROC: 0.855
APR: 0.875
CPU times: user 54.6 s, sys: 40.6 s, total: 1min 35s
Wall time: 2min 8s

In [24]:
%%time 
#train with self training strategy

iterable_pos_train,iterable_pos_train_ = tee(iterable_pos_train)
iterable_neg_train,iterable_neg_train_ = tee(iterable_neg_train)
iterable_pos_test,iterable_pos_test_ = tee(iterable_pos_test)
iterable_neg_test,iterable_neg_test_ = tee(iterable_neg_test)

estimator = self_training(iterable_pos_train_, iterable_neg_train_, vectorizer=vectorizer, pos2neg_ratio=pos2neg_ratio, num_iterations=20, threshold= -1, mode='more_than' )
estimate(iterable_pos_test_, iterable_neg_test_, estimator, vectorizer)


Positives:
Instances: 1680 ; Features: 1048577 with an avg of 188 features per instance
Iteration: 1/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 153 features per instance
Classifier:
SGDClassifier(alpha=0.000336484649532, class_weight='auto', epsilon=0.1,
       eta0=6.47038640862, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=49, n_jobs=-1,
       penalty='l2', power_t=0.2348462725, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.918 +- 0.019
           precision: 0.975 +- 0.008
              recall: 0.938 +- 0.020
                  f1: 0.956 +- 0.011
   average_precision: 0.993 +- 0.003
             roc_auc: 0.891 +- 0.036
--------------------------------------------------------------------------------
Iteration: 2/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 181 features per instance
Classifier:
SGDClassifier(alpha=0.000384637968996, class_weight='auto', epsilon=0.1,
       eta0=7.3423659076, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=20, n_jobs=-1,
       penalty='l2', power_t=0.291397971212, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.888 +- 0.016
           precision: 0.971 +- 0.008
              recall: 0.910 +- 0.017
                  f1: 0.939 +- 0.009
   average_precision: 0.986 +- 0.009
             roc_auc: 0.815 +- 0.080
--------------------------------------------------------------------------------
Iteration: 3/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 166 features per instance
Classifier:
SGDClassifier(alpha=0.000733829297335, class_weight='auto', epsilon=0.1,
       eta0=1.16576984895, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='hinge', n_iter=24, n_jobs=-1,
       penalty='l2', power_t=0.1666954585, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.758 +- 0.093
           precision: 0.984 +- 0.010
              recall: 0.759 +- 0.104
                  f1: 0.852 +- 0.068
   average_precision: 0.989 +- 0.004
             roc_auc: 0.813 +- 0.056
--------------------------------------------------------------------------------
Iteration: 4/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 168 features per instance
Classifier:
SGDClassifier(alpha=9.37761801746e-05, class_weight='auto', epsilon=0.1,
       eta0=5.47663903026, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=12, n_jobs=-1,
       penalty='l2', power_t=0.557367911782, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.934 +- 0.017
           precision: 0.960 +- 0.006
              recall: 0.971 +- 0.020
                  f1: 0.965 +- 0.009
   average_precision: 0.984 +- 0.005
             roc_auc: 0.768 +- 0.060
--------------------------------------------------------------------------------
Iteration: 5/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 161 features per instance
Classifier:
SGDClassifier(alpha=0.000477356331166, class_weight='auto', epsilon=0.1,
       eta0=6.73668205814, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='hinge', n_iter=85, n_jobs=-1,
       penalty='l2', power_t=0.466525687371, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.868 +- 0.019
           precision: 0.977 +- 0.009
              recall: 0.883 +- 0.020
                  f1: 0.927 +- 0.011
   average_precision: 0.990 +- 0.006
             roc_auc: 0.854 +- 0.073
--------------------------------------------------------------------------------
Iteration: 6/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 160 features per instance
Classifier:
SGDClassifier(alpha=0.00018998189191, class_weight='auto', epsilon=0.1,
       eta0=2.10687395137, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=9, n_jobs=-1,
       penalty='l2', power_t=1.05671785769, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.918 +- 0.019
           precision: 0.965 +- 0.009
              recall: 0.949 +- 0.024
                  f1: 0.957 +- 0.010
   average_precision: 0.985 +- 0.007
             roc_auc: 0.796 +- 0.070
--------------------------------------------------------------------------------
Iteration: 7/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 175 features per instance
Classifier:
SGDClassifier(alpha=0.000321489477344, class_weight='auto', epsilon=0.1,
       eta0=4.59916635655, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=31, n_jobs=-1,
       penalty='l2', power_t=0.669693409659, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.908 +- 0.023
           precision: 0.973 +- 0.007
              recall: 0.930 +- 0.019
                  f1: 0.951 +- 0.013
   average_precision: 0.990 +- 0.004
             roc_auc: 0.846 +- 0.055
--------------------------------------------------------------------------------
Iteration: 8/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 150 features per instance
Classifier:
SGDClassifier(alpha=0.000270085229934, class_weight='auto', epsilon=0.1,
       eta0=6.56673581292, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=9, n_jobs=-1,
       penalty='l2', power_t=0.109571788647, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.884 +- 0.021
           precision: 0.970 +- 0.005
              recall: 0.906 +- 0.023
                  f1: 0.937 +- 0.012
   average_precision: 0.990 +- 0.004
             roc_auc: 0.833 +- 0.048
--------------------------------------------------------------------------------
Iteration: 9/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 167 features per instance
Classifier:
SGDClassifier(alpha=0.000103811031891, class_weight='auto', epsilon=0.1,
       eta0=9.47731259819, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=22, n_jobs=-1,
       penalty='l2', power_t=0.812456665481, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.935 +- 0.008
           precision: 0.953 +- 0.004
              recall: 0.980 +- 0.010
                  f1: 0.966 +- 0.004
   average_precision: 0.989 +- 0.006
             roc_auc: 0.814 +- 0.096
--------------------------------------------------------------------------------
Iteration: 10/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 170 features per instance
Classifier:
SGDClassifier(alpha=0.000307254703885, class_weight='auto', epsilon=0.1,
       eta0=7.70476320984, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=94, n_jobs=-1,
       penalty='l2', power_t=0.255264497877, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.902 +- 0.014
           precision: 0.964 +- 0.007
              recall: 0.932 +- 0.017
                  f1: 0.948 +- 0.008
   average_precision: 0.986 +- 0.007
             roc_auc: 0.796 +- 0.066
--------------------------------------------------------------------------------
Iteration: 11/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 160 features per instance
Classifier:
SGDClassifier(alpha=5.23799006015e-05, class_weight='auto', epsilon=0.1,
       eta0=2.08221511386, fit_intercept=True, l1_ratio=0.15,
       learning_rate='constant', loss='hinge', n_iter=71, n_jobs=-1,
       penalty='l2', power_t=0.165088949459, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.904 +- 0.060
           precision: 0.960 +- 0.007
              recall: 0.939 +- 0.068
                  f1: 0.948 +- 0.036
   average_precision: 0.984 +- 0.007
             roc_auc: 0.770 +- 0.078
--------------------------------------------------------------------------------
Iteration: 12/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 161 features per instance
Classifier:
SGDClassifier(alpha=0.000543331545919, class_weight='auto', epsilon=0.1,
       eta0=1.48169549008, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='hinge', n_iter=91, n_jobs=-1,
       penalty='l2', power_t=0.368076770158, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.835 +- 0.025
           precision: 0.972 +- 0.007
              recall: 0.852 +- 0.032
                  f1: 0.907 +- 0.016
   average_precision: 0.988 +- 0.006
             roc_auc: 0.818 +- 0.072
--------------------------------------------------------------------------------
Iteration: 13/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 189 features per instance
Classifier:
SGDClassifier(alpha=6.72601084939e-05, class_weight='auto', epsilon=0.1,
       eta0=2.77401512557, fit_intercept=True, l1_ratio=0.15,
       learning_rate='invscaling', loss='hinge', n_iter=30, n_jobs=-1,
       penalty='l2', power_t=0.288744435175, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.922 +- 0.017
           precision: 0.960 +- 0.006
              recall: 0.958 +- 0.017
                  f1: 0.959 +- 0.009
   average_precision: 0.986 +- 0.006
             roc_auc: 0.796 +- 0.068
--------------------------------------------------------------------------------
Iteration: 14/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 186 features per instance
Classifier:
SGDClassifier(alpha=1.10282726287e-05, class_weight='auto', epsilon=0.1,
       eta0=1.88436009051, fit_intercept=True, l1_ratio=0.15,
       learning_rate='constant', loss='hinge', n_iter=77, n_jobs=-1,
       penalty='l2', power_t=0.804730425688, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.943 +- 0.013
           precision: 0.959 +- 0.004
              recall: 0.982 +- 0.013
                  f1: 0.970 +- 0.007
   average_precision: 0.991 +- 0.006
             roc_auc: 0.854 +- 0.065
--------------------------------------------------------------------------------
Iteration: 15/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 187 features per instance
Classifier:
SGDClassifier(alpha=0.000430570160623, class_weight='auto', epsilon=0.1,
       eta0=6.45241993653, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=-1,
       penalty='l2', power_t=0.787022763905, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.883 +- 0.014
           precision: 0.973 +- 0.009
              recall: 0.902 +- 0.016
                  f1: 0.936 +- 0.008
   average_precision: 0.988 +- 0.007
             roc_auc: 0.836 +- 0.055
--------------------------------------------------------------------------------
Iteration: 16/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 147 features per instance
Classifier:
SGDClassifier(alpha=2.36019501808e-05, class_weight='auto', epsilon=0.1,
       eta0=9.93474088776, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=99, n_jobs=-1,
       penalty='l2', power_t=0.934205456405, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.951 +- 0.009
           precision: 0.958 +- 0.005
              recall: 0.993 +- 0.007
                  f1: 0.975 +- 0.004
   average_precision: 0.992 +- 0.004
             roc_auc: 0.865 +- 0.066
--------------------------------------------------------------------------------
Iteration: 17/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 178 features per instance
Classifier:
SGDClassifier(alpha=0.000302649298351, class_weight='auto', epsilon=0.1,
       eta0=0.219296058439, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=11, n_jobs=-1,
       penalty='l2', power_t=0.205583636727, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.904 +- 0.018
           precision: 0.975 +- 0.012
              recall: 0.923 +- 0.022
                  f1: 0.948 +- 0.010
   average_precision: 0.993 +- 0.003
             roc_auc: 0.878 +- 0.046
--------------------------------------------------------------------------------
Iteration: 18/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 174 features per instance
Classifier:
SGDClassifier(alpha=9.75543670139e-05, class_weight='auto', epsilon=0.1,
       eta0=8.90811972677, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=88, n_jobs=-1,
       penalty='l2', power_t=0.405378881742, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.936 +- 0.012
           precision: 0.957 +- 0.004
              recall: 0.976 +- 0.011
                  f1: 0.967 +- 0.006
   average_precision: 0.988 +- 0.006
             roc_auc: 0.830 +- 0.073
--------------------------------------------------------------------------------
Iteration: 19/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 166 features per instance
Classifier:
SGDClassifier(alpha=0.000972468302056, class_weight='auto', epsilon=0.1,
       eta0=9.4368426691, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=94, n_jobs=-1,
       penalty='l2', power_t=0.508140121323, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.768 +- 0.055
           precision: 0.980 +- 0.010
              recall: 0.772 +- 0.054
                  f1: 0.863 +- 0.036
   average_precision: 0.988 +- 0.007
             roc_auc: 0.812 +- 0.072
--------------------------------------------------------------------------------
Iteration: 20/20
Negatives:
Instances: 84 ; Features: 1048577 with an avg of 150 features per instance
Classifier:
SGDClassifier(alpha=0.000390662884465, class_weight='auto', epsilon=0.1,
       eta0=1.14085120818, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=89, n_jobs=-1,
       penalty='l2', power_t=0.207487319012, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Predictive performance:
            accuracy: 0.893 +- 0.018
           precision: 0.974 +- 0.008
              recall: 0.912 +- 0.018
                  f1: 0.942 +- 0.010
   average_precision: 0.989 +- 0.006
             roc_auc: 0.841 +- 0.054
--------------------------------------------------------------------------------
Test set
Instances: 1302 ; Features: 1048577 with an avg of 181 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.82      0.49      0.61       581
          1       0.69      0.91      0.78       721

avg / total       0.75      0.72      0.71      1302

ROC: 0.851
APR: 0.877
CPU times: user 10min 43s, sys: 3min 30s, total: 14min 13s
Wall time: 25min 38s

Note: The recall of the negatives has gone from 0.3 in the case of a single trial to 0.5 with the self training strategy.


In [19]:
%matplotlib inline

from eden.converter.graph.gspan import gspan_to_eden
graphs = gspan_to_eden( 'bursi.pos.gspan' )
graphs = vectorizer.annotate( graphs, estimator=estimator )

from eden.modifier.graph.vertex_attributes import colorize_binary
graphs = colorize_binary(graph_list = graphs, output_attribute = 'color_value', input_attribute='importance', level=0.5)

import itertools 
graphs = itertools.islice( graphs, 20 )

from eden.util.display import draw_graph
for graph in graphs: draw_graph( graph, size=6, node_border = False, vertex_color='color_value', colormap='YlOrRd', invert_colormap = False )