In [2]:
%matplotlib inline
import os, sys
import ntpath
from eden.util import display
# -----------------------------------------------------------------------------

In [3]:
from graph_finder import GraphFinder

Experiments:

read a directory of '.sto' files and create their graphs


In [4]:
pos_class_0_path = "StoExamples/StoClasses/pos_class_0"
pos_class_1_path = "StoExamples/StoClasses/pos_class_1"
neg_class_0_path = "StoExamples/StoClasses/neg_class_0"
neg_class_1_path = "StoExamples/StoClasses/neg_class_1"

pos_class_0_abs_path = os.path.abspath(pos_class_0_path)
pos_class_1_abs_path = os.path.abspath(pos_class_1_path)
neg_class_0_abs_path = os.path.abspath(neg_class_0_path)
neg_class_1_abs_path = os.path.abspath(neg_class_1_path)

convert files to graphs


In [5]:
gf = GraphFinder()
pos_0_graphs = gf.convert(pos_class_0_abs_path)
pos_1_graphs = gf.convert(pos_class_1_abs_path)
neg_0_graphs = gf.convert(neg_class_0_abs_path)
neg_1_graphs = gf.convert(neg_class_1_abs_path)


{'graph_title': '550-70730-0-0'}
{'graph_title': '550-70852-0-0'}
{'graph_title': '550-53949-1-0'}
{'graph_title': '550-69275-0-0'}
{'graph_title': '550-69410-0-0'}
{'graph_title': '550-751-0-0'}
{'graph_title': '550-1137-0-0'}
{'graph_title': '550-904-1-0'}
{'graph_title': '550-1143-1-0'}
{'graph_title': '550-1153-0-0'}

In [6]:
import numpy as np
def make_target(fname_pos, fname_neg):
    '''create target list'''
    
    target = [1]* len(fname_pos) + [-1] * len(fname_neg)
    y = np.asarray([int(value) for value in target])
    #print ('y',y)
    return y

In [7]:
from eden.graph import Vectorizer
def vectorizer(transform_train):
    '''this function vectorize the train data'''
    print 'vectorizer'
    %%time
  
    vectorizer = Vectorizer(complexity=3)

    '''extract features and build data matrix'''
    X = vectorizer.transform(transform_train)
    print ('X', X)
    print 'Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1],  X.getnnz()/X.shape[0])
    return X

In [8]:
def make_data(fname_pos, fname_neg):
    '''this function make the train data, apply the transformation to the graphs,
        and apply the vectorization'''

    train_graphs = fname_pos + fname_neg

    '''create the transformed train graphs'''
    transform_trains = gf.graphs_transform(train_graphs, use_seq=True, use_cov = True)
    
    transform_fname_pos = gf.graphs_transform(fname_pos, use_seq=True, use_cov = True)
    print 'pos'

    transform_fname_neg = gf.graphs_transform(fname_neg, use_seq=True, use_cov = True)
    print 'neg'

    #test_graphs = make_test_graphs(fname_pos, fname_neg)
    #print ('test_graphs', test_graphs)
    
    X = vectorizer(transform_trains)

    y = make_target(transform_fname_pos, transform_fname_neg)
    print 'Done'
    return X, y

Classification

make train test data


In [9]:
X, y = make_data(pos_0_graphs, neg_0_graphs)
test_graphs = pos_1_graphs + neg_1_graphs
print ('test_graphs', test_graphs)
print X
print y


pos
neg
vectorizer
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.01 µs
('X', <4x1048577 sparse matrix of type '<type 'numpy.float64'>'
	with 10356 stored elements in Compressed Sparse Row format>)
Instances: 4 Features: 1048577 with an avg of 2589 features per instance
Done
('test_graphs', [<networkx.classes.graph.Graph object at 0x7f5a2f00c790>, <networkx.classes.graph.Graph object at 0x7f5a59e94650>, <networkx.classes.graph.Graph object at 0x7f5a2f00c910>, <networkx.classes.graph.Graph object at 0x7f5a2f00ca10>, <networkx.classes.graph.Graph object at 0x7f5a2f00ca50>, <networkx.classes.graph.Graph object at 0x7f5a2f00ca90>])
  (0, 771)	0.0296695414548
  (0, 846)	0.0320092199832
  (0, 1016)	0.0263523138347
  (0, 3408)	0.0252538136138
  (0, 3460)	0.0296695414548
  (0, 5207)	0.0296695414548
  (0, 5750)	0.0296695414548
  (0, 5936)	0.0320092199832
  (0, 6727)	0.0263523138347
  (0, 7240)	0.0263523138347
  (0, 7856)	0.0252538136138
  (0, 8606)	0.0263523138347
  (0, 8670)	0.0241684122261
  (0, 8871)	0.0263523138347
  (0, 8887)	0.0296695414548
  (0, 8994)	0.0464238345443
  (0, 9136)	0.0252538136138
  (0, 10799)	0.0263523138347
  (0, 11520)	0.0252538136138
  (0, 12052)	0.0252538136138
  (0, 12087)	0.0208333333333
  (0, 12210)	0.0252538136138
  (0, 13098)	0.0199521721117
  (0, 13378)	0.0399043442234
  (0, 14138)	0.0320092199832
  :	:
  (3, 1039960)	0.0157173653365
  (3, 1040661)	0.0160375074775
  (3, 1041052)	0.0152145154863
  (3, 1041204)	0.0187382922249
  (3, 1041395)	0.01610391566
  (3, 1041742)	0.0192879187453
  (3, 1042148)	0.0152145154863
  (3, 1042254)	0.0187382922249
  (3, 1042482)	0.0160375074775
  (3, 1042697)	0.017766726363
  (3, 1042744)	0.017766726363
  (3, 1043641)	0.0152145154863
  (3, 1043732)	0.0157173653365
  (3, 1043838)	0.0180421959122
  (3, 1044122)	0.0152145154863
  (3, 1044164)	0.0174183253573
  (3, 1044254)	0.017766726363
  (3, 1044332)	0.0174183253573
  (3, 1044420)	0.0150482316357
  (3, 1044721)	0.0152145154863
  (3, 1045143)	0.0170896481756
  (3, 1045828)	0.0150482316357
  (3, 1046582)	0.0150482316357
  (3, 1047835)	0.0152145154863
  (3, 1048087)	0.01610391566
[ 1  1 -1 -1]

Modelling


In [12]:
%%time
#induce a predictive model
from sklearn.linear_model import SGDClassifier
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)

from sklearn import cross_validation
scores = cross_validation.cross_val_score(predictor, X, y, cv= 3, scoring='roc_auc')

import numpy as np
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))


/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/cross_validation.py:516: Warning: The least populated class in y has only 2 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=3.
  % (min_labels, self.n_folds)), Warning)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-12-1bfb8de16d1a> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u"#induce a predictive model\nfrom sklearn.linear_model import SGDClassifier\npredictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)\n\nfrom sklearn import cross_validation\nscores = cross_validation.cross_val_score(predictor, X, y, cv= 3, scoring='roc_auc')\n\nimport numpy as np\nprint('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))")

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2291             magic_arg_s = self.var_expand(line, stack_depth)
   2292             with self.builtin_trap:
-> 2293                 result = fn(magic_arg_s, cell)
   2294             return result
   2295 

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1165         else:
   1166             st = clock2()
-> 1167             exec(code, glob, local_ns)
   1168             end = clock2()
   1169             out = None

<timed exec> in <module>()

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/cross_validation.pyc in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
   1431                                               train, test, verbose, None,
   1432                                               fit_params)
-> 1433                       for train, test in cv)
   1434     return np.array(scores)[:, 0]
   1435 

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    802             self._iterating = True
    803 
--> 804             while self.dispatch_one_batch(iterator):
    805                 pass
    806 

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    660                 return False
    661             else:
--> 662                 self._dispatch(tasks)
    663                 return True
    664 

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    568 
    569         if self._pool is None:
--> 570             job = ImmediateComputeBatch(batch)
    571             self._jobs.append(job)
    572             self.n_dispatched_batches += 1

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
    181         # Don't delay the application, to avoid keeping the input
    182         # arguments in memory
--> 183         self.results = batch()
    184 
    185     def get(self):

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
   1548 
   1549     else:
-> 1550         test_score = _score(estimator, X_test, y_test, scorer)
   1551         if return_train_score:
   1552             train_score = _score(estimator, X_train, y_train, scorer)

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
   1604         score = scorer(estimator, X_test)
   1605     else:
-> 1606         score = scorer(estimator, X_test, y_test)
   1607     if not isinstance(score, numbers.Number):
   1608         raise ValueError("scoring must return a number, got %s (%s) instead."

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, clf, X, y, sample_weight)
    163         else:
    164             try:
--> 165                 y_pred = clf.decision_function(X)
    166 
    167                 # For multi-output multi-class estimator

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/linear_model/base.pyc in decision_function(self, X)
    242                                  "yet" % {'name': type(self).__name__})
    243 
--> 244         X = check_array(X, accept_sparse='csr')
    245 
    246         n_features = self.coef_.shape[1]

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              " minimum of %d is required%s."
    406                              % (n_samples, shape_repr, ensure_min_samples,
--> 407                                 context))
    408 
    409     if ensure_min_features > 0 and array.ndim == 2:

ValueError: Found array with 0 sample(s) (shape=(0, 1048577)) while a minimum of 1 is required.

In [11]:
%%time
#induce a predictive model
from sklearn.linear_model import SGDClassifier
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)

import numpy as np
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y, X)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-ff223b04dd18> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u"#induce a predictive model\nfrom sklearn.linear_model import SGDClassifier\npredictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)\n\nimport numpy as np\nfrom sklearn.metrics import roc_auc_score\nscore = roc_auc_score(y, X)")

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2291             magic_arg_s = self.var_expand(line, stack_depth)
   2292             with self.builtin_trap:
-> 2293                 result = fn(magic_arg_s, cell)
   2294             return result
   2295 

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1165         else:
   1166             st = clock2()
-> 1167             exec(code, glob, local_ns)
   1168             end = clock2()
   1169             out = None

<timed exec> in <module>()

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/metrics/ranking.pyc in roc_auc_score(y_true, y_score, average, sample_weight)
    251     return _average_binary_score(
    252         _binary_roc_auc_score, y_true, y_score, average,
--> 253         sample_weight=sample_weight)
    254 
    255 

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/metrics/base.pyc in _average_binary_score(binary_metric, y_true, y_score, average, sample_weight)
     77 
     78     if y_type == "binary":
---> 79         return binary_metric(y_true, y_score, sample_weight=sample_weight)
     80 
     81     check_consistent_length(y_true, y_score, sample_weight)

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/metrics/ranking.pyc in _binary_roc_auc_score(y_true, y_score, sample_weight)
    246 
    247         fpr, tpr, tresholds = roc_curve(y_true, y_score,
--> 248                                         sample_weight=sample_weight)
    249         return auc(fpr, tpr, reorder=True)
    250 

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/metrics/ranking.pyc in roc_curve(y_true, y_score, pos_label, sample_weight, drop_intermediate)
    495     """
    496     fps, tps, thresholds = _binary_clf_curve(
--> 497         y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)
    498 
    499     # Attempt to drop thresholds corresponding to points in between and

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/metrics/ranking.pyc in _binary_clf_curve(y_true, y_score, pos_label, sample_weight)
    290     check_consistent_length(y_true, y_score)
    291     y_true = column_or_1d(y_true)
--> 292     y_score = column_or_1d(y_score)
    293     if sample_weight is not None:
    294         sample_weight = column_or_1d(sample_weight)

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in column_or_1d(y, warn)
    549         return np.ravel(y)
    550 
--> 551     raise ValueError("bad input shape {0}".format(shape))
    552 
    553 

ValueError: bad input shape (4, 1048577)

Note

draw the generated graphs


In [ ]:
for G in pos_0_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in pos_1_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')

for G in neg_0_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in neg_1_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')

transform the created graph to have the desired type of info(end user specification)


In [13]:
transform_pos_0_graphs = gf.graphs_transform(pos_0_graphs, use_seq=True, use_cov = False)
transform_pos_1_graphs = gf.graphs_transform(pos_1_graphs, use_seq=True, use_cov = False)

transform_neg_0_graphs = gf.graphs_transform(neg_0_graphs, use_seq=True, use_cov = False)
transform_neg_1_graphs = gf.graphs_transform(neg_1_graphs, use_seq=True, use_cov = False)

draw the transformed graphs


In [ ]:
for G in pos_0_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in pos_1_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')

for G in neg_0_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in neg_1_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')

In [ ]:
def make_test_graphs(fname_pos, fname_neg):
    print fname_pos
    print fname_neg
    
    if fname_pos == 'pos_0_graphs' and fname_neg == 'neg_0_graphs':
        test_graphs = pos_1_graphs + neg_1_graphs
        print 'test = pos_1_graphs + neg_1_graphs'
    if fname_pos == 'pos_0_graphs' and fname_neg == 'neg_1_graphs':
        test_graphs = pos_1_graphs + neg_0_graphs
        print 'test = pos_1_graphs + neg_0_graphs'
    if fname_pos == 'pos_1_graphs' and fname_neg == 'neg_0_graphs':
        test_graphs = pos_0_graphs + neg_1_graphs
        print 'test = pos_0_graphs + neg_0_graphs'
    if fname_pos == 'pos_1_graphs' and fname_neg == 'neg_1_graphs':
        test_graphs = pos_0_graphs + neg_0_graphs
        print 'test = pos_0_graphs + neg_0_graphs'
    return test