%matplotlib inline
import os, sys
import ntpath
from eden.util import display
# -----------------------------------------------------------------------------

from graph_finder import GraphFinder


read a directory of '.sto' files and create their graphs

pos_class_0_path = "StoExamples/StoClasses/pos_class_0"
pos_class_1_path = "StoExamples/StoClasses/pos_class_1"
neg_class_0_path = "StoExamples/StoClasses/neg_class_0"
neg_class_1_path = "StoExamples/StoClasses/neg_class_1"

pos_class_0_abs_path = os.path.abspath(pos_class_0_path)
pos_class_1_abs_path = os.path.abspath(pos_class_1_path)
neg_class_0_abs_path = os.path.abspath(neg_class_0_path)
neg_class_1_abs_path = os.path.abspath(neg_class_1_path)

convert files to graphs

gf = GraphFinder()
pos_0_graphs = gf.convert(pos_class_0_abs_path)
pos_1_graphs = gf.convert(pos_class_1_abs_path)
neg_0_graphs = gf.convert(neg_class_0_abs_path)
neg_1_graphs = gf.convert(neg_class_1_abs_path)

{'graph_title': '550-70730-0-0'}
{'graph_title': '550-70852-0-0'}
{'graph_title': '550-53949-1-0'}
{'graph_title': '550-69275-0-0'}
{'graph_title': '550-69410-0-0'}
{'graph_title': '550-751-0-0'}
{'graph_title': '550-1137-0-0'}
{'graph_title': '550-904-1-0'}
{'graph_title': '550-1143-1-0'}
{'graph_title': '550-1153-0-0'}

import numpy as np
def make_target(fname_pos, fname_neg):
    '''create target list'''
    target = [1]* len(fname_pos) + [-1] * len(fname_neg)
    y = np.asarray([int(value) for value in target])
    #print ('y',y)
    return y

from eden.graph import Vectorizer
def vectorizer(transform_train):
    '''this function vectorize the train data'''
    print 'vectorizer'
    vectorizer = Vectorizer(complexity=3)

    '''extract features and build data matrix'''
    X = vectorizer.transform(transform_train)
    print ('X', X)
    print 'Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1],  X.getnnz()/X.shape[0])
    return X

def make_data(fname_pos, fname_neg):
    '''this function make the train data, apply the transformation to the graphs,
        and apply the vectorization'''

    train_graphs = fname_pos + fname_neg

    '''create the transformed train graphs'''
    transform_trains = gf.graphs_transform(train_graphs, use_seq=True, use_cov = True)
    transform_fname_pos = gf.graphs_transform(fname_pos, use_seq=True, use_cov = True)
    print 'pos'

    transform_fname_neg = gf.graphs_transform(fname_neg, use_seq=True, use_cov = True)
    print 'neg'

    #test_graphs = make_test_graphs(fname_pos, fname_neg)
    #print ('test_graphs', test_graphs)
    X = vectorizer(transform_trains)

    y = make_target(transform_fname_pos, transform_fname_neg)
    print 'Done'
    return X, y


make train test data

X, y = make_data(pos_0_graphs, neg_0_graphs)
test_graphs = pos_1_graphs + neg_1_graphs
print ('test_graphs', test_graphs)
print X
print y

('X', <4x1048577 sparse matrix of type '<type 'numpy.float64'>'
	with 10356 stored elements in Compressed Sparse Row format>)
Instances: 4 Features: 1048577 with an avg of 2589 features per instance
('test_graphs', [<networkx.classes.graph.Graph object at 0x7f5a2f00c790>, <networkx.classes.graph.Graph object at 0x7f5a59e94650>, <networkx.classes.graph.Graph object at 0x7f5a2f00c910>, <networkx.classes.graph.Graph object at 0x7f5a2f00ca10>, <networkx.classes.graph.Graph object at 0x7f5a2f00ca50>, <networkx.classes.graph.Graph object at 0x7f5a2f00ca90>])
  (0, 771)	0.0296695414548
  (0, 846)	0.0320092199832
  (0, 1016)	0.0263523138347
  (0, 3408)	0.0252538136138
  (0, 3460)	0.0296695414548
  (0, 5207)	0.0296695414548
  (0, 5750)	0.0296695414548
  (0, 5936)	0.0320092199832
  (0, 6727)	0.0263523138347
  (0, 7240)	0.0263523138347
  (0, 7856)	0.0252538136138
  (0, 8606)	0.0263523138347
  (0, 8670)	0.0241684122261
  (0, 8871)	0.0263523138347
  (0, 8887)	0.0296695414548
  (0, 8994)	0.0464238345443
  (0, 9136)	0.0252538136138
  (0, 10799)	0.0263523138347
  (0, 11520)	0.0252538136138
  (0, 12052)	0.0252538136138
  (0, 12087)	0.0208333333333
  (0, 12210)	0.0252538136138
  (0, 13098)	0.0199521721117
  (0, 13378)	0.0399043442234
  (0, 14138)	0.0320092199832
  :	:
  (3, 1039960)	0.0157173653365
  (3, 1040661)	0.0160375074775
  (3, 1041052)	0.0152145154863
  (3, 1041204)	0.0187382922249
  (3, 1041395)	0.01610391566
  (3, 1041742)	0.0192879187453
  (3, 1042148)	0.0152145154863
  (3, 1042254)	0.0187382922249
  (3, 1042482)	0.0160375074775
  (3, 1042697)	0.017766726363
  (3, 1042744)	0.017766726363
  (3, 1043641)	0.0152145154863
  (3, 1043732)	0.0157173653365
  (3, 1043838)	0.0180421959122
  (3, 1044122)	0.0152145154863
  (3, 1044164)	0.0174183253573
  (3, 1044254)	0.017766726363
  (3, 1044332)	0.0174183253573
  (3, 1044420)	0.0150482316357
  (3, 1044721)	0.0152145154863
  (3, 1045143)	0.0170896481756
  (3, 1045828)	0.0150482316357
  (3, 1046582)	0.0150482316357
  (3, 1047835)	0.0152145154863
  (3, 1048087)	0.01610391566
[ 1  1 -1 -1]


#induce a predictive model
from sklearn.linear_model import SGDClassifier
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)

from sklearn import cross_validation
scores = cross_validation.cross_val_score(predictor, X, y, cv= 3, scoring='roc_auc')

import numpy as np
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))

draw the generated graphs

for G in pos_0_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in pos_1_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')

for G in neg_0_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in neg_1_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')

transform the created graph to have the desired type of info(end user specification)

transform_pos_0_graphs = gf.graphs_transform(pos_0_graphs, use_seq=True, use_cov = False)
transform_pos_1_graphs = gf.graphs_transform(pos_1_graphs, use_seq=True, use_cov = False)

transform_neg_0_graphs = gf.graphs_transform(neg_0_graphs, use_seq=True, use_cov = False)
transform_neg_1_graphs = gf.graphs_transform(neg_1_graphs, use_seq=True, use_cov = False)

draw the transformed graphs

for G in pos_0_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in pos_1_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')

for G in neg_0_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in neg_1_graphs:
    display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')

def make_test_graphs(fname_pos, fname_neg):
    print fname_pos
    print fname_neg
    if fname_pos == 'pos_0_graphs' and fname_neg == 'neg_0_graphs':
        test_graphs = pos_1_graphs + neg_1_graphs
        print 'test = pos_1_graphs + neg_1_graphs'
    if fname_pos == 'pos_0_graphs' and fname_neg == 'neg_1_graphs':
        test_graphs = pos_1_graphs + neg_0_graphs
        print 'test = pos_1_graphs + neg_0_graphs'
    if fname_pos == 'pos_1_graphs' and fname_neg == 'neg_0_graphs':
        test_graphs = pos_0_graphs + neg_1_graphs
        print 'test = pos_0_graphs + neg_0_graphs'
    if fname_pos == 'pos_1_graphs' and fname_neg == 'neg_1_graphs':
        test_graphs = pos_0_graphs + neg_0_graphs
        print 'test = pos_0_graphs + neg_0_graphs'
    return test