In [2]:
%matplotlib inline
import os, sys
import ntpath
from eden.util import display
# -----------------------------------------------------------------------------
In [3]:
from graph_finder import GraphFinder
In [4]:
pos_class_0_path = "StoExamples/StoClasses/pos_class_0"
pos_class_1_path = "StoExamples/StoClasses/pos_class_1"
neg_class_0_path = "StoExamples/StoClasses/neg_class_0"
neg_class_1_path = "StoExamples/StoClasses/neg_class_1"
pos_class_0_abs_path = os.path.abspath(pos_class_0_path)
pos_class_1_abs_path = os.path.abspath(pos_class_1_path)
neg_class_0_abs_path = os.path.abspath(neg_class_0_path)
neg_class_1_abs_path = os.path.abspath(neg_class_1_path)
convert files to graphs
In [5]:
gf = GraphFinder()
pos_0_graphs = gf.convert(pos_class_0_abs_path)
pos_1_graphs = gf.convert(pos_class_1_abs_path)
neg_0_graphs = gf.convert(neg_class_0_abs_path)
neg_1_graphs = gf.convert(neg_class_1_abs_path)
In [6]:
import numpy as np
def make_target(fname_pos, fname_neg):
'''create target list'''
target = [1]* len(fname_pos) + [-1] * len(fname_neg)
y = np.asarray([int(value) for value in target])
#print ('y',y)
return y
In [7]:
from eden.graph import Vectorizer
def vectorizer(transform_train):
'''this function vectorize the train data'''
print 'vectorizer'
%%time
vectorizer = Vectorizer(complexity=3)
'''extract features and build data matrix'''
X = vectorizer.transform(transform_train)
print ('X', X)
print 'Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz()/X.shape[0])
return X
In [8]:
def make_data(fname_pos, fname_neg):
'''this function make the train data, apply the transformation to the graphs,
and apply the vectorization'''
train_graphs = fname_pos + fname_neg
'''create the transformed train graphs'''
transform_trains = gf.graphs_transform(train_graphs, use_seq=True, use_cov = True)
transform_fname_pos = gf.graphs_transform(fname_pos, use_seq=True, use_cov = True)
print 'pos'
transform_fname_neg = gf.graphs_transform(fname_neg, use_seq=True, use_cov = True)
print 'neg'
#test_graphs = make_test_graphs(fname_pos, fname_neg)
#print ('test_graphs', test_graphs)
X = vectorizer(transform_trains)
y = make_target(transform_fname_pos, transform_fname_neg)
print 'Done'
return X, y
In [9]:
X, y = make_data(pos_0_graphs, neg_0_graphs)
test_graphs = pos_1_graphs + neg_1_graphs
print ('test_graphs', test_graphs)
print X
print y
In [12]:
%%time
#induce a predictive model
from sklearn.linear_model import SGDClassifier
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)
from sklearn import cross_validation
scores = cross_validation.cross_val_score(predictor, X, y, cv= 3, scoring='roc_auc')
import numpy as np
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))
In [11]:
%%time
#induce a predictive model
from sklearn.linear_model import SGDClassifier
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)
import numpy as np
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y, X)
draw the generated graphs
In [ ]:
for G in pos_0_graphs:
display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in pos_1_graphs:
display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in neg_0_graphs:
display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in neg_1_graphs:
display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
In [13]:
transform_pos_0_graphs = gf.graphs_transform(pos_0_graphs, use_seq=True, use_cov = False)
transform_pos_1_graphs = gf.graphs_transform(pos_1_graphs, use_seq=True, use_cov = False)
transform_neg_0_graphs = gf.graphs_transform(neg_0_graphs, use_seq=True, use_cov = False)
transform_neg_1_graphs = gf.graphs_transform(neg_1_graphs, use_seq=True, use_cov = False)
draw the transformed graphs
In [ ]:
for G in pos_0_graphs:
display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in pos_1_graphs:
display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in neg_0_graphs:
display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
for G in neg_1_graphs:
display.draw_graph(G, size=40, node_size=400, font_size=20, node_border=True, prog='neato')
In [ ]:
def make_test_graphs(fname_pos, fname_neg):
print fname_pos
print fname_neg
if fname_pos == 'pos_0_graphs' and fname_neg == 'neg_0_graphs':
test_graphs = pos_1_graphs + neg_1_graphs
print 'test = pos_1_graphs + neg_1_graphs'
if fname_pos == 'pos_0_graphs' and fname_neg == 'neg_1_graphs':
test_graphs = pos_1_graphs + neg_0_graphs
print 'test = pos_1_graphs + neg_0_graphs'
if fname_pos == 'pos_1_graphs' and fname_neg == 'neg_0_graphs':
test_graphs = pos_0_graphs + neg_1_graphs
print 'test = pos_0_graphs + neg_0_graphs'
if fname_pos == 'pos_1_graphs' and fname_neg == 'neg_1_graphs':
test_graphs = pos_0_graphs + neg_0_graphs
print 'test = pos_0_graphs + neg_0_graphs'
return test