In [1]:
from eden.io.gspan import load
pos_graphs = list(load('data/bursi.pos.gspan'))
neg_graphs = list(load('data/bursi.neg.gspan'))
graphs = pos_graphs + neg_graphs
y = [1]*len(pos_graphs) + [-1]*len(neg_graphs)
import numpy as np
y = np.array(y)
EDeN exports a vectorize
function that converts a list of graphs in input to a data matrix in output.
The output format is a scipy Compressed Sparse Row matrix.
In [2]:
%%time
from eden.graph import vectorize
X = vectorize(graphs, complexity=2)
print 'Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz()/X.shape[0])
Several predictive algorithms from the scikit library can process data in csr format.
In [3]:
%%time
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)
scores = cross_val_score(predictor, X, y, cv=10, scoring='roc_auc')
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))