load a target file
In [1]:
import requests
import numpy as np
target_path = open('/home/alsheikm/GitDir/EeDN_work/StoExamples/output/data.target', 'r')
for line in target_path:
targets = line.split('\n')
y = np.array(int(line.strip())) #for target in targets if target])
load data and convert it to graphs
In [2]:
from eden.converter.graph.gspan import gspan_to_eden
graphs = gspan_to_eden('http://www.bioinf.uni-freiburg.de/~costa/bursi.gspan')
setup the vectorizer
In [3]:
from eden.graph import Vectorizer
vectorizer = Vectorizer(complexity=3)
extract features and build data matrix
In [4]:
%%time
X = vectorizer.transform(graphs)
print 'Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz()/X.shape[0])
induce a predictor and evaluate its performance
In [5]:
%%time
#induce a predictive model
from sklearn.linear_model import SGDClassifier
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)
from sklearn import cross_validation
scores = cross_validation.cross_val_score(predictor, X, y, cv=10, scoring='roc_auc')
import numpy as np
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))
In [ ]: