
1 Conversion

load a target file

In [1]:
import requests
import numpy as np
target_path = open('/home/alsheikm/GitDir/EeDN_work/StoExamples/output/', 'r')
for line in target_path:
    targets = line.split('\n')
    y = np.array(int(line.strip())) #for target in targets if target])

load data and convert it to graphs

In [2]:
from eden.converter.graph.gspan import gspan_to_eden
graphs = gspan_to_eden('')

2 Vectorization

setup the vectorizer

In [3]:
from eden.graph import Vectorizer
vectorizer = Vectorizer(complexity=3)

extract features and build data matrix

In [4]:
X = vectorizer.transform(graphs)
print 'Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1],  X.getnnz()/X.shape[0])

Instances: 4337 Features: 1048577 with an avg of 180 features per instance
CPU times: user 19.8 s, sys: 96 ms, total: 19.9 s
Wall time: 33.7 s

3 Modelling

induce a predictor and evaluate its performance

In [5]:
#induce a predictive model
from sklearn.linear_model import SGDClassifier
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)

from sklearn import cross_validation
scores = cross_validation.cross_val_score(predictor, X, y, cv=10, scoring='roc_auc')

import numpy as np
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))

TypeError                                 Traceback (most recent call last)
TypeError: Singleton array array(1) cannot be considered a valid collection.

