Classification

1 Conversion

load a target file


In [1]:
import requests
import numpy as np
target_path = open('/home/alsheikm/GitDir/EeDN_work/StoExamples/output/data.target', 'r')
for line in target_path:
    targets = line.split('\n')
    y = np.array(int(line.strip())) #for target in targets if target])

load data and convert it to graphs


In [2]:
from eden.converter.graph.gspan import gspan_to_eden
graphs = gspan_to_eden('http://www.bioinf.uni-freiburg.de/~costa/bursi.gspan')

2 Vectorization

setup the vectorizer


In [3]:
from eden.graph import Vectorizer
vectorizer = Vectorizer(complexity=3)

extract features and build data matrix


In [4]:
%%time
X = vectorizer.transform(graphs)
print 'Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1],  X.getnnz()/X.shape[0])


Instances: 4337 Features: 1048577 with an avg of 180 features per instance
CPU times: user 19.8 s, sys: 96 ms, total: 19.9 s
Wall time: 33.7 s

3 Modelling

induce a predictor and evaluate its performance


In [5]:
%%time
#induce a predictive model
from sklearn.linear_model import SGDClassifier
predictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)

from sklearn import cross_validation
scores = cross_validation.cross_val_score(predictor, X, y, cv=10, scoring='roc_auc')

import numpy as np
print('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-5-2bcfcbef9be3> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u"#induce a predictive model\nfrom sklearn.linear_model import SGDClassifier\npredictor = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)\n\nfrom sklearn import cross_validation\nscores = cross_validation.cross_val_score(predictor, X, y, cv=10, scoring='roc_auc')\n\nimport numpy as np\nprint('AUC ROC: %.4f +- %.4f' % (np.mean(scores),np.std(scores)))")

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2291             magic_arg_s = self.var_expand(line, stack_depth)
   2292             with self.builtin_trap:
-> 2293                 result = fn(magic_arg_s, cell)
   2294             return result
   2295 

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/alsheikm/miniconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1165         else:
   1166             st = clock2()
-> 1167             exec(code, glob, local_ns)
   1168             end = clock2()
   1169             out = None

<timed exec> in <module>()

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/cross_validation.pyc in cross_val_score(estimator, X, y, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
   1420         Array of scores of the estimator for each run of the cross validation.
   1421     """
-> 1422     X, y = indexable(X, y)
   1423 
   1424     cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in indexable(*iterables)
    199         else:
    200             result.append(np.array(X))
--> 201     check_consistent_length(*result)
    202     return result
    203 

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_consistent_length(*arrays)
    171     """
    172 
--> 173     uniques = np.unique([_num_samples(X) for X in arrays if X is not None])
    174     if len(uniques) > 1:
    175         raise ValueError("Found arrays with inconsistent numbers of samples: "

/home/alsheikm/.local/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _num_samples(x)
    120         if len(x.shape) == 0:
    121             raise TypeError("Singleton array %r cannot be considered"
--> 122                             " a valid collection." % x)
    123         return x.shape[0]
    124     else:

TypeError: Singleton array array(1) cannot be considered a valid collection.

In [ ]: