In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
from eden import graph
from eden.util import eden_io
from eden.converter.graph import gspan

In [3]:
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import RandomizedSearchCV
from sklearn import cross_validation

from scipy.stats import randint
from scipy.stats import uniform

import numpy as np
from scipy import stats

import time

In [4]:
input_data_url='http://www.bioinf.uni-freiburg.de/~costa/bursi.gspan'
input_target_url='http://www.bioinf.uni-freiburg.de/~costa/bursi.target'

In [5]:
y=eden_io.load_target(input_target_url)
print('Target size:%d' % y.shape[0])
print('Target classes:%d' % len(set(y)))


Target size:4337
Target classes:2

In [6]:
%%time
#quick parameter analysis
clf = SGDClassifier()
results = []
for max_radius in range(2,8):
    for max_distance in [0,int(max_radius/2),max_radius, max_radius*2]:
        t0 = time.clock()
 
        vec=graph.Vectorizer(r=max_radius,d=max_distance)
        g_it=gspan.gspan_to_eden(input_data_url, 'url')
        X=vec.transform(g_it, n_jobs=-1)

        scores = cross_validation.cross_val_score(clf, X, y,cv=10, scoring='roc_auc', n_jobs=-1)
 
        #results 
        perf=np.mean(scores)
        std=np.std(scores)
        dt=time.clock() - t0
        err=1/(1-perf)
        result={'perf':perf, 'std':std, 'dt':dt, 'err':err, 'r':max_radius, 'd':max_distance}
        results.append(result)
        print('r=%d d=%d AUCROC=%.4f (+- %.4f) runtime=%.1f sec' % (max_radius, max_distance, perf,std,dt))


r=2 d=0 AUCROC=0.8834 (+- 0.0139) runtime=14.1 sec
r=2 d=1 AUCROC=0.8931 (+- 0.0135) runtime=15.9 sec
r=2 d=2 AUCROC=0.8972 (+- 0.0137) runtime=17.8 sec
r=2 d=4 AUCROC=0.8992 (+- 0.0157) runtime=23.0 sec
r=3 d=0 AUCROC=0.9000 (+- 0.0150) runtime=14.8 sec
r=3 d=1 AUCROC=0.9014 (+- 0.0155) runtime=17.4 sec
r=3 d=3 AUCROC=0.9050 (+- 0.0160) runtime=22.9 sec
r=3 d=6 AUCROC=0.9059 (+- 0.0162) runtime=28.1 sec
r=4 d=0 AUCROC=0.9027 (+- 0.0173) runtime=16.1 sec
r=4 d=2 AUCROC=0.9051 (+- 0.0150) runtime=22.5 sec
r=4 d=4 AUCROC=0.9085 (+- 0.0134) runtime=28.1 sec
r=4 d=8 AUCROC=0.9089 (+- 0.0150) runtime=33.2 sec
r=5 d=0 AUCROC=0.9063 (+- 0.0129) runtime=16.5 sec
r=5 d=2 AUCROC=0.9062 (+- 0.0141) runtime=25.0 sec
r=5 d=5 AUCROC=0.9092 (+- 0.0128) runtime=32.9 sec
r=5 d=10 AUCROC=0.9087 (+- 0.0126) runtime=37.8 sec
r=6 d=0 AUCROC=0.9058 (+- 0.0126) runtime=18.0 sec
r=6 d=3 AUCROC=0.9080 (+- 0.0139) runtime=29.3 sec
r=6 d=6 AUCROC=0.9088 (+- 0.0122) runtime=38.4 sec
r=6 d=12 AUCROC=0.9073 (+- 0.0123) runtime=42.9 sec
r=7 d=0 AUCROC=0.9058 (+- 0.0135) runtime=18.3 sec
r=7 d=3 AUCROC=0.9094 (+- 0.0125) runtime=31.3 sec
r=7 d=7 AUCROC=0.9092 (+- 0.0133) runtime=44.3 sec
r=7 d=14 AUCROC=0.9070 (+- 0.0134) runtime=48.9 sec
CPU times: user 9min 18s, sys: 1min 19s, total: 10min 37s
Wall time: 16min 49s

In [7]:
#plot
plt.figure(figsize=(10,10))
plt.grid(True)
for result in results:
    label='r:%d d:%d \np:%.3f'%(result['r'],result['d'],result['perf'])
    x2=result['dt']
    y2=result['err']
    plt.annotate(label,xy = (x2, y2), xytext = (-20, -25), textcoords = 'offset points')   
plt.scatter([result['dt'] for result in results],[result['err'] for result in results])
plt.show()



In [8]:
#optimal parameter analysis
clf = SGDClassifier()
param_dist = {"n_iter": randint(5, 100),
              "power_t": uniform(0.1),
              "alpha": uniform(1e-08,1e-03),
              "eta0" : uniform(1e-03,10),
              "penalty": ["l1", "l2", "elasticnet"],
              "learning_rate": ["invscaling", "constant","optimal"]}

results = []
for max_radius in range(2,8):
    for max_distance in [0,int(max_radius/2),max_radius, max_radius*2]:
        t0 = time.clock()
        
        #feature creation
        vec=graph.Vectorizer(r=max_radius,d=max_distance)
        g_it=gspan.gspan_to_eden(input_data_url,'url')
        X=vec.transform(g_it, n_jobs=-1)

 
        #parameter optimisation
        n_iter_search = 20
        random_search = RandomizedSearchCV(clf,param_distributions=param_dist,n_iter=n_iter_search,cv=3,scoring='roc_auc', n_jobs=-1)
        random_search.fit(X, y)
        optclf = SGDClassifier(**random_search.best_params_)
        scores = cross_validation.cross_val_score(optclf, X, y,cv=10, scoring='roc_auc')
        
        #performance results 
        dt=time.clock() - t0
        
        perf=np.mean(scores)
        std=np.std(scores)
        err=1/(1-perf)
        result={'perf':perf, 'std':std, 'dt':dt, 'err':err, 'r':max_radius, 'd':max_distance}
        results.append(result)
        print('r=%d d=%d AUCROC=%.4f (+- %.4f) runtime=%.1f sec' % (max_radius, max_distance, perf,std,dt))


r=2 d=0 AUCROC=0.8957 (+- 0.0144) runtime=18.1 sec
r=2 d=1 AUCROC=0.9059 (+- 0.0122) runtime=20.5 sec
r=2 d=2 AUCROC=0.9070 (+- 0.0122) runtime=36.2 sec
r=2 d=4 AUCROC=0.8966 (+- 0.0117) runtime=30.4 sec
r=3 d=0 AUCROC=0.9057 (+- 0.0120) runtime=17.1 sec
r=3 d=1 AUCROC=0.9112 (+- 0.0111) runtime=23.4 sec
r=3 d=3 AUCROC=0.9135 (+- 0.0109) runtime=39.3 sec
r=3 d=6 AUCROC=0.9127 (+- 0.0117) runtime=46.0 sec
r=4 d=0 AUCROC=0.8889 (+- 0.0159) runtime=19.6 sec
r=4 d=2 AUCROC=0.9057 (+- 0.0123) runtime=38.8 sec
r=4 d=4 AUCROC=0.9112 (+- 0.0113) runtime=49.5 sec
r=4 d=8 AUCROC=0.9101 (+- 0.0125) runtime=90.6 sec
r=5 d=0 AUCROC=0.9005 (+- 0.0122) runtime=21.0 sec
r=5 d=2 AUCROC=0.9123 (+- 0.0129) runtime=49.4 sec
r=5 d=5 AUCROC=0.9134 (+- 0.0105) runtime=65.5 sec
r=5 d=10 AUCROC=0.9102 (+- 0.0144) runtime=115.8 sec
r=6 d=0 AUCROC=0.9126 (+- 0.0122) runtime=23.3 sec
r=6 d=3 AUCROC=0.9147 (+- 0.0124) runtime=69.4 sec
r=6 d=6 AUCROC=0.9097 (+- 0.0119) runtime=106.8 sec
r=6 d=12 AUCROC=0.8902 (+- 0.0133) runtime=124.8 sec
r=7 d=0 AUCROC=0.9126 (+- 0.0122) runtime=24.2 sec
r=7 d=3 AUCROC=0.9130 (+- 0.0111) runtime=74.0 sec
r=7 d=7 AUCROC=0.9067 (+- 0.0120) runtime=134.7 sec
r=7 d=14 AUCROC=0.9116 (+- 0.0119) runtime=115.1 sec

In [9]:
#plot
plt.figure(figsize=(10,10))
plt.grid(True)
for result in results:
    label='r:%d d:%d \np:%.3f'%(result['r'],result['d'],result['perf'])
    x2=result['dt']
    y2=result['err']
    plt.annotate(label,xy = (x2, y2), xytext = (-20, -25), textcoords = 'offset points')   
plt.scatter([result['dt'] for result in results],[result['err'] for result in results])
plt.show()



In [10]:
#optimal parameter analysis
clf = SGDClassifier()
param_dist = {"n_iter": randint(5, 100),
              "power_t": uniform(0.1),
              "alpha": uniform(1e-08,1e-03),
              "eta0" : uniform(1e-03,10),
              "penalty": ["l1", "l2", "elasticnet"],
              "learning_rate": ["invscaling", "constant","optimal"]}

results = []
for max_radius in range(2,8):
    for max_distance in [0,int(max_radius/2),max_radius, max_radius*2]:
        t0 = time.clock()
        
        #feature creation
        vec=graph.Vectorizer(r=max_radius,d=max_distance)
        g_it=gspan.gspan_to_eden(input_data_url, 'url')
        X=vec.transform(g_it, n_jobs=-1)

 
        #parameter optimisation
        n_iter_search = 50
        random_search = RandomizedSearchCV(clf,param_distributions=param_dist,n_iter=n_iter_search,cv=3,scoring='roc_auc', n_jobs=-1)
        random_search.fit(X, y)
        optclf = SGDClassifier(**random_search.best_params_)
        scores = cross_validation.cross_val_score(optclf, X, y,cv=10, scoring='roc_auc')
        
        #performance results 
        dt=time.clock() - t0
        
        perf=np.mean(scores)
        std=np.std(scores)
        err=1/(1-perf)
        result={'perf':perf, 'std':std, 'dt':dt, 'err':err, 'r':max_radius, 'd':max_distance}
        results.append(result)
        print('r=%d d=%d AUCROC=%.4f (+- %.4f) runtime=%.1f sec' % (max_radius, max_distance, perf,std,dt))


r=2 d=0 AUCROC=0.8876 (+- 0.0142) runtime=18.3 sec
r=2 d=1 AUCROC=0.9002 (+- 0.0122) runtime=20.7 sec
r=2 d=2 AUCROC=0.9048 (+- 0.0123) runtime=40.5 sec
r=2 d=4 AUCROC=0.9045 (+- 0.0121) runtime=52.3 sec
r=3 d=0 AUCROC=0.9096 (+- 0.0115) runtime=21.5 sec
r=3 d=1 AUCROC=0.9002 (+- 0.0126) runtime=27.2 sec
r=3 d=3 AUCROC=0.9176 (+- 0.0125) runtime=71.8 sec
r=3 d=6 AUCROC=0.9019 (+- 0.0126) runtime=87.9 sec
r=4 d=0 AUCROC=0.9151 (+- 0.0114) runtime=23.2 sec
r=4 d=2 AUCROC=0.9146 (+- 0.0118) runtime=73.7 sec
r=4 d=4 AUCROC=0.9155 (+- 0.0114) runtime=82.8 sec
r=4 d=8 AUCROC=0.9168 (+- 0.0128) runtime=131.4 sec
r=5 d=0 AUCROC=0.9125 (+- 0.0108) runtime=22.8 sec
r=5 d=2 AUCROC=0.9141 (+- 0.0124) runtime=68.9 sec
r=5 d=5 AUCROC=0.9127 (+- 0.0111) runtime=200.7 sec
r=5 d=10 AUCROC=0.9153 (+- 0.0125) runtime=144.5 sec
r=6 d=0 AUCROC=0.9090 (+- 0.0112) runtime=25.9 sec
r=6 d=3 AUCROC=0.9051 (+- 0.0122) runtime=90.4 sec
r=6 d=6 AUCROC=0.9170 (+- 0.0113) runtime=143.9 sec
r=6 d=12 AUCROC=0.9148 (+- 0.0118) runtime=179.2 sec
r=7 d=0 AUCROC=0.9097 (+- 0.0118) runtime=28.4 sec
r=7 d=3 AUCROC=0.9139 (+- 0.0113) runtime=91.7 sec
r=7 d=7 AUCROC=0.9063 (+- 0.0120) runtime=173.5 sec
r=7 d=14 AUCROC=0.9134 (+- 0.0127) runtime=201.3 sec

In [11]:
#plot
plt.figure(figsize=(10,10))
plt.grid(True)
for result in results:
    label='r:%d d:%d \np:%.3f'%(result['r'],result['d'],result['perf'])
    x2=result['dt']
    y2=result['err']
    plt.annotate(label,xy = (x2, y2), xytext = (-20, -25), textcoords = 'offset points')   
plt.scatter([result['dt'] for result in results],[result['err'] for result in results])
plt.show()