In [1]:
%pylab inline
In [2]:
from eden import graph
from eden.util import eden_io
from eden.converter.graph import gspan
In [3]:
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import RandomizedSearchCV
from sklearn import cross_validation
from scipy.stats import randint
from scipy.stats import uniform
import numpy as np
from scipy import stats
import time
In [4]:
input_data_url='http://www.bioinf.uni-freiburg.de/~costa/bursi.gspan'
input_target_url='http://www.bioinf.uni-freiburg.de/~costa/bursi.target'
In [5]:
y=eden_io.load_target(input_target_url)
print('Target size:%d' % y.shape[0])
print('Target classes:%d' % len(set(y)))
In [6]:
%%time
#quick parameter analysis
clf = SGDClassifier()
results = []
for max_radius in range(2,8):
for max_distance in [0,int(max_radius/2),max_radius, max_radius*2]:
t0 = time.clock()
vec=graph.Vectorizer(r=max_radius,d=max_distance)
g_it=gspan.gspan_to_eden(input_data_url, 'url')
X=vec.transform(g_it, n_jobs=-1)
scores = cross_validation.cross_val_score(clf, X, y,cv=10, scoring='roc_auc', n_jobs=-1)
#results
perf=np.mean(scores)
std=np.std(scores)
dt=time.clock() - t0
err=1/(1-perf)
result={'perf':perf, 'std':std, 'dt':dt, 'err':err, 'r':max_radius, 'd':max_distance}
results.append(result)
print('r=%d d=%d AUCROC=%.4f (+- %.4f) runtime=%.1f sec' % (max_radius, max_distance, perf,std,dt))
In [7]:
#plot
plt.figure(figsize=(10,10))
plt.grid(True)
for result in results:
label='r:%d d:%d \np:%.3f'%(result['r'],result['d'],result['perf'])
x2=result['dt']
y2=result['err']
plt.annotate(label,xy = (x2, y2), xytext = (-20, -25), textcoords = 'offset points')
plt.scatter([result['dt'] for result in results],[result['err'] for result in results])
plt.show()
In [8]:
#optimal parameter analysis
clf = SGDClassifier()
param_dist = {"n_iter": randint(5, 100),
"power_t": uniform(0.1),
"alpha": uniform(1e-08,1e-03),
"eta0" : uniform(1e-03,10),
"penalty": ["l1", "l2", "elasticnet"],
"learning_rate": ["invscaling", "constant","optimal"]}
results = []
for max_radius in range(2,8):
for max_distance in [0,int(max_radius/2),max_radius, max_radius*2]:
t0 = time.clock()
#feature creation
vec=graph.Vectorizer(r=max_radius,d=max_distance)
g_it=gspan.gspan_to_eden(input_data_url,'url')
X=vec.transform(g_it, n_jobs=-1)
#parameter optimisation
n_iter_search = 20
random_search = RandomizedSearchCV(clf,param_distributions=param_dist,n_iter=n_iter_search,cv=3,scoring='roc_auc', n_jobs=-1)
random_search.fit(X, y)
optclf = SGDClassifier(**random_search.best_params_)
scores = cross_validation.cross_val_score(optclf, X, y,cv=10, scoring='roc_auc')
#performance results
dt=time.clock() - t0
perf=np.mean(scores)
std=np.std(scores)
err=1/(1-perf)
result={'perf':perf, 'std':std, 'dt':dt, 'err':err, 'r':max_radius, 'd':max_distance}
results.append(result)
print('r=%d d=%d AUCROC=%.4f (+- %.4f) runtime=%.1f sec' % (max_radius, max_distance, perf,std,dt))
In [9]:
#plot
plt.figure(figsize=(10,10))
plt.grid(True)
for result in results:
label='r:%d d:%d \np:%.3f'%(result['r'],result['d'],result['perf'])
x2=result['dt']
y2=result['err']
plt.annotate(label,xy = (x2, y2), xytext = (-20, -25), textcoords = 'offset points')
plt.scatter([result['dt'] for result in results],[result['err'] for result in results])
plt.show()
In [10]:
#optimal parameter analysis
clf = SGDClassifier()
param_dist = {"n_iter": randint(5, 100),
"power_t": uniform(0.1),
"alpha": uniform(1e-08,1e-03),
"eta0" : uniform(1e-03,10),
"penalty": ["l1", "l2", "elasticnet"],
"learning_rate": ["invscaling", "constant","optimal"]}
results = []
for max_radius in range(2,8):
for max_distance in [0,int(max_radius/2),max_radius, max_radius*2]:
t0 = time.clock()
#feature creation
vec=graph.Vectorizer(r=max_radius,d=max_distance)
g_it=gspan.gspan_to_eden(input_data_url, 'url')
X=vec.transform(g_it, n_jobs=-1)
#parameter optimisation
n_iter_search = 50
random_search = RandomizedSearchCV(clf,param_distributions=param_dist,n_iter=n_iter_search,cv=3,scoring='roc_auc', n_jobs=-1)
random_search.fit(X, y)
optclf = SGDClassifier(**random_search.best_params_)
scores = cross_validation.cross_val_score(optclf, X, y,cv=10, scoring='roc_auc')
#performance results
dt=time.clock() - t0
perf=np.mean(scores)
std=np.std(scores)
err=1/(1-perf)
result={'perf':perf, 'std':std, 'dt':dt, 'err':err, 'r':max_radius, 'd':max_distance}
results.append(result)
print('r=%d d=%d AUCROC=%.4f (+- %.4f) runtime=%.1f sec' % (max_radius, max_distance, perf,std,dt))
In [11]:
#plot
plt.figure(figsize=(10,10))
plt.grid(True)
for result in results:
label='r:%d d:%d \np:%.3f'%(result['r'],result['d'],result['perf'])
x2=result['dt']
y2=result['err']
plt.annotate(label,xy = (x2, y2), xytext = (-20, -25), textcoords = 'offset points')
plt.scatter([result['dt'] for result in results],[result['err'] for result in results])
plt.show()