In [26]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from IPython.core.display import HTML
HTML('<style>.container { width:95% !important; }</style><style>.output_png {display: table-cell;text-align: center;vertical-align: middle;}</style>')
Out[26]:
In [2]:
import logging
from eden.util import configure_logging
logger = logging.getLogger()
configure_logging(logger, verbosity=1)
In [3]:
assay_id = '624466' #apr88
assay_id = '1847' #apr83
assay_id = '588350' #apr86 p1k n600
assay_id = '119' #apr60 30k mols
assay_id = '588794' #apr76 p1k n1k
assay_id = '624325' #apr79
In [4]:
from toolz import curry
from eden_chem.io.pubchem import download
download_active = curry(download)(active=True)
download_inactive = curry(download)(active=False)
from eden_chem.obabel import load as babel_load
from toolz import memoize
@memoize
def get_active_graphs(assay_id):
return pipe(assay_id, download_active, babel_load, list)
@memoize
def get_inactive_graphs(assay_id):
return pipe(assay_id, download_inactive, babel_load, list)
In [5]:
from toolz import curry
from eden_chem.io.pubchem import download
download_active = curry(download)(active=True)
download_inactive = curry(download)(active=False)
from eden_chem.obabel import load as babel_load
from toolz import take
ctake_few = curry(take)(5)
from eden.display import draw_graph_set
cdraw_graph_set = curry(draw_graph_set)(n_graphs_per_line=5, size=6)
print 'Draw sample'
print 'Active'
from toolz import pipe
pipe(assay_id, download_active, babel_load, ctake_few, cdraw_graph_set)
In [6]:
def clean_label(orig_graph):
graph = orig_graph.copy()
for u in graph.nodes():
graph.node[u]['label']='_'
for e in graph.edges_iter():
graph.edge[e[0]][e[1]]['label']='_'
return graph
In [7]:
%%time
graphs = get_active_graphs(assay_id)
from toolz.curried import map
from eden.graph import auto_relabel
auto_relabel_ = curry(auto_relabel)(n_clusters=3, complexity=3)
relabled_graphs = pipe(graphs, map(clean_label), auto_relabel_)
In [8]:
pipe(relabled_graphs, ctake_few, cdraw_graph_set)
In [9]:
# map graphs to vectors
from eden.graph import vectorize
cvectorize = curry(vectorize)(complexity=4, nbits=17)
@memoize
def get_active_vecs(assay_id):
return pipe(assay_id, get_active_graphs, cvectorize)
@memoize
def get_inactive_vecs(assay_id):
return pipe(assay_id, get_inactive_graphs, cvectorize)
import numpy as np
from scipy.sparse import vstack
from sklearn.linear_model import SGDClassifier
def make_data(assay_id):
active_X = get_active_vecs(assay_id)
inactive_X = get_inactive_vecs(assay_id)
X = vstack((active_X, inactive_X))
y = np.array([1]*active_X.shape[0] + [0]*inactive_X.shape[0])
return X, y
import numpy as np
from scipy.sparse import vstack
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
def predictive_performance(estimator, data):
X, y = data
cv = ShuffleSplit(n_splits=15, test_size=0.3, random_state=34)
scoring = make_scorer(average_precision_score)
scores = cross_val_score(estimator, X, y, cv=cv, scoring=scoring)
return scores
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)
predictive_performance_ = curry(predictive_performance)(sgd)
def output_avg_and_std(scores):
print('Acc: %.3f ±%.3f' % (np.mean(scores),np.std(scores)))
def output_stats(scores):
mean, std = np.mean(scores), np.std(scores)
median, low, high = np.percentile(scores, 50), np.percentile(scores, 25), np.percentile(scores, 75)
low10, high90 = np.percentile(scores, 10), np.percentile(scores, 90)
print ' score=%.2f ±%.2f [%.2f|%.2f|%.2f]' % (mean, std, low, median, high),
return median, low, high, low10, high90
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report
from eden.display import plot_confusion_matrices
from eden.display import plot_aucs
from sklearn.model_selection import train_test_split
def train(estimator, data):
X, y = data
return estimator.fit(X, y)
def test(estimator, X):
y_pred = estimator.predict(X)
y_score = estimator.decision_function(X)
return y_pred, y_score
def train_and_test(estimator, data):
X, y = data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
fit_estimator = train(estimator, (X_train, y_train))
y_pred, y_score = test(fit_estimator, X_test)
return y_test, y_pred, y_score
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)
train_and_test_ = curry(train_and_test)(sgd)
def predictive_performance_report(data):
y_true, y_pred, y_score = data
print
print 'Accuracy: %.2f' % accuracy_score(y_true, y_pred)
print ' AUC ROC: %.2f' % roc_auc_score(y_true, y_score)
print ' AUC AP: %.2f' % average_precision_score(y_true, y_score)
print
print 'Classification Report:'
print classification_report(y_true, y_pred)
print
plot_confusion_matrices(y_true, y_pred, size=int(len(set(y_true))*2.5))
print
plot_aucs(y_true, y_score, size=10)
In [10]:
pipe(assay_id, make_data, train_and_test_, predictive_performance_report)
In [11]:
from eden.graph import auto_relabel
def make_data_clear_label(assay_id):
active_graphs = get_active_graphs(assay_id)
inactive_graphs = get_inactive_graphs(assay_id)
graphs = active_graphs + inactive_graphs
X = pipe(graphs, map(clean_label), cvectorize)
y = np.array([1]*len(active_graphs) + [0]*len(inactive_graphs))
return X, y
pipe(assay_id, make_data_clear_label, train_and_test_, predictive_performance_report)
In [12]:
from eden.graph import auto_relabel
def make_data_relabel(assay_id, n_clusters=8, complexity=4):
active_graphs = get_active_graphs(assay_id)
inactive_graphs = get_inactive_graphs(assay_id)
graphs = active_graphs + inactive_graphs
auto_relabel_ = curry(auto_relabel)(n_clusters=n_clusters, complexity=complexity)
X = pipe(graphs, map(clean_label), auto_relabel_, cvectorize)
y = np.array([1]*len(active_graphs) + [0]*len(inactive_graphs))
return X, y
In [13]:
make_data_relabel_ = curry(make_data_relabel)(n_clusters=2)
pipe(assay_id, make_data_relabel_, train_and_test_, predictive_performance_report)
In [14]:
make_data_relabel_ = curry(make_data_relabel)(n_clusters=30)
pipe(assay_id, make_data_relabel_, train_and_test_, predictive_performance_report)
In [21]:
from math import ceil
import numpy as np
from scipy import linalg
def lowess(x, y, f=2. / 3., iter=3):
"""lowess(x, y, f=2./3., iter=3) -> yest
Lowess smoother: Robust locally weighted regression.
The lowess function fits a nonparametric regression curve to a scatterplot.
The arrays x and y contain an equal number of elements; each pair
(x[i], y[i]) defines a data point in the scatterplot. The function returns
the estimated (smooth) values of y.
The smoothing span is given by f. A larger value for f will result in a
smoother curve. The number of robustifying iterations is given by iter. The
function will run faster with a smaller number of iterations.
"""
n = len(x)
r = int(ceil(f * n))
h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)]
w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0)
w = (1 - w ** 3) ** 3
yest = np.zeros(n)
delta = np.ones(n)
for iteration in range(iter):
for i in range(n):
weights = delta * w[:, i]
b = np.array([np.sum(weights * y), np.sum(weights * y * x)])
A = np.array([[np.sum(weights), np.sum(weights * x)],
[np.sum(weights * x), np.sum(weights * x * x)]])
beta = linalg.solve(A, b)
yest[i] = beta[0] + beta[1] * x[i]
residuals = y - yest
s = np.median(np.abs(residuals))
delta = np.clip(residuals / (6.0 * s), -1, 1)
delta = (1 - delta ** 2) ** 2
return yest
import pylab as plt
import numpy as np
def plot_stats(x=None,y=None, xlabel='', ylabel='', title='', label='', interpolation=False):
plt.figure(figsize=(15,6))
y = np.array(y)
y0 = y[:,0]
y1 = y[:,1]
y2 = y[:,2]
y3 = y[:,3]
y4 = y[:,4]
if interpolation:
ye = lowess(np.array(x),y0, f=0.3)
ye = y0
plt.fill_between(x, y3, y4, color='navy', alpha=0.08)
plt.fill_between(x, y1, y2, color='navy', alpha=0.08)
plt.plot(x,y0,'-', lw=2, color='navy', label=label)
if interpolation:
plt.plot(x,ye,'-', lw=3, color='r', label='lowess interpolation')
plt.plot(x,y0,
linestyle='None',
markerfacecolor='white',
markeredgecolor='navy',
marker='o',
markeredgewidth=2,
markersize=8)
plt.xticks(pipe(x,map(int),list))
#plt.ylim((.4,1))
plt.grid(linestyle=":")
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.legend(loc='upper right')
plt.show()
In [22]:
import time
def cluster_analysis(n_cluster_list):
xs = []
ys = []
for n_clusters in n_cluster_list:
ts = time.time()
print 'num clusters=%3d' % n_clusters,
make_data_relabel_ = curry(make_data_relabel)(n_clusters=n_clusters)
y = pipe(assay_id, make_data_relabel_, predictive_performance_, output_stats)
ys.append(y)
xs.append(n_clusters)
te = time.time()
print ' %3.1f sec' % (te-ts)
plot_stats(xs,ys,
xlabel='num clusters',
ylabel='AUC APR',
title= 'Num clusters Analysis',
label= 'median+-quantiles')
In [23]:
%%time
cluster_analysis([2,3,6,8,9,10,11,12,14,16,18,20])
In [24]:
%%time
cluster_analysis([20,30,40,50,60,70,80])
In [25]:
%%time
cluster_analysis([2,4,8,16,32,64,128,256])