The objective is to see whether continuous vector embedding can help in the prediction of concentrations of metabolites.
In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [2]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from scipy import stats
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
import pandas as pd
from rdkit import Chem
from keras.models import Sequential, Model, load_model
In [3]:
import sys
sys.path.append('/Users/joewandy/git/keras-molecules')
from molecules.model import MoleculeVAE
from molecules.utils import load_dataset
from embedding import to_one_hot_array, get_input_arr, autoencode, encode
from embedding import visualize_latent_rep, get_classifyre, get_scatter_colours
In [4]:
plt.style.use('seaborn-notebook')
In [5]:
df = pd.read_csv('data_all_standards.csv')
In [6]:
df.head()
Out[6]:
In [7]:
df.shape
Out[7]:
In [8]:
def plot_graph(df, conc_label, feature_label):
x = np.array(df[feature_label], dtype=float)
y = np.array(df[conc_label], dtype=float)
plt.scatter(x, y, s=10)
slope, intercept, r_value, p_value, std_err=stats.linregress(x, y)
z = np.polyfit(x, y, 1)
p = np.poly1d(z)
plt.plot(x, p(x), "r--")
print ("r_value = " + str(r_value))
plt.title(feature_label + ' v intensity')
plt.xlabel(feature_label)
plt.ylabel('Intensity (@20uM)')
plt.text((np.min(x)+1), (np.max(y)-1), "y=%.6fx+(%.6f)"%(z[0], z[1]))
return y
In [9]:
y = plot_graph(df, '4.321928095', 'ASA_P')
In [10]:
X = df[['molP', 'TPSA9', 'VDWSA',
'ASA', 'ASA+', 'ASA-',
'ASA_H', 'ASA_P', 'MW',
'NC', 'PC', 'HLB',
'Sol', 'Neg', 'Pos',
'Ref', 'logP', 'logD',
'TPSA', 'H-donors', 'H-acceptors']].values.astype(float)
In [11]:
print X.shape
print y.shape
Performs 10-folds cross-validation on the regression model.
cross_val_predict
returns an array of the same size as y
where each entry is a prediction obtained by cross validation
In [12]:
k = 10
lr = LinearRegression()
predicted = cross_val_predict(lr, X, y, cv=k)
In [17]:
fig, ax = plt.subplots()
ax.scatter(y, predicted, s=10)
slope, intercept, r_value, p_value, std_err=stats.linregress(y, predicted)
z = np.polyfit(y, predicted, 1)
p = np.poly1d(z)
ax.plot(y, p(y), "r--")
print ("r_value = " + str(r_value))
ax.set_xlim((10, 35))
ax.set_ylim((10, 35))
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
ax.set_xlabel('Measured Intensity')
ax.set_ylabel('Predicted Intensity')
plt.title('Predicted vs. Measured Intensity @20uM (10-folds CV)', size=14)
Out[17]:
Load model
In [18]:
base_dir = '/Users/joewandy/git/keras-molecules/'
data_file = base_dir + 'data/smiles_500k_processed.h5'
model_file = base_dir + 'data/smiles_500k_model_292.h5'
latent_dim = 292
In [19]:
_, charset = load_dataset(data_file, split=False)
print charset
In [20]:
model = MoleculeVAE()
model.load(charset, model_file, latent_rep_size=latent_dim)
Get the SMILES strings of our standards molecules, and convert them to canonical SMILES strings using rdkit.
In [21]:
smiles_list = df['Smiles'].values.tolist()
smiles_list = [Chem.MolToSmiles(Chem.MolFromSmiles(x)) for x in smiles_list]
Pre-process the SMILES strings.
In [22]:
input_array = get_input_arr(smiles_list, charset)
Try to auto-encode a few SMILES for sanity check.
In [23]:
autoencode(model, charset, input_array, N=10)
Extract the latent vectors
In [24]:
X_latent = encode(model, input_array)
print X_latent.shape
Visualise the latent vectors
In [25]:
visualize_latent_rep(input_array, model, latent_dim)
Concatenate the latent + chemical features for regression
In [26]:
X_new = np.concatenate((X, X_latent), axis=1)
print X.shape
print X_latent.shape
print X_new.shape
Make new predictions using X_new
In [27]:
predicted_new = cross_val_predict(lr, X_new, y, cv=k)
In [30]:
def make_plot(predicted, predicted_new, y):
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(14, 7))
ax1.scatter(y, predicted, s=10)
slope, intercept, r_value, p_value, std_err=stats.linregress(y, predicted)
z = np.polyfit(y, predicted, 1)
p = np.poly1d(z)
ax1.plot(y, p(y), "r--")
ax1.set_title('Chemical Features')
ax1.set_ylabel('Predicted Intensity')
ax1.set_xlabel('Measured Intensity')
ax2.scatter(y, predicted_new, s=10)
slope, intercept, r_value, p_value, std_err=stats.linregress(y, predicted_new)
z = np.polyfit(y, predicted_new, 1)
p = np.poly1d(z)
ax2.plot(y, p(y), "r--")
ax2.set_title('Chemical + Embedding Features')
ax2.set_xlabel('Measured Intensity')
ax1.set_xlim((10, 40))
ax2.set_xlim((10, 40))
ax1.set_ylim((10, 40))
ax1.plot(ax1.get_xlim(), ax1.get_ylim(), ls="--", c=".3")
ax2.plot(ax2.get_xlim(), ax2.get_ylim(), ls="--", c=".3")
plt.suptitle('Predicted vs. Measured Intensity @20uM (10-folds CV)', size=14)
In [31]:
make_plot(predicted, predicted_new, y)
Try ridge regression
In [32]:
reg = linear_model.Ridge()
predicted = cross_val_predict(reg, X, y, cv=k)
predicted_new = cross_val_predict(reg, X_new, y, cv=k)
make_plot(predicted, predicted_new, y)
Try gaussian process regression with this kernel:
In [33]:
kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) \
+ WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))
gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
predicted = cross_val_predict(gp, X, y, cv=k)
predicted_new = cross_val_predict(gp, X_new, y, cv=k)
make_plot(predicted, predicted_new, y)
In [ ]: