The objective is to see whether continuous vector embedding can help in the prediction of concentrations of metabolites.
In [1]:
    
%matplotlib inline
%load_ext autoreload
%autoreload 2
    
In [2]:
    
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from scipy import stats
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
import pandas as pd
from rdkit import Chem
from keras.models import Sequential, Model, load_model
    
    
In [3]:
    
import sys
sys.path.append('/Users/joewandy/git/keras-molecules')
from molecules.model import MoleculeVAE
from molecules.utils import load_dataset
    
from embedding import to_one_hot_array, get_input_arr, autoencode, encode
from embedding import visualize_latent_rep, get_classifyre, get_scatter_colours
    
In [4]:
    
plt.style.use('seaborn-notebook')
    
In [5]:
    
df = pd.read_csv('data_all_standards.csv')
    
In [6]:
    
df.head()
    
    Out[6]:
In [7]:
    
df.shape
    
    Out[7]:
In [8]:
    
def plot_graph(df, conc_label, feature_label):
    x = np.array(df[feature_label], dtype=float)
    y = np.array(df[conc_label], dtype=float) 
    plt.scatter(x, y, s=10)
    
    slope, intercept, r_value, p_value, std_err=stats.linregress(x, y)
    z = np.polyfit(x, y, 1)
    p = np.poly1d(z)
    plt.plot(x, p(x), "r--")
    print ("r_value = " + str(r_value))
    plt.title(feature_label + ' v intensity')
    plt.xlabel(feature_label)
    plt.ylabel('Intensity (@20uM)')
    plt.text((np.min(x)+1), (np.max(y)-1), "y=%.6fx+(%.6f)"%(z[0], z[1]))
    
    return y
    
In [9]:
    
y = plot_graph(df, '4.321928095', 'ASA_P')
    
    
    
In [10]:
    
X = df[['molP', 'TPSA9', 'VDWSA', 
        'ASA', 'ASA+', 'ASA-', 
        'ASA_H', 'ASA_P', 'MW', 
        'NC', 'PC', 'HLB', 
        'Sol', 'Neg', 'Pos', 
        'Ref', 'logP', 'logD', 
        'TPSA', 'H-donors', 'H-acceptors']].values.astype(float)
    
In [11]:
    
print X.shape
print y.shape
    
    
Performs 10-folds cross-validation on the regression model.
cross_val_predict returns an array of the same size as y where each entry is a prediction obtained by cross validation
In [12]:
    
k = 10
lr = LinearRegression()
predicted = cross_val_predict(lr, X, y, cv=k)
    
In [17]:
    
fig, ax = plt.subplots()
ax.scatter(y, predicted, s=10)
slope, intercept, r_value, p_value, std_err=stats.linregress(y, predicted)
z = np.polyfit(y, predicted, 1)
p = np.poly1d(z)
ax.plot(y, p(y), "r--")
print ("r_value = " + str(r_value))
ax.set_xlim((10, 35))
ax.set_ylim((10, 35))
ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")
ax.set_xlabel('Measured Intensity')
ax.set_ylabel('Predicted Intensity')
plt.title('Predicted vs. Measured Intensity @20uM (10-folds CV)', size=14)
    
    
    Out[17]:
    
Load model
In [18]:
    
base_dir = '/Users/joewandy/git/keras-molecules/'
data_file = base_dir + 'data/smiles_500k_processed.h5'
model_file = base_dir + 'data/smiles_500k_model_292.h5'
latent_dim = 292
    
In [19]:
    
_, charset = load_dataset(data_file, split=False)
print charset
    
    
In [20]:
    
model = MoleculeVAE()
model.load(charset, model_file, latent_rep_size=latent_dim)
    
Get the SMILES strings of our standards molecules, and convert them to canonical SMILES strings using rdkit.
In [21]:
    
smiles_list = df['Smiles'].values.tolist()
smiles_list = [Chem.MolToSmiles(Chem.MolFromSmiles(x)) for x in smiles_list]
    
Pre-process the SMILES strings.
In [22]:
    
input_array = get_input_arr(smiles_list, charset)
    
Try to auto-encode a few SMILES for sanity check.
In [23]:
    
autoencode(model, charset, input_array, N=10)
    
    
Extract the latent vectors
In [24]:
    
X_latent = encode(model, input_array)
print X_latent.shape
    
    
Visualise the latent vectors
In [25]:
    
visualize_latent_rep(input_array, model, latent_dim)
    
    
    
Concatenate the latent + chemical features for regression
In [26]:
    
X_new = np.concatenate((X, X_latent), axis=1)
print X.shape
print X_latent.shape
print X_new.shape
    
    
Make new predictions using X_new
In [27]:
    
predicted_new = cross_val_predict(lr, X_new, y, cv=k)
    
In [30]:
    
def make_plot(predicted, predicted_new, y):
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(14, 7))
    ax1.scatter(y, predicted, s=10)
    slope, intercept, r_value, p_value, std_err=stats.linregress(y, predicted)
    z = np.polyfit(y, predicted, 1)
    p = np.poly1d(z)
    ax1.plot(y, p(y), "r--")
    ax1.set_title('Chemical Features')
    ax1.set_ylabel('Predicted Intensity')
    ax1.set_xlabel('Measured Intensity')
    ax2.scatter(y, predicted_new, s=10)
    slope, intercept, r_value, p_value, std_err=stats.linregress(y, predicted_new)
    z = np.polyfit(y, predicted_new, 1)
    p = np.poly1d(z)
    ax2.plot(y, p(y), "r--")
    ax2.set_title('Chemical + Embedding Features')
    ax2.set_xlabel('Measured Intensity')
    ax1.set_xlim((10, 40))
    ax2.set_xlim((10, 40))
    ax1.set_ylim((10, 40))
    ax1.plot(ax1.get_xlim(), ax1.get_ylim(), ls="--", c=".3")
    ax2.plot(ax2.get_xlim(), ax2.get_ylim(), ls="--", c=".3")
    plt.suptitle('Predicted vs. Measured Intensity @20uM (10-folds CV)', size=14)
    
In [31]:
    
make_plot(predicted, predicted_new, y)
    
    
Try ridge regression
In [32]:
    
reg = linear_model.Ridge()
predicted = cross_val_predict(reg, X, y, cv=k)
predicted_new = cross_val_predict(reg, X_new, y, cv=k)
make_plot(predicted, predicted_new, y)
    
    
Try gaussian process regression with this kernel:
In [33]:
    
kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3)) \
    + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))
    
gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
predicted = cross_val_predict(gp, X, y, cv=k)
predicted_new = cross_val_predict(gp, X_new, y, cv=k)
make_plot(predicted, predicted_new, y)
    
    
In [ ]: