This notebook features a workflow that tries to extract activity-structure information from experimental data.
The data are saved in the database (DB) and can be loaded into a pandas data frame using the compound_db_utils package:
In [1]:
import os, pickle, math
from compound_db_utils import settings
from compound_db_utils.data_loaders import fetch_learning_data
SEED = 42
CV_FOLDS = 10
RELOAD = False # if True, ignore the pickled data and update from DB
def remove_inconsistent(smiles, ic50s):
"""
Filter that computes a consensus value for molecules that were tested
multiple times. The consensus is computed as the median of all
available values.
Molecules with too much variation in the data
and too few data points are marked as inconsistent and
removed entirely.
"""
if int(math.log(ic50s.min(), 10)) != int(math.log(ic50s.max(), 10)) and len(ic50s) < 20:
return False
else:
return ic50s.median()
data = None
if os.path.exists('data.pickle') and not RELOAD:
data = pickle.load(open('data.pickle', 'br'))
else:
data = fetch_learning_data(
['MI-T-15c25ba4']
, col_names_map={
settings.COMPOUNDS_TABLE + '_smiles' : 'smiles'
, settings.BIOACTIVITIES_TABLE + '_value' : 'ic50'
}
, create_rdkit_mols=False
, compute_descriptors=True
, duplicates_handler= remove_inconsistent
)
pickle.dump(data, open('data.pickle', 'bw'))
For each compound, the loaded data provide the canonical SMILES string and measured IC50 values. We also chose to compute molecular descriptors for the compounds. Those are saved in the remaining columns (descriptors are computed using the RDKit cheminformatics toolkit).
Note that the original dataset may contain duplicate entries, because more experiments were often carried out for one given compound. Here, we decided to remove compounds that show large inconsistencies in their IC50 values and we do not have enough data to safely aggregate the values together.
Next, we convert the IC50 values to pIC50 which is defined as the negative logarithm with base 10 of the concentration in moles rather than nanomoles (the default unit for the data in our database):
In [2]:
import numpy as np
data.ic50 = data.ic50.apply(lambda x : -1.0 * np.log10(x / 1.0e9))
Because of the logarithmic scale, the converted data are now much 'nicer' (note that the higher the pIC50 value, the more potent the compound is as opposed to the raw IC50 value where lower value means higher potency):
In [3]:
%matplotlib inline
print('Original:')
print(data.ic50.apply(lambda x : np.power(10, - x) * 1.0e9).describe())
print('\nConverted:')
print(data.ic50.describe())
data.ic50.hist().plot()
Out[3]:
In [4]:
import h2o
from h2o.frame import H2OFrame
def create_frame(data_frame, name):
frame = H2OFrame.from_python(
python_obj=data_frame
, destination_frame=name
, column_names=data_frame.columns.get_values().tolist()
, header=1
)
return frame
In [ ]:
# from sklearn.cross_validation import train_test_split
# h2o.init()
# h2o.h2o.remove_all()
# # data structures used in scikit-learn
# data_train, data_test, pIC50_train, pIC50_test = train_test_split(
# data
# , data.ic50
# , test_size=0.4
# , random_state=SEED
# )
# data_train.head()
# # data structures used in h2o
# training_frame = create_frame(data_train, "training_frame")
# testing_frame = create_frame(data_test, "testing_frame")
# data_train.set_index('smiles', inplace=True)
# data_test.set_index('smiles', inplace=True)
# print(training_frame.shape, testing_frame.shape)
# print(data_train.shape, data_test.shape)
In [5]:
h2o.init()
h2o.h2o.remove_all()
# data structures used in h2o
frame = create_frame(data, 'frame')
training_frame, testing_frame = frame.split_frame(
ratios=[0.6]
, destination_frames=["data_all_train", "data_all_test"]
, seed=SEED
)
# data structures used in scikit-learn
data_train = training_frame.as_data_frame()
data_test = testing_frame.as_data_frame()
data_train.set_index('smiles', inplace=True)
data_test.set_index('smiles', inplace=True)
print(training_frame.shape, testing_frame.shape)
print(data_train.shape, data_test.shape)
In this part of the notebook, we attempt to train an SVR estimator on the data using scikit-learn.
Before we attempt to build the model, we need to remove some variables that might have and adverse effect on the outcome. We will also remove features that will have no effect at all and are therefore irrelevant.
We also define a couple of helper functions that will update our dataset as we go:
In [6]:
def get_removed_feats(data, model):
"""
This function finds out the names of the descriptors
removed by the model.
"""
return data.columns.values[1:][~model.get_support()]
def update_data(data, removed_descriptors, inplace=False):
"""
Removes the descriptors marked for removal from the data frame.
"""
if inplace:
data.drop(removed_descriptors, 1, inplace=True)
print(data.shape)
return data
else:
new_data = data.drop(removed_descriptors, 1, inplace=False)
print(new_data.shape)
return new_data
In [7]:
from sklearn.feature_selection import VarianceThreshold
# find the names of the columns with zero variance
var_sel = VarianceThreshold()
var_sel.fit(data_train.iloc[:,1:])
removed_descriptors = get_removed_feats(data_train, var_sel)
# update the data frame
data_train = update_data(data_train, removed_descriptors)
data_test = update_data(data_test, removed_descriptors)
In [8]:
correlated_descs = dict()
def find_correlated(data):
correlation_matrix = data.corr(method='spearman')
removed_descs = set()
all_descs = correlation_matrix.columns.values
for label in all_descs:
correlations_abs = correlation_matrix[label].abs()
mask = (correlations_abs > 0.9).values
correlated_names = all_descs[mask]
correlated_descs[label] = set(correlated_names)
correlated_descs[label].remove(label)
if label not in removed_descs:
removed_descs.update(correlated_descs[label])
return removed_descs
removed_corr = find_correlated(data_train.iloc[:,1:])
data_train = update_data(data_train, removed_corr)
data_test = update_data(data_test, removed_corr)
pickle.dump(correlated_descs, open('correlated_descs_svr.pickle', 'bw'))
In [9]:
final_desc_set = data_train.iloc[:,1:].columns.tolist()
pickle.dump(final_desc_set, open('final_desc_set_svr.pickle', 'bw'))
In [10]:
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame, concat
def scale_df(frame, scaler):
return DataFrame(scaler.transform(frame), columns=frame.columns)
scaler = StandardScaler(copy=False)
scaler.fit(concat( (data_train.iloc[:,1:], data_test.iloc[:,1:]) ))
descriptors_train = scale_df(data_train.iloc[:,1:], scaler)
descriptors_train.index = data_train.index
descriptors_test = scale_df(data_test.iloc[:,1:], scaler)
descriptors_test.index = data_test.index
print(descriptors_train.shape, descriptors_test.shape)
descriptors_train.head()
Out[10]:
Next, we use a grid to optimize the hyperparameters of the model and we select the best model for further evaluation:
In [13]:
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
param_grid = [
{
'C': [1, 10, 100, 1000]
, 'epsilon': [0.0, 0.1, 0.2, 0.3, 0.4]
, 'gamma': [1.0, 0.1, 0.01, 0.001]
, 'kernel': ['rbf']
},
]
gs = GridSearchCV(SVR(), param_grid, n_jobs=2, cv=CV_FOLDS)
gs.fit(descriptors_train, data_train.ic50)
model = SVR(**gs.best_estimator_.get_params())
gs.best_estimator_.get_params()
Out[13]:
In [14]:
from sklearn.cross_validation import cross_val_score
import numpy as np
scores = cross_val_score(model, descriptors_train, data_train.ic50, cv=CV_FOLDS)
scores_mse = cross_val_score(model, descriptors_train, data_train.ic50, cv=CV_FOLDS, scoring='mean_squared_error')
print("Mean R^2: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print("Mean R: %0.2f (+/- %0.2f)" % (np.sqrt(scores).mean(), np.sqrt(scores).std() * 2))
print("Mean MSE: %0.2f (+/- %0.2f)" % (abs(scores_mse.mean()), scores_mse.std() * 2))
In [15]:
from sklearn.metrics import mean_squared_error
model.fit(descriptors_train, data_train.ic50)
print("R^2: %0.2f" % model.score(descriptors_test, data_test.ic50))
print("R: %0.2f" % np.sqrt(model.score(descriptors_test, data_test.ic50)))
print("MSE: %0.2f" % mean_squared_error(model.predict(descriptors_test), data_test.ic50))
In [16]:
validation_predictions = DataFrame()
validation_predictions['ic50_predicted'] = model.predict(descriptors_test)
validation_predictions['ic50_true'] = data_test.ic50.values
validation_predictions.index = descriptors_test.index
pickle.dump(validation_predictions, open('validation_set_predictions_svr.pickle', 'wb'))
validation_predictions.head()
Out[16]:
In [17]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [15, 15]
span = (1,12)
axes = plt.gca()
axes.set_xlim(span)
axes.set_ylim(span)
plt.plot((span[0],span[1]), (span[0],span[1]), linestyle='--')
# plt.scatter(
# pIC50_train
# , model.predict(descriptors_train)
# , c='green'
# , s=20
# )
plt.scatter(
data_test.ic50
, model.predict(descriptors_test)
, c='blue'
, s=20
)
plt.show()
In [18]:
errors = (data_test.ic50 - model.predict(descriptors_test)).abs()
weights = np.ones_like(errors) / len(errors)
plt.hist(errors, weights=weights, bins=30)
plt.show()
In [19]:
from sklearn import decomposition
n_components = 5
all_data = data.loc[:,data_train.columns]
ic50s = all_data['ic50']
del all_data['ic50']
print(all_data.shape)
pca = decomposition.PCA(n_components=n_components)
pca.fit(all_data)
pca_result = pca.transform(all_data)
eigen_values = pca.explained_variance_ratio_
plt.rcParams["figure.figsize"] = [5, 5]
plt.bar(range(pca_result.shape[1]), eigen_values)
plt.show()
In [20]:
from mpl_toolkits.mplot3d import Axes3D
from itertools import combinations
combos = list(combinations(range(n_components), 3))
plt.rcParams["figure.figsize"] = [15, 30]
fig = plt.figure(len(combos) / 2)
for idx, combo in enumerate(combos):
ax = fig.add_subplot(len(combos) / 2, 2, idx + 1, projection='3d')
ax.scatter(
pca_result[:,combo[0]]
, pca_result[:,combo[1]]
, pca_result[:,combo[2]]
, c=ic50s
, s=20
, cmap='YlOrRd' # red are the compounds with higher values of pIC50
)
ax.view_init(elev=30, azim=45)
ax.set_xlabel('PC%s' % (combo[0] + 1))
ax.set_ylabel('PC%s' % (combo[1] + 1))
ax.set_zlabel('PC%s' % (combo[2] + 1))
plt.show()
In this part of the notebook, we build a QSAR model using a random forest estimator from the h2o package. We will use the same training and testing dataset as in the SVR approach above:
In [ ]:
# from h2o.estimators.random_forest import H2ORandomForestEstimator as RFE
# model = RFE(
# model_id = "random_forest_model"
# # , mtries=-1
# # , sample_rate=0.666
# # , sample_rate_per_class=[]
# # , col_sample_rate_per_tree=1
# # , col_sample_rate_change_per_level=1
# # , build_tree_one_node=False
# # , ntrees=50
# # , max_depth=20
# # , min_rows=1
# # , nbins=20
# # , nbins_cats=1024
# # , binomial_double_trees=False
# # , balance_classes=False
# , seed=SEED
# , nfolds=CV_FOLDS
# , fold_assignment="Random"
# # , keep_cross_validation_predictions=False
# # , keep_cross_validation_fold_assignment=False
# # , score_each_iteration=False
# # , score_tree_interval=0
# # , stopping_rounds=0
# # , stopping_metric="AUTO"
# # , stopping_tolerance=0.001
# # , min_split_improvement=0
# # , histogram_type="AUTO"
# )
# model.train(
# x=training_frame.names[2:]
# , y='ic50'
# , training_frame=training_frame
# )
# # model.show()
In [ ]:
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.random_forest import H2ORandomForestEstimator as RFE
parameters = {
'ntrees' : [10, 50, 100, 150, 200]
, 'max_depth' : [2, 4, 6, 8, 10, 20, 30]
, 'col_sample_rate_per_tree' : [0.3, 0.6, 1.0]
, 'sample_rate' : [0.3, 0.6, 1.0]
}
search_criteria = {
'strategy': "RandomDiscrete"
, 'seed': SEED
, 'stopping_metric': "MSE"
, 'stopping_tolerance': 0.01
, 'stopping_rounds': 5
}
grid_search = H2OGridSearch(
RFE(
seed=SEED
, nfolds=CV_FOLDS
, fold_assignment="Random"
)
, grid_id='grid_search'
, hyper_params=parameters
, search_criteria=search_criteria
)
grid_search.train(
x=training_frame.names[2:]
, y='ic50'
, training_frame=training_frame
)
In [43]:
grid_search_results = grid_search.sort_by('mse')
best_model_id = grid_search_results['Model Id'][0]
best_model = h2o.get_model(best_model_id)
model = best_model
print(grid_search_results)
print(grid_search.get_hyperparams(best_model_id))
In [30]:
importances = model.varimp(use_pandas=True)
pickle.dump(importances, open('importances_rf.pickle', 'bw'))
importances.head()
Out[30]:
In [31]:
performance = model.model_performance(xval=True)
print("R^2:", performance.r2())
print("R:", math.sqrt(performance.r2()))
print("MSE:", performance.mse())
In [32]:
performance = model.model_performance(test_data=testing_frame)
print("R^2:", performance.r2())
print("R:", math.sqrt(performance.r2()))
print("MSE:", performance.mse())
In [33]:
from pandas import DataFrame
predictions_train = model.predict(training_frame).as_data_frame()
predictions_test = model.predict(testing_frame).as_data_frame()
predictions_train.columns = ['ic50']
predictions_test.columns = ['ic50']
true_values_train = training_frame['ic50'].as_data_frame()
true_values_test = testing_frame['ic50'].as_data_frame()
validation_predictions = DataFrame()
validation_predictions['ic50_predicted'] = predictions_test['ic50']
validation_predictions['ic50_true'] = true_values_test
validation_predictions.index = testing_frame['smiles'].as_data_frame()
pickle.dump(validation_predictions, open('validation_set_predictions_rf.pickle', 'wb'))
validation_predictions.head()
Out[33]:
In [34]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [15, 15]
span = (1,12)
axes = plt.gca()
axes.set_xlim(span)
axes.set_ylim(span)
axes.set_xlabel("True Values", fontsize='x-large')
axes.set_ylabel("Predicted Values", fontsize='x-large')
plt.plot((span[0],span[1]), (span[0],span[1]), linestyle='--')
# plt.scatter(
# true_values_train
# , predictions_train
# , c='green'
# , s=20
# , label='training set'
# )
plt.scatter(
true_values_test
, predictions_test
, c='blue'
, s=20
, label='testing set'
)
plt.legend(loc='upper left', fontsize='x-large')
Out[34]:
In [35]:
errors = (true_values_test - predictions_test).abs().iloc[:,0]
weights = np.ones_like(errors) / len(errors)
plt.hist(errors, weights=weights, bins=30)
plt.show()
In [36]:
importances = model.varimp(use_pandas=True)
mask = importances["scaled_importance"] > 0.05
descs_important = data[importances["variable"][mask]]
print(descs_important.shape)
descs_important.head(10)
Out[36]:
In [37]:
from sklearn import decomposition
n_components=5
pca = decomposition.PCA(n_components=n_components)
pca.fit(descs_important)
pca_result = pca.transform(descs_important)
eigen_values = pca.explained_variance_ratio_
In [38]:
plt.rcParams["figure.figsize"] = [5, 3]
plt.bar(range(pca_result.shape[1]), eigen_values)
plt.show()
In [39]:
from mpl_toolkits.mplot3d import Axes3D
from itertools import combinations
combos = list(combinations(range(n_components), 3))
plt.rcParams["figure.figsize"] = [15, 30]
fig = plt.figure(len(combos) / 2)
for idx, combo in enumerate(combos):
ax = fig.add_subplot(len(combos) / 2, 2, idx + 1, projection='3d')
ax.scatter(
pca_result[:,combo[0]]
, pca_result[:,combo[1]]
, pca_result[:,combo[2]]
, c=data.ic50
, s=20
, cmap='YlOrRd' # red are the compounds with higher values of pIC50
)
ax.view_init(elev=30, azim=45)
ax.set_xlabel('PC%s' % (combo[0] + 1))
ax.set_ylabel('PC%s' % (combo[1] + 1))
ax.set_zlabel('PC%s' % (combo[2] + 1))
plt.show()
In [ ]: