In [41]:
# Jupyter notebook magic statements
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set_style('white')
from Bio import SeqIO
from util.isoelectric_point import * # imported from the 'util/' directory
from sklearn.cross_validation import cross_val_score, ShuffleSplit, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelBinarizer
In [43]:
# List the files present in the "data" folder
! ls data/
In [44]:
# Load the protease resistance data into memory.
# [ ] TODO: Refactor this function out as a load_data() function.
resistance = pd.read_csv('data/hiv-protease-data-expanded.csv', index_col=0)
drug_cols = ['NFV', 'SQV', 'FPV', 'IDV', 'LPV', 'ATV', 'TPV', 'DRV']
resistance.set_index('seqid', inplace=True)
resistance[drug_cols] = resistance[drug_cols].apply(np.log10)
resistance.head()
Out[44]:
In [45]:
# Count the number of NaNs in each of the drug resistance columns.
pd.isnull(resistance[drug_cols]).sum().sort_values()
Out[45]:
In [46]:
# Convert the FASTA file of sequences into a pandas dataframe.
# [ ] TODO: Refactor out into a load_sequences() function.
sequences = [np.array(s) for s in SeqIO.parse('data/hiv-protease-sequences-expanded.fasta', 'fasta')]
seqids = [s.id for s in SeqIO.parse('data/hiv-protease-sequences-expanded.fasta', 'fasta')]
sequences = pd.DataFrame(sequences)
sequences = sequences.replace('*', np.nan)
sequences.index = seqids
sequences.dropna(inplace=True)
# Because there's a lower-case character somewhere, force all of the characters to be upper-case.
for col in sequences.columns:
sequences[col] = sequences[col].apply(lambda x: str(x).upper())
In [47]:
len(sequences)
Out[47]:
In [48]:
# Commands to inspect the utility scripts
# ! ls util/
! cat util/isoelectric_point.py
In [49]:
# Replace the letters with amino acid pKas
# Also ensure that "*" are replace with NaN, which are then dropped later.
pKa_features = sequences.replace(isoelectric_points.keys(), isoelectric_points.values())
pKa_features.head()
Out[49]:
In [72]:
# Join in the resistance data for FPV
# [ ] TODO: Refactor lines 5 through 14 into a get_XY_matrix() function.
# Tunable parameters
X_cols = [i for i in range(0,99)]
drug = 'LPV'
weight_col = 'weight'
# Get the final data matrix, and split into X and Y
data_matrix = pKa_features.join(resistance[[drug, weight_col]], how='inner').dropna()
print(len(data_matrix))
X = data_matrix[X_cols]
Y = data_matrix[drug]
W = data_matrix[weight_col]
# Perform a train/test split
cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.2)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
# Compute the following:
mdl = RandomForestRegressor(n_estimators=100)
# Compute the score as the mean absolute error
scores = cross_val_score(mdl, X, Y, cv=cv, scoring='mean_absolute_error', n_jobs=-1)
np.abs(np.mean(scores))
Out[72]:
In [73]:
np.std(scores)
Out[73]:
In [74]:
# Do a fit/predict on the X and Y matrices.
def train_random_forest_regressor(X_train, Y_train, W, X_test, Y_test):
mdl = RandomForestRegressor(n_estimators=100)
mdl.fit(X_train, Y_train, sample_weight=W.ix[X_train.index].values)
preds = mdl.predict(X_test)
score = mdl.score(X_test, Y_test)
return preds, score
preds, score = train_random_forest_regressor(X_train, Y_train, W, X_test, Y_test)
In [75]:
# Plot the predicted against actual.
def plot_test_vs_preds(Y_test, preds):
fig = plt.figure(figsize=(4,4))
plt.scatter(Y_test, preds)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.plot([min(Y_test),max(Y_test)], [min(Y_test),max(Y_test)], color='r', lw=3)
plt.annotate('r2: {0:.02f}'.format(score), xy=(2,0))
plt.title('{0}, mse: {1:.03f}'.format(drug, mse(preds, Y_test)))
plot_test_vs_preds(Y_test, preds)
In [76]:
# The average MSE is 0.02, which is in log-transformed units.
# The average mean absolute error is 0.08
# What is the actual mean error in non log-transformed units?
print('The actual mean error is:')
np.power(10, np.abs(np.mean(scores)))
Out[76]:
In [77]:
# We will use the scikit-learn label binarizer to binarize the amino acids into a letter matrix.
aa_letters = [i for i in isoelectric_points.keys()]
lb = LabelBinarizer()
lb.fit(aa_letters)
bin_features = pd.DataFrame()
X_cols = []
for col in sequences.columns:
transformed = lb.transform(sequences[col])
for i, letter in enumerate(lb.classes_):
colname = '{0}_{1}'.format(col, letter)
bin_features[colname] = transformed[:, i]
X_cols.append(colname)
bin_features.index = sequences.index
bin_features.head()
Out[77]:
In [78]:
data_matrix = bin_features.join(resistance[[drug, weight_col]]).dropna()
print(len(data_matrix))
X = data_matrix[X_cols]
Y = data_matrix[drug]
W = data_matrix[weight_col]
# Perform a train/test split
cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.2)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
# Compute the following:
mdl = RandomForestRegressor(n_estimators=100)
# Compute the score as the mean absolute error
scores = cross_val_score(mdl, X, Y, cv=cv, scoring='mean_absolute_error', n_jobs=-1)
np.abs(np.mean(scores))
Out[78]:
In [79]:
np.std(scores)
Out[79]:
In [80]:
# Do a fit/predict on the X and Y matrices.
preds, score = train_random_forest_regressor(X_train, Y_train, W, X_test, Y_test)
In [81]:
# Plot the predicted against actual.
plot_test_vs_preds(Y_test, preds)
In [82]:
print('The actual mean error is:')
np.power(10, np.abs(np.mean(scores)))
Out[82]:
In [ ]:
Binarized sequence features vs. pKa values - basically the models perform similarly.
However, we are lacking interpretability. I would like to implement something like a convolutional network on graphical models of chemical structures.
In [ ]: