In [8]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [27]:
from Bio import SeqIO
from Bio.Align import MultipleSeqAlignment
from sklearn.cross_validation import ShuffleSplit, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from util.isoelectric_point import isoelectric_points
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')
We will train a model to predict drug resistance values from sequence.
This is the other general variant of supervised learning - where instead of predicting a "label" for a class (classification), we are predicting a "number" or a "value". This is called regression, analogous to, say, linear regression or logistic regression.
In [10]:
# Load the sequence data as a Pandas dataframe.
seqids = [s.id for s in SeqIO.parse('data/hiv-protease-sequences-expanded.fasta', 'fasta')]
sequences = [s for s in SeqIO.parse('data/hiv-protease-sequences-expanded.fasta', 'fasta')]
sequences = MultipleSeqAlignment(sequences)
sequences = pd.DataFrame(np.array(sequences))
sequences.index = seqids
# Ensure that all of the letters are upper-case, otherwise the replace function in the next cell won't work.
for col in sequences.columns:
sequences[col] = sequences[col].apply(lambda x: x.upper())
sequences[col] = sequences[col].replace('*', np.nan)
sequences.head()
Out[10]:
In [11]:
seqdf = sequences.replace(isoelectric_points.keys(), isoelectric_points.values())
seqdf.head()
Out[11]:
In [12]:
# Load the drug resistance values
dr_vals = pd.read_csv('data/hiv-protease-data-expanded.csv', index_col=0)
dr_vals.set_index('seqid', inplace=True)
dr_vals.head()
Out[12]:
In [13]:
# Join the sequence data together with that of one drug of interest.
drug_name = 'FPV'
data_matrix = seqdf.join(dr_vals[drug_name]).dropna() # we have to drop NaN values because scikit-learn algorithms are not designed to accept them.
data_matrix.head()
Out[13]:
In [18]:
# Your Answer
# Hint: to select a set of columns from a dataframe, use: dataframe[[columns]]
# Hint: the columns 0 to 98 can be expressed as a list comprehension: [i for i in range(99)]
X = data_matrix[[i for i in range(99)]]
Y = data_matrix[drug_name]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
In [19]:
# Answer
mdl = RandomForestRegressor()
mdl.fit(X_train, Y_train)
preds = mdl.predict(X_test)
Make a plot of what the predictions (y-axis) against the actual values (x-axis)
In [20]:
plt.scatter(preds, Y_test)
Out[20]:
Look through the sklearn.metrics module. What might be a suitable metric to use?
Justify the use of two of them, and write the code that computes the evaluation metric.
In [21]:
# Metric 1: correlation coefficient.
r2_score(preds, Y_test)
Out[21]:
In [22]:
# Metric 2: mean squared error
mean_squared_error(preds, Y_test)
Out[22]:
In [28]:
X = data_matrix[[i for i in range(99)]]
Y = data_matrix[drug_name].apply(np.log10) # the log10 transformation is applied here.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)
In [29]:
mdl = RandomForestRegressor()
mdl.fit(X_train, Y_train)
preds = mdl.predict(X_test)
Note: the MSE goes down because of the log10 transform. However, that's exactly what we would have expected by definition.
In [30]:
mean_squared_error(preds, Y_test)
Out[30]:
The r-squared score goes up. Less skew is always good.
In [31]:
r2_score(preds, Y_test)
Out[31]:
In [ ]:
In [33]:
cv = ShuffleSplit(n=len(X), n_iter=10, test_size=0.3)
models = dict()
models['rf'] = RandomForestRegressor()
models['gb'] = GradientBoostingRegressor()
models['ad'] = AdaBoostRegressor()
models['ex'] = ExtraTreesRegressor()
scores = dict()
for abbr, model in models.items():
print(abbr, model)
score = cross_val_score(model, X, Y, cv=cv, scoring='mean_squared_error')
scores[abbr] = -score # a known issue in the scikit-learn package; we have to take the negative of the result.
In [34]:
score_summary = pd.DataFrame(scores)
score_summary = pd.DataFrame(score_summary.unstack()).reset_index()
# sns.violinplot(x=)
score_summary.columns = ['model', 'idx', 'error']
sns.violinplot(x='model', y='error', data=score_summary)
Out[34]:
Here, we would interpret the ExtraTrees and RandomForest regressors as the best performing, "out-of-the-box" (i.e. without further tuning).
Statistical good practices:
In [ ]: