In [1]:
# Read in the data
import pandas as pd
import custom_funcs as cf
import numpy as np
from bokeh.charts import Histogram, Scatter
from bokeh import plotting as bplt
from bokeh.plotting import ColumnDataSource
from ipywidgets import widgets, interact, Dropdown
from IPython.display import display
from sklearn.preprocessing import LabelBinarizer
from sklearn import cross_validation as cv
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error as mse
from Bio import SeqIO
from Bio.Align import MultipleSeqAlignment
%load_ext autoreload
%autoreload 2
bplt.output_notebook()
By the end of this portion of the class, you should be equipped to:
If I gave you the protein sequence of an HIV protease, could you tell the degree to which the protease is resistant to a particular drug? What kind of data would you need?
From a clinical perspective: If a new patient came in, and you had to prescribe a drug combination for their infection, how would you tell which drugs to give, given the genomic data?
From an epidemiological perspective: How would you tell whether drug resistance is shaping the evolution of the virus? What kind of data would you need to answer this question?
If you can set up your machine learning problem as "learning the relationship between features and outcomes", then you are good to go for doing machine learning.
| Feature 1 | Feature 2 | Feature 3 | Outcome 1 | Outcome 2 |
|---|---|---|---|---|
| Obs 1 | Obs 2 | Obs 3 | Val 1 | Val 2 |
| ... | ... | ... | ... | ... |
The data are derived from the Stanford HIV Database. The data are collected by measuring the fold drug resistance in a standardized drug resistance assay, also known as the PhenoSense Assay. As such, this is a high quality, matched genotype-phenotype dataset.
I have written some custom functions in the custom_funcs.py module to aid with data loading. Run the cell below.
In [2]:
DATA_HANDLE = 'drug_data/hiv-protease-data.csv' # specify the relative path to the protease drug resistance data
N_DATA = 8 # specify the number of columns in the CSV file that are drug resistance measurements.
CONSENSUS = 'sequences/hiv-protease-consensus.fasta' # specify the relative path to the HIV protease consensus sequence
data, drug_cols, feat_cols = cf.read_data(DATA_HANDLE, N_DATA)
consensus_map = cf.read_consensus(CONSENSUS)
data = cf.clean_data(data, feat_cols, consensus_map)
In [3]:
data.head() # alternatively, data.tail()
Out[3]:
Explore
Before we go any further, I think it is valuable to explore the dataset further. Below is an interactive tool to explore the data.
As you look through it, discuss the following question with each other:
In [4]:
def histogram(drug, bins=50, log10='False'):
src = data[[drug]].dropna()
xlabel = 'Drug Resistance Values'
if log10 == "True":
src = src.apply(lambda x: np.log10(x))
xlabel = 'log10 (Drug Resistance Value)'
p = Histogram(data=src,
title='Distribution of {0} Drug Resistance Values'.format(drug),
xlabel=xlabel,
ylabel='Count',
bins=bins,
width=900,
height=300)
bplt.show(p)
interact(histogram, drug=[i for i in drug_cols], bins=[10, 80, 10], log10=["False", "True"])
Let's also take a look at the correlations between the drug resistance values. Think about the following question:
In [5]:
def scatter(drug1, drug2, log10="False"):
p = bplt.figure(x_axis_label=drug1,
y_axis_label=drug2,
plot_height=300,
plot_width=300)
src = data[[drug1, drug2]].dropna()
if log10 == "True":
src = src.apply(np.log10)
src = ColumnDataSource(src)
p.circle(source=src, x=drug1, y=drug2)
bplt.show(p)
interact(scatter,
drug1=[i for i in drug_cols][::-1],
drug2=[i for i in drug_cols],
log10=['True', 'False'])
Let's work with only one drug: FPV. We will reduce the data such that we only have FPV drug resistance values with the sequence feature matrix.
In [ ]:
cols_of_interest = []
cols_of_interest.append('FPV')
cols_of_interest.extend(feat_cols)
fpv = data[cols_of_interest]
fpv.head(10) # change to: fpv.dropna().head(10)
Notice how there are NaN values present. These will cause issues for us later on.
NaN values.Exercise
Let's try to write a function together that will give us a cleaned data set, which you can use later on.
In [ ]:
def one_drug_data(drug_name):
# append and extend are two methods for lists.
# append adds a single item to the end of the list.
# extend iterates over the list passed into the function, and appends it to the end of the list.
# What is the function call to remove rows with NaN values in it?
return drug_data
fpv = one_drug_data('FPV')
fpv.head()
As the scatterplots and histograms show, it may be wise to log10 transform the drug resistance value, so that we can work with a less skewed distribution.
Exercise
Fill in the function below.
In [ ]:
def log10_drug_data(drug_name):
# Comment out drug_name.
drug_data = one_drug_data(________)
drug_data[________] = drug_data[________].apply(np.log10)
return drug_data
fpv = log10_drug_data("FPV")
fpv.head()
In [ ]:
def split_xy(data, X_colnames, Y_colnames):
"""
The "X" matrix will be the feature columns.
The "Y" matrix (or column in this case) will be the outcome column(s).
"""
return data[_________], data[___________]
X, Y = split_xy(fpv, feat_cols, 'FPV')
Convince yourself that they've been split correctly, by running the following two cells.
In [ ]:
X.head()
In [ ]:
Y.head()
In [ ]:
lb = LabelBinarizer()
lb.fit(list('CHIMSVAGLPTRFYWDNEQK'))
X_binarized = pd.DataFrame()
for col in X.columns:
binarized_cols = lb.transform(X[col])
for i, c in enumerate(lb.classes_):
X_binarized[str(col) + '_' + str(c)] = binarized_cols[:,i]
Convince yourself, now, that X_binarized is indeed binarized.
In [ ]:
X_binarized.head().iloc[:,0:20]
In [ ]:
# Split the data into two sets, the training and testing set.
X_train, X_test, Y_train, Y_test = cv.train_test_split(X_binarized, Y, test_size=0.3)
In [ ]:
# NOTE: This is the code you may want to use for your pset!
# The code associated with cells that are interactive may not be best suited to your use case.
# Initialize the model
rfr = RandomForestRegressor()
# Call on the "model.fit()" function.
rfr.fit(X_train, Y_train)
# Make a prediction on the test set.
Y_preds = rfr.predict(X_test)
# Evaluate the model by calculating the mean-squared error.
mse(Y_test, Y_preds)
Question
What is the mean-squared error metric actually measuring?
During the model training and evaluation phase, we want to know which models will generalize best. Therefore, it is important to use cross-validation on the data.
In cross-validation procedure we are going to use here is called fractional shuffled cross validation, the data are:
X times.There's nothing special about the name, I made it up. :)
Exercise
Play around with the interactive widgets below, and let us know which model you think should be considered the best. Note: some models take a long time to run. Also, the more the number of iterations, the longer it should take.
In [ ]:
import math
def assess_model(model, n_iters):
model = eval(model)
n_iters = int(n_iters)
cv_iterator = cv.ShuffleSplit(len(X_binarized), n_iter=n_iters, test_size=0.3)
cv_scores = cv.cross_val_score(model(),
X_binarized,
Y,
cv=cv_iterator,
scoring='mean_squared_error')
# note: we return negative of mean squared error because of a known issue in scikit-learn
# where the MSE is returned as a negative value. Therefore, to get back the positive value,
# we have to take the negative of the array of values.
return -cv_scores
interact(assess_model,
model=['RandomForestRegressor', 'AdaBoostRegressor', 'ExtraTreesRegressor', 'GradientBoostingRegressor'],
n_iters=Dropdown(description='n_iters', options=['5', '6', '7', '8', '9', '10', '11', '12']))
Exercise
Modify the assess_model function such that it returns the mean and the standard deviation of the CV scores.
The following block of code should help illustrate for you acceptable coding patterns:
def function(param1, param2):
# do stuff with param1 and param2
result1 = #stuff done with param1
result2 = #stuff done with param2
return result1, result2
Also, the numpy function has been imported in the np namespace, so you can access the following functions:
np.mean(array_of_values) will give you the mean.np.std(array_of_values) will give you the standard deviation.When you're done, don't forget to re-run the cell, so that the function can be updated, before playing with the drop-down menu values.
Question
Which model performs the best?
Once we know which model performs the best, we can go on to tweak it further.
Note: We haven't tweaked the parameters in the model yet! Thus far, we have yet to optimize model performance. Take a look at each of the following model's API documentation pages to see what parameters can be tweaked:
In [ ]:
def optimize_random_forest_model(n_iters, n_estimators):
n_iters = int(n_iters) # I cast as an int only because widgets can only return strings. Likewise on next line.
n_estimators = int(n_estimators)
cv_iterator = cv.ShuffleSplit(len(X_binarized), n_iter=n_iters, test_size=0.3)
cv_scores = cv.cross_val_score(RandomForestRegressor(n_estimators),
X_binarized,
Y,
cv=cv_iterator,
scoring='mean_squared_error')
return np.mean(-cv_scores), np.std(-cv_scores)
interact(optimize_random_forest_model,
n_estimators=Dropdown(description='Number of Random Forest Estimators', options=['5', '10', '50', '100']),
n_iters=Dropdown(description='Number of Train/Test Iterations', options=['5', '6', '7', '8', '9', '10', '11', '12']))
In [ ]:
rfr = RandomForestRegressor(n_estimators=100)
rfr.fit(X_binarized, Y)
sorted([i for i in zip(X_binarized.columns, rfr.feature_importances_)], key=lambda x:x[1], reverse=True)
The final thing we need to do before moving onto the next class is to make predictions on the rest of the data.
Recall the motivation:
In [ ]:
proteases = [s for s in SeqIO.parse('sequences/proteases_downsampled.fasta', 'fasta')]
for s in proteases:
if len(s.seq) != 99:
print(s.id)
In [ ]:
proteases = [s for s in SeqIO.parse('sequences/proteases_downsampled.fasta', 'fasta') if len(s.seq) == 99]
proteases = MultipleSeqAlignment(proteases)
proteases
In [ ]:
# Iterate over all of the columns in the multiple sequence alignment, and append it as a
proteases_df = pd.DataFrame()
for col in range(proteases.get_alignment_length()):
binarized_cols = lb.transform([k for k in proteases[:,col]])
for i, c in enumerate(lb.classes_):
proteases_df[str(col + 1) + '_' + c] = binarized_cols[:,i]
# Add in the index.
index = []
for s in proteases:
index.append(s.id)
proteases_df.index = index
proteases_df.head()
In [ ]:
proteases_df.iloc[0:5, 0:20]
In [ ]:
# Fit all four regressor models
def fit_model(model, X, Y):
# Instantiate the model
# Fit the model
# Return the model and the model predictions.
return
# Train RF, ET, AB and GB models
rf_mdl, rf_preds = fit_model(RandomForestRegressor, X_binarized, Y)
# Continue below.
In [ ]:
# Zip together the predicted values with the index of the protease dataframe.
protease_preds = [i for i in zip(proteases_df.index, rf_mdl.predict(proteases_df))]
# Convert it into a dataframe, set the index column to be the IDs, and apply a 10**x transformation
protease_preds = pd.DataFrame(protease_preds).set_index(0).apply(lambda x: np.power(10, x))
# View the first few lines.
protease_preds.head()
In [ ]:
# Write them to disk.
protease_preds.to_csv('csv/FPV_random-forest_preds.tsv', sep='\t')
In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(rf_preds)
In [ ]: