In [ ]:
import custom_funcs as cf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import sklearn.cross_validation as cv

from Bio import SeqIO
from Bio.Align import MultipleSeqAlignment
from itertools import combinations
from random import sample
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelBinarizer

%matplotlib inline

%load_ext autoreload
%autoreload 2

Problem Set on Machine Learning

Problem 1

Identify an academic literature reference that descirbes the PhenoSense assay. Paste the URL to the PubMed article below, and write a 1-2 sentence summary on what is measured in the assay, and how it relates to drug resistance.

Compare and contrast it with the plaque reduction assay as mentioned in the literature - what would be one advantage of the plaque reduction assay that is lacking in PhenoSense, and vice versa?

Answer

Double-click on this cell to type in your answer. Use Markdown formatting if you'd like.

A new paragraph is delineated by having a line in between them. You can bold or italicize text.

  • Bulleted
  • Lists
    • are done this way.
    • 4 spaces for indents.
  1. Numbered
  2. Lists
    1. are done this way.
    2. 4 spaces for indents.
    3. The numbering is automatically parsed!

Leave the answer at the top so Claire can know where your answer is!

Problem 2

  1. Write code below to calculate the correlation between two drugs' resistance profiles. Identify the protease drugs for which the two drugs' resistance values are correlated.
  2. Speculate as to why they would be correlated.

In [ ]:
# This cell loads the data and cleans it for you, and log10 transforms the drug resistance values.
# Remember to run this cell if you want to have the data loaded into memory.
DATA_HANDLE = 'drug_data/hiv-protease-data.csv'  # specify the relative path to the protease drug resistance data
N_DATA = 8  # specify the number of columns in the CSV file that are drug resistance measurements.
CONSENSUS = 'sequences/hiv-protease-consensus.fasta'  # specify the relative path to the HIV protease consensus sequence
data, drug_cols, feat_cols = cf.read_data(DATA_HANDLE, N_DATA)
consensus_map = cf.read_consensus(CONSENSUS)
data = cf.clean_data(data, feat_cols, consensus_map)
for name in drug_cols:
    data[name] = data[name].apply(np.log10)
data.head()

In [ ]:
"""
Complete the function below to compute the correlation score.

Use the scipy.stats.pearsonr(x, y) function to find the correlation score between two arrays of things. 
You do not need to type the whole name, as I have imported the pearsonr name for you, so you only have to do:

    pearsonr(x, y)

Procedure:

1. Select two columns' names to compare.
2. Make sure to drop NaN values. the pearsonr function cannot deal with NaN values. 
   (Refer to the Lecture notebook if you forgot how to do this.)
3. Pass the data in to pearsonr().

"""
def corr_score(drug1, drug2):
    ### BEGIN SOLUTION
    # Get the subset of data, while dropping columns that have NaN in them.
    
    # Return the pearsonr score.
    return pearsonr(____________, ____________)
    ### END SOLUTION

In [ ]:
assert corr_score('IDV', 'FPV') == (0.79921991532901282, 2.6346448659104859e-306)
assert corr_score('ATV', 'FPV') == (0.82009597442033089, 2.5199367322520278e-231)
assert corr_score('NFV', 'DRV') == (0.69148264851159791, 4.0640711263961111e-82)
assert corr_score('LPV', 'SQV') == (0.76682619729899326, 4.2705737581002648e-234)

Question: Which two drugs are most correlated?

Answer

Question: Why might they be correlated? (Hint: you can look online for what they look like.)

Answer

Problem 3

Fill in the code below to plot the relationship between number of estimators (X-axis) and the MSE value for each of the estimators.

  • Try 10, 30, 50, 80, 100, 300, 500 and 800 estimators.
  • Use the ShuffleSplit iterator with cross-validation.
  • Use mean of at least 5 cross-validated MSE scores.

In [ ]:
def return_cleaned_data(drug_name, data):
    # Select the subsets of columns of interest.
    cols_of_interest = []
    cols_of_interest.append(drug_name)
    cols_of_interest.extend(feat_cols)
    subset = data[cols_of_interest].dropna() 
    Y = subset[drug_name]  
    X = subset[feat_cols]
    
    # Binarize the columns.
    lb = LabelBinarizer()
    lb.fit(list('CHIMSVAGLPTRFYWDNEQK'))

    X_binarized = pd.DataFrame()

    for col in X.columns:
        binarized_cols = lb.transform(X[col])

        for i, c in enumerate(lb.classes_):
            X_binarized[col + '_' + c] = binarized_cols[:,i]
    
    return X_binarized, Y

X_binarized, Y = return_cleaned_data('FPV', data)
len(X_binarized), len(Y)

In [ ]:
num_estimators = [_________]  # fill in the list of estimators to try here.
models = {'Random Forest':RandomForestRegressor,
          }  # fill in the other models here

# Initialize a dictionary to hold the models' MSE values.
mses = dict()
for model_name, model in models.items():
    mses[model_name] = dict()
    for n in num_estimators:
        mses[model_name][n] = 0

# Iterate over the models, and number of estimators.
for model_name, model in models.items():
    for n_est in num_estimators:
        print(model_name, n_est)
        ### Begin Here
        
        # Set up the cross-validation iterator
        
        # Initialize the model
        
        # Collect the cross-validation scores. Remember that mse will be negative, and needs to
        # be transformed to be positive.
        
        
        
        ### End Here
        
        
        # Store the mean MSEs.
        mses[model_name][n_est] = np.mean(-cv_scores)

In [ ]:
# When you're done, run the following cell to make your plot.
pd.DataFrame(mses).plot()
plt.xlabel('Num Estimators')
plt.ylabel('MSE')

Question: Given the data above, consider the following question from the viewpoint of a data scientist/data analyst. What factors do you need to consider when tweaking model parameters?

Answer

Problem 4

  • Pick the best model from above, and re-train it on the dataset again. Refer to the Lecture notebook for a version of the code that may help here!
  • Now, use it to make predictions on the global HIV protease dataset.
  • Plot the global distribution.

In [ ]:
# Load in the data and binarize it.
proteases = [s for s in SeqIO.parse('sequences/HIV1-protease.fasta', 'fasta') if len(s) == 99]
alignment = MultipleSeqAlignment(proteases)
proteases_df = pd.DataFrame(np.array([list(rec) for rec in alignment], str))
proteases_df.index = [s.id for s in proteases]
proteases_df.columns = [i for i in range(1, 100)]
X_global = cf.binarize_seqfeature(proteases_df)

In [ ]:
# Train your model here, with optimized parameters for best MSE minimization.
### BEGIN
model = ________________(__________)  # put your best model here, with optimized parameters.
model.fit(______________)
preds = model.predict(______________)
plt.hist(preds)
### END

Question:

How would you evaluate whether the predictions are correct?

Answer

Question: In the procedure we have used here, we have done the following:

  1. Randomly subdivide the whole training data into a subset training and testing set.
  2. Used cross-validation over multiple random splits to select the best model.
  3. Re-train best model on the entire dataset.
  4. Use the trained model to make predictions about new data.

Think through the procedure for a moment. What assumptions about the training data have we made in using this procedure to train the ML models?

Answer


In [ ]: