In [1]:
from Bio import AlignIO, SeqIO
from Bio.Align import MultipleSeqAlignment
from sklearn.cross_validation import train_test_split, cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import LabelBinarizer
from tpot import TPOT
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import custom_funcs as cf
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
import seaborn as sns
sns.set_context('poster')
In this notebook, I will write the code that is necessary for training on the training set, and testing on the test set.
In [3]:
# Read in the protease inhibitor data
data = pd.read_csv('../data/hiv_data/hiv-protease-data.csv', index_col='SeqID')
drug_cols = data.columns[0:8]
feat_cols = data.columns[8:]
# Read in the consensus data
consensus = SeqIO.read('../data/hiv_data/hiv-protease-consensus.fasta', 'fasta')
consensus_map = {i:letter for i, letter in enumerate(str(consensus.seq))}
# Because there are '-' characters in the dataset, representing consensus sequence at each of the positions,
# they need to be replaced with the actual consensus letter.
for i, col in enumerate(feat_cols):
# Replace '-' with the consensus letter.
data[col] = data[col].replace({'-':consensus_map[i]})
# Replace '.' with np.nan
data[col] = data[col].replace({'.':np.nan})
# Replace 'X' with np.nan
data[col] = data[col].replace({'X':np.nan})
# Drop any feat_cols that have np.nan inside them. We don't want low quality sequences.
data.dropna(inplace=True, subset=feat_cols)
data
Out[3]:
In [4]:
# Grab out the drug name to be tested.
DRUG = drug_cols[0]
# I have written a custom function that takes in the data, and reduces it such that we only consider one drug column
# value, and the corresponding amino acid sequences. The NaN values are also dropped, as most machine learning
# algorithms in scikit-learn cannot deal with NaN values. Finally, the data are log10 transformed.
X, Y = cf.split_data_xy(data, feat_cols, DRUG)
# Binarize the sequence features such that there are 99 x 20 columns in total.
# The purpose of binarizing is to turn the string labels into numeric labels. The most naive way to do this is to
# take every amino acid position in the sequence alignment, and turn it into 20 columns of 1s and 0s corresponding
# to whether a particular amino acid is present or not.
lb = LabelBinarizer()
lb.fit(list('CHIMSVAGLPTRFYWDNEQK'))
X_binarized = pd.DataFrame()
for col in X.columns:
binarized_cols = lb.transform(X[col])
for i, c in enumerate(lb.classes_):
X_binarized[col + '_' + c] = binarized_cols[:,i]
X_binarized
Out[4]:
In [5]:
# Build the categorical values of resistance.
# value < 4-fold = sensitive
# 4-fold < value < 11-fold = intermediate
# value > 11-fold = resistant
#
# These values came from https://www.poz.com/basics/hiv-basics/hiv-drug-resistance,
# more particularly by observing the screen shot on the page.
#
# Final note: we log10 transform the cutoffs, because all of the resistance
# values are also log10-transformed right now.
lower_bound = np.log10(4)
upper_bound = np.log10(11)
from sklearn.preprocessing import LabelEncoder
def categorize(x):
if x < lower_bound:
return 'sensitive'
if x >= lower_bound and x <= upper_bound:
return 'partially sensitive'
else:
return 'resistant'
Y_cat = pd.DataFrame()
Y_cat['resistance_class'] = Y.apply(lambda x: categorize(x))
le = LabelEncoder()
Y_cat = le.fit_transform(Y_cat['resistance_class'])
In [6]:
# Next step is to split the data into a training set and a test set.
# We use the train_test_split function provided by the scikit-learn package to do this.
tts_cat = X_train_cat, X_test_cat, Y_train_cat, Y_test_cat = train_test_split(X_binarized, Y_cat, test_size=0.33)
tts_reg = X_train, X_test, Y_train, Y_test = train_test_split(X_binarized, Y, test_size=0.33)
In [7]:
Y_train
Out[7]:
In [8]:
# # We will use TPOT to automatically find a best model
# tpot_model = TPOT(generations=10, verbosity=2)
# tpot_model.fit(X_train_cat, Y_train_cat)
In [9]:
# tpot_model.export('../scripts/fpv_classifer.py')
In [10]:
# !ls ../scripts
In [11]:
####
In [13]:
# # Try RandomForest Classifier
# from sklearn.ensemble import RandomForestClassifier
# rfc = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
# rfc.fit(X_train_cat, Y_train_cat)
# preds = rfc.predict(X_test_cat)
# from sklearn.metrics import normalized_mutual_info_score as nmi
# from sklearn.metrics import accuracy_score
# accuracy_score(preds, Y_test_cat)
In [ ]:
In [ ]:
We will skip past a discussion of other regresison models, and instead focus here on the use of ensemble regressors.
I would encourage you to take a look at the documentation for each of the following regressors:
The purpose is to look for parameters that you can tweak to make the model better.
I have written a custom function called train_model that takes in
and returns (in order):
In [28]:
# Train a bunch of ensemble regressors:
## Random Forest
rfr_kwargs = {'n_jobs':-1, 'n_estimators':1000, 'n_jobs':-1}
rfr, rfr_preds, rfr_mse, rfr_r2 = cf.train_model(*tts_reg, model=RandomForestRegressor, modelargs=rfr_kwargs)
## Gradient Boosting
gbr_kwargs = {'n_estimators':1000, 'n_jobs':-1}
gbr, gbr_preds, gbr_mse, gbr_r2 = cf.train_model(*tts_reg, model=GradientBoostingRegressor, modelargs=gbr_kwargs)
## AdaBoost
abr_kwargs = {'n_estimators':1000}
abr, abr_preds, abr_mse, abr_r2 = cf.train_model(*tts_reg, model=AdaBoostRegressor, modelargs=abr_kwargs)
## ExtraTrees
etr_kwargs = {'n_estimators':1000, 'n_jobs':-1}
etr, etr_preds, etr_mse, etr_r2 = cf.train_model(*tts_reg, model=ExtraTreesRegressor, modelargs=etr_kwargs)
## Bagging
bgr, bgr_preds, bgr_mse, bgr_r2 = cf.train_model(*tts_reg, model=BaggingRegressor)
In [29]:
from xgboost import XGBRegressor
xbr_kwargs = {'max_depth': 10, 'subsample':0.5}
xbr, xbr_preds, xbr_mse, xbr_r2 = cf.train_model(*tts_reg, model=XGBRegressor, modelargs=xbr_kwargs)
In [30]:
xbr_mse
Out[30]:
In [31]:
# Compare the trained models. Which one minimizes the mean squared error the most?
rfr_mse, gbr_mse, abr_mse, etr_mse, bgr_mse
Out[31]:
In [33]:
fig = plt.figure(figsize=(6.5, 3))
ax = fig.add_subplot(1, 2, 1)
ax.scatter(gbr_preds, Y_test, color='blue')
ax.set_ylabel('actual')
ax.set_xlabel('predicted')
ax.set_title('gradient boost')
ax.set_xticks(np.arange(-1.5, 3.5, 1))
ax2 = fig.add_subplot(1, 2, 2, sharey=ax, sharex=ax)
ax2.scatter(xbr_preds, Y_test, color='blue')
ax2.set_xlabel('predicted')
ax2.set_title('xgboost')
plt.savefig('figures/rf_gboost_baseline.pdf', bbox_inches='tight')
In [9]:
# Qualitatively, what is the math behind the MSE score? Are you looking to minimize or maximize it?
In [10]:
models = dict()
models['rfr'] = RandomForestRegressor(n_estimators=100, n_jobs=-1)
models['gbr'] = GradientBoostingRegressor(n_estimators=100)
models['abr'] = AdaBoostRegressor(n_estimators=100)
models['etr'] = ExtraTreesRegressor(n_estimators=100, n_jobs=-1)
models['bgr'] = BaggingRegressor()
models['xbr'] = XGBRegressor()
scores = cross_val_score(models['gbr'], X_binarized, Y, cv=ShuffleSplit(n=len(Y)))
scores
Out[10]:
In [ ]: