In [2]:
from Bio import AlignIO, SeqIO
from Bio.Align import MultipleSeqAlignment
from sklearn.cross_validation import train_test_split, cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import LabelBinarizer

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import custom_funcs as cf

%load_ext autoreload
%autoreload 2
%matplotlib inline

In this notebook, I will write the code that is necessary for training on the training set, and testing on the test set.


In [3]:
# Read in the protease inhibitor data
data = pd.read_csv('drug_data/hiv-protease-data.csv', index_col='SeqID')
drug_cols = data.columns[0:8]
feat_cols = data.columns[8:]

# Read in the consensus data
consensus = SeqIO.read('sequences/hiv-protease-consensus.fasta', 'fasta')

consensus_map = {i:letter for i, letter in enumerate(str(consensus.seq))}

# Because there are '-' characters in the dataset, representing consensus sequence at each of the positions, 
# they need to be replaced with the actual consensus letter.

for i, col in enumerate(feat_cols):
    # Replace '-' with the consensus letter.
    data[col] = data[col].replace({'-':consensus_map[i]})
    
    # Replace '.' with np.nan
    data[col] = data[col].replace({'.':np.nan})
    
    # Replace 'X' with np.nan
    data[col] = data[col].replace({'X':np.nan})
    
# Drop any feat_cols that have np.nan inside them. We don't want low quality sequences.
data.dropna(inplace=True, subset=feat_cols)
data


Out[3]:
FPV ATV IDV LPV NFV SQV TPV DRV P1 P2 ... P90 P91 P92 P93 P94 P95 P96 P97 P98 P99
SeqID
2996 2.5 NaN 16.3 NaN 38.6 16.1 NaN NaN P Q ... M T Q L G C T L N F
4387 0.7 NaN 0.8 NaN 0.8 1.1 NaN NaN P Q ... L T Q I G C T L N F
4432 1.5 NaN 1.0 NaN 2.2 1.1 NaN NaN P Q ... L T Q I G C T L N F
4482 3.9 NaN 20.2 NaN 21.6 9.2 NaN NaN P Q ... M T Q L G C T L N F
4486 9.5 20.0 8.2 11.0 72.0 46.0 NaN NaN P Q ... L T Q I G C T L N F
4538 NaN NaN 21.0 7.5 55.0 85.0 NaN NaN P Q ... M T Q I G C T L N F
4664 3.1 NaN 8.7 NaN 32.0 16.9 NaN NaN P Q ... M T Q I G C T L N F
4690 4.9 15.0 18.0 5.9 24.0 73.0 NaN NaN P Q ... M T Q I G C T L N F
4698 1.2 NaN 0.7 NaN 3.6 1.3 NaN NaN P Q ... L T Q L G C T L N F
5221 NaN NaN 0.8 0.8 1.2 0.7 NaN NaN P Q ... L T Q I G C T L N F
5279 8.3 79.0 16.0 12.0 600.0 1000.0 NaN NaN P Q ... M T Q I G C T L N F
5444 2.7 21.0 24.0 6.1 42.0 132.0 NaN NaN P Q ... M T Q I G C T L N F
5462 2.1 16.0 12.0 22.0 15.0 82.0 NaN NaN P Q ... L T Q I G C T L N F
5464 2.1 NaN 22.2 7.8 24.7 104.8 NaN NaN P Q ... M T Q L G C T L N F
5640 0.8 2.8 3.2 1.5 3.5 0.9 NaN NaN P Q ... L T Q L G C T L N F
5681 NaN NaN 26.0 25.0 37.0 7.4 NaN NaN P Q ... M T Q L G C T L N F
5707 6.1 10.0 28.0 11.0 41.0 65.0 NaN NaN P Q ... M T Q L G C T L N F
6024 NaN NaN 8.3 3.0 22.0 3.4 NaN NaN P Q ... M T Q L G C T L N F
6028 NaN NaN 16.0 20.0 37.0 7.9 NaN NaN P Q ... M T Q I G C T L N F
7038 NaN NaN 6.0 4.0 11.0 1.1 NaN NaN P Q ... L T Q I G C T L N F
7042 11.0 18.0 28.0 17.0 53.0 62.0 NaN NaN P Q ... M T Q L G C T L N F
7085 0.4 2.0 1.9 0.9 3.7 2.5 NaN NaN P Q ... M T Q I G C T L N F
7103 NaN NaN 0.7 0.7 11.0 0.4 NaN NaN P Q ... L T Q I G C T L N F
7119 1.4 0.9 1.0 0.8 1.6 0.8 NaN NaN P Q ... L T Q I G C T L N F
7235 NaN NaN 3.3 2.2 8.7 1.6 NaN NaN P Q ... M T Q I G C T L N F
7260 NaN NaN 33.0 15.0 16.0 18.0 NaN NaN P Q ... M T Q L G C T L N F
7412 6.2 NaN 12.0 NaN 10.2 591.5 NaN NaN P Q ... L T Q I G C T L N F
7414 3.5 NaN 4.4 NaN 6.2 1.1 NaN NaN P Q ... M T Q L G C T L N F
7415 2.2 NaN 6.5 NaN 9.5 19.0 NaN NaN P Q ... M T Q I G C T L N F
7430 2.8 NaN 48.9 NaN 80.7 42.1 NaN NaN P Q ... M T Q I G C T L N F
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
258507 0.5 0.8 0.8 0.8 1.2 0.7 0.9 0.6 P Q ... L T Q I G C T L N F
258509 2.5 5.0 4.5 2.0 9.1 3.5 2.4 1.3 P Q ... M T Q L G C T L N F
259173 0.7 1.0 1.2 1.1 2.0 1.0 1.0 0.8 P Q ... L T Q L G C T L N F
259175 0.9 0.8 1.0 1.0 0.8 0.8 0.7 0.8 P Q ... L T Q I G C T L N F
259177 0.9 1.1 1.0 0.9 1.1 1.0 1.2 0.9 P Q ... L T Q I G C T L N F
259181 2.6 9.3 21.0 6.8 13.0 21.0 1.4 1.5 P Q ... M T Q I G C T L N F
259183 0.7 1.0 0.9 0.7 2.2 0.9 1.2 0.8 P Q ... L T Q I G C T L N F
259185 1.2 1.1 1.6 1.3 1.6 1.2 1.0 0.8 P Q ... L T Q L G C T L N F
259187 0.7 2.0 1.9 1.4 4.0 2.0 1.4 0.8 P Q ... M T Q L G C T L N F
259189 0.4 0.7 0.8 0.5 0.9 0.9 0.6 0.6 P Q ... L T Q L G C T L N F
259191 1.1 27.0 30.0 36.0 36.0 200.0 0.6 0.6 P Q ... M T Q L G C T L N F
259193 0.5 0.7 0.8 0.7 1.2 0.5 0.6 0.3 P Q ... L T Q I G C T L N F
259195 1.1 1.5 1.6 1.1 1.4 1.3 1.5 0.9 P Q ... L T Q L G C T L N F
259197 0.5 0.8 0.8 0.8 1.1 0.7 0.9 0.7 P Q ... L T Q I G C T L N F
259199 0.5 0.8 0.7 0.6 1.1 0.6 0.6 0.6 P Q ... L T Q L G C T L N F
259203 1.0 1.1 0.9 1.0 1.1 1.0 1.1 1.1 P Q ... L T Q I G C T L N F
259207 1.6 1.6 1.4 1.4 1.9 1.1 1.0 0.9 P Q ... L T Q I G C T L N F
259215 0.6 0.8 1.0 0.7 1.3 0.6 0.7 0.6 P Q ... L T Q L G C T L N F
259219 0.7 0.9 0.8 0.7 1.0 0.8 0.7 0.8 P Q ... L T Q I G C T L N F
259223 0.8 1.0 0.9 0.9 1.4 0.8 0.9 0.9 P Q ... L T Q I G C T L N F
259227 6.3 6.2 6.3 3.4 20.5 5.3 6.7 2.9 P Q ... M T Q I G C T L N F
259233 1.4 1.5 1.3 1.0 2.5 1.0 1.3 0.8 P Q ... L T Q L G C T L N F
259237 0.7 1.2 1.0 0.8 1.4 1.0 1.2 0.6 P Q ... L T Q L G C T L N F
259241 0.8 1.0 1.1 0.8 1.2 1.0 1.0 0.8 P Q ... L T Q I G C T L N F
259245 0.6 0.8 0.8 0.8 1.1 1.0 0.9 0.9 P Q ... L T Q I G C T L N F
259249 0.4 0.5 0.5 0.4 0.6 0.6 0.6 0.6 P Q ... L T Q I G C T L N F
259253 0.9 0.8 0.9 0.9 1.5 0.7 0.7 0.6 P Q ... L T Q I G C T L N F
259257 0.8 0.8 0.8 0.6 1.7 0.7 1.1 0.5 P Q ... L T Q L G C T L N F
259261 0.6 0.7 0.7 0.5 0.7 0.7 0.8 0.6 P Q ... L T Q I G C T L N F
259265 0.3 0.9 0.6 0.6 0.7 0.7 0.8 0.8 P Q ... L T Q L G C T L N F

1540 rows × 107 columns


In [4]:
# Grab out the drug name to be tested.
DRUG = drug_cols[0]

# I have written a custom function that takes in the data, and reduces it such that we only consider one drug column
# value, and the corresponding amino acid sequences. The NaN values are also dropped, as most machine learning
# algorithms in scikit-learn cannot deal with NaN values. Finally, the data are log10 transformed.
X, Y = cf.split_data_xy(data, feat_cols, DRUG)

# Binarize the sequence features such that there are 99 x 20 columns in total.
# The purpose of binarizing is to turn the string labels into numeric labels. The most naive way to do this is to 
# take every amino acid position in the sequence alignment, and turn it into 20 columns of 1s and 0s corresponding 
# to whether a particular amino acid is present or not.

lb = LabelBinarizer()
lb.fit(list('CHIMSVAGLPTRFYWDNEQK'))

X_binarized = pd.DataFrame()

for col in X.columns:
    binarized_cols = lb.transform(X[col])
    
    for i, c in enumerate(lb.classes_):
        X_binarized[col + '_' + c] = binarized_cols[:,i]
X_binarized


Out[4]:
P1_A P1_C P1_D P1_E P1_F P1_G P1_H P1_I P1_K P1_L ... P99_M P99_N P99_P P99_Q P99_R P99_S P99_T P99_V P99_W P99_Y
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
13 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
17 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
18 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
20 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
21 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
22 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
25 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
26 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
27 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
28 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1385 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1386 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1387 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1388 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1389 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1390 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1391 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1392 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1393 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1394 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1395 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1396 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1397 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1398 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1399 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1400 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1401 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1402 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1403 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1404 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1405 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1406 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1407 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1408 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1409 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1410 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1411 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1412 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1413 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1414 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

1415 rows × 1980 columns


In [5]:
# Next step is to split the data into a training set and a test set. 
# We use the train_test_split function provided by the scikit-learn package to do this.
tts_data = X_train, X_test, Y_train, Y_test = train_test_split(X_binarized, Y, test_size=0.33)

We will skip past a discussion of other regresison models, and instead focus here on the use of ensemble regressors.

I would encourage you to take a look at the documentation for each of the following regressors:

  • Random Forest
  • Gradient Boosting
  • AdaBoost
  • ExtraTrees
  • Bagging

The purpose is to look for parameters that you can tweak to make the model better.

I have written a custom function called train_model that takes in

  • the train/test splitted data,
  • a model, and
  • model arguments,

and returns (in order):

  • the trained model,
  • the model predictions,
  • the mean-squared error of the predictions,
  • and the correlation score (r-squared).

In [6]:
# Train a bunch of ensemble regressors:

## Random Forest
kwargs = {'n_jobs':-1, 'n_estimators':1000}
rfr, rfr_preds, rfr_mse, rfr_r2 = cf.train_model(*tts_data, model=RandomForestRegressor, modelargs=kwargs)

## Gradient Boosting
kwargs = {'n_estimators':1000}
gbr, gbr_preds, gbr_mse, gbr_r2 = cf.train_model(*tts_data, model=GradientBoostingRegressor, modelargs=kwargs)

## AdaBoost
kwargs = {'n_estimators':1000}
abr, abr_preds, abr_mse, abr_r2 = cf.train_model(*tts_data, model=AdaBoostRegressor, modelargs=kwargs)

## ExtraTrees
kwargs = {'n_estimators':1000, 'n_jobs':-1}
etr, etr_preds, etr_mse, etr_r2 = cf.train_model(*tts_data, model=ExtraTreesRegressor, modelargs=kwargs)

## Bagging
bgr, bgr_preds, bgr_mse, bgr_r2 = cf.train_model(*tts_data, model=BaggingRegressor)

In [7]:
# Compare the trained models. Which one minimizes the mean squared error the most?

rfr_mse, gbr_mse, abr_mse, etr_mse, bgr_mse


Out[7]:
(0.56602412768855048,
 0.49571120460278129,
 0.69972776559692174,
 0.8629261739741434,
 0.61291876964437908)

In [8]:
# Qualitatively, what is the math behind the MSE score? Are you looking to minimize or maximize it?

In [25]:
models = dict()
models['rfr'] = RandomForestRegressor(n_estimators=100, n_jobs=-1)
models['gbr'] = GradientBoostingRegressor(n_estimators=100)
models['abr'] = AdaBoostRegressor(n_estimators=100)
models['etr'] = ExtraTreesRegressor(n_estimators=100, n_jobs=-1)
models['bgr'] = BaggingRegressor()

scores = cross_val_score(models['gbr'], X_binarized, Y, cv=ShuffleSplit(n=len(Y)))
scores


Out[25]:
array([ 0.82395091,  0.89683899,  0.87186737,  0.85271116,  0.83535876,
        0.87401101,  0.91191317,  0.84443399,  0.86307088,  0.88454115])

In [10]:
# Using the coding pattern illustrated above, tweak the parameters as documented in the scikit-learn documentation.
# Can you improve the model predictions? 
"""
Basically, the idea is to tweak the parameters.
"""


Out[10]:
'\nBasically, the idea is to tweak the parameters.\n'

Switch Notebooks

We will leave this notebook for notebook 3. When we are done with NB3, we will come back to here.


In [18]:
# Pick the model that gives the best MSE score (minimize MSE score), and use it to make predictions.

# Read in the sequence data for sequences that do not have predictions made.
proteases_alignment = [s for s in SeqIO.parse('sequences/proteases_downsampled.fasta', 'fasta') if len(s) == len(consensus.seq)]
proteases_alignment = MultipleSeqAlignment(proteases_alignment)
proteases_alignment


Out[18]:
<<class 'Bio.Align.MultipleSeqAlignment'> instance (249 records of length 99, SingleLetterAlphabet()) at 1124b17f0>

In [19]:
# Binarize the mutliple sequence alignment.
proteases_df = pd.DataFrame()

for col in range(proteases_alignment.get_alignment_length()):
    binarized_cols = lb.transform([k for k in proteases_alignment[:,col]])
    
    for i, c in enumerate(lb.classes_):
        proteases_df[str(col) + '_' + c] = binarized_cols[:,i]
        
# Add in the index.
index = []
for s in proteases_alignment:
    index.append(s.id)
    
proteases_df.index = index

In [20]:
proteases_df


Out[20]:
0_A 0_C 0_D 0_E 0_F 0_G 0_H 0_I 0_K 0_L ... 98_M 98_N 98_P 98_Q 98_R 98_S 98_T 98_V 98_W 98_Y
B.US.2004.CA48243.GQ207976 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.US.2004.SD_PIRC_4339.KJ723303 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.US.2004.CR0068P.FJ469695 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.US.2004.CA51328.GQ208561 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.US.2004.CA42953.GQ207851 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.US.2004.PID_28_Day_254_28.KF470125 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.US.2004.NIa07292004.KC814463 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.US.2004.PID_12_3.KF469466 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.US.2004.HIV_US_BID-V5282_2004.JQ403107 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.US.2004.CA50992.GQ208351 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.04ZAPS205B1.DQ093599 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.SK54.HM593470 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.04ZASK145B1.AY901976 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.SK181.HM593189 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.SK206.HM593214 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.SK143B1.AY703910 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.SK217.HM593225 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.PS209.FJ199765 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.PS186.FJ199744 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2004.04ZASK234B1.DQ093605 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.JP.2004.C04_5555_5978_1.AB640410 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.JP.2004.DR5913.AB480696 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.JP.2004.DR6174.AB480692 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.JP.2004.DR6175.AB480694 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
01_AE.JP.2004.C04-2069050-1.AB867643 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.JP.2004.04JP.Y232KNG.AB735871 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.JP.2004.DR6089.AB286955 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.JP.2004.C04_5555_6242_1.AB640443 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.JP.2004.IMS0675.JX264493 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.JP.2004.C04_51110198_1.AB640298 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
C.ZA.2007.07M22ZA.GU201816 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2007.C.704010042.w02.21dps.gp15.JX973857 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2007.07ZA.KZN07_246147.KF736609 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2007.patient_179_seq_117.KC422933 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2007.HIV100397.JN132232 0 0 0 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
C.ZA.2007.07ZA.GP07_217417.KF736578 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2007.patient_625_seq_514.KC424107 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2007.C.705010162.S.0dps.gp14.JX973959 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2007.CP076300328.HQ994367 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ZA.2007.503_11204_50.KT183268 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2003.1182_48_7201_20030808.DQ879026 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2003.1182_48_7082_20030910.DQ878937 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2003.1182_48_7144_20030805.DQ878982 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2003.1182_48_7014_20030715.DQ878896 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2003.1182_48_7205_20030911.DQ879030 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2003.1182_48_7024_20030915.DQ878909 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2003.1182_48_7028_20030710.DQ878912 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2003.1182_48_7207_20030911.DQ879032 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
C.ES.2003.1182_48_7101_20030723.DQ878954 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2003.243HDO032_ARA.HM460455 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
02_AG.ES.2006.PPT24.GQ241055 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
02_AG.ES.2006.06SP13_322971.EU342796 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2006.06SP41_321956.EU255414 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2006.06SP612967.EF583261 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
06_cpx.ES.2006.PN53.GQ240997 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
A1.ES.2006.06SP4327266.JX428567 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2006.HRC-ADG133.JX271418 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
B.ES.2006.06SP620727.EF583281 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
02_AG.ES.2006.PPT73.GQ241104 0 0 0 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
B.ES.2006.HCIII-00288250.JX271402 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

249 rows × 1980 columns


In [39]:
# Make predictions using the best model - ExtraTrees Regressor
etr_preds = pd.DataFrame(etr.predict(proteases_df))
etr_preds['traits'] = proteases_df.index
etr_preds.columns = ['{0}_resistance'.format(DRUG), 'traits']

In [48]:
etr_preds.set_index('traits', inplace=True)

In [51]:
pd.DataFrame(etr_preds).to_csv('csv/{0}_etr_preds.tsv'.format(DRUG), sep='\t')

In [ ]: