In [1]:
import time

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import cross_val_score
#from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.model_selection import cross_val_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle

from scipy import interp

%matplotlib inline


/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [296]:
# Get the positive mode data ~ 80 mb
local_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/'
project_path = 'revo_healthcare/data/processed/ST000578/ST000578_AN000888_Results.tsv'
data = pd.read_csv(local_path+project_path, sep='\t')

In [297]:
# Parse the class-labels from output
outcome = data.iloc[0,:]
print outcome.unique()
# convert all P. vivax into just P.vivax
susceptible_triplicate = outcome[outcome.str.contains('Current Malaria Infection:P.Vivax') & 
        outcome.str.contains('Chloroquine Resistance:Susceptible')]

resistant_triplicate = outcome[outcome.str.contains('Current Malaria Infection:P.Vivax') & 
        outcome.str.contains('Chloroquine Resistance:Resistant')]

print '\n\nSusceptible', susceptible_triplicate.shape[0] / 3
print 'Resistant', resistant_triplicate.shape[0] / 3

# Select one of the three triplicate samples
resistant = resistant_triplicate[~resistant_triplicate.index.str.contains('\.')]
susceptible = susceptible_triplicate[~susceptible_triplicate.index.str.contains('\.')]
print 'Resistant', resistant.values
# Relabel so that only two classes
resistant[:] = 'Chloroquine resistant'
susceptible[:] = 'Chloroquine susceptible'
print '\n\n Resistant', resistant
print '\n\n susceptible', susceptible
class_labels = pd.concat([resistant, susceptible])
print class_labels


['Factors'
 'Current Malaria Infection:None | Prior Malaria Infection:N/A | Chloroquine Resistance:N/A'
 'Current Malaria Infection:None | Prior Malaria Infection:NO | Chloroquine Resistance:N/A'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:N/A'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:NO | Chloroquine Resistance:N/A'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Susceptible'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:None | Prior Malaria Infection:YES | Chloroquine Resistance:N/A'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:NO | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant']


Susceptible 33
Resistant 31
Resistant [ 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:NO | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant']


 Resistant 2009227    Chloroquine resistant
2008791    Chloroquine resistant
2008700    Chloroquine resistant
2008675    Chloroquine resistant
2008630    Chloroquine resistant
2007260    Chloroquine resistant
2008235    Chloroquine resistant
2009263    Chloroquine resistant
2009264    Chloroquine resistant
2009249    Chloroquine resistant
2009203    Chloroquine resistant
2008712    Chloroquine resistant
2008676    Chloroquine resistant
2008652    Chloroquine resistant
2008625    Chloroquine resistant
2009250    Chloroquine resistant
2009204    Chloroquine resistant
2008715    Chloroquine resistant
2008677    Chloroquine resistant
2008653    Chloroquine resistant
2008626    Chloroquine resistant
2008613    Chloroquine resistant
2008614    Chloroquine resistant
2009267    Chloroquine resistant
2009255    Chloroquine resistant
2009226    Chloroquine resistant
2008727    Chloroquine resistant
2008699    Chloroquine resistant
2008674    Chloroquine resistant
2008629    Chloroquine resistant
2009256    Chloroquine resistant
Name: 0, dtype: object


 susceptible 2009261    Chloroquine susceptible
2008622    Chloroquine susceptible
2009262    Chloroquine susceptible
2009228    Chloroquine susceptible
2009109    Chloroquine susceptible
2008709    Chloroquine susceptible
2008678    Chloroquine susceptible
2008631    Chloroquine susceptible
2008623    Chloroquine susceptible
2009233    Chloroquine susceptible
2009110    Chloroquine susceptible
2008710    Chloroquine susceptible
2008679    Chloroquine susceptible
2008651    Chloroquine susceptible
2008624    Chloroquine susceptible
2007278    Chloroquine susceptible
2008603    Chloroquine susceptible
2009265    Chloroquine susceptible
2009266    Chloroquine susceptible
2009253    Chloroquine susceptible
2009212    Chloroquine susceptible
2008721    Chloroquine susceptible
2008680    Chloroquine susceptible
2008654    Chloroquine susceptible
2008627    Chloroquine susceptible
2009254    Chloroquine susceptible
2009225    Chloroquine susceptible
2008722    Chloroquine susceptible
2008702    Chloroquine susceptible
2008660    Chloroquine susceptible
2008628    Chloroquine susceptible
2008618    Chloroquine susceptible
2008621    Chloroquine susceptible
Name: 0, dtype: object
2009227      Chloroquine resistant
2008791      Chloroquine resistant
2008700      Chloroquine resistant
2008675      Chloroquine resistant
2008630      Chloroquine resistant
2007260      Chloroquine resistant
2008235      Chloroquine resistant
2009263      Chloroquine resistant
2009264      Chloroquine resistant
2009249      Chloroquine resistant
2009203      Chloroquine resistant
2008712      Chloroquine resistant
2008676      Chloroquine resistant
2008652      Chloroquine resistant
2008625      Chloroquine resistant
2009250      Chloroquine resistant
2009204      Chloroquine resistant
2008715      Chloroquine resistant
2008677      Chloroquine resistant
2008653      Chloroquine resistant
2008626      Chloroquine resistant
2008613      Chloroquine resistant
2008614      Chloroquine resistant
2009267      Chloroquine resistant
2009255      Chloroquine resistant
2009226      Chloroquine resistant
2008727      Chloroquine resistant
2008699      Chloroquine resistant
2008674      Chloroquine resistant
2008629      Chloroquine resistant
                    ...           
2009228    Chloroquine susceptible
2009109    Chloroquine susceptible
2008709    Chloroquine susceptible
2008678    Chloroquine susceptible
2008631    Chloroquine susceptible
2008623    Chloroquine susceptible
2009233    Chloroquine susceptible
2009110    Chloroquine susceptible
2008710    Chloroquine susceptible
2008679    Chloroquine susceptible
2008651    Chloroquine susceptible
2008624    Chloroquine susceptible
2007278    Chloroquine susceptible
2008603    Chloroquine susceptible
2009265    Chloroquine susceptible
2009266    Chloroquine susceptible
2009253    Chloroquine susceptible
2009212    Chloroquine susceptible
2008721    Chloroquine susceptible
2008680    Chloroquine susceptible
2008654    Chloroquine susceptible
2008627    Chloroquine susceptible
2009254    Chloroquine susceptible
2009225    Chloroquine susceptible
2008722    Chloroquine susceptible
2008702    Chloroquine susceptible
2008660    Chloroquine susceptible
2008628    Chloroquine susceptible
2008618    Chloroquine susceptible
2008621    Chloroquine susceptible
Name: 0, Length: 64, dtype: object

In [298]:
# Check other subsets of data
susceptible_triplicate = outcome[outcome.str.contains('Current Malaria Infection:P.Vivax') & 
        outcome.str.contains('Chloroquine Resistance:Susceptible')]

resistant_triplicate = outcome[outcome.str.contains('Current Malaria Infection:P.Vivax') & 
        outcome.str.contains('Chloroquine Resistance:Resistant')]

print susceptible_triplicate.unique()
print "Number susceptible", susceptible_triplicate.shape[0] / 3, '\n'
print resistant_triplicate.unique()
print "number resistant", resistant_triplicate.shape[0] /3
print 'num resistant, no prior malaria', resistant_triplicate.str.contains('NO').shape[0] / 3
print 'num resistant, no prior malaria', resistant_triplicate.str.contains('N\/A').shape[0] / 3
print resistant_triplicate.values

# 31 resistant
# 15 with n/a
# 15 with prior malaria
# 1 without prior malaria


[ 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Susceptible']
Number susceptible 33 

[ 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:NO | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant']
number resistant 31
num resistant, no prior malaria 31
num resistant, no prior malaria 31
[ 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:NO | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:NO | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:NO | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant']

In [299]:
def remove_zero_columns(X, threshold=1e-20):
    '''
    Requires dataframe
    '''
    # convert zeros to nan, drop all nan columns, the replace leftover nan with zeros
    X_non_zero_colum = X.replace(0, np.nan).dropna(how='all', axis=1).replace(np.nan, 0)
    #.dropna(how='all', axis=0).replace(np.nan,0)
    return X_non_zero_colum

def zero_fill_half_min(X, threshold=1e-20):
    # Fill zeros with 1/2 the minimum value of that column
    # input dataframe. Add only to zero values
    
    # Get a vector of 1/2 minimum values
    half_min = X[X > threshold].min(axis=0)*0.5
    
    # Add the half_min values to a dataframe where everything that isn't zero is NaN.
    # then convert NaN's to 0
    fill_vals = (X[X < threshold] + half_min).fillna(value=0)
    
    # Add the original dataframe to the dataframe of zeros and fill-values
    X_zeros_filled = X + fill_vals
    return X_zeros_filled

toy = pd.DataFrame([[1,2,3,0],
               [0,0,0,0],
               [0.5,1,0,0]], dtype=float)

toy_no_zeros = remove_zero_columns(toy)
toy_filled_zeros = zero_fill_half_min(toy_no_zeros)
print toy
print toy_no_zeros
print toy_filled_zeros


     0    1    2    3
0  1.0  2.0  3.0  0.0
1  0.0  0.0  0.0  0.0
2  0.5  1.0  0.0  0.0
     0    1    2
0  1.0  2.0  3.0
1  0.0  0.0  0.0
2  0.5  1.0  0.0
      0    1    2
0  1.00  2.0  3.0
1  0.25  0.5  1.5
2  0.50  1.0  1.5

In [336]:
# Grab samples that have correct class labels
df_raw = data[class_labels.index]
print df_raw.head()
raise he
# remove first column and convert to float
df_raw = df_raw.iloc[1:,:].astype('float64')
print "df_raw shape", df_raw.shape
print "class labels", class_labels.shape
# Make sure labels and df_raw-columns are in correct order
print "quick eyeball that y and X are in same order ", zip(df_raw.columns, class_labels.index)[0:5]
assert (df_raw.columns == class_labels.index).all()
print df_raw.head

# Conver to binary class labels
print class_labels.unique()
le = preprocessing.LabelEncoder()
le.fit(class_labels)
y = le.transform(class_labels)
print y

# Convert to numpy array
#X_raw = df_nonzero.as_matrix().T


                                             2009227  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        51065.03274   
2                                        34348.85255   
3                                        880363.6806   
4                                                  0   

                                             2008791  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        22154.69474   
2                                        49230.82536   
3                                        1057405.156   
4                                        125925.8035   

                                             2008700  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                                  0   
2                                        70190.04916   
3                                        1362530.042   
4                                                  0   

                                             2008675  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        26024.15828   
2                                         36202.7997   
3                                        1204027.617   
4                                        233731.1325   

                                             2008630  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                                  0   
2                                        242840.7866   
3                                        1139145.689   
4                                                  0   

                                             2007260  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        22649.12341   
2                                        41078.10758   
3                                        847851.6719   
4                                                  0   

                                             2008235  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                                  0   
2                                        77487.93587   
3                                        1114489.331   
4                                                  0   

                                             2009263  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        23520.59625   
2                                        92097.95032   
3                                        1168790.522   
4                                                  0   

                                             2009264  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        325352.6865   
2                                        43108.38869   
3                                        1864196.648   
4                                        198412.5026   

                                             2009249  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        16710.64025   
2                                        32451.82977   
3                                        923555.7857   
4                                         175605.376   

                         ...                          \
0                        ...                           
1                        ...                           
2                        ...                           
3                        ...                           
4                        ...                           

                                             2008654  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                                  0   
2                                        65779.75978   
3                                        1189716.167   
4                                                  0   

                                             2008627  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        189801.0396   
2                                        64947.13096   
3                                         1113456.76   
4                                        162600.7731   

                                             2009254  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        35398.58445   
2                                        45624.15079   
3                                        1981599.775   
4                                                  0   

                                             2009225  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                         81391.2985   
2                                        79544.37776   
3                                        2298585.715   
4                                                  0   

                                             2008722  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        47856.12858   
2                                        62747.31446   
3                                        1237544.847   
4                                                  0   

                                             2008702  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        52430.35018   
2                                        76765.77108   
3                                         1103177.29   
4                                                  0   

                                             2008660  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                         46734.1497   
2                                         40119.0363   
3                                        940232.3417   
4                                        71873.94041   

                                             2008628  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        32292.29959   
2                                        62304.90538   
3                                         1206203.62   
4                                                  0   

                                             2008618  \
0  Current Malaria Infection:P.Vivax | Prior Mala...   
1                                        245087.8537   
2                                        70368.39229   
3                                        585418.1445   
4                                                  0   

                                             2008621  
0  Current Malaria Infection:P.Vivax | Prior Mala...  
1                                        330209.7279  
2                                        32849.71032  
3                                        1458231.452  
4                                        76562.55983  

[5 rows x 64 columns]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-336-9f46fa5b4def> in <module>()
      2 df_raw = data[class_labels.index]
      3 print df_raw.head()
----> 4 raise he
      5 # remove first column and convert to float
      6 df_raw = df_raw.iloc[1:,:].astype('float64')

NameError: name 'he' is not defined

Time to preprocess feature table


In [301]:
# TODO PQN normalization, and log-transformation, 
# and some feature selection (above certain threshold of intensity, use principal components), et

def pqn_normalize(X, integral_first=False, plot=False):
    '''
    Take a feature table and run PQN normalization on it
    '''
    # normalize by sum of intensities in each sample first. Not necessary
    if integral_first: 
        sample_sums = np.sum(X, axis=1)
        X = (X / sample_sums[:,np.newaxis])
    
    # Get the median value of each feature across all samples
    mean_intensities = np.median(X, axis=0)
    print 'mean_intensity shape', mean_intensities.shape
    print 'Mean intensity', mean_intensities[0:10]
    # Divde each feature by the median value of each feature - 
    # these are the quotients for each feature
    X_quotients = (X / mean_intensities[np.newaxis,:])
    
    if plot: # plot the distribution of quotients from one sample
        for i in range(1,len(X_quotients[:,1])):
            print 'allquotients reshaped!\n\n', 
            #all_quotients = X_quotients.reshape(np.prod(X_quotients.shape))
            all_quotients = X_quotients[i,:]
            print all_quotients.shape
            x = np.random.normal(loc=0, scale=1, size=len(all_quotients))
            sns.violinplot(all_quotients)
            plt.title("median val: %f\nMax val=%f" % (np.median(all_quotients), np.max(all_quotients)))
            plt.plot( title="median val: ")#%f" % np.median(all_quotients))
            plt.xlim([-0.5, 5])
            plt.show()

    # Define a quotient for each sample as the median of the feature-specific quotients
    # in that sample
    sample_quotients = np.median(X_quotients, axis=1)
    
    # Quotient normalize each samples
    X_pqn = X / sample_quotients[:,np.newaxis]
    return X_pqn

# Make a fake sample, with 2 samples at 1x and 2x dilutions
X_toy = np.array([[1,1,1,],
                  [2,2,2],
                  [3,6,9],
                  [6,12,18]], dtype=float)
print X_toy
print X_toy.reshape(1, np.prod(X_toy.shape))
X_toy_pqn_int = pqn_normalize(X_toy, integral_first=True)
print X_toy_pqn_int

print '\n\n\n'
X_toy_pqn = pqn_normalize(X_toy)
print X_toy_pqn


[[  1.   1.   1.]
 [  2.   2.   2.]
 [  3.   6.   9.]
 [  6.  12.  18.]]
[[  1.   1.   1.   2.   2.   2.   3.   6.   9.   6.  12.  18.]]
mean_intensity shape (3,)
Mean intensity [ 0.25        0.33333333  0.41666667]
[[ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.16666667  0.33333333  0.5       ]
 [ 0.16666667  0.33333333  0.5       ]]




mean_intensity shape (3,)
Mean intensity [ 2.5  4.   5.5]
[[ 4.  4.  4.]
 [ 4.  4.  4.]
 [ 2.  4.  6.]
 [ 2.  4.  6.]]

In [302]:
def prevalence_threshold(X, threshold=0.5):
    '''
    input: numpy matrix
    output matrix, but with columns where presence/absence below threshold removed'''
    samples_present = (X > 1e-20).sum(axis=0)
    percent = np.divide(samples_present, float(X.shape[0]))
    print percent
    above_threshold = percent > threshold
    print above_threshold
    # only keep values b
    output = X[:,above_threshold]
    return output
    
test = np.array([[1,2,3], [0,0,3], [0,2,3]])
print 'Input\n', test
prevalence_threshold(test)


Input
[[1 2 3]
 [0 0 3]
 [0 2 3]]
[ 0.33333333  0.66666667  1.        ]
[False  True  True]
Out[302]:
array([[2, 3],
       [0, 3],
       [2, 3]])

In [303]:
# preprocess feature table

# remove zero features
df_nonzero = remove_zero_columns(df_raw.T)
print 'Zero full shape', df_raw.T.shape
print 'Zeros removed shape', df_nonzero.shape
X_nonzero = df_nonzero.as_matrix()
print X_nonzero.shape


# standardize input
X_scaled = preprocessing.scale(X_nonzero)
print 'scaled mean', X_scaled.mean(axis=0)
print 'scaled std', X_scaled.std(axis=0)
print X_scaled[0:4]
# require feature to be present in at least 50% of samples
X_50percent = prevalence_threshold(X_nonzero, threshold=0.5)
print "50% prevalence cutoff", X_50percent.shape

X_80 = prevalence_threshold(X_nonzero, threshold=0.8)
print "80% prevalence cutoff", X_80.shape
# PQN normalize?
#print X_nonzero[0:5]
#X_pqn = pqn_normalize(X_nonzero)
#print X_pqn
# Final decision to use


Zero full shape (64, 20347)
Zeros removed shape (64, 18041)
(64, 18041)
scaled mean [ -2.77555756e-17  -2.08166817e-17  -6.93889390e-17 ...,  -5.55111512e-17
  -2.08166817e-17   2.60208521e-17]
scaled std [ 1.  1.  1. ...,  1.  1.  1.]
[[-0.22015996 -0.67058828 -1.04242515 ..., -0.81946284 -0.50490175
  -0.76044203]
 [-0.51361324 -0.27116883 -0.59282246 ..., -0.17879717 -0.3860976
   0.07657816]
 [-0.73849362  0.29135884  0.1820523  ...,  0.17456447 -0.50490175
   0.0765257 ]
 [-0.4743364  -0.62082992 -0.22046986 ...,  0.23829698 -0.50490175
  -0.57318127]]
[ 0.75      0.953125  0.984375 ...,  0.640625  0.34375   0.578125]
[ True  True  True ...,  True False  True]
50% prevalence cutoff (64, 4541)
[ 0.75      0.953125  0.984375 ...,  0.640625  0.34375   0.578125]
[False  True  True ..., False False False]
80% prevalence cutoff (64, 2103)

Define Random forest function <h/2> TODO: turn this into a pipeline, silly


In [133]:
def roc_curve_cv(X, y, clf, cross_val,
                path='/home/irockafe/Desktop/roc.pdf',
                save=False, plot=True):  
    t1 = time.time()
    # collect vals for the ROC curves
    tpr_list = []
    mean_fpr = np.linspace(0,1,100)
    auc_list = []
    
    # Get the false-positive and true-positive rate
    for i, (train, test) in enumerate(cross_val):
        clf.fit(X[train], y[train])
        y_pred = clf.predict_proba(X[test])[:,1]
        
        # get fpr, tpr
        fpr, tpr, thresholds = roc_curve(y[test], y_pred)
        roc_auc = auc(fpr, tpr)
        #print 'AUC', roc_auc
        #sns.plt.plot(fpr, tpr, lw=10, alpha=0.6, label='ROC - AUC = %0.2f' % roc_auc,)
        #sns.plt.show()
        tpr_list.append(interp(mean_fpr, fpr, tpr))
        tpr_list[-1][0] = 0.0
        auc_list.append(roc_auc)
        
        if (i % 10 == 0):
            print '{perc}% done! {time}s elapsed'.format(perc=100*float(i)/cross_val.n_iter, time=(time.time() - t1))
        
            
        
        
    # get mean tpr and fpr
    mean_tpr = np.mean(tpr_list, axis=0)
    # make sure it ends up at 1.0
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc_list)
    
    if plot:
        # plot mean auc
        plt.plot(mean_fpr, mean_tpr, label='Mean ROC - AUC = %0.2f $\pm$ %0.2f' % (mean_auc, 
                                                                                       std_auc),
                    lw=5, color='b')

        # plot luck-line
        plt.plot([0,1], [0,1], linestyle = '--', lw=2, color='r',
                    label='Luck', alpha=0.5) 

        # plot 1-std
        std_tpr = np.std(tpr_list, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.2,
                            label=r'$\pm$ 1 stdev')

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC curve, {iters} iterations of {cv} cross validation'.format(
            iters=cross_val.n_iter, cv='{train}:{test}'.format(test=cross_val.test_size, train=(1-cross_val.test_size)))
                 )
        plt.legend(loc="lower right")

        if save:
            plt.savefig(path,  format='pdf')


        plt.show()
    return tpr_list, auc_list, mean_fpr

In [305]:
rf_estimators = 500
n_iter = 25
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)

print cross_val_rf.n_iter
print cross_val_rf.test_size

# unscaled
tpr_vals, auc_vals, mean_fpr = roc_curve_cv(X_nonzero, y, clf_rf, cross_val_rf,
                                           save=False)


25
0.3
0.0% done! 1.81439590454s elapsed
40.0% done! 20.7781660557s elapsed
80.0% done! 39.6885519028s elapsed

In [304]:
# what about with a bit of feature pruning?


rf_estimators = 2000
n_iter = 50
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)

print cross_val_rf.n_iter
print cross_val_rf.test_size

# unscaled
tpr_vals, auc_vals, mean_fpr = roc_curve_cv(X_50percent, y, clf_rf, cross_val_rf,
                                           save=False)


50
0.3
0.0% done! 6.23399305344s elapsed
20.0% done! 76.0771548748s elapsed
40.0% done! 142.417345047s elapsed
60.0% done! 208.340952873s elapsed
80.0% done! 272.890228987s elapsed

In [193]:
# what about with a bit of feature pruning?
rf_estimators = 1000
n_iter = 25
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)

print cross_val_rf.n_iter
print cross_val_rf.test_size

# unscaled
tpr_vals, auc_vals, mean_fpr = roc_curve_cv(X_80, y, clf_rf, cross_val_rf,
                                           save=False)


25
0.3
0.0% done! 3.23655700684s elapsed
40.0% done! 35.9883110523s elapsed
80.0% done! 69.0458688736s elapsed

Plot the mz/rt space


In [267]:
data = data.iloc[1:, :]
data = data.set_index(data.iloc[:,0])

def prevalence_threshold(df, threshold=0.5):
    '''
    features x samples df
    '''
    output = df[((df > 1e-20).sum(axis=1) / df.shape[1]) > threshold]
    return output

df_raw = data[class_labels.index]

#print df_raw.head
df_50percent_prevalence = prevalence_threshold(df_raw, threshold=0.5)
print df_50percent_prevalence.index


Index([u'89.5059_435.1', u'90.0074_371.2', u'90.0075_8.6', u'90.0076_439.3',
       u'90.0902_271.2', u'90.5060_435.3', u'90.5060_7.9', u'90.5060_556.1',
       u'90.5090_369.6', u'90.5092_17.6',
       ...
       u'1565.0444_430.9', u'1566.0391_432.0', u'1577.8963_418.0',
       u'1578.8915_418.4', u'1735.1634_417.2', u'1735.6551_417.7',
       u'1762.1148_417.4', u'1763.1075_417.3', u'1983.3337_417.2',
       u'1984.3186_417.5'],
      dtype='object', name=u'Samples', length=5151)

In [251]:
mz_rt_df = pd.DataFrame([i.split('_') for i in df_50percent_prevalence.index],
            columns=['mz', 'rt'], dtype='float64')

plt.scatter(x=mz_rt_df['rt'], y=mz_rt_df['mz'], s=2
           )
plt.xlabel('rt')
plt.ylabel('mz')
plt.show()



In [ ]:


In [334]:
# Select between 350 and 350
df_slice = mz_rt_df[(mz_rt_df['rt'] > 350) & (mz_rt_df['rt'] < 475)]
print mz_rt_df.shape
print asdf.shape
print df_slice.shape

# convert df_slice into X and run rf on it
print 'df-slice', df_slice[-8:-1]
print df_slice.shape
print 'shape of x-50', X_nonzero.shape


(5155, 2)
(2923, 2)
(1390, 2)
df-slice              mz     rt
5147  1577.8963  418.0
5148  1578.8915  418.4
5149  1735.1634  417.2
5150  1735.6551  417.7
5151  1762.1148  417.4
5152  1763.1075  417.3
5153  1983.3337  417.2
(1390, 2)
shape of x-50 (64, 18041)

Try to back out the 22 patients that are repeats...?

Or just wait for Karan Uppal to give the info


In [338]:
# Get the positive mode data ~ 80 mb
local_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/'
project_path = 'revo_healthcare/data/processed/ST000578/ST000578_AN000888_Results.tsv'
data = pd.read_csv(local_path+project_path, sep='\t')

In [344]:
df_50percent_prevalence
mz_rt_df = pd.DataFrame([i.split('_') for i in df_50percent_prevalence.index],
            columns=['mz', 'rt'], dtype='float64',
                       index=df_50percent_prevalence.index)
mz_rt_df
df_50percent_prevalence_mzrt = pd.concat([mz_rt_df, df_50percent_prevalence],
                                        axis=1)
df_50percent_prevalence_mzrt


Out[344]:
mz rt 2009227 2008791 2008700 2008675 2008630 2007260 2008235 2009263 ... 2008654 2008627 2009254 2009225 2008722 2008702 2008660 2008628 2008618 2008621
Samples
89.5059_435.1 89.5059 435.1 0 0 8257841.742 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
90.0074_371.2 90.0074 371.2 10498841.66 9841835.671 9580520.612 10914163.22 9153719.487 10828925.47 11772214.82 10512799.3 ... 7026674.414 7032003.049 7719911.028 7256920.416 8141573.982 7984498.671 9110181.769 10796345.17 8011757.469 6165551.241
90.0075_8.6 90.0075 8.6 3965161.294 2781320.516 1717467.517 4557703.401 3319942.933 2346799.773 16144.20002 3191167.455 ... 3361374.609 3225108.739 3234679.55 2257954.805 2829996.817 3025325.917 3964125.1 3894615.297 3767459.37 1860727.982
90.0076_439.3 90.0076 439.3 0 0 1074652.702 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
90.0902_271.2 90.0902 271.2 149087.1152 0 0 0 0 74752.09517 0 0 ... 0 134253.2046 0 0 0 0 226999.9155 0 224761.5756 0
90.5060_435.3 90.5060 435.3 0 0 884390.5513 0 0 1974046.316 1700740.002 0 ... 0 0 2882807.655 780882.399 0 0 0 1167876.502 0 0
90.5060_7.9 90.5060 7.9 6058519.493 2231072.165 423239.0544 3850227.892 5264008.161 4112418.498 2444736.853 5342918.932 ... 6515632.634 0 4333684.698 3044488.261 0 3399020.112 4821618.007 6331004.895 6149395.473 4007774.961
90.5060_556.1 90.5060 556.1 0 0 4244221.354 4864197.202 0 4951916.472 4896935.91 1427984.56 ... 0 2333475.243 1396999.005 0 4685815.578 4315130.185 1679274.02 3989390.666 0 0
90.5090_369.6 90.5090 369.6 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 125527.8799 238728.988 0 0 0
90.5092_17.6 90.5092 17.6 0 115455.5355 0 0 28068.56722 0 0 0 ... 0 0 14851.66607 0 0 120916.2108 128653.7856 78832.70997 0 0
90.5247_196.0 90.5247 196.0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
90.5248_375.3 90.5248 375.3 3021854.317 0 0 2220677.433 0 0 0 0 ... 406403.6735 0 0 0 0 3537445.909 0 7253540.674 0 3327266.86
90.5249_36.0 90.5249 36.0 5448438.699 7198422.653 9807786.115 6275756.742 4291497.217 6524701.024 0 6394814.163 ... 8017484.285 8698940.475 6447841.305 10352306.69 9928524.935 7441818.805 4932827.72 8323500.152 7137622.677 0
90.9756_43.2 90.9756 43.2 5969641.319 7682736.664 6454374.441 5167697.656 7710570.018 5836218.512 5608189.373 4921445.729 ... 6676109.719 5564232.26 6342556.3 5943083.312 6767760.897 5100041.402 6431736.961 5525051.032 5507681.851 5640639.179
91.0077_10.9 91.0077 10.9 350121.5799 280759.0341 265857.4427 350320.5165 187919.9025 212206.6134 246438.1215 262296.1873 ... 242635.6677 221675.7999 229490.9118 248242.2838 224932.697 283475.8571 335048.9052 255997.4283 407961.83 357468.8174
91.0077_573.9 91.0077 573.9 0 0 0 0 0 71452.65731 0 0 ... 0 0 0 0 0 0 0 0 0 0
91.0225_34.9 91.0225 34.9 414013.9793 0 951154.1525 621837.0755 440505.0904 481915.8709 1085349.859 472663.6129 ... 701208.7806 695549.5147 593596.924 1011122.885 785552.2501 674660.9142 234798.2155 722024.3645 659349.3175 368513.9601
91.0225_308.7 91.0225 308.7 744811.4944 783890.241 0 0 0 0 1044914.369 0 ... 0 0 784667.4444 0 0 0 0 0 0 0
91.0266_36.8 91.0266 36.8 323496.014 419842.6729 600436.0097 393878.3168 262175.7103 421403.3179 550224.7103 397568.9655 ... 575749.0563 551845.2813 436312.5506 575679.7514 489581.8057 474608.3407 387506.319 545998.9721 489848.0843 198919.9106
91.0534_374.6 91.0534 374.6 0 0 0 0 0 37166.76623 297235.8426 0 ... 51769.32302 55167.76522 0 0 55177.71829 0 0 0 0 0
91.5011_18.2 91.5011 18.2 0 0 0 0 8205.302463 59620.41605 63188.63976 0 ... 0 0 6565.430852 0 0 0 40812.97441 38493.93123 0 0
91.5036_434.7 91.5036 434.7 0 0 0 0 0 0 650499.8168 0 ... 0 0 0 0 0 0 0 0 0 0
91.5037_8.1 91.5037 8.1 2113778.491 1035747.08 596398.0602 1648143.578 1999672.742 1023945.405 0 1702671.468 ... 1835541.686 1965144.129 1578519.992 1254855.662 1291675.706 1282284.369 1630517.095 2623260.837 1738926.971 985253.1105
91.5037_555.0 91.5037 555.0 0 796994.0787 1678355.833 1003823.215 1746155.359 889175.3619 1773346.163 766091.1939 ... 803211.2381 989886.4252 496448.3167 779719.8628 1759576.634 1075611.757 563945.8533 1605973.717 509069.1679 0
91.9790_41.8 91.9790 41.8 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
92.0049_573.6 92.0049 573.6 0 0 0 0 0 0 0 0 ... 0 73328.39513 0 143190.6162 56492.28231 0 0 0 0 57865.94175
92.0389_37.6 92.0389 37.6 0 0 0 0 0 55056.23856 31438.43584 34584.76624 ... 90017.77892 27108.19628 22884.95239 55619.15778 185537.0365 35938.26805 0 34736.3584 90954.89343 14103.02042
92.5025_10.1 92.5025 10.1 196277.8927 87930.64936 204554.4669 140074.234 75841.48121 116998.9605 203119.1601 203405.359 ... 125295.0378 98060.14562 129047.4041 188838.3328 72702.59088 185231.1968 117949.371 148770.824 191671.5297 202568.1988
92.5025_562.9 92.5025 562.9 0 145322.964 0 0 0 71125.79177 136892.2807 152640.6671 ... 0 109437.3279 0 138676.3812 0 0 0 0 0 122478.3809
92.5025_448.2 92.5025 448.2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1333.9466_432.0 1333.9466 432.0 363706 254383 629712 380184 785763 0 125991 235487 ... 0 0 136440 185800 244905 0 259650 0 441671 0
1346.6934_39.8 1346.6934 39.8 234329 263865 894916 382636 257716 0 585774 0 ... 285468 182446 0 200518 0 12980.6 459267 0 0 0
1362.6598_41.6 1362.6598 41.6 369198 500438 0 0 0 515858 89219.1 104976 ... 208159 213668 0 0 22031.9 418287 565639 250577 0 838983
1487.5033_417.3 1487.5033 417.3 277395 1.21835e+06 1.56868e+06 1.4268e+06 1.50004e+06 433265 187134 1.27366e+06 ... 984057 2.2708e+06 2.49157e+06 2.90481e+06 374955 2.33353e+06 679352 1.08524e+06 5.33895e+06 1.22672e+06
1488.0020_417.5 1488.0020 417.5 1.11106e+06 7.14363e+06 7.32453e+06 7.466e+06 1.05085e+07 2.74557e+06 717243 7.20509e+06 ... 4.55781e+06 1.16489e+07 1.39584e+07 1.35243e+07 3.2937e+06 9.34309e+06 4.25661e+06 4.85552e+06 1.65558e+07 4.43804e+06
1488.5009_417.4 1488.5009 417.4 0 707461 775796 674553 617459 80112.2 0 710955 ... 895621 1.27398e+06 3.90967e+06 945761 329427 957294 507604 235312 3.30025e+06 242450
1490.0102_417.4 1490.0102 417.4 237892 942951 861405 1.09992e+06 904029 304760 354768 740417 ... 443855 1.2584e+06 1.7558e+06 2.18381e+06 0 1.58963e+06 878667 0 0 826558
1490.9995_417.5 1490.9995 417.5 0 502305 1.94181e+06 0 1.73085e+06 0 0 947171 ... 391002 1.11622e+06 252711 1.2137e+06 0 820919 909906 626392 3.70908e+06 123263
1497.9852_418.5 1497.9852 418.5 18198.8 725228 399984 908093 711426 706436 0 730910 ... 870677 757316 715944 2.49575e+06 551390 895318 400494 468692 517338 34710.9
1498.4782_418.0 1498.4782 418.0 1.473e+06 47696 562845 1.56137e+06 578754 0 0 86531.5 ... 316964 416897 2.95054e+06 1.32693e+06 251191 504571 444715 0 1.22777e+06 331470
1498.9934_417.3 1498.9934 417.3 586778 794281 318663 320658 292070 283998 0 662019 ... 606554 216305 525353 633034 0 219029 758872 220786 897147 1.05328e+06
1499.4869_417.4 1499.4869 417.4 13173.5 406366 49873.9 0 722235 202596 0 855035 ... 633549 396916 336799 837536 0 704534 349528 0 0 0
1506.4662_417.1 1506.4662 417.1 0 474581 660630 1.14833e+06 267524 0 0 139587 ... 579881 756347 836195 1.10743e+06 0 198979 591199 0 353258 935934
1506.9634_417.3 1506.9634 417.3 0 0 2.14088e+06 432544 1.7441e+06 0 0 511794 ... 1.18723e+06 1.94759e+06 588442 668157 0 0 0 0 0 0
1508.9547_418.5 1508.9547 418.5 0 0 0 0 0 0 0 0 ... 1.36492e+06 1.70563e+06 0 0 668008 1.48265e+06 868447 839569 2.10961e+06 1.22649e+06
1509.4864_417.5 1509.4864 417.5 272379 915517 371483 591921 1.47227e+06 1.35451e+06 0 1.10731e+06 ... 0 1.31202e+06 723156 635745 432109 0 1.52411e+06 633306 0 0
1509.9852_418.0 1509.9852 418.0 619030 1.08713e+06 1.08287e+06 1.17719e+06 1.06641e+06 977258 152933 1.5519e+06 ... 1.21676e+06 1.88511e+06 1.60785e+06 1.45525e+06 1.24603e+06 1.45683e+06 1.12249e+06 1.00433e+06 0 1.50663e+06
1510.9611_417.9 1510.9611 417.9 0 0 0 0 0 0 0 0 ... 499558 662341 625116 0 467204 487650 308450 264955 821691 874620
1514.4625_417.4 1514.4625 417.4 368950 963761 792611 1.30887e+06 896520 560616 218595 926808 ... 1.14567e+06 1.49725e+06 1.82213e+06 1.66125e+06 517588 1.03396e+06 533479 868694 0 1.40544e+06
1514.9637_417.6 1514.9637 417.6 356141 2.01115e+06 762706 4.31643e+06 1.06663e+06 567262 159991 1.14849e+06 ... 0 1.19869e+06 1.61884e+06 4.82512e+06 0 1.23353e+06 559614 1.57724e+06 0 0
1565.0444_430.9 1565.0444 430.9 1478.76 298353 25286.9 147028 320048 0 0 43190.4 ... 0 213184 181380 1.12326e+06 110779 0 0 0 902647 0
1566.0391_432.0 1566.0391 432.0 85943.4 587645 0 331745 263775 0 0 189343 ... 0 81710.9 408747 332965 131796 155424 145495 131871 633759 0
1577.8963_418.0 1577.8963 418.0 342378 204815 776990 252045 257158 499335 383471 186745 ... 263563 0 429490 477172 197101 744663 352338 309261 0 561984
1578.8915_418.4 1578.8915 418.4 469602 458640 0 144189 674929 295683 236287 341081 ... 940247 317598 353704 353458 389799 0 564795 0 0 0
1735.1634_417.2 1735.1634 417.2 358808 843433 925776 1.00208e+06 1.14441e+06 242722 0 909280 ... 469164 1.90716e+06 1.40435e+06 1.71562e+06 253114 1.47031e+06 910961 421894 2.81354e+06 1.40064e+06
1735.6551_417.7 1735.6551 417.7 0 1.25331e+06 0 1.6635e+06 1.22537e+06 0 0 1.0357e+06 ... 607725 2.31893e+06 2.17909e+06 1.83453e+06 509136 1.0987e+06 1.31286e+06 1.01901e+06 2.93617e+06 856519
1762.1148_417.4 1762.1148 417.4 0 701023 388293 475046 394144 746369 0 666752 ... 0 1.7547e+06 632190 515846 0 936565 1.0517e+06 0 633518 0
1763.1075_417.3 1763.1075 417.3 0 0 510225 0 0 0 0 176634 ... 520883 118599 367589 1.70472e+06 0 305821 43446.4 348186 488637 0
1983.3337_417.2 1983.3337 417.2 1036.54 460889 714522 760267 874229 228267 0 475948 ... 294456 845642 1.41258e+06 0 0 0 1.07847e+06 613841 2.0938e+06 0
1984.3186_417.5 1984.3186 417.5 0 322134 322114 72068.8 697438 0 0 0 ... 45538.6 565770 686524 691060 0 374726 17701.9 249300 984439 0

5151 rows × 66 columns


In [353]:


In [354]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter

def get_rt_slice(df, rt_bounds):
    '''
    PURPOSE: 
        Given a tidy feature table with 'mz' and 'rt' column headers, 
        retain only the features whose rt is between rt_left 
        and rt_right
    INPUT: 
        df - a tidy pandas dataframe with 'mz' and 'rt' column 
            headers
        rt_left, rt_right: the boundaries of your rt_slice, in seconds
    '''
    out_df = df.loc[ (df['rt'] > rt_bounds[0]) & 
                    (df['rt'] < rt_bounds[1])]
    return out_df

def plot_mz_rt(df, rt_bounds, path='/home/irockafe/Desktop/poop.pdf'):
    # the random data
    x = df['rt']
    y = df['mz']
    print np.max(x)
    print np.max(y)
    nullfmt = NullFormatter()         # no labels

    # definitions for the axes
    left, width = 0.1, 0.65
    bottom, height = 0.1, 0.65
    bottom_h = left_h = left + width + 0.02

    rect_scatter = [left, bottom, width, height]
    rect_histx = [left, bottom_h, width, 0.2]
    rect_histy = [left_h, bottom, 0.2, height]

    # start with a rectangular Figure
    #fig = plt.figure(1, figsize=(8, 8))
    fig = plt.figure(1, figsize=(10,10))
    
    axScatter = plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)

    # no labels
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    # the scatter plot:
    axScatter.scatter(x, y, s=1)

    # now determine nice limits by hand:
    binwidth = 0.25

    #xymax = np.max([np.max(np.fabs(x)), np.max(np.fabs(y))])

    #lim = (int(xymax/binwidth) + 1) * binwidth

    x_min = np.min(x)-50
    x_max = np.max(x)+50
    axScatter.set_xlim(x_min, x_max )
    y_min = np.min(y)-50
    y_max = np.max(y)+50
    axScatter.set_ylim(y_min, y_max)

    # Add vertical red line between 750-1050 retention time
    '''
    plt.plot([0,1], [0,1], linestyle = '--', lw=2, color='r',
                        label='Luck', alpha=0.5)
    '''
    print 'ymin: ', y_min
    # Add vertical/horizontal lines to scatter and histograms
    axScatter.axvline(x=rt_bounds[0], lw=2, color='r', alpha=0.5)
    axScatter.axvline(x=rt_bounds[1], lw=2, color='r', alpha=0.5)

    axHistx.axvline(x=rt_bounds[0], lw=2, color='r', alpha=0.5)
    axHistx.axvline(x=rt_bounds[1], lw=2, color='r', alpha=0.5)

    #bins = np.arange(-lim, lim + binwidth, binwidth)
    bins = 100
    axHistx.hist(x, bins=bins)
    axHisty.hist(y, bins=bins, orientation='horizontal')

    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    axScatter.set_ylabel('m/z', fontsize=30)
    axScatter.set_xlabel('Retention Time', fontsize=30)

    axHistx.set_ylabel('# of Features', fontsize=20)
    axHisty.set_xlabel('# of Features', fontsize=20)
        
    plt.savefig(path, 
                format='pdf')
    plt.show()


rt_slice = (350, 475)
df_50percent_slice = get_rt_slice(df_50percent_prevalence_mzrt, rt_slice)

# plot selection 
plot_mz_rt(df_50percent_prevalence_mzrt, (rt_slice))


596.6
1984.3186
ymin:  39.5059

In [394]:
X_50_slice = df_50percent_slice.T.as_matrix()
print "slice shape", X_50_slice.shape



rf_estimators = 1000
n_iter = 25
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)

print cross_val_rf.n_iter
print cross_val_rf.test_size

# unscaled
tpr_vals, auc_vals, mean_fpr = roc_curve_cv(X_50_slice, y, clf_rf, cross_val_rf,
                                           save=False)


slice shape (66, 2229)
25
0.3
0.0% done! 4.05469584465s elapsed
40.0% done! 37.6644158363s elapsed
80.0% done! 70.914978981s elapsed

Same story holds as before: By running only a subset of chromatography, you can still do just as well classifying

Try another subset of retention time


In [401]:
def slice_and_predict(df, y, rt_slice,
                     rf_estimators=1000, 
                     n_iter=10, test_size=0.3, random_state=1):
    df_slice = get_rt_slice(df, rt_slice)

    # plot selection 
    plot_mz_rt(df, (rt_slice))
    # remove mz and rt from dataframe...
    df_slice_no_mzrt = df_slice.drop(['mz', 'rt'], axis=1)
    X_slice = df_slice_no_mzrt.T.as_matrix()
    print "slice shape", X_slice.shape

    # Run RF
    rf_estimators = rf_estimators
    n_iter = n_iter
    test_size = test_size
    random_state = random_state
    cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
    clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)
    tpr_vals, auc_vals, mean_fpr = roc_curve_cv(X_slice, y, clf_rf, cross_val_rf,
                                               save=False)


rt_slice = (0,100)
df_50percent_prevalence_mzrt
#df_50percent_prevalence_mzrt.drop(['mz', 'rt'], axis=1)
slice_and_predict(df_50percent_prevalence_mzrt, y, rt_slice)


596.6
1984.3186
ymin:  39.5059
<bound method DataFrame.head of                     2009227      2008791      2008700      2008675  \
Samples                                                              
90.0075_8.6     3965161.294  2781320.516  1717467.517  4557703.401   
90.5060_7.9     6058519.493  2231072.165  423239.0544  3850227.892   
90.5092_17.6              0  115455.5355            0            0   
90.5249_36.0    5448438.699  7198422.653  9807786.115  6275756.742   
90.9756_43.2    5969641.319  7682736.664  6454374.441  5167697.656   
91.0077_10.9    350121.5799  280759.0341  265857.4427  350320.5165   
91.0225_34.9    414013.9793            0  951154.1525  621837.0755   
91.0266_36.8     323496.014  419842.6729  600436.0097  393878.3168   
91.5011_18.2              0            0            0            0   
91.5037_8.1     2113778.491   1035747.08  596398.0602  1648143.578   
91.9790_41.8              0            0            0            0   
92.0389_37.6              0            0            0            0   
92.5025_10.1    196277.8927  87930.64936  204554.4669   140074.234   
92.5213_33.7    773106.0023  189058.7431  233366.6464  127477.4813   
93.5024_15.7    8363.454429  124910.9258  79832.47732  5702.521546   
93.5246_36.8    27273.71485  27701.95149   31562.4255  53418.35481   
94.0441_38.1    237182.9477  351465.6874  679253.2611  390439.3065   
94.5141_36.3    21045.32852  20081.18228            0            0   
94.5447_38.7              0  132007.0237  361311.9523            0   
95.0152_57.8              0            0            0            0   
95.0230_16.6              0  253450.6983  219615.8261  186455.0579   
95.0427_38.9    45381.73524            0            0  58319.49927   
96.0797_94.5              0  50255.51454            0  124948.3142   
96.5139_14.9    45801.87214  91960.15387            0            0   
96.9207_46.7    2229178.891  2560462.625  3090669.778  2593588.989   
96.9999_37.5    32568.47941  42686.30616  100703.8001  64140.23025   
97.5134_8.3      616621.111  580573.7253  263931.6335  426650.2366   
97.9314_18.4    61451.37543  77527.53581            0  83520.84122   
97.9678_10.1    177791.9569            0  98961.88121   220931.936   
97.9904_35.9              0  5478627.934            0  17963823.89   
...                     ...          ...          ...          ...   
1150.7485_41.3       142016       317447       836359       303915   
1152.6906_41.8  1.34367e+06       185594       318880       151371   
1159.7096_41.8       716867      21283.9       100743            0   
1160.7025_42.6       325445            0       320834            0   
1164.6497_43.5       294450       318484            0       656525   
1166.7250_40.5            0            0            0            0   
1172.7414_44.5            0            0            0            0   
1176.6816_42.0       231377       105894            0       460137   
1182.7060_40.1            0            0            0       496689   
1184.7142_42.1       249643       314052       132478       204471   
1188.7304_40.8       411639       566085       763949       323268   
1200.6901_42.3       314384       667300       419340       224707   
1204.7010_40.7       276926       425151       177740       404186   
1210.7114_41.6       289660            0       316982       816834   
1216.6546_42.7            0            0            0       402922   
1226.6901_42.2  1.36364e+06       476723       641828  1.17269e+06   
1234.7163_39.7            0       245003            0            0   
1242.6644_42.5       891287            0            0      3440.72   
1250.6978_41.0            0       648949       139276       264940   
1252.6974_42.1       328404            0       358652       267347   
1256.7109_40.4       364786       504363       306756       654547   
1262.7252_41.3       330996       472009            0       272502   
1272.6851_41.2       646273       366600       402188       273924   
1274.4884_13.6            0            0            0       139544   
1278.7022_41.9       450561       563901       962647       804148   
1294.6764_42.5       627729       296142       898464            0   
1324.6985_41.2       368360            0       306798       589322   
1330.7125_41.2       790961       324127            0       106687   
1346.6934_39.8       234329       263865       894916       382636   
1362.6598_41.6       369198       500438            0            0   

                    2008630      2007260      2008235      2009263  \
Samples                                                              
90.0075_8.6     3319942.933  2346799.773  16144.20002  3191167.455   
90.5060_7.9     5264008.161  4112418.498  2444736.853  5342918.932   
90.5092_17.6    28068.56722            0            0            0   
90.5249_36.0    4291497.217  6524701.024            0  6394814.163   
90.9756_43.2    7710570.018  5836218.512  5608189.373  4921445.729   
91.0077_10.9    187919.9025  212206.6134  246438.1215  262296.1873   
91.0225_34.9    440505.0904  481915.8709  1085349.859  472663.6129   
91.0266_36.8    262175.7103  421403.3179  550224.7103  397568.9655   
91.5011_18.2    8205.302463  59620.41605  63188.63976            0   
91.5037_8.1     1999672.742  1023945.405            0  1702671.468   
91.9790_41.8              0            0            0            0   
92.0389_37.6              0  55056.23856  31438.43584  34584.76624   
92.5025_10.1    75841.48121  116998.9605  203119.1601   203405.359   
92.5213_33.7    87162.15831  138814.7661   187789.747  139786.8405   
93.5024_15.7              0  76968.31876  78529.15582  112017.0127   
93.5246_36.8    84371.93364  43383.64467  72514.67406  37776.90846   
94.0441_38.1    379367.2668  468965.7531  413461.1879  427036.1823   
94.5141_36.3              0            0            0  19881.56627   
94.5447_38.7    29494.28423            0            0  18741.45921   
95.0152_57.8              0            0            0            0   
95.0230_16.6    230590.6664  194719.6009  231039.6628  202508.7813   
95.0427_38.9              0            0            0            0   
96.0797_94.5              0            0            0            0   
96.5139_14.9    75305.55127  122974.0817  14113.79822  7531.516358   
96.9207_46.7    2307465.897   2391115.04  2282793.731  2178811.992   
96.9999_37.5    43443.45564   89798.2985  82841.35288  73589.10495   
97.5134_8.3     328030.4006  350734.0581  355718.0382  412500.7903   
97.9314_18.4    21356.74107            0  2761.592027            0   
97.9678_10.1    251764.8698  164091.5434  145010.7167            0   
97.9904_35.9    12214618.72            0  16715439.85            0   
...                     ...          ...          ...          ...   
1150.7485_41.3       253252       153609            0       241698   
1152.6906_41.8       221759       712906       437720       624170   
1159.7096_41.8       118403      50574.6            0       160497   
1160.7025_42.6       243704       394390            0       334405   
1164.6497_43.5       647271            0       234210            0   
1166.7250_40.5            0       433500       197358            0   
1172.7414_44.5            0            0            0       321350   
1176.6816_42.0       693270       193759       456820            0   
1182.7060_40.1       199376       660619       345962            0   
1184.7142_42.1       310800       162185       226698       171801   
1188.7304_40.8  1.61999e+06       533974       276045       431862   
1200.6901_42.3       287970        81408       158412       389458   
1204.7010_40.7       241683       281969       301487       243510   
1210.7114_41.6       652220       560594       337794       197932   
1216.6546_42.7  1.22842e+06            0       154846  1.20079e+06   
1226.6901_42.2            0       477903       800111       241062   
1234.7163_39.7       441357            0            0            0   
1242.6644_42.5       211118       116803       500413       162709   
1250.6978_41.0       117614       463785       698114            0   
1252.6974_42.1       317729            0       263105            0   
1256.7109_40.4       602697       419694       185495       593161   
1262.7252_41.3       350081            0            0       294905   
1272.6851_41.2       490005       193553       187344       310078   
1274.4884_13.6            0       195070      64733.7      46737.4   
1278.7022_41.9       738674      49401.4       286510       717521   
1294.6764_42.5       428067            0       659991      55798.6   
1324.6985_41.2       688980       270826            0            0   
1330.7125_41.2       668409       664346            0            0   
1346.6934_39.8       257716            0       585774            0   
1362.6598_41.6            0       515858      89219.1       104976   

                    2009264      2009249     ...           2008654  \
Samples                                      ...                     
90.0075_8.6     6628460.509  4859109.333     ...       3361374.609   
90.5060_7.9     5714550.409  4147867.037     ...       6515632.634   
90.5092_17.6              0            0     ...                 0   
90.5249_36.0    3530316.311  3574931.424     ...       8017484.285   
90.9756_43.2    5252845.756  4680603.346     ...       6676109.719   
91.0077_10.9    311465.8456  307836.6964     ...       242635.6677   
91.0225_34.9    497603.9063  446483.2171     ...       701208.7806   
91.0266_36.8    769231.8236  324397.3933     ...       575749.0563   
91.5011_18.2              0            0     ...                 0   
91.5037_8.1     2020360.392  1522009.861     ...       1835541.686   
91.9790_41.8              0  11123.72842     ...                 0   
92.0389_37.6    20462.98106            0     ...       90017.77892   
92.5025_10.1    219736.6895   142536.304     ...       125295.0378   
92.5213_33.7    102441.4833  156594.5796     ...        144992.212   
93.5024_15.7    52957.03155  68852.68548     ...       142169.8328   
93.5246_36.8    172734.8387  196224.1773     ...       270898.7648   
94.0441_38.1    473149.3673  580080.7362     ...       399036.1415   
94.5141_36.3              0            0     ...       53301.92095   
94.5447_38.7              0  31749.12229     ...       309228.9378   
95.0152_57.8              0            0     ...                 0   
95.0230_16.6    296189.2394  209981.6147     ...       198817.3782   
95.0427_38.9              0   27318.3374     ...       78373.60588   
96.0797_94.5              0            0     ...                 0   
96.5139_14.9    109323.8707   22105.0613     ...       113418.6128   
96.9207_46.7    2678920.487  1756197.048     ...       2494157.328   
96.9999_37.5    105832.4549  50210.94216     ...        80148.1356   
97.5134_8.3     1046059.215            0     ...       533657.5483   
97.9314_18.4       89317.72  30864.55386     ...                 0   
97.9678_10.1    505037.1538  168095.6049     ...       251666.9195   
97.9904_35.9    9857038.198  7197632.324     ...       13634659.49   
...                     ...          ...     ...               ...   
1150.7485_41.3            0            0     ...                 0   
1152.6906_41.8            0       173267     ...            713792   
1159.7096_41.8       152925      97433.4     ...           39313.9   
1160.7025_42.6       140182       877012     ...            124713   
1164.6497_43.5            0            0     ...             63349   
1166.7250_40.5   1.1227e+06       313999     ...            173853   
1172.7414_44.5       599857       550101     ...            190112   
1176.6816_42.0            0            0     ...            399313   
1182.7060_40.1       521186       161269     ...            349973   
1184.7142_42.1      91964.6  1.21821e+06     ...            170300   
1188.7304_40.8            0       986078     ...            342936   
1200.6901_42.3       647794            0     ...                 0   
1204.7010_40.7       380106            0     ...            802973   
1210.7114_41.6       413424       867446     ...            534480   
1216.6546_42.7       469990            0     ...                 0   
1226.6901_42.2  1.42154e+06       151472     ...       1.55489e+06   
1234.7163_39.7       274086            0     ...            190889   
1242.6644_42.5       426523            0     ...                 0   
1250.6978_41.0       486227       653927     ...            230726   
1252.6974_42.1      59348.6            0     ...                 0   
1256.7109_40.4       472934            0     ...            418116   
1262.7252_41.3       778930       140869     ...                 0   
1272.6851_41.2            0  1.03003e+06     ...            297656   
1274.4884_13.6        11786            0     ...           41206.3   
1278.7022_41.9       689772       356750     ...                 0   
1294.6764_42.5       435056       470897     ...                 0   
1324.6985_41.2       482574            0     ...            449095   
1330.7125_41.2       439209            0     ...           98812.2   
1346.6934_39.8            0            0     ...            285468   
1362.6598_41.6       598477      57690.2     ...            208159   

                    2008627      2009254      2009225      2008722  \
Samples                                                              
90.0075_8.6     3225108.739   3234679.55  2257954.805  2829996.817   
90.5060_7.9               0  4333684.698  3044488.261            0   
90.5092_17.6              0  14851.66607            0            0   
90.5249_36.0    8698940.475  6447841.305  10352306.69  9928524.935   
90.9756_43.2     5564232.26    6342556.3  5943083.312  6767760.897   
91.0077_10.9    221675.7999  229490.9118  248242.2838   224932.697   
91.0225_34.9    695549.5147   593596.924  1011122.885  785552.2501   
91.0266_36.8    551845.2813  436312.5506  575679.7514  489581.8057   
91.5011_18.2              0  6565.430852            0            0   
91.5037_8.1     1965144.129  1578519.992  1254855.662  1291675.706   
91.9790_41.8              0            0            0            0   
92.0389_37.6    27108.19628  22884.95239  55619.15778  185537.0365   
92.5025_10.1    98060.14562  129047.4041  188838.3328  72702.59088   
92.5213_33.7    165804.0998  225968.6949   156295.635  141690.4117   
93.5024_15.7              0  79452.75602  65749.38042  70760.57411   
93.5246_36.8    94667.28808  38438.90541  37853.17185   35734.7688   
94.0441_38.1    582135.9545  245083.0499  331295.3117  404814.3372   
94.5141_36.3    33341.85178            0  11769.64065            0   
94.5447_38.7    177607.0408  47938.94201            0  144052.2379   
95.0152_57.8              0            0            0            0   
95.0230_16.6    211432.6586  205648.8066  194927.9413  186343.3331   
95.0427_38.9              0  41093.98201  84362.75101  102676.2014   
96.0797_94.5     15358.0945            0            0  76227.30182   
96.5139_14.9      60975.294  92310.41495  98057.64143   54860.3108   
96.9207_46.7    2734187.828  2554378.261  2394356.747  2363498.852   
96.9999_37.5    57838.59424  55947.09973  68260.73311  133619.6022   
97.5134_8.3     575149.4191  645061.3328  385306.8238  130282.1941   
97.9314_18.4    15665.88267  41070.37212  1278.119339  99409.88349   
97.9678_10.1    141662.7153  219310.2226  102009.4101  180498.9118   
97.9904_35.9     13609343.6    6809282.7  18107813.33            0   
...                     ...          ...          ...          ...   
1150.7485_41.3       181801            0       518178            0   
1152.6906_41.8       342004            0            0       343708   
1159.7096_41.8       157540       268747            0            0   
1160.7025_42.6       553264       300919       571595       228457   
1164.6497_43.5       543977       576016       576967       604614   
1166.7250_40.5       172955       363509       291942       319444   
1172.7414_44.5            0            0       548462            0   
1176.6816_42.0       350277       357032            0            0   
1182.7060_40.1       254124            0       993605            0   
1184.7142_42.1            0            0       747151       435627   
1188.7304_40.8       407942       455438       219977            0   
1200.6901_42.3       155805       257166            0       375715   
1204.7010_40.7       368911       496600            0       620421   
1210.7114_41.6       548794       911557       883027       357790   
1216.6546_42.7            0  1.04993e+06            0       296741   
1226.6901_42.2       562946       428840  2.07956e+06            0   
1234.7163_39.7       514409       147664       381773       359884   
1242.6644_42.5       475908            0            0      78522.3   
1250.6978_41.0       126206            0            0            0   
1252.6974_42.1            0       926891            0            0   
1256.7109_40.4       612677       115381       334048       759451   
1262.7252_41.3       134085       547387       241160            0   
1272.6851_41.2       168241       428102       465527            0   
1274.4884_13.6       112977            0      80682.2            0   
1278.7022_41.9       600876       796615            0            0   
1294.6764_42.5            0      54759.1       613293            0   
1324.6985_41.2       762961       710039       127949            0   
1330.7125_41.2       319145       280412       374427            0   
1346.6934_39.8       182446            0       200518            0   
1362.6598_41.6       213668            0            0      22031.9   

                    2008702      2008660      2008628      2008618  \
Samples                                                              
90.0075_8.6     3025325.917    3964125.1  3894615.297   3767459.37   
90.5060_7.9     3399020.112  4821618.007  6331004.895  6149395.473   
90.5092_17.6    120916.2108  128653.7856  78832.70997            0   
90.5249_36.0    7441818.805   4932827.72  8323500.152  7137622.677   
90.9756_43.2    5100041.402  6431736.961  5525051.032  5507681.851   
91.0077_10.9    283475.8571  335048.9052  255997.4283    407961.83   
91.0225_34.9    674660.9142  234798.2155  722024.3645  659349.3175   
91.0266_36.8    474608.3407   387506.319  545998.9721  489848.0843   
91.5011_18.2              0  40812.97441  38493.93123            0   
91.5037_8.1     1282284.369  1630517.095  2623260.837  1738926.971   
91.9790_41.8              0            0            0            0   
92.0389_37.6    35938.26805            0   34736.3584  90954.89343   
92.5025_10.1    185231.1968   117949.371   148770.824  191671.5297   
92.5213_33.7    103485.8296  221280.6808  133518.8514  195216.9817   
93.5024_15.7    72830.96387  104458.2272  148581.3145  80203.22047   
93.5246_36.8    32709.98062  20709.95204  42166.12442  38281.83138   
94.0441_38.1    330787.7181    160632.37  631936.1649  465924.0994   
94.5141_36.3              0            0  24722.04712            0   
94.5447_38.7              0  72689.83068  117244.1161   463674.667   
95.0152_57.8              0            0            0            0   
95.0230_16.6      262597.61  197604.7725  237954.4153  210414.3169   
95.0427_38.9    77443.32765            0            0            0   
96.0797_94.5              0            0            0            0   
96.5139_14.9              0  85138.02335  153426.8202  152406.3402   
96.9207_46.7    2710644.056  2049517.417   2332387.31  2417197.841   
96.9999_37.5    68117.77475  37671.48808  89552.15565  65440.96744   
97.5134_8.3     215490.5143  379417.2145   488748.158  520599.7407   
97.9314_18.4    54467.64564  5564.985712  67416.28361   19853.1375   
97.9678_10.1              0  467239.6908            0  429457.5677   
97.9904_35.9    13531626.59  8369852.054  14622987.82  12939864.27   
...                     ...          ...          ...          ...   
1150.7485_41.3            0       144221            0            0   
1152.6906_41.8            0       222431            0            0   
1159.7096_41.8       399812       442560            0            0   
1160.7025_42.6       103770       302851       172828       366954   
1164.6497_43.5            0            0       170019            0   
1166.7250_40.5       147748       155121       122978       237372   
1172.7414_44.5       954418            0       194401      76583.6   
1176.6816_42.0       396745            0       805954            0   
1182.7060_40.1       143558       241327       642359       310534   
1184.7142_42.1            0       847336      6796.39            0   
1188.7304_40.8       128872       686349            0            0   
1200.6901_42.3       333187            0            0      83505.5   
1204.7010_40.7            0            0       879413       552615   
1210.7114_41.6       254607       967732       719544       774428   
1216.6546_42.7       309568            0       288102       176586   
1226.6901_42.2       501449       473750       897216            0   
1234.7163_39.7            0            0            0       516997   
1242.6644_42.5       617054       319314       224825            0   
1250.6978_41.0            0            0       418538            0   
1252.6974_42.1            0       441698            0            0   
1256.7109_40.4       635062       473151       271811            0   
1262.7252_41.3       376295       239076       438242      82801.7   
1272.6851_41.2            0       369878       841548       343527   
1274.4884_13.6       112784       177104            0       278859   
1278.7022_41.9            0       541889            0       502626   
1294.6764_42.5            0       201599            0       749223   
1324.6985_41.2       209186       258962            0            0   
1330.7125_41.2            0       373464      49976.9  1.01262e+06   
1346.6934_39.8      12980.6       459267            0            0   
1362.6598_41.6       418287       565639       250577            0   

                    2008621  
Samples                      
90.0075_8.6     1860727.982  
90.5060_7.9     4007774.961  
90.5092_17.6              0  
90.5249_36.0              0  
90.9756_43.2    5640639.179  
91.0077_10.9    357468.8174  
91.0225_34.9    368513.9601  
91.0266_36.8    198919.9106  
91.5011_18.2              0  
91.5037_8.1     985253.1105  
91.9790_41.8              0  
92.0389_37.6    14103.02042  
92.5025_10.1    202568.1988  
92.5213_33.7    202900.2753  
93.5024_15.7              0  
93.5246_36.8    5480.454853  
94.0441_38.1              0  
94.5141_36.3              0  
94.5447_38.7    31354.10138  
95.0152_57.8              0  
95.0230_16.6    283749.0126  
95.0427_38.9      24908.972  
96.0797_94.5    94288.61018  
96.5139_14.9              0  
96.9207_46.7    3265252.777  
96.9999_37.5    50637.10037  
97.5134_8.3     342516.6079  
97.9314_18.4     52442.8171  
97.9678_10.1    215695.3047  
97.9904_35.9    9083910.197  
...                     ...  
1150.7485_41.3            0  
1152.6906_41.8       636275  
1159.7096_41.8       216925  
1160.7025_42.6       572814  
1164.6497_43.5  1.03367e+06  
1166.7250_40.5       168988  
1172.7414_44.5       515194  
1176.6816_42.0       140231  
1182.7060_40.1            0  
1184.7142_42.1       349732  
1188.7304_40.8       449743  
1200.6901_42.3       500711  
1204.7010_40.7       471584  
1210.7114_41.6       500730  
1216.6546_42.7      91453.1  
1226.6901_42.2       455448  
1234.7163_39.7       107935  
1242.6644_42.5       546823  
1250.6978_41.0            0  
1252.6974_42.1        81849  
1256.7109_40.4       961921  
1262.7252_41.3       560192  
1272.6851_41.2       499323  
1274.4884_13.6            0  
1278.7022_41.9       486178  
1294.6764_42.5       434913  
1324.6985_41.2            0  
1330.7125_41.2       497848  
1346.6934_39.8            0  
1362.6598_41.6       838983  

[2229 rows x 64 columns]>
slice shape (64, 2229)
0.0% done! 4.78106617928s elapsed

In [388]:
def make_sliding_window(min_val, max_val, width, step):
    '''
    Width should be
    '''
    if step > width:
        raise ValueError, ("Your step should be less than"+
                           "or equal to the width of the window")
    left_bound = np.arange(min_val, max_val, step)
    right_bound = left_bound + width
    rt_bounds = zip(left_bound, right_bound)
    # remove any bounds that go past the maximum value
    for idx, i in enumerate(rt_bounds):
        if i[1] > max_val:
            rt_bounds.pop(idx)
    return rt_bounds

make_sliding_window(0, 100, 10, 5)


Out[388]:
[(0, 10),
 (5, 15),
 (10, 20),
 (15, 25),
 (20, 30),
 (25, 35),
 (30, 40),
 (35, 45),
 (40, 50),
 (45, 55),
 (50, 60),
 (55, 65),
 (60, 70),
 (65, 75),
 (70, 80),
 (75, 85),
 (80, 90),
 (85, 95),
 (90, 100)]

In [396]:
# Make sliding window
min_val = 0
max_val = df_50percent_prevalence_mzrt['rt'].max()
width = max_val / 5.0
step = width / 2
print "min: {mini}, max: {maxi}, width: {width}, step: {step}".format(
        mini=min_val, maxi=max_val, width=width, step=step)
sliding_window = make_sliding_window(min_val, max_val, width, step)

# plot and run classifier on sliding window
for rt_slice in sliding_window:
    print rt_slice
    slice_and_predict(df_50percent_prevalence_mzrt, y, rt_slice,
                      rf_estimators=1000, 
                     n_iter=25, test_size=0.3, random_state=1)
    print '\n\n\n'+'-'*50+'NEXT ROUND'+'-'*50+'\n\n\n'


min: 0, max: 596.6, width: 119.32, step: 59.66
(0.0, 119.32000000000001)
596.6
1984.3186
ymin:  39.5059
slice shape (66, 2248)
0.0% done! 3.12874293327s elapsed
40.0% done! 38.1319730282s elapsed
80.0% done! 74.1431500912s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------



(59.660000000000004, 178.98000000000002)
596.6
1984.3186
ymin:  39.5059
slice shape (66, 243)
0.0% done! 4.19101309776s elapsed
40.0% done! 38.577999115s elapsed
80.0% done! 74.5704450607s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------



(119.32000000000001, 238.64000000000001)
596.6
1984.3186
ymin:  39.5059
slice shape (66, 91)
0.0% done! 3.03325486183s elapsed
40.0% done! 33.2707278728s elapsed
80.0% done! 62.8525710106s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------



(178.98000000000002, 298.30000000000001)
596.6
1984.3186
ymin:  39.5059
slice shape (66, 245)
0.0% done! 3.43220806122s elapsed
40.0% done! 31.9505441189s elapsed
80.0% done! 60.5420660973s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------



(238.64000000000001, 357.96000000000004)
596.6
1984.3186
ymin:  39.5059
slice shape (66, 672)
0.0% done! 3.0954041481s elapsed
40.0% done! 32.2163431644s elapsed
80.0% done! 61.3447351456s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------



(298.30000000000001, 417.62)
596.6
1984.3186
ymin:  39.5059
slice shape (66, 1314)
0.0% done! 3.45354104042s elapsed
40.0% done! 33.3407568932s elapsed
80.0% done! 63.2684009075s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------



(357.96000000000004, 477.28000000000003)
596.6
1984.3186
ymin:  39.5059
slice shape (66, 1346)
0.0% done! 3.14566516876s elapsed
40.0% done! 34.858104229s elapsed
80.0% done! 69.7666180134s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------



(417.62, 536.94000000000005)
596.6
1984.3186
ymin:  39.5059
slice shape (66, 919)
0.0% done! 4.22216081619s elapsed
40.0% done! 39.1709859371s elapsed
80.0% done! 76.0188238621s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------



(477.28000000000003, 596.60000000000002)
596.6
1984.3186
ymin:  39.5059
slice shape (66, 793)
0.0% done! 4.71782708168s elapsed
40.0% done! 52.1401121616s elapsed
80.0% done! 85.8386499882s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------