Feature detected using Craig's code

Not sure how that worked, but the PCA plot is wild


In [2]:
import time

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import cross_val_score
#from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.model_selection import cross_val_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle

from scipy import interp

%matplotlib inline

In [110]:
def remove_zero_columns(X, threshold=1e-20):
    # convert zeros to nan, drop all nan columns, the replace leftover nan with zeros
    X_non_zero_colum = X.replace(0, np.nan).dropna(how='all', axis=1).replace(np.nan, 0)
    #.dropna(how='all', axis=0).replace(np.nan,0)
    return X_non_zero_colum

def zero_fill_half_min(X, threshold=1e-20):
    # Fill zeros with 1/2 the minimum value of that column
    # input dataframe. Add only to zero values
    
    # Get a vector of 1/2 minimum values
    half_min = X[X > threshold].min(axis=0)*0.5
    
    # Add the half_min values to a dataframe where everything that isn't zero is NaN.
    # then convert NaN's to 0
    fill_vals = (X[X < threshold] + half_min).fillna(value=0)
    
    # Add the original dataframe to the dataframe of zeros and fill-values
    X_zeros_filled = X + fill_vals
    return X_zeros_filled



toy = pd.DataFrame([[1,2,3,0],
               [0,0,0,0],
               [0.5,1,0,0]], dtype=float)

toy_no_zeros = remove_zero_columns(toy)
toy_filled_zeros = zero_fill_half_min(toy_no_zeros)
print toy
print toy_no_zeros
print toy_filled_zeros


     0  1  2  3
0  1.0  2  3  0
1  0.0  0  0  0
2  0.5  1  0  0
     0  1  2
0  1.0  2  3
1  0.0  0  0
2  0.5  1  0
      0    1    2
0  1.00  2.0  3.0
1  0.25  0.5  1.5
2  0.50  1.0  1.5

Import the dataframe and remove any features that are all zero


In [111]:
### Subdivide the data into a feature table
data_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/old_repo_revo_healthcare/'\
'data/MTBLS315/7-24-17_Malaria_pos_grouped_data_Craig_processed.csv'
## Import the data and remove extraneous columns
df = pd.read_csv(data_path)
print df.shape
print df.head()
# Make a new index of mz:rt
mz = df.loc[:,"mz"].astype('str')
rt = df.loc[:,"rt"].astype('str')
idx = mz+':'+rt
df.index = idx
df
# separate samples from xcms/camera things to make feature table
not_samples = ['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 
               'isotopes', 'adduct']
samples_list = df.columns.difference(not_samples)
mz_rt_df = df[not_samples]

# convert to samples x features
X_df_raw = df[samples_list].T
# Remove zero-full columns and fill zeroes with 1/2 minimum values
X_df = remove_zero_columns(X_df_raw)
X_df_zero_filled = zero_fill_half_min(X_df)

print (X_df < 1e-20).sum().sum()
print "original shape: %s \n# zeros: %f\n" % (X_df_raw.shape, (X_df_raw < 1e-20).sum().sum())
print "zero-columns replaced? shape: %s \n# zeros: %f\n" % (X_df.shape, 
                                                         (X_df < 1e-20).sum().sum())
print "zeros filled shape: %s \n#zeros: %f\n" % (X_df_zero_filled.shape, 
                                              (X_df_zero_filled < 1e-20).sum().sum())


# Convert to numpy matrix to play nicely with sklearn
X = X_df.as_matrix()
print X.shape


(6857, 68)
           mz       mzmin       mzmax          rt       rtmin       rtmax  \
0  100.075385  100.074986  100.075519  129.794110  127.975656  132.153297   
1  100.916388  100.916045  100.916718   45.296209   44.146884   47.819750   
2  101.008038  101.007614  101.008242   38.931362   38.142080   39.498050   
3  101.010206  101.009702  101.010503   38.950441    9.380345   40.465458   
4  101.095582  101.095295  101.095724  973.599729  953.662750  992.645495   

          X1001_P         X1002_P         X1003_P          X1004_P  \
0    42373.927779   179331.750974  2537721.750522    101845.009079   
1    11178.774973    20729.982156    20852.688848     12717.817680   
2  8791113.535859  6606203.430266  6546795.750105  12136100.900199   
3   147009.244240   239414.696757   172061.687778  10058152.743696   
4   723913.491959   604511.079660   701094.488497    660342.546434   

        ...              X1024_P         X1032_P         X1034_P  \
0       ...                  NaN             NaN             NaN   
1       ...                  NaN             NaN             NaN   
2       ...        8041070.74371  7900344.594391  3202083.926599   
3       ...                  NaN             NaN             NaN   
4       ...                  NaN             NaN             NaN   

          X1036_P         X1043_P         X1052_P         X1054_P  \
0             NaN             NaN             NaN             NaN   
1             NaN             NaN             NaN             NaN   
2  5383539.442712  5802213.220837  7213738.304376  5679683.988764   
3             NaN             NaN             NaN             NaN   
4             NaN             NaN             NaN             NaN   

         X1055_P         X1060_P         X1062_P  
0            NaN             NaN             NaN  
1            NaN             NaN             NaN  
2  4694471.26903  4398947.827277  6154061.829345  
3            NaN             NaN             NaN  
4            NaN             NaN             NaN  

[5 rows x 68 columns]
99966
original shape: (60, 6857) 
# zeros: 7422.000000

zero-columns replaced? shape: (60, 6857) 
# zeros: 99966.000000

zeros filled shape: (60, 6857) 
#zeros: 0.000000

(60, 6857)

Get mappings between sample names, file names, and sample classes


In [179]:
# Get mapping between sample name and assay names
path_sample_name_map = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/data/raw/'\
                'MTBLS315/metadata/a_UPLC_POS_nmfi_and_bsi_diagnosis.txt'
# Index is the sample name
sample_df = pd.read_csv(path_sample_name_map, 
                        sep='\t', index_col=0)
sample_df = sample_df['MS Assay Name']
sample_df.shape
print sample_df.head(10)

# get mapping between sample name and sample class
path_sample_class_map = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/data/raw/'\
                        'MTBLS315/metadata/s_NMFI and BSI diagnosis.txt'
class_df = pd.read_csv(path_sample_class_map,
                      sep='\t')
# Set index as sample name
class_df.set_index('Sample Name', inplace=True)
class_df = class_df['Factor Value[patient group]']
print class_df.head(10)

# convert all non-malarial classes into a single classes 
# (collapse non-malarial febril illness and bacteremia together)
class_map_df = pd.concat([sample_df, class_df], axis=1)
class_map_df.rename(columns={'Factor Value[patient group]': 'class'}, inplace=True)
class_map_df
print class_map_df

# remove sample from y (labels) if not present in X (feature table)
print X_df.index
print class_map_df['MS Assay Name'].tolist()
for i in class_map_df['MS Assay Name']:
    if i not in X_df.index:
        print '\n\n\nFound the missing one!: ', i, '\n\n\n'
        break
drop_idx = class_map_df[class_map_df['MS Assay Name'] == i].index
class_map_df = class_map_df.drop(drop_idx, axis=0)
print class_map_df


binary_class_map = class_map_df.replace(to_replace=['non-malarial febrile illness', 'bacterial bloodstream infection' ], 
                                        value='non-malarial fever')

binary_class_map


Sample Name
MCMA429    1001_P
MCMA430    1002_P
MCMA431    1003_P
MCMA433    1004_P
MCMA434    1005_P
MCMA435    1006_P
MCMA436    1007_P
MCMA442    1009_P
MCMA443    1010_P
MCMA444    1011_P
Name: MS Assay Name, dtype: object
Sample Name
MCMA429                            malaria
MCMA430                            malaria
MCMA431                            malaria
MCMA433                            malaria
MCMA434       non-malarial febrile illness
MCMA435                            malaria
MCMA436    bacterial bloodstream infection
MCMA442                            malaria
MCMA443       non-malarial febrile illness
MCMA444    bacterial bloodstream infection
Name: Factor Value[patient group], dtype: object
            MS Assay Name                            class
Sample Name                                               
MCMA429            1001_P                          malaria
MCMA430            1002_P                          malaria
MCMA431            1003_P                          malaria
MCMA433            1004_P                          malaria
MCMA434            1005_P     non-malarial febrile illness
MCMA435            1006_P                          malaria
MCMA436            1007_P  bacterial bloodstream infection
MCMA442            1009_P                          malaria
MCMA443            1010_P     non-malarial febrile illness
MCMA444            1011_P  bacterial bloodstream infection
MCMA446            1012_P  bacterial bloodstream infection
MCMA447            1013_P                          malaria
MCMA448            1014_P                          malaria
MCMA450            1016_P                          malaria
MCMA451            1017_P                          malaria
MCMA452            1018_P                          malaria
MCMA455            1019_P  bacterial bloodstream infection
MCMA458            1020_P     non-malarial febrile illness
MCMA459            1021_P     non-malarial febrile illness
MCMA462            1023_P     non-malarial febrile illness
MCMA463            1024_P     non-malarial febrile illness
MCMA465            1026_P  bacterial bloodstream infection
MCMA466            1027_P                          malaria
MCMA472            1029_P                          malaria
MCMA474            1030_P                          malaria
MCMA477            1031_P                          malaria
MCMA478            1032_P     non-malarial febrile illness
MCMA480            1033_P  bacterial bloodstream infection
MCMA484            1034_P     non-malarial febrile illness
MCMA489            1035_P                          malaria
...                   ...                              ...
MCMA493            1037_P                          malaria
MCMA495            1038_P                          malaria
MCMA496            1039_P                          malaria
MCMA497            1040_P                          malaria
MCMA501            1041_P                          malaria
MCMA502            1042_P                          malaria
MCMA503            1043_P     non-malarial febrile illness
MCMA513            1044_P                          malaria
MCMA515            1045_P  bacterial bloodstream infection
MCMA516            1046_P                          malaria
MCMA523            1048_P                          malaria
MCMA526            1049_P  bacterial bloodstream infection
MCMA528            1050_P  bacterial bloodstream infection
MCMA533            1051_P                          malaria
MCMA534            1052_P     non-malarial febrile illness
MCMA535            1053_P  bacterial bloodstream infection
MCMA537            1054_P     non-malarial febrile illness
MCMA538            1055_P     non-malarial febrile illness
MCMA550            1056_P                          malaria
MCMA557            1057_P  bacterial bloodstream infection
MCMA560            1058_P                          malaria
MCMA562            1059_P                          malaria
MCMA564            1060_P     non-malarial febrile illness
MCMA565            1061_P  bacterial bloodstream infection
MCMA566            1062_P     non-malarial febrile illness
MURB082            1064_P                          malaria
MURB085            1065_P                          malaria
MURB088            1066_P                          malaria
MURB092            1067_P                          malaria
MURB095            1068_P                          malaria

[61 rows x 2 columns]
Index([u'1001_P', u'1002_P', u'1003_P', u'1004_P', u'1005_P', u'1006_P',
       u'1007_P', u'1009_P', u'1010_P', u'1011_P', u'1012_P', u'1013_P',
       u'1014_P', u'1016_P', u'1017_P', u'1018_P', u'1019_P', u'1021_P',
       u'1023_P', u'1024_P', u'1026_P', u'1027_P', u'1029_P', u'1030_P',
       u'1031_P', u'1032_P', u'1033_P', u'1034_P', u'1035_P', u'1036_P',
       u'1037_P', u'1038_P', u'1039_P', u'1040_P', u'1041_P', u'1042_P',
       u'1043_P', u'1044_P', u'1045_P', u'1046_P', u'1048_P', u'1049_P',
       u'1050_P', u'1051_P', u'1052_P', u'1053_P', u'1054_P', u'1055_P',
       u'1056_P', u'1057_P', u'1058_P', u'1059_P', u'1060_P', u'1061_P',
       u'1062_P', u'1064_P', u'1065_P', u'1066_P', u'1067_P', u'1068_P'],
      dtype='object')
['1001_P', '1002_P', '1003_P', '1004_P', '1005_P', '1006_P', '1007_P', '1009_P', '1010_P', '1011_P', '1012_P', '1013_P', '1014_P', '1016_P', '1017_P', '1018_P', '1019_P', '1020_P', '1021_P', '1023_P', '1024_P', '1026_P', '1027_P', '1029_P', '1030_P', '1031_P', '1032_P', '1033_P', '1034_P', '1035_P', '1036_P', '1037_P', '1038_P', '1039_P', '1040_P', '1041_P', '1042_P', '1043_P', '1044_P', '1045_P', '1046_P', '1048_P', '1049_P', '1050_P', '1051_P', '1052_P', '1053_P', '1054_P', '1055_P', '1056_P', '1057_P', '1058_P', '1059_P', '1060_P', '1061_P', '1062_P', '1064_P', '1065_P', '1066_P', '1067_P', '1068_P']



Found the missing one!:  1020_P 



            MS Assay Name                            class
Sample Name                                               
MCMA429            1001_P                          malaria
MCMA430            1002_P                          malaria
MCMA431            1003_P                          malaria
MCMA433            1004_P                          malaria
MCMA434            1005_P     non-malarial febrile illness
MCMA435            1006_P                          malaria
MCMA436            1007_P  bacterial bloodstream infection
MCMA442            1009_P                          malaria
MCMA443            1010_P     non-malarial febrile illness
MCMA444            1011_P  bacterial bloodstream infection
MCMA446            1012_P  bacterial bloodstream infection
MCMA447            1013_P                          malaria
MCMA448            1014_P                          malaria
MCMA450            1016_P                          malaria
MCMA451            1017_P                          malaria
MCMA452            1018_P                          malaria
MCMA455            1019_P  bacterial bloodstream infection
MCMA459            1021_P     non-malarial febrile illness
MCMA462            1023_P     non-malarial febrile illness
MCMA463            1024_P     non-malarial febrile illness
MCMA465            1026_P  bacterial bloodstream infection
MCMA466            1027_P                          malaria
MCMA472            1029_P                          malaria
MCMA474            1030_P                          malaria
MCMA477            1031_P                          malaria
MCMA478            1032_P     non-malarial febrile illness
MCMA480            1033_P  bacterial bloodstream infection
MCMA484            1034_P     non-malarial febrile illness
MCMA489            1035_P                          malaria
MCMA491            1036_P     non-malarial febrile illness
MCMA493            1037_P                          malaria
MCMA495            1038_P                          malaria
MCMA496            1039_P                          malaria
MCMA497            1040_P                          malaria
MCMA501            1041_P                          malaria
MCMA502            1042_P                          malaria
MCMA503            1043_P     non-malarial febrile illness
MCMA513            1044_P                          malaria
MCMA515            1045_P  bacterial bloodstream infection
MCMA516            1046_P                          malaria
MCMA523            1048_P                          malaria
MCMA526            1049_P  bacterial bloodstream infection
MCMA528            1050_P  bacterial bloodstream infection
MCMA533            1051_P                          malaria
MCMA534            1052_P     non-malarial febrile illness
MCMA535            1053_P  bacterial bloodstream infection
MCMA537            1054_P     non-malarial febrile illness
MCMA538            1055_P     non-malarial febrile illness
MCMA550            1056_P                          malaria
MCMA557            1057_P  bacterial bloodstream infection
MCMA560            1058_P                          malaria
MCMA562            1059_P                          malaria
MCMA564            1060_P     non-malarial febrile illness
MCMA565            1061_P  bacterial bloodstream infection
MCMA566            1062_P     non-malarial febrile illness
MURB082            1064_P                          malaria
MURB085            1065_P                          malaria
MURB088            1066_P                          malaria
MURB092            1067_P                          malaria
MURB095            1068_P                          malaria
Out[179]:
MS Assay Name class
Sample Name
MCMA429 1001_P malaria
MCMA430 1002_P malaria
MCMA431 1003_P malaria
MCMA433 1004_P malaria
MCMA434 1005_P non-malarial fever
MCMA435 1006_P malaria
MCMA436 1007_P non-malarial fever
MCMA442 1009_P malaria
MCMA443 1010_P non-malarial fever
MCMA444 1011_P non-malarial fever
MCMA446 1012_P non-malarial fever
MCMA447 1013_P malaria
MCMA448 1014_P malaria
MCMA450 1016_P malaria
MCMA451 1017_P malaria
MCMA452 1018_P malaria
MCMA455 1019_P non-malarial fever
MCMA459 1021_P non-malarial fever
MCMA462 1023_P non-malarial fever
MCMA463 1024_P non-malarial fever
MCMA465 1026_P non-malarial fever
MCMA466 1027_P malaria
MCMA472 1029_P malaria
MCMA474 1030_P malaria
MCMA477 1031_P malaria
MCMA478 1032_P non-malarial fever
MCMA480 1033_P non-malarial fever
MCMA484 1034_P non-malarial fever
MCMA489 1035_P malaria
MCMA491 1036_P non-malarial fever
MCMA493 1037_P malaria
MCMA495 1038_P malaria
MCMA496 1039_P malaria
MCMA497 1040_P malaria
MCMA501 1041_P malaria
MCMA502 1042_P malaria
MCMA503 1043_P non-malarial fever
MCMA513 1044_P malaria
MCMA515 1045_P non-malarial fever
MCMA516 1046_P malaria
MCMA523 1048_P malaria
MCMA526 1049_P non-malarial fever
MCMA528 1050_P non-malarial fever
MCMA533 1051_P malaria
MCMA534 1052_P non-malarial fever
MCMA535 1053_P non-malarial fever
MCMA537 1054_P non-malarial fever
MCMA538 1055_P non-malarial fever
MCMA550 1056_P malaria
MCMA557 1057_P non-malarial fever
MCMA560 1058_P malaria
MCMA562 1059_P malaria
MCMA564 1060_P non-malarial fever
MCMA565 1061_P non-malarial fever
MCMA566 1062_P non-malarial fever
MURB082 1064_P malaria
MURB085 1065_P malaria
MURB088 1066_P malaria
MURB092 1067_P malaria
MURB095 1068_P malaria

In [128]:
### Remove any rows from binary map that aren't in craig's actual feature table
# Remove any samples from y (classes) that aren't in feature table
idx = X_df.index.tolist()
print idx
new_idx = [i.replace('X','') for i in idx]

X_df.index = new_idx
print X_df.shape


print binary_class_map.shape
print binary_class_map


['1001_P', '1002_P', '1003_P', '1004_P', '1005_P', '1006_P', '1007_P', '1009_P', '1010_P', '1011_P', '1012_P', '1013_P', '1014_P', '1016_P', '1017_P', '1018_P', '1019_P', '1021_P', '1023_P', '1024_P', '1026_P', '1027_P', '1029_P', '1030_P', '1031_P', '1032_P', '1033_P', '1034_P', '1035_P', '1036_P', '1037_P', '1038_P', '1039_P', '1040_P', '1041_P', '1042_P', '1043_P', '1044_P', '1045_P', '1046_P', '1048_P', '1049_P', '1050_P', '1051_P', '1052_P', '1053_P', '1054_P', '1055_P', '1056_P', '1057_P', '1058_P', '1059_P', '1060_P', '1061_P', '1062_P', '1064_P', '1065_P', '1066_P', '1067_P', '1068_P']
(60, 6857)
Index([u'1001_P', u'1002_P', u'1003_P', u'1004_P', u'1005_P', u'1006_P',
       u'1007_P', u'1009_P', u'1010_P', u'1011_P', u'1012_P', u'1013_P',
       u'1014_P', u'1016_P', u'1017_P', u'1018_P', u'1019_P', u'1021_P',
       u'1023_P', u'1024_P', u'1026_P', u'1027_P', u'1029_P', u'1030_P',
       u'1031_P', u'1032_P', u'1033_P', u'1034_P', u'1035_P', u'1036_P',
       u'1037_P', u'1038_P', u'1039_P', u'1040_P', u'1041_P', u'1042_P',
       u'1043_P', u'1044_P', u'1045_P', u'1046_P', u'1048_P', u'1049_P',
       u'1050_P', u'1051_P', u'1052_P', u'1053_P', u'1054_P', u'1055_P',
       u'1056_P', u'1057_P', u'1058_P', u'1059_P', u'1060_P', u'1061_P',
       u'1062_P', u'1064_P', u'1065_P', u'1066_P', u'1067_P', u'1068_P'],
      dtype='object')
['1001_P', '1002_P', '1003_P', '1004_P', '1005_P', '1006_P', '1007_P', '1009_P', '1010_P', '1011_P', '1012_P', '1013_P', '1014_P', '1016_P', '1017_P', '1018_P', '1019_P', '1020_P', '1021_P', '1023_P', '1024_P', '1026_P', '1027_P', '1029_P', '1030_P', '1031_P', '1032_P', '1033_P', '1034_P', '1035_P', '1036_P', '1037_P', '1038_P', '1039_P', '1040_P', '1041_P', '1042_P', '1043_P', '1044_P', '1045_P', '1046_P', '1048_P', '1049_P', '1050_P', '1051_P', '1052_P', '1053_P', '1054_P', '1055_P', '1056_P', '1057_P', '1058_P', '1059_P', '1060_P', '1061_P', '1062_P', '1064_P', '1065_P', '1066_P', '1067_P', '1068_P']



Found the missing one!:  1020_P
(60, 2)
            MS Assay Name               class
Sample Name                                  
MCMA429            1001_P             malaria
MCMA430            1002_P             malaria
MCMA431            1003_P             malaria
MCMA433            1004_P             malaria
MCMA434            1005_P  non-malarial fever
MCMA435            1006_P             malaria
MCMA436            1007_P  non-malarial fever
MCMA442            1009_P             malaria
MCMA443            1010_P  non-malarial fever
MCMA444            1011_P  non-malarial fever
MCMA446            1012_P  non-malarial fever
MCMA447            1013_P             malaria
MCMA448            1014_P             malaria
MCMA450            1016_P             malaria
MCMA451            1017_P             malaria
MCMA452            1018_P             malaria
MCMA455            1019_P  non-malarial fever
MCMA459            1021_P  non-malarial fever
MCMA462            1023_P  non-malarial fever
MCMA463            1024_P  non-malarial fever
MCMA465            1026_P  non-malarial fever
MCMA466            1027_P             malaria
MCMA472            1029_P             malaria
MCMA474            1030_P             malaria
MCMA477            1031_P             malaria
MCMA478            1032_P  non-malarial fever
MCMA480            1033_P  non-malarial fever
MCMA484            1034_P  non-malarial fever
MCMA489            1035_P             malaria
MCMA491            1036_P  non-malarial fever
MCMA493            1037_P             malaria
MCMA495            1038_P             malaria
MCMA496            1039_P             malaria
MCMA497            1040_P             malaria
MCMA501            1041_P             malaria
MCMA502            1042_P             malaria
MCMA503            1043_P  non-malarial fever
MCMA513            1044_P             malaria
MCMA515            1045_P  non-malarial fever
MCMA516            1046_P             malaria
MCMA523            1048_P             malaria
MCMA526            1049_P  non-malarial fever
MCMA528            1050_P  non-malarial fever
MCMA533            1051_P             malaria
MCMA534            1052_P  non-malarial fever
MCMA535            1053_P  non-malarial fever
MCMA537            1054_P  non-malarial fever
MCMA538            1055_P  non-malarial fever
MCMA550            1056_P             malaria
MCMA557            1057_P  non-malarial fever
MCMA560            1058_P             malaria
MCMA562            1059_P             malaria
MCMA564            1060_P  non-malarial fever
MCMA565            1061_P  non-malarial fever
MCMA566            1062_P  non-malarial fever
MURB082            1064_P             malaria
MURB085            1065_P             malaria
MURB088            1066_P             malaria
MURB092            1067_P             malaria
MURB095            1068_P             malaria

In [129]:
# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(binary_class_map['class'])
y = le.transform(binary_class_map['class'])
print X.shape
print y.shape


(60, 6857)
(60,)

Plot the distribution of classification accuracy across multiple cross-validation splits - Kinda Dumb

Turns out doing this is kind of dumb, because you're not taking into account the prediction score your classifier assigned. Use AUC's instead. You want to give your classifier a lower score if it is really confident and wrong, than vice-versa


In [48]:
def rf_violinplot(X, y, n_iter=25, test_size=0.3, random_state=1,
                 n_estimators=1000):
    cross_val_skf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, 
                                           random_state=random_state)
    clf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    scores = cross_val_score(clf, X, y, cv=cross_val_skf)

    sns.violinplot(scores,inner='stick')

rf_violinplot(X,y)


# TODO - Switch to using caret for this bs..?



In [49]:
# Do multi-fold cross validation for adaboost classifier
def adaboost_violinplot(X, y, n_iter=25, test_size=0.3, random_state=1,
                       n_estimators=200):
    cross_val_skf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
    clf = AdaBoostClassifier(n_estimators=n_estimators, random_state=random_state)

    scores = cross_val_score(clf, X, y, cv=cross_val_skf)

    sns.violinplot(scores,inner='stick')

adaboost_violinplot(X,y)



In [115]:
# TODO PQN normalization, and log-transformation, 
# and some feature selection (above certain threshold of intensity, use principal components), et

def pqn_normalize(X, integral_first=False, plot=False):
    '''
    Take a feature table and run PQN normalization on it
    '''
    # normalize by sum of intensities in each sample first. Not necessary
    if integral_first: 
        sample_sums = np.sum(X, axis=1)
        X = (X / sample_sums[:,np.newaxis])
    
    # Get the median value of each feature across all samples
    mean_intensities = np.median(X, axis=0)
    
    # Divde each feature by the median value of each feature - 
    # these are the quotients for each feature
    X_quotients = (X / mean_intensities[np.newaxis,:])
    
    if plot: # plot the distribution of quotients from one sample
        for i in range(1,len(X_quotients[:,1])):
            print 'allquotients reshaped!\n\n', 
            #all_quotients = X_quotients.reshape(np.prod(X_quotients.shape))
            all_quotients = X_quotients[i,:]
            print all_quotients.shape
            x = np.random.normal(loc=0, scale=1, size=len(all_quotients))
            sns.violinplot(all_quotients)
            plt.title("median val: %f\nMax val=%f" % (np.median(all_quotients), np.max(all_quotients)))
            plt.plot( title="median val: ")#%f" % np.median(all_quotients))
            plt.xlim([-0.5, 5])
            plt.show()

    # Define a quotient for each sample as the median of the feature-specific quotients
    # in that sample
    sample_quotients = np.median(X_quotients, axis=1)
    
    # Quotient normalize each samples
    X_pqn = X / sample_quotients[:,np.newaxis]
    return X_pqn

# Make a fake sample, with 2 samples at 1x and 2x dilutions
X_toy = np.array([[1,1,1,],
                  [2,2,2],
                  [3,6,9],
                  [6,12,18]], dtype=float)
print X_toy
print X_toy.reshape(1, np.prod(X_toy.shape))
X_toy_pqn_int = pqn_normalize(X_toy, integral_first=True, plot=True)
print X_toy_pqn_int

print '\n\n\n'
X_toy_pqn = pqn_normalize(X_toy)
print X_toy_pqn


[[  1.   1.   1.]
 [  2.   2.   2.]
 [  3.   6.   9.]
 [  6.  12.  18.]]
[[  1.   1.   1.   2.   2.   2.   3.   6.   9.   6.  12.  18.]]
allquotients reshaped!

(3,)
allquotients reshaped!

(3,)
allquotients reshaped!

(3,)
[[ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.16666667  0.33333333  0.5       ]
 [ 0.16666667  0.33333333  0.5       ]]




[[ 4.  4.  4.]
 [ 4.  4.  4.]
 [ 2.  4.  6.]
 [ 2.  4.  6.]]

pqn normalize your features

Doesn't work with all the NaN values. Didn't bother fixing it right now.


In [118]:
print X.shape
X_pqn = pqn_normalize(X)
print X_pqn


(60, 6857)
[[ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 ..., 
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]]

Random Forest & adaBoost with PQN-normalized data


In [231]:
rf_violinplot(X_pqn, y)



In [232]:
# Do multi-fold cross validation for adaboost classifier
adaboost_violinplot(X_pqn, y)


RF & adaBoost with PQN-normalized, log-transformed data

Turns out a monotonic transformation doesn't really affect any of these things. I guess they're already close to unit varinace...?


In [234]:
X_pqn_nlog = np.log(X_pqn)
rf_violinplot(X_pqn_nlog, y)



In [235]:
adaboost_violinplot(X_pqn_nlog, y)



In [72]:
def roc_curve_cv(X, y, clf, cross_val,
                path='/home/irockafe/Desktop/roc.pdf',
                save=False, plot=True):  
    t1 = time.time()
    # collect vals for the ROC curves
    tpr_list = []
    mean_fpr = np.linspace(0,1,100)
    auc_list = []
    
    # Get the false-positive and true-positive rate
    for i, (train, test) in enumerate(cross_val):
        clf.fit(X[train], y[train])
        y_pred = clf.predict_proba(X[test])[:,1]
        
        # get fpr, tpr
        fpr, tpr, thresholds = roc_curve(y[test], y_pred)
        roc_auc = auc(fpr, tpr)
        #print 'AUC', roc_auc
        #sns.plt.plot(fpr, tpr, lw=10, alpha=0.6, label='ROC - AUC = %0.2f' % roc_auc,)
        #sns.plt.show()
        tpr_list.append(interp(mean_fpr, fpr, tpr))
        tpr_list[-1][0] = 0.0
        auc_list.append(roc_auc)
        
        if (i % 10 == 0):
            print '{perc}% done! {time}s elapsed'.format(perc=100*float(i)/cross_val.n_iter, time=(time.time() - t1))
        
            
        
        
    # get mean tpr and fpr
    mean_tpr = np.mean(tpr_list, axis=0)
    # make sure it ends up at 1.0
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc_list)
    
    if plot:
        # plot mean auc
        plt.plot(mean_fpr, mean_tpr, label='Mean ROC - AUC = %0.2f $\pm$ %0.2f' % (mean_auc, 
                                                                                       std_auc),
                    lw=5, color='b')

        # plot luck-line
        plt.plot([0,1], [0,1], linestyle = '--', lw=2, color='r',
                    label='Luck', alpha=0.5) 

        # plot 1-std
        std_tpr = np.std(tpr_list, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.2,
                            label=r'$\pm$ 1 stdev')

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC curve, {iters} iterations of {cv} cross validation'.format(
            iters=cross_val.n_iter, cv='{train}:{test}'.format(test=cross_val.test_size, train=(1-cross_val.test_size)))
                 )
        plt.legend(loc="lower right")

        if save:
            plt.savefig(path,  format='pdf')


        plt.show()
    return tpr_list, auc_list, mean_fpr

In [119]:
rf_estimators = 1000
n_iter = 3
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)
rf_graph_path = '''/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revolutionizing_healthcare/data/MTBLS315/\
isaac_feature_tables/uhplc_pos/rf_roc_{trees}trees_{cv}cviter.pdf'''.format(trees=rf_estimators, cv=n_iter)

print cross_val_rf.n_iter
print cross_val_rf.test_size

tpr_vals, auc_vals, mean_fpr = roc_curve_cv(X, y, clf_rf, cross_val_rf,
                                           path=rf_graph_path, save=False)


3
0.3
0.0% done! 2.01393198967s elapsed

In [75]:
# For adaboosted
n_iter = 3
test_size = 0.3
random_state = 1
adaboost_estimators = 200
adaboost_path = '''/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revolutionizing_healthcare/data/MTBLS315/\
isaac_feature_tables/uhplc_pos/adaboost_roc_{trees}trees_{cv}cviter.pdf'''.format(trees=adaboost_estimators, 
                                                                            cv=n_iter)


cross_val_adaboost = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf = AdaBoostClassifier(n_estimators=adaboost_estimators, random_state=random_state)

adaboost_tpr, adaboost_auc, adaboost_fpr = roc_curve_cv(X, y, clf, cross_val_adaboost,
                                                       path=adaboost_path)


0.0% done! 4.04500412941s elapsed

In [120]:
print X.shape


(60, 6857)

Great, you can classify things. But make null models and do a sanity check to make sure you arent just classifying garbage


In [76]:
# Make a null model AUC curve

def make_null_model(X, y, clf, cross_val, random_state=1, num_shuffles=5, plot=True):
    '''
    Runs the true model, then sanity-checks by:
    
    Shuffles class labels and then builds cross-validated ROC curves from them.
    Compares true AUC vs. shuffled auc by t-test (assumes normality of AUC curve)
    '''
    null_aucs = []
    print y.shape
    print X.shape
    tpr_true, auc_true, fpr_true = roc_curve_cv(X, y, clf, cross_val)
    # shuffle y lots of times
    for i in range(0, num_shuffles):
        #Iterate through the shuffled y vals and repeat with appropriate params
        # Retain the auc vals for final plotting of distribution
        y_shuffle = shuffle(y)
        cross_val.y = y_shuffle
        cross_val.y_indices = y_shuffle
        print 'Number of differences b/t original and shuffle: %s' % (y == cross_val.y).sum()
        # Get auc values for number of iterations
        tpr, auc, fpr = roc_curve_cv(X, y_shuffle, clf, cross_val, plot=False)
        
        null_aucs.append(auc)
    
    
    #plot the outcome
    if plot:
        flattened_aucs = [j for i in null_aucs for j in i]
        my_dict = {'true_auc': auc_true, 'null_auc': flattened_aucs}
        df_poop =  pd.DataFrame.from_dict(my_dict, orient='index').T
        df_tidy = pd.melt(df_poop, value_vars=['true_auc', 'null_auc'],
                         value_name='auc', var_name='AUC_type')
        #print flattened_aucs
        sns.violinplot(x='AUC_type', y='auc',
            inner='points', data=df_tidy)
        # Plot distribution of AUC vals   
        plt.title("Distribution of aucs")
        #sns.plt.ylabel('count')
        plt.xlabel('AUC')
        #sns.plt.plot(auc_true, 0, color='red', markersize=10)
        plt.show()
    # Do a quick t-test to see if odds of randomly getting an AUC that good
    return auc_true, null_aucs

In [80]:
# Make a null model AUC curve & compare it to null-model

# Random forest magic!
rf_estimators = 1000
n_iter = 50
test_size = 0.3
random_state = 1

# Define Cross-validation and classifier
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)

true_auc, all_aucs = make_null_model(X, y, clf_rf, cross_val_rf, num_shuffles=5)


(60,)
(60, 6857)
0.0% done! 1.98951101303s elapsed
20.0% done! 23.2866311073s elapsed
40.0% done! 45.426803112s elapsed
60.0% done! 64.6907000542s elapsed
80.0% done! 84.3342819214s elapsed
Number of differences b/t original and shuffle: 28
0.0% done! 2.0375430584s elapsed
20.0% done! 24.2736289501s elapsed
40.0% done! 45.9073090553s elapsed
60.0% done! 66.4045600891s elapsed
80.0% done! 87.3796539307s elapsed
Number of differences b/t original and shuffle: 28
0.0% done! 2.00804305077s elapsed
20.0% done! 23.4076499939s elapsed
40.0% done! 44.6843700409s elapsed
60.0% done! 66.2608439922s elapsed
80.0% done! 88.4894390106s elapsed
Number of differences b/t original and shuffle: 36
0.0% done! 2.02911615372s elapsed
20.0% done! 21.7539160252s elapsed
40.0% done! 42.176071167s elapsed
60.0% done! 62.0842471123s elapsed
80.0% done! 83.4315152168s elapsed
Number of differences b/t original and shuffle: 36
0.0% done! 2.00507688522s elapsed
20.0% done! 23.0816810131s elapsed
40.0% done! 43.9805998802s elapsed
60.0% done! 64.5948519707s elapsed
80.0% done! 86.1674730778s elapsed
Number of differences b/t original and shuffle: 26
0.0% done! 2.01746702194s elapsed
20.0% done! 24.0207948685s elapsed
40.0% done! 44.8304729462s elapsed
60.0% done! 65.2819898129s elapsed
80.0% done! 86.4579348564s elapsed

In [81]:
# make dataframe from true and false aucs
flattened_aucs = [j for i in all_aucs for j in i]
my_dict = {'true_auc': true_auc, 'null_auc': flattened_aucs}
df_poop =  pd.DataFrame.from_dict(my_dict, orient='index').T
df_tidy = pd.melt(df_poop, value_vars=['true_auc', 'null_auc'],
                 value_name='auc', var_name='AUC_type')
print df_tidy.head()
#print flattened_aucs
sns.violinplot(x='AUC_type', y='auc',
    inner='points', data=df_tidy, bw=0.7)
plt.show()


   AUC_type  auc
0  true_auc    1
1  true_auc    1
2  true_auc    1
3  true_auc    1
4  true_auc    1

This seems kinda wild

Is there just an easy-to-spot difference in the number of zero-entries between the two?


In [184]:
from sklearn.decomposition import PCA

# Check PCA of things
def PCA_plot(X, y, n_components, plot_color, class_nums, class_names, title='PCA'):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit(X).transform(X)

    print zip(plot_color, class_nums, class_names)
    for color, i, target_name in zip(plot_color, class_nums, class_names):
        
        # plot one class at a time, first plot all classes y == 0
        #print color
        #print y == i
        xvals = X_pca[y == i, 0]
        print xvals.shape
        yvals = X_pca[y == i, 1]
        plt.scatter(xvals, yvals, color=color, alpha=0.8, label=target_name)

    plt.legend(bbox_to_anchor=(1.01,1), loc='upper left', shadow=False)#, scatterpoints=1)
    plt.title('PCA of Malaria data')
    plt.show()


PCA_plot(X, y, 2, ['red', 'blue'], [0,1], ['malaria', 'non-malaria fever'])


[('red', 0, 'malaria'), ('blue', 1, 'non-malaria fever')]
(34,)
(26,)

Let's see if PCA differentiates all three classes easily, it looks like it might...

Or batch effects?


In [185]:
# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(class_map_df['class'])
y_three_class = le.transform(class_map_df['class'])
print class_map_df.head(10)
print y_three_class
print X.shape
print y_three_class.shape

y_labels = np.sort(class_map_df['class'].unique())
print y_labels
colors = ['green', 'red', 'blue']

print np.unique(y_three_class)
PCA_plot(X, y_three_class, 2, colors, np.unique(y_three_class), y_labels)


            MS Assay Name                            class
Sample Name                                               
MCMA429            1001_P                          malaria
MCMA430            1002_P                          malaria
MCMA431            1003_P                          malaria
MCMA433            1004_P                          malaria
MCMA434            1005_P     non-malarial febrile illness
MCMA435            1006_P                          malaria
MCMA436            1007_P  bacterial bloodstream infection
MCMA442            1009_P                          malaria
MCMA443            1010_P     non-malarial febrile illness
MCMA444            1011_P  bacterial bloodstream infection
[1 1 1 1 2 1 0 1 2 0 0 1 1 1 1 1 0 2 2 2 0 1 1 1 1 2 0 2 1 2 1 1 1 1 1 1 2
 1 0 1 1 0 0 1 2 0 2 2 1 0 1 1 2 0 2 1 1 1 1 1]
(60, 6857)
(60,)
['bacterial bloodstream infection' 'malaria' 'non-malarial febrile illness']
[0 1 2]
[('green', 0, 'bacterial bloodstream infection'), ('red', 1, 'malaria'), ('blue', 2, 'non-malarial febrile illness')]
(12,)
(34,)
(14,)