Feature detected using Craig's code

Not sure how that worked, but the PCA plot is wild



In [2]:

    
import time

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import cross_val_score
#from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.model_selection import cross_val_score

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle

from scipy import interp

%matplotlib inline



In [110]:

    
def remove_zero_columns(X, threshold=1e-20):
    # convert zeros to nan, drop all nan columns, the replace leftover nan with zeros
    X_non_zero_colum = X.replace(0, np.nan).dropna(how='all', axis=1).replace(np.nan, 0)
    #.dropna(how='all', axis=0).replace(np.nan,0)
    return X_non_zero_colum

def zero_fill_half_min(X, threshold=1e-20):
    # Fill zeros with 1/2 the minimum value of that column
    # input dataframe. Add only to zero values
    
    # Get a vector of 1/2 minimum values
    half_min = X[X > threshold].min(axis=0)*0.5
    
    # Add the half_min values to a dataframe where everything that isn't zero is NaN.
    # then convert NaN's to 0
    fill_vals = (X[X < threshold] + half_min).fillna(value=0)
    
    # Add the original dataframe to the dataframe of zeros and fill-values
    X_zeros_filled = X + fill_vals
    return X_zeros_filled



toy = pd.DataFrame([[1,2,3,0],
               [0,0,0,0],
               [0.5,1,0,0]], dtype=float)

toy_no_zeros = remove_zero_columns(toy)
toy_filled_zeros = zero_fill_half_min(toy_no_zeros)
print toy
print toy_no_zeros
print toy_filled_zeros









    



     0  1  2  3
0  1.0  2  3  0
1  0.0  0  0  0
2  0.5  1  0  0
     0  1  2
0  1.0  2  3
1  0.0  0  0
2  0.5  1  0
      0    1    2
0  1.00  2.0  3.0
1  0.25  0.5  1.5
2  0.50  1.0  1.5

Import the dataframe and remove any features that are all zero



In [111]:

    
### Subdivide the data into a feature table
data_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/old_repo_revo_healthcare/'\
'data/MTBLS315/7-24-17_Malaria_pos_grouped_data_Craig_processed.csv'
## Import the data and remove extraneous columns
df = pd.read_csv(data_path)
print df.shape
print df.head()
# Make a new index of mz:rt
mz = df.loc[:,"mz"].astype('str')
rt = df.loc[:,"rt"].astype('str')
idx = mz+':'+rt
df.index = idx
df
# separate samples from xcms/camera things to make feature table
not_samples = ['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 
               'isotopes', 'adduct']
samples_list = df.columns.difference(not_samples)
mz_rt_df = df[not_samples]

# convert to samples x features
X_df_raw = df[samples_list].T
# Remove zero-full columns and fill zeroes with 1/2 minimum values
X_df = remove_zero_columns(X_df_raw)
X_df_zero_filled = zero_fill_half_min(X_df)

print (X_df < 1e-20).sum().sum()
print "original shape: %s \n# zeros: %f\n" % (X_df_raw.shape, (X_df_raw < 1e-20).sum().sum())
print "zero-columns replaced? shape: %s \n# zeros: %f\n" % (X_df.shape, 
                                                         (X_df < 1e-20).sum().sum())
print "zeros filled shape: %s \n#zeros: %f\n" % (X_df_zero_filled.shape, 
                                              (X_df_zero_filled < 1e-20).sum().sum())


# Convert to numpy matrix to play nicely with sklearn
X = X_df.as_matrix()
print X.shape









    



(6857, 68)
           mz       mzmin       mzmax          rt       rtmin       rtmax  \
0  100.075385  100.074986  100.075519  129.794110  127.975656  132.153297   
1  100.916388  100.916045  100.916718   45.296209   44.146884   47.819750   
2  101.008038  101.007614  101.008242   38.931362   38.142080   39.498050   
3  101.010206  101.009702  101.010503   38.950441    9.380345   40.465458   
4  101.095582  101.095295  101.095724  973.599729  953.662750  992.645495   

          X1001_P         X1002_P         X1003_P          X1004_P  \
0    42373.927779   179331.750974  2537721.750522    101845.009079   
1    11178.774973    20729.982156    20852.688848     12717.817680   
2  8791113.535859  6606203.430266  6546795.750105  12136100.900199   
3   147009.244240   239414.696757   172061.687778  10058152.743696   
4   723913.491959   604511.079660   701094.488497    660342.546434   

        ...              X1024_P         X1032_P         X1034_P  \
0       ...                  NaN             NaN             NaN   
1       ...                  NaN             NaN             NaN   
2       ...        8041070.74371  7900344.594391  3202083.926599   
3       ...                  NaN             NaN             NaN   
4       ...                  NaN             NaN             NaN   

          X1036_P         X1043_P         X1052_P         X1054_P  \
0             NaN             NaN             NaN             NaN   
1             NaN             NaN             NaN             NaN   
2  5383539.442712  5802213.220837  7213738.304376  5679683.988764   
3             NaN             NaN             NaN             NaN   
4             NaN             NaN             NaN             NaN   

         X1055_P         X1060_P         X1062_P  
0            NaN             NaN             NaN  
1            NaN             NaN             NaN  
2  4694471.26903  4398947.827277  6154061.829345  
3            NaN             NaN             NaN  
4            NaN             NaN             NaN  

[5 rows x 68 columns]
99966
original shape: (60, 6857) 
# zeros: 7422.000000

zero-columns replaced? shape: (60, 6857) 
# zeros: 99966.000000

zeros filled shape: (60, 6857) 
#zeros: 0.000000

(60, 6857)

Get mappings between sample names, file names, and sample classes



In [179]:

    
# Get mapping between sample name and assay names
path_sample_name_map = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/data/raw/'\
                'MTBLS315/metadata/a_UPLC_POS_nmfi_and_bsi_diagnosis.txt'
# Index is the sample name
sample_df = pd.read_csv(path_sample_name_map, 
                        sep='\t', index_col=0)
sample_df = sample_df['MS Assay Name']
sample_df.shape
print sample_df.head(10)

# get mapping between sample name and sample class
path_sample_class_map = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revo_healthcare/data/raw/'\
                        'MTBLS315/metadata/s_NMFI and BSI diagnosis.txt'
class_df = pd.read_csv(path_sample_class_map,
                      sep='\t')
# Set index as sample name
class_df.set_index('Sample Name', inplace=True)
class_df = class_df['Factor Value[patient group]']
print class_df.head(10)

# convert all non-malarial classes into a single classes 
# (collapse non-malarial febril illness and bacteremia together)
class_map_df = pd.concat([sample_df, class_df], axis=1)
class_map_df.rename(columns={'Factor Value[patient group]': 'class'}, inplace=True)
class_map_df
print class_map_df

# remove sample from y (labels) if not present in X (feature table)
print X_df.index
print class_map_df['MS Assay Name'].tolist()
for i in class_map_df['MS Assay Name']:
    if i not in X_df.index:
        print '\n\n\nFound the missing one!: ', i, '\n\n\n'
        break
drop_idx = class_map_df[class_map_df['MS Assay Name'] == i].index
class_map_df = class_map_df.drop(drop_idx, axis=0)
print class_map_df


binary_class_map = class_map_df.replace(to_replace=['non-malarial febrile illness', 'bacterial bloodstream infection' ], 
                                        value='non-malarial fever')

binary_class_map









    



Sample Name
MCMA429    1001_P
MCMA430    1002_P
MCMA431    1003_P
MCMA433    1004_P
MCMA434    1005_P
MCMA435    1006_P
MCMA436    1007_P
MCMA442    1009_P
MCMA443    1010_P
MCMA444    1011_P
Name: MS Assay Name, dtype: object
Sample Name
MCMA429                            malaria
MCMA430                            malaria
MCMA431                            malaria
MCMA433                            malaria
MCMA434       non-malarial febrile illness
MCMA435                            malaria
MCMA436    bacterial bloodstream infection
MCMA442                            malaria
MCMA443       non-malarial febrile illness
MCMA444    bacterial bloodstream infection
Name: Factor Value[patient group], dtype: object
            MS Assay Name                            class
Sample Name                                               
MCMA429            1001_P                          malaria
MCMA430            1002_P                          malaria
MCMA431            1003_P                          malaria
MCMA433            1004_P                          malaria
MCMA434            1005_P     non-malarial febrile illness
MCMA435            1006_P                          malaria
MCMA436            1007_P  bacterial bloodstream infection
MCMA442            1009_P                          malaria
MCMA443            1010_P     non-malarial febrile illness
MCMA444            1011_P  bacterial bloodstream infection
MCMA446            1012_P  bacterial bloodstream infection
MCMA447            1013_P                          malaria
MCMA448            1014_P                          malaria
MCMA450            1016_P                          malaria
MCMA451            1017_P                          malaria
MCMA452            1018_P                          malaria
MCMA455            1019_P  bacterial bloodstream infection
MCMA458            1020_P     non-malarial febrile illness
MCMA459            1021_P     non-malarial febrile illness
MCMA462            1023_P     non-malarial febrile illness
MCMA463            1024_P     non-malarial febrile illness
MCMA465            1026_P  bacterial bloodstream infection
MCMA466            1027_P                          malaria
MCMA472            1029_P                          malaria
MCMA474            1030_P                          malaria
MCMA477            1031_P                          malaria
MCMA478            1032_P     non-malarial febrile illness
MCMA480            1033_P  bacterial bloodstream infection
MCMA484            1034_P     non-malarial febrile illness
MCMA489            1035_P                          malaria
...                   ...                              ...
MCMA493            1037_P                          malaria
MCMA495            1038_P                          malaria
MCMA496            1039_P                          malaria
MCMA497            1040_P                          malaria
MCMA501            1041_P                          malaria
MCMA502            1042_P                          malaria
MCMA503            1043_P     non-malarial febrile illness
MCMA513            1044_P                          malaria
MCMA515            1045_P  bacterial bloodstream infection
MCMA516            1046_P                          malaria
MCMA523            1048_P                          malaria
MCMA526            1049_P  bacterial bloodstream infection
MCMA528            1050_P  bacterial bloodstream infection
MCMA533            1051_P                          malaria
MCMA534            1052_P     non-malarial febrile illness
MCMA535            1053_P  bacterial bloodstream infection
MCMA537            1054_P     non-malarial febrile illness
MCMA538            1055_P     non-malarial febrile illness
MCMA550            1056_P                          malaria
MCMA557            1057_P  bacterial bloodstream infection
MCMA560            1058_P                          malaria
MCMA562            1059_P                          malaria
MCMA564            1060_P     non-malarial febrile illness
MCMA565            1061_P  bacterial bloodstream infection
MCMA566            1062_P     non-malarial febrile illness
MURB082            1064_P                          malaria
MURB085            1065_P                          malaria
MURB088            1066_P                          malaria
MURB092            1067_P                          malaria
MURB095            1068_P                          malaria

[61 rows x 2 columns]
Index([u'1001_P', u'1002_P', u'1003_P', u'1004_P', u'1005_P', u'1006_P',
       u'1007_P', u'1009_P', u'1010_P', u'1011_P', u'1012_P', u'1013_P',
       u'1014_P', u'1016_P', u'1017_P', u'1018_P', u'1019_P', u'1021_P',
       u'1023_P', u'1024_P', u'1026_P', u'1027_P', u'1029_P', u'1030_P',
       u'1031_P', u'1032_P', u'1033_P', u'1034_P', u'1035_P', u'1036_P',
       u'1037_P', u'1038_P', u'1039_P', u'1040_P', u'1041_P', u'1042_P',
       u'1043_P', u'1044_P', u'1045_P', u'1046_P', u'1048_P', u'1049_P',
       u'1050_P', u'1051_P', u'1052_P', u'1053_P', u'1054_P', u'1055_P',
       u'1056_P', u'1057_P', u'1058_P', u'1059_P', u'1060_P', u'1061_P',
       u'1062_P', u'1064_P', u'1065_P', u'1066_P', u'1067_P', u'1068_P'],
      dtype='object')
['1001_P', '1002_P', '1003_P', '1004_P', '1005_P', '1006_P', '1007_P', '1009_P', '1010_P', '1011_P', '1012_P', '1013_P', '1014_P', '1016_P', '1017_P', '1018_P', '1019_P', '1020_P', '1021_P', '1023_P', '1024_P', '1026_P', '1027_P', '1029_P', '1030_P', '1031_P', '1032_P', '1033_P', '1034_P', '1035_P', '1036_P', '1037_P', '1038_P', '1039_P', '1040_P', '1041_P', '1042_P', '1043_P', '1044_P', '1045_P', '1046_P', '1048_P', '1049_P', '1050_P', '1051_P', '1052_P', '1053_P', '1054_P', '1055_P', '1056_P', '1057_P', '1058_P', '1059_P', '1060_P', '1061_P', '1062_P', '1064_P', '1065_P', '1066_P', '1067_P', '1068_P']



Found the missing one!:  1020_P 



            MS Assay Name                            class
Sample Name                                               
MCMA429            1001_P                          malaria
MCMA430            1002_P                          malaria
MCMA431            1003_P                          malaria
MCMA433            1004_P                          malaria
MCMA434            1005_P     non-malarial febrile illness
MCMA435            1006_P                          malaria
MCMA436            1007_P  bacterial bloodstream infection
MCMA442            1009_P                          malaria
MCMA443            1010_P     non-malarial febrile illness
MCMA444            1011_P  bacterial bloodstream infection
MCMA446            1012_P  bacterial bloodstream infection
MCMA447            1013_P                          malaria
MCMA448            1014_P                          malaria
MCMA450            1016_P                          malaria
MCMA451            1017_P                          malaria
MCMA452            1018_P                          malaria
MCMA455            1019_P  bacterial bloodstream infection
MCMA459            1021_P     non-malarial febrile illness
MCMA462            1023_P     non-malarial febrile illness
MCMA463            1024_P     non-malarial febrile illness
MCMA465            1026_P  bacterial bloodstream infection
MCMA466            1027_P                          malaria
MCMA472            1029_P                          malaria
MCMA474            1030_P                          malaria
MCMA477            1031_P                          malaria
MCMA478            1032_P     non-malarial febrile illness
MCMA480            1033_P  bacterial bloodstream infection
MCMA484            1034_P     non-malarial febrile illness
MCMA489            1035_P                          malaria
MCMA491            1036_P     non-malarial febrile illness
MCMA493            1037_P                          malaria
MCMA495            1038_P                          malaria
MCMA496            1039_P                          malaria
MCMA497            1040_P                          malaria
MCMA501            1041_P                          malaria
MCMA502            1042_P                          malaria
MCMA503            1043_P     non-malarial febrile illness
MCMA513            1044_P                          malaria
MCMA515            1045_P  bacterial bloodstream infection
MCMA516            1046_P                          malaria
MCMA523            1048_P                          malaria
MCMA526            1049_P  bacterial bloodstream infection
MCMA528            1050_P  bacterial bloodstream infection
MCMA533            1051_P                          malaria
MCMA534            1052_P     non-malarial febrile illness
MCMA535            1053_P  bacterial bloodstream infection
MCMA537            1054_P     non-malarial febrile illness
MCMA538            1055_P     non-malarial febrile illness
MCMA550            1056_P                          malaria
MCMA557            1057_P  bacterial bloodstream infection
MCMA560            1058_P                          malaria
MCMA562            1059_P                          malaria
MCMA564            1060_P     non-malarial febrile illness
MCMA565            1061_P  bacterial bloodstream infection
MCMA566            1062_P     non-malarial febrile illness
MURB082            1064_P                          malaria
MURB085            1065_P                          malaria
MURB088            1066_P                          malaria
MURB092            1067_P                          malaria
MURB095            1068_P                          malaria






    Out[179]:






  
    
      
      MS Assay Name
      class
    
    
      Sample Name
      
      
    
  
  
    
      MCMA429
      1001_P
      malaria
    
    
      MCMA430
      1002_P
      malaria
    
    
      MCMA431
      1003_P
      malaria
    
    
      MCMA433
      1004_P
      malaria
    
    
      MCMA434
      1005_P
      non-malarial fever
    
    
      MCMA435
      1006_P
      malaria
    
    
      MCMA436
      1007_P
      non-malarial fever
    
    
      MCMA442
      1009_P
      malaria
    
    
      MCMA443
      1010_P
      non-malarial fever
    
    
      MCMA444
      1011_P
      non-malarial fever
    
    
      MCMA446
      1012_P
      non-malarial fever
    
    
      MCMA447
      1013_P
      malaria
    
    
      MCMA448
      1014_P
      malaria
    
    
      MCMA450
      1016_P
      malaria
    
    
      MCMA451
      1017_P
      malaria
    
    
      MCMA452
      1018_P
      malaria
    
    
      MCMA455
      1019_P
      non-malarial fever
    
    
      MCMA459
      1021_P
      non-malarial fever
    
    
      MCMA462
      1023_P
      non-malarial fever
    
    
      MCMA463
      1024_P
      non-malarial fever
    
    
      MCMA465
      1026_P
      non-malarial fever
    
    
      MCMA466
      1027_P
      malaria
    
    
      MCMA472
      1029_P
      malaria
    
    
      MCMA474
      1030_P
      malaria
    
    
      MCMA477
      1031_P
      malaria
    
    
      MCMA478
      1032_P
      non-malarial fever
    
    
      MCMA480
      1033_P
      non-malarial fever
    
    
      MCMA484
      1034_P
      non-malarial fever
    
    
      MCMA489
      1035_P
      malaria
    
    
      MCMA491
      1036_P
      non-malarial fever
    
    
      MCMA493
      1037_P
      malaria
    
    
      MCMA495
      1038_P
      malaria
    
    
      MCMA496
      1039_P
      malaria
    
    
      MCMA497
      1040_P
      malaria
    
    
      MCMA501
      1041_P
      malaria
    
    
      MCMA502
      1042_P
      malaria
    
    
      MCMA503
      1043_P
      non-malarial fever
    
    
      MCMA513
      1044_P
      malaria
    
    
      MCMA515
      1045_P
      non-malarial fever
    
    
      MCMA516
      1046_P
      malaria
    
    
      MCMA523
      1048_P
      malaria
    
    
      MCMA526
      1049_P
      non-malarial fever
    
    
      MCMA528
      1050_P
      non-malarial fever
    
    
      MCMA533
      1051_P
      malaria
    
    
      MCMA534
      1052_P
      non-malarial fever
    
    
      MCMA535
      1053_P
      non-malarial fever
    
    
      MCMA537
      1054_P
      non-malarial fever
    
    
      MCMA538
      1055_P
      non-malarial fever
    
    
      MCMA550
      1056_P
      malaria
    
    
      MCMA557
      1057_P
      non-malarial fever
    
    
      MCMA560
      1058_P
      malaria
    
    
      MCMA562
      1059_P
      malaria
    
    
      MCMA564
      1060_P
      non-malarial fever
    
    
      MCMA565
      1061_P
      non-malarial fever
    
    
      MCMA566
      1062_P
      non-malarial fever
    
    
      MURB082
      1064_P
      malaria
    
    
      MURB085
      1065_P
      malaria
    
    
      MURB088
      1066_P
      malaria
    
    
      MURB092
      1067_P
      malaria
    
    
      MURB095
      1068_P
      malaria



In [128]:

    
### Remove any rows from binary map that aren't in craig's actual feature table
# Remove any samples from y (classes) that aren't in feature table
idx = X_df.index.tolist()
print idx
new_idx = [i.replace('X','') for i in idx]

X_df.index = new_idx
print X_df.shape


print binary_class_map.shape
print binary_class_map









    



['1001_P', '1002_P', '1003_P', '1004_P', '1005_P', '1006_P', '1007_P', '1009_P', '1010_P', '1011_P', '1012_P', '1013_P', '1014_P', '1016_P', '1017_P', '1018_P', '1019_P', '1021_P', '1023_P', '1024_P', '1026_P', '1027_P', '1029_P', '1030_P', '1031_P', '1032_P', '1033_P', '1034_P', '1035_P', '1036_P', '1037_P', '1038_P', '1039_P', '1040_P', '1041_P', '1042_P', '1043_P', '1044_P', '1045_P', '1046_P', '1048_P', '1049_P', '1050_P', '1051_P', '1052_P', '1053_P', '1054_P', '1055_P', '1056_P', '1057_P', '1058_P', '1059_P', '1060_P', '1061_P', '1062_P', '1064_P', '1065_P', '1066_P', '1067_P', '1068_P']
(60, 6857)
Index([u'1001_P', u'1002_P', u'1003_P', u'1004_P', u'1005_P', u'1006_P',
       u'1007_P', u'1009_P', u'1010_P', u'1011_P', u'1012_P', u'1013_P',
       u'1014_P', u'1016_P', u'1017_P', u'1018_P', u'1019_P', u'1021_P',
       u'1023_P', u'1024_P', u'1026_P', u'1027_P', u'1029_P', u'1030_P',
       u'1031_P', u'1032_P', u'1033_P', u'1034_P', u'1035_P', u'1036_P',
       u'1037_P', u'1038_P', u'1039_P', u'1040_P', u'1041_P', u'1042_P',
       u'1043_P', u'1044_P', u'1045_P', u'1046_P', u'1048_P', u'1049_P',
       u'1050_P', u'1051_P', u'1052_P', u'1053_P', u'1054_P', u'1055_P',
       u'1056_P', u'1057_P', u'1058_P', u'1059_P', u'1060_P', u'1061_P',
       u'1062_P', u'1064_P', u'1065_P', u'1066_P', u'1067_P', u'1068_P'],
      dtype='object')
['1001_P', '1002_P', '1003_P', '1004_P', '1005_P', '1006_P', '1007_P', '1009_P', '1010_P', '1011_P', '1012_P', '1013_P', '1014_P', '1016_P', '1017_P', '1018_P', '1019_P', '1020_P', '1021_P', '1023_P', '1024_P', '1026_P', '1027_P', '1029_P', '1030_P', '1031_P', '1032_P', '1033_P', '1034_P', '1035_P', '1036_P', '1037_P', '1038_P', '1039_P', '1040_P', '1041_P', '1042_P', '1043_P', '1044_P', '1045_P', '1046_P', '1048_P', '1049_P', '1050_P', '1051_P', '1052_P', '1053_P', '1054_P', '1055_P', '1056_P', '1057_P', '1058_P', '1059_P', '1060_P', '1061_P', '1062_P', '1064_P', '1065_P', '1066_P', '1067_P', '1068_P']



Found the missing one!:  1020_P
(60, 2)
            MS Assay Name               class
Sample Name                                  
MCMA429            1001_P             malaria
MCMA430            1002_P             malaria
MCMA431            1003_P             malaria
MCMA433            1004_P             malaria
MCMA434            1005_P  non-malarial fever
MCMA435            1006_P             malaria
MCMA436            1007_P  non-malarial fever
MCMA442            1009_P             malaria
MCMA443            1010_P  non-malarial fever
MCMA444            1011_P  non-malarial fever
MCMA446            1012_P  non-malarial fever
MCMA447            1013_P             malaria
MCMA448            1014_P             malaria
MCMA450            1016_P             malaria
MCMA451            1017_P             malaria
MCMA452            1018_P             malaria
MCMA455            1019_P  non-malarial fever
MCMA459            1021_P  non-malarial fever
MCMA462            1023_P  non-malarial fever
MCMA463            1024_P  non-malarial fever
MCMA465            1026_P  non-malarial fever
MCMA466            1027_P             malaria
MCMA472            1029_P             malaria
MCMA474            1030_P             malaria
MCMA477            1031_P             malaria
MCMA478            1032_P  non-malarial fever
MCMA480            1033_P  non-malarial fever
MCMA484            1034_P  non-malarial fever
MCMA489            1035_P             malaria
MCMA491            1036_P  non-malarial fever
MCMA493            1037_P             malaria
MCMA495            1038_P             malaria
MCMA496            1039_P             malaria
MCMA497            1040_P             malaria
MCMA501            1041_P             malaria
MCMA502            1042_P             malaria
MCMA503            1043_P  non-malarial fever
MCMA513            1044_P             malaria
MCMA515            1045_P  non-malarial fever
MCMA516            1046_P             malaria
MCMA523            1048_P             malaria
MCMA526            1049_P  non-malarial fever
MCMA528            1050_P  non-malarial fever
MCMA533            1051_P             malaria
MCMA534            1052_P  non-malarial fever
MCMA535            1053_P  non-malarial fever
MCMA537            1054_P  non-malarial fever
MCMA538            1055_P  non-malarial fever
MCMA550            1056_P             malaria
MCMA557            1057_P  non-malarial fever
MCMA560            1058_P             malaria
MCMA562            1059_P             malaria
MCMA564            1060_P  non-malarial fever
MCMA565            1061_P  non-malarial fever
MCMA566            1062_P  non-malarial fever
MURB082            1064_P             malaria
MURB085            1065_P             malaria
MURB088            1066_P             malaria
MURB092            1067_P             malaria
MURB095            1068_P             malaria



In [129]:

    
# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(binary_class_map['class'])
y = le.transform(binary_class_map['class'])
print X.shape
print y.shape









    



(60, 6857)
(60,)

Plot the distribution of classification accuracy across multiple cross-validation splits - Kinda Dumb

Turns out doing this is kind of dumb, because you're not taking into account the prediction score your classifier assigned. Use AUC's instead. You want to give your classifier a lower score if it is really confident and wrong, than vice-versa



In [48]:

    
def rf_violinplot(X, y, n_iter=25, test_size=0.3, random_state=1,
                 n_estimators=1000):
    cross_val_skf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, 
                                           random_state=random_state)
    clf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    scores = cross_val_score(clf, X, y, cv=cross_val_skf)

    sns.violinplot(scores,inner='stick')

rf_violinplot(X,y)


# TODO - Switch to using caret for this bs..?



In [49]:

    
# Do multi-fold cross validation for adaboost classifier
def adaboost_violinplot(X, y, n_iter=25, test_size=0.3, random_state=1,
                       n_estimators=200):
    cross_val_skf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
    clf = AdaBoostClassifier(n_estimators=n_estimators, random_state=random_state)

    scores = cross_val_score(clf, X, y, cv=cross_val_skf)

    sns.violinplot(scores,inner='stick')

adaboost_violinplot(X,y)



In [115]:

    
# TODO PQN normalization, and log-transformation, 
# and some feature selection (above certain threshold of intensity, use principal components), et

def pqn_normalize(X, integral_first=False, plot=False):
    '''
    Take a feature table and run PQN normalization on it
    '''
    # normalize by sum of intensities in each sample first. Not necessary
    if integral_first: 
        sample_sums = np.sum(X, axis=1)
        X = (X / sample_sums[:,np.newaxis])
    
    # Get the median value of each feature across all samples
    mean_intensities = np.median(X, axis=0)
    
    # Divde each feature by the median value of each feature - 
    # these are the quotients for each feature
    X_quotients = (X / mean_intensities[np.newaxis,:])
    
    if plot: # plot the distribution of quotients from one sample
        for i in range(1,len(X_quotients[:,1])):
            print 'allquotients reshaped!\n\n', 
            #all_quotients = X_quotients.reshape(np.prod(X_quotients.shape))
            all_quotients = X_quotients[i,:]
            print all_quotients.shape
            x = np.random.normal(loc=0, scale=1, size=len(all_quotients))
            sns.violinplot(all_quotients)
            plt.title("median val: %f\nMax val=%f" % (np.median(all_quotients), np.max(all_quotients)))
            plt.plot( title="median val: ")#%f" % np.median(all_quotients))
            plt.xlim([-0.5, 5])
            plt.show()

    # Define a quotient for each sample as the median of the feature-specific quotients
    # in that sample
    sample_quotients = np.median(X_quotients, axis=1)
    
    # Quotient normalize each samples
    X_pqn = X / sample_quotients[:,np.newaxis]
    return X_pqn

# Make a fake sample, with 2 samples at 1x and 2x dilutions
X_toy = np.array([[1,1,1,],
                  [2,2,2],
                  [3,6,9],
                  [6,12,18]], dtype=float)
print X_toy
print X_toy.reshape(1, np.prod(X_toy.shape))
X_toy_pqn_int = pqn_normalize(X_toy, integral_first=True, plot=True)
print X_toy_pqn_int

print '\n\n\n'
X_toy_pqn = pqn_normalize(X_toy)
print X_toy_pqn









    



[[  1.   1.   1.]
 [  2.   2.   2.]
 [  3.   6.   9.]
 [  6.  12.  18.]]
[[  1.   1.   1.   2.   2.   2.   3.   6.   9.   6.  12.  18.]]
allquotients reshaped!

(3,)






    












    



allquotients reshaped!

(3,)






    












    



allquotients reshaped!

(3,)






    












    



[[ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.16666667  0.33333333  0.5       ]
 [ 0.16666667  0.33333333  0.5       ]]




[[ 4.  4.  4.]
 [ 4.  4.  4.]
 [ 2.  4.  6.]
 [ 2.  4.  6.]]

pqn normalize your features

Doesn't work with all the NaN values. Didn't bother fixing it right now.



In [118]:

    
print X.shape
X_pqn = pqn_normalize(X)
print X_pqn









    



(60, 6857)
[[ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 ..., 
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]
 [ nan  nan  nan ...,  nan  nan  nan]]

Random Forest & adaBoost with PQN-normalized data



In [231]:

    
rf_violinplot(X_pqn, y)



In [232]:

    
# Do multi-fold cross validation for adaboost classifier
adaboost_violinplot(X_pqn, y)

RF & adaBoost with PQN-normalized, log-transformed data

Turns out a monotonic transformation doesn't really affect any of these things. I guess they're already close to unit varinace...?



In [234]:

    
X_pqn_nlog = np.log(X_pqn)
rf_violinplot(X_pqn_nlog, y)



In [235]:

    
adaboost_violinplot(X_pqn_nlog, y)



In [72]:

    
def roc_curve_cv(X, y, clf, cross_val,
                path='/home/irockafe/Desktop/roc.pdf',
                save=False, plot=True):  
    t1 = time.time()
    # collect vals for the ROC curves
    tpr_list = []
    mean_fpr = np.linspace(0,1,100)
    auc_list = []
    
    # Get the false-positive and true-positive rate
    for i, (train, test) in enumerate(cross_val):
        clf.fit(X[train], y[train])
        y_pred = clf.predict_proba(X[test])[:,1]
        
        # get fpr, tpr
        fpr, tpr, thresholds = roc_curve(y[test], y_pred)
        roc_auc = auc(fpr, tpr)
        #print 'AUC', roc_auc
        #sns.plt.plot(fpr, tpr, lw=10, alpha=0.6, label='ROC - AUC = %0.2f' % roc_auc,)
        #sns.plt.show()
        tpr_list.append(interp(mean_fpr, fpr, tpr))
        tpr_list[-1][0] = 0.0
        auc_list.append(roc_auc)
        
        if (i % 10 == 0):
            print '{perc}% done! {time}s elapsed'.format(perc=100*float(i)/cross_val.n_iter, time=(time.time() - t1))
        
            
        
        
    # get mean tpr and fpr
    mean_tpr = np.mean(tpr_list, axis=0)
    # make sure it ends up at 1.0
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(auc_list)
    
    if plot:
        # plot mean auc
        plt.plot(mean_fpr, mean_tpr, label='Mean ROC - AUC = %0.2f $\pm$ %0.2f' % (mean_auc, 
                                                                                       std_auc),
                    lw=5, color='b')

        # plot luck-line
        plt.plot([0,1], [0,1], linestyle = '--', lw=2, color='r',
                    label='Luck', alpha=0.5) 

        # plot 1-std
        std_tpr = np.std(tpr_list, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.2,
                            label=r'$\pm$ 1 stdev')

        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC curve, {iters} iterations of {cv} cross validation'.format(
            iters=cross_val.n_iter, cv='{train}:{test}'.format(test=cross_val.test_size, train=(1-cross_val.test_size)))
                 )
        plt.legend(loc="lower right")

        if save:
            plt.savefig(path,  format='pdf')


        plt.show()
    return tpr_list, auc_list, mean_fpr



In [119]:

    
rf_estimators = 1000
n_iter = 3
test_size = 0.3
random_state = 1
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)
rf_graph_path = '''/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revolutionizing_healthcare/data/MTBLS315/\
isaac_feature_tables/uhplc_pos/rf_roc_{trees}trees_{cv}cviter.pdf'''.format(trees=rf_estimators, cv=n_iter)

print cross_val_rf.n_iter
print cross_val_rf.test_size

tpr_vals, auc_vals, mean_fpr = roc_curve_cv(X, y, clf_rf, cross_val_rf,
                                           path=rf_graph_path, save=False)









    



3
0.3
0.0% done! 2.01393198967s elapsed



In [75]:

    
# For adaboosted
n_iter = 3
test_size = 0.3
random_state = 1
adaboost_estimators = 200
adaboost_path = '''/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/revolutionizing_healthcare/data/MTBLS315/\
isaac_feature_tables/uhplc_pos/adaboost_roc_{trees}trees_{cv}cviter.pdf'''.format(trees=adaboost_estimators, 
                                                                            cv=n_iter)


cross_val_adaboost = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf = AdaBoostClassifier(n_estimators=adaboost_estimators, random_state=random_state)

adaboost_tpr, adaboost_auc, adaboost_fpr = roc_curve_cv(X, y, clf, cross_val_adaboost,
                                                       path=adaboost_path)









    



0.0% done! 4.04500412941s elapsed



In [120]:

    
print X.shape

Great, you can classify things. But make null models and do a sanity check to make sure you arent just classifying garbage



In [76]:

    
# Make a null model AUC curve

def make_null_model(X, y, clf, cross_val, random_state=1, num_shuffles=5, plot=True):
    '''
    Runs the true model, then sanity-checks by:
    
    Shuffles class labels and then builds cross-validated ROC curves from them.
    Compares true AUC vs. shuffled auc by t-test (assumes normality of AUC curve)
    '''
    null_aucs = []
    print y.shape
    print X.shape
    tpr_true, auc_true, fpr_true = roc_curve_cv(X, y, clf, cross_val)
    # shuffle y lots of times
    for i in range(0, num_shuffles):
        #Iterate through the shuffled y vals and repeat with appropriate params
        # Retain the auc vals for final plotting of distribution
        y_shuffle = shuffle(y)
        cross_val.y = y_shuffle
        cross_val.y_indices = y_shuffle
        print 'Number of differences b/t original and shuffle: %s' % (y == cross_val.y).sum()
        # Get auc values for number of iterations
        tpr, auc, fpr = roc_curve_cv(X, y_shuffle, clf, cross_val, plot=False)
        
        null_aucs.append(auc)
    
    
    #plot the outcome
    if plot:
        flattened_aucs = [j for i in null_aucs for j in i]
        my_dict = {'true_auc': auc_true, 'null_auc': flattened_aucs}
        df_poop =  pd.DataFrame.from_dict(my_dict, orient='index').T
        df_tidy = pd.melt(df_poop, value_vars=['true_auc', 'null_auc'],
                         value_name='auc', var_name='AUC_type')
        #print flattened_aucs
        sns.violinplot(x='AUC_type', y='auc',
            inner='points', data=df_tidy)
        # Plot distribution of AUC vals   
        plt.title("Distribution of aucs")
        #sns.plt.ylabel('count')
        plt.xlabel('AUC')
        #sns.plt.plot(auc_true, 0, color='red', markersize=10)
        plt.show()
    # Do a quick t-test to see if odds of randomly getting an AUC that good
    return auc_true, null_aucs



In [80]:

    
# Make a null model AUC curve & compare it to null-model

# Random forest magic!
rf_estimators = 1000
n_iter = 50
test_size = 0.3
random_state = 1

# Define Cross-validation and classifier
cross_val_rf = StratifiedShuffleSplit(y, n_iter=n_iter, test_size=test_size, random_state=random_state)
clf_rf = RandomForestClassifier(n_estimators=rf_estimators, random_state=random_state)

true_auc, all_aucs = make_null_model(X, y, clf_rf, cross_val_rf, num_shuffles=5)









    



(60,)
(60, 6857)
0.0% done! 1.98951101303s elapsed
20.0% done! 23.2866311073s elapsed
40.0% done! 45.426803112s elapsed
60.0% done! 64.6907000542s elapsed
80.0% done! 84.3342819214s elapsed






    












    



Number of differences b/t original and shuffle: 28
0.0% done! 2.0375430584s elapsed
20.0% done! 24.2736289501s elapsed
40.0% done! 45.9073090553s elapsed
60.0% done! 66.4045600891s elapsed
80.0% done! 87.3796539307s elapsed
Number of differences b/t original and shuffle: 28
0.0% done! 2.00804305077s elapsed
20.0% done! 23.4076499939s elapsed
40.0% done! 44.6843700409s elapsed
60.0% done! 66.2608439922s elapsed
80.0% done! 88.4894390106s elapsed
Number of differences b/t original and shuffle: 36
0.0% done! 2.02911615372s elapsed
20.0% done! 21.7539160252s elapsed
40.0% done! 42.176071167s elapsed
60.0% done! 62.0842471123s elapsed
80.0% done! 83.4315152168s elapsed
Number of differences b/t original and shuffle: 36
0.0% done! 2.00507688522s elapsed
20.0% done! 23.0816810131s elapsed
40.0% done! 43.9805998802s elapsed
60.0% done! 64.5948519707s elapsed
80.0% done! 86.1674730778s elapsed
Number of differences b/t original and shuffle: 26
0.0% done! 2.01746702194s elapsed
20.0% done! 24.0207948685s elapsed
40.0% done! 44.8304729462s elapsed
60.0% done! 65.2819898129s elapsed
80.0% done! 86.4579348564s elapsed



In [81]:

    
# make dataframe from true and false aucs
flattened_aucs = [j for i in all_aucs for j in i]
my_dict = {'true_auc': true_auc, 'null_auc': flattened_aucs}
df_poop =  pd.DataFrame.from_dict(my_dict, orient='index').T
df_tidy = pd.melt(df_poop, value_vars=['true_auc', 'null_auc'],
                 value_name='auc', var_name='AUC_type')
print df_tidy.head()
#print flattened_aucs
sns.violinplot(x='AUC_type', y='auc',
    inner='points', data=df_tidy, bw=0.7)
plt.show()









    



   AUC_type  auc
0  true_auc    1
1  true_auc    1
2  true_auc    1
3  true_auc    1
4  true_auc    1

This seems kinda wild

Is there just an easy-to-spot difference in the number of zero-entries between the two?



In [184]:

    
from sklearn.decomposition import PCA

# Check PCA of things
def PCA_plot(X, y, n_components, plot_color, class_nums, class_names, title='PCA'):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit(X).transform(X)

    print zip(plot_color, class_nums, class_names)
    for color, i, target_name in zip(plot_color, class_nums, class_names):
        
        # plot one class at a time, first plot all classes y == 0
        #print color
        #print y == i
        xvals = X_pca[y == i, 0]
        print xvals.shape
        yvals = X_pca[y == i, 1]
        plt.scatter(xvals, yvals, color=color, alpha=0.8, label=target_name)

    plt.legend(bbox_to_anchor=(1.01,1), loc='upper left', shadow=False)#, scatterpoints=1)
    plt.title('PCA of Malaria data')
    plt.show()


PCA_plot(X, y, 2, ['red', 'blue'], [0,1], ['malaria', 'non-malaria fever'])









    



[('red', 0, 'malaria'), ('blue', 1, 'non-malaria fever')]
(34,)
(26,)

Let's see if PCA differentiates all three classes easily, it looks like it might...

Or batch effects?



In [185]:

    
# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(class_map_df['class'])
y_three_class = le.transform(class_map_df['class'])
print class_map_df.head(10)
print y_three_class
print X.shape
print y_three_class.shape

y_labels = np.sort(class_map_df['class'].unique())
print y_labels
colors = ['green', 'red', 'blue']

print np.unique(y_three_class)
PCA_plot(X, y_three_class, 2, colors, np.unique(y_three_class), y_labels)









    



            MS Assay Name                            class
Sample Name                                               
MCMA429            1001_P                          malaria
MCMA430            1002_P                          malaria
MCMA431            1003_P                          malaria
MCMA433            1004_P                          malaria
MCMA434            1005_P     non-malarial febrile illness
MCMA435            1006_P                          malaria
MCMA436            1007_P  bacterial bloodstream infection
MCMA442            1009_P                          malaria
MCMA443            1010_P     non-malarial febrile illness
MCMA444            1011_P  bacterial bloodstream infection
[1 1 1 1 2 1 0 1 2 0 0 1 1 1 1 1 0 2 2 2 0 1 1 1 1 2 0 2 1 2 1 1 1 1 1 1 2
 1 0 1 1 0 0 1 2 0 2 2 1 0 1 1 2 0 2 1 1 1 1 1]
(60, 6857)
(60,)
['bacterial bloodstream infection' 'malaria' 'non-malarial febrile illness']
[0 1 2]
[('green', 0, 'bacterial bloodstream infection'), ('red', 1, 'malaria'), ('blue', 2, 'non-malarial febrile illness')]
(12,)
(34,)
(14,)

	MS Assay Name	class
Sample Name
MCMA429	1001_P	malaria
MCMA430	1002_P	malaria
MCMA431	1003_P	malaria
MCMA433	1004_P	malaria
MCMA434	1005_P	non-malarial fever
MCMA435	1006_P	malaria
MCMA436	1007_P	non-malarial fever
MCMA442	1009_P	malaria
MCMA443	1010_P	non-malarial fever
MCMA444	1011_P	non-malarial fever
MCMA446	1012_P	non-malarial fever
MCMA447	1013_P	malaria
MCMA448	1014_P	malaria
MCMA450	1016_P	malaria
MCMA451	1017_P	malaria
MCMA452	1018_P	malaria
MCMA455	1019_P	non-malarial fever
MCMA459	1021_P	non-malarial fever
MCMA462	1023_P	non-malarial fever
MCMA463	1024_P	non-malarial fever
MCMA465	1026_P	non-malarial fever
MCMA466	1027_P	malaria
MCMA472	1029_P	malaria
MCMA474	1030_P	malaria
MCMA477	1031_P	malaria
MCMA478	1032_P	non-malarial fever
MCMA480	1033_P	non-malarial fever
MCMA484	1034_P	non-malarial fever
MCMA489	1035_P	malaria
MCMA491	1036_P	non-malarial fever
MCMA493	1037_P	malaria
MCMA495	1038_P	malaria
MCMA496	1039_P	malaria
MCMA497	1040_P	malaria
MCMA501	1041_P	malaria
MCMA502	1042_P	malaria
MCMA503	1043_P	non-malarial fever
MCMA513	1044_P	malaria
MCMA515	1045_P	non-malarial fever
MCMA516	1046_P	malaria
MCMA523	1048_P	malaria
MCMA526	1049_P	non-malarial fever
MCMA528	1050_P	non-malarial fever
MCMA533	1051_P	malaria
MCMA534	1052_P	non-malarial fever
MCMA535	1053_P	non-malarial fever
MCMA537	1054_P	non-malarial fever
MCMA538	1055_P	non-malarial fever
MCMA550	1056_P	malaria
MCMA557	1057_P	non-malarial fever
MCMA560	1058_P	malaria
MCMA562	1059_P	malaria
MCMA564	1060_P	non-malarial fever
MCMA565	1061_P	non-malarial fever
MCMA566	1062_P	non-malarial fever
MURB082	1064_P	malaria
MURB085	1065_P	malaria
MURB088	1066_P	malaria
MURB092	1067_P	malaria
MURB095	1068_P	malaria