Shrink datasets down to mz, try to classify them, then try to combine features

You'll have to... mean center and standardize all your features?


In [540]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing

from IPython.display import display

import pickle

# My code
import data.preprocessing as preproc
import util.combine_by_mz as combine_mz
import util.rt_window_prediction as rtwin


/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Start with mtbls315

load up the data


In [260]:
### import two datasets
def reindex_xcms_by_mzrt(df):
    df.index = (df.loc[:,'mz'].astype('str') + 
               ':' + df.loc[:, 'rt'].astype('str'))
    return df

local_path = ('/home/irockafe/Dropbox (MIT)/Alm_Lab/' +
'projects/')
# malaria
malaria_path = (local_path + '/revo_healthcare/data/processed/MTBLS315/'+
            'uhplc_pos/xcms_result_4.csv')
df_malaria_raw = pd.read_csv(malaria_path, index_col=0)

# convert column names to remove X added to them
new_idx = [i.replace('X', '') for i in df_malaria_raw.columns]
df_malaria_raw.columns = new_idx
print df_malaria_raw.columns
# replace 0 values with nans, so it's easier to replace them later
df_malaria_raw = df_malaria_raw.replace(to_replace=0.0 , value=np.nan, )

# Make a new index of mz:rt
mz = df_malaria_raw.loc[:,"mz"].astype('str')
rt = df_malaria_raw.loc[:,"rt"].astype('str')
idx = mz+':'+rt
df_malaria_raw.index = idx

# separate samples from xcms/camera things to make feature table
not_samples = ['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 
               'npeaks', 'uhplc_pos', 
               ]
samples_list = df_malaria_raw.columns.difference(not_samples)
mz_rt_df = df_malaria_raw[not_samples]

# convert to samples x features
X_df_malaria_raw = df_malaria_raw[samples_list].T

print "original shape: %s \n# nans: %f\n" % (X_df_malaria_raw.shape, X_df_malaria_raw.isnull().sum().sum())


Index([u'mz', u'mzmin', u'mzmax', u'rt', u'rtmin', u'rtmax', u'npeaks',
       u'uhplc_pos', u'1001_P', u'1002_P', u'1003_P', u'1004_P', u'1005_P',
       u'1006_P', u'1007_P', u'1009_P', u'1010_P', u'1011_P', u'1012_P',
       u'1013_P', u'1014_P', u'1016_P', u'1017_P', u'1018_P', u'1019_P',
       u'1020_P', u'1021_P', u'1023_P', u'1024_P', u'1026_P', u'1027_P',
       u'1029_P', u'1030_P', u'1031_P', u'1032_P', u'1033_P', u'1034_P',
       u'1035_P', u'1036_P', u'1037_P', u'1038_P', u'1039_P', u'1040_P',
       u'1041_P', u'1042_P', u'1043_P', u'1044_P', u'1045_P', u'1046_P',
       u'1048_P', u'1049_P', u'1050_P', u'1051_P', u'1052_P', u'1053_P',
       u'1054_P', u'1055_P', u'1056_P', u'1057_P', u'1058_P', u'1059_P',
       u'1060_P', u'1061_P', u'1062_P', u'1064_P', u'1065_P', u'1066_P',
       u'1067_P', u'1068_P'],
      dtype='object')
original shape: (61, 6737) 
# nans: 3727.000000

Mapping between class, sample name, run name


In [261]:
# Get mapping between sample name and assay names
path_sample_name_map = (local_path + 'revo_healthcare/data/raw/' +
                'MTBLS315/metadata/a_UPLC_POS_nmfi_and_bsi_diagnosis.txt')

# Sample name to Assay name
# Index is the sample name
# value we want is the Assay name
sample_df = pd.read_csv(path_sample_name_map, 
                        sep='\t', index_col=0)
sample_df = sample_df['MS Assay Name']
sample_df.shape

# sample name to sample class
path_sample_class_map = (local_path + 'revo_healthcare/data/raw/' +
                        'MTBLS315/metadata/s_NMFI and BSI diagnosis.txt')
class_df = pd.read_csv(path_sample_class_map,
                      sep='\t')
class_df.set_index('Sample Name', inplace=True)
class_df = class_df['Factor Value[patient group]']

# Combine sample > assay and sample > class into one
class_map_df = pd.concat([sample_df, class_df], axis=1)
class_map_df.rename(columns={'Factor Value[patient group]': 'class'}, inplace=True)

# convert all non-malarial classes into a single classes 
# (collapse non-malarial febril illness and bacteremia together)
binary_class_map = class_map_df.replace(to_replace=['non-malarial febrile illness', 'bacterial bloodstream infection' ], 
                                        value='non-malarial fever')

print "binary class map:\n", binary_class_map.head()

# Get case and control samples in their own dataframes
case_str = 'malaria'
control_str = 'non-malarial fever'
# get that assay names based on class
case_labels = binary_class_map[binary_class_map['class'] == case_str]['MS Assay Name']
control_labels = binary_class_map[binary_class_map['class'] == control_str]['MS Assay Name']
# select the assay names from X_df_raw based on class
case = X_df_malaria_raw.loc[case_labels]
control = X_df_malaria_raw.loc[control_labels]

print 'case shape: ', case.shape
print 'control shape: ', control.shape


binary class map:
            MS Assay Name               class
Sample Name                                  
MCMA429            1001_P             malaria
MCMA430            1002_P             malaria
MCMA431            1003_P             malaria
MCMA433            1004_P             malaria
MCMA434            1005_P  non-malarial fever
case shape:  (34, 6737)
control shape:  (27, 6737)

In [262]:
# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(binary_class_map['class'])
assert(binary_class_map['MS Assay Name'] == X_df_malaria_raw.index).all()
y_malaria = le.transform(binary_class_map['class'])
y_malaria


Out[262]:
array([0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0])

Get p. vivax data to useful format


In [12]:
# Get p. vivax data
project_path = ('/revo_healthcare/data/processed/' +
                'ST000578/ST000578_AN000888_Results.tsv')

p_vivax_path = local_path + project_path
df_p_vivax = pd.read_csv(p_vivax_path, sep='\t')


/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,402,403,404,405,406,407,408,409,410,411,412,413,414,415,416,417,418,419,420,421,422,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560,561,562,563,564,565,566,567,568,569,570,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610,611,612,613,614,615,616,617,618,619,620,621,622,623,624,625,626,627,628,629,630,631,632,633,634,635,636,637,638,639,640,641,642,643,644,645,646,647,648,649,650,651,652,653,654,655,656,657,658,659,660,661,662,663,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,680,681,682,683,684,685,686,687,688,689,690,691,692,693,694,695,696,697,698,699,700,701,702,703,704,705,706,707,708,709,710,711,712,713,714,715,716,717,718,719,720,721,722,723,724,725,726,727,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,835,836,837,838,839,840) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [13]:
## Parse the class-labels from output
outcome = df_p_vivax.iloc[0,:]
print outcome.unique()

# convert all P. vivax into just P.vivax
susceptible_triplicate = outcome[outcome.str.contains('Current Malaria Infection:P.Vivax') & 
        outcome.str.contains('Chloroquine Resistance:Susceptible')]

resistant_triplicate = outcome[outcome.str.contains('Current Malaria Infection:P.Vivax') & 
        outcome.str.contains('Chloroquine Resistance:Resistant')]

control_triplicate = outcome[outcome.str.contains('Current Malaria Infection:None') ]
print '\n\nSusceptible\n', susceptible_triplicate.shape[0] / 3
print '\n\nResistant\n', resistant_triplicate.shape[0] / 3
print '\n\nControl\n', control_triplicate.shape[0] /3
# Select one of the three triplicate samples ( the one that doesn't have a period in the name)
resistant = resistant_triplicate[~resistant_triplicate.index.str.contains('\.')]
susceptible = susceptible_triplicate[~susceptible_triplicate.index.str.contains('\.')]
control = control_triplicate[~control_triplicate.index.str.contains('\.')]

# Relabel so that only three classes
resistant[:] = 'Chloroquine resistant'
susceptible[:] = 'Chloroquine susceptible'
control[:] = 'Control'

class_labels = pd.concat([resistant, susceptible, control])
print "how many samples?\n", class_labels.shape
print "What are the classes?\n", class_labels.unique()


['Factors'
 'Current Malaria Infection:None | Prior Malaria Infection:N/A | Chloroquine Resistance:N/A'
 'Current Malaria Infection:None | Prior Malaria Infection:NO | Chloroquine Resistance:N/A'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:N/A'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:NO | Chloroquine Resistance:N/A'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Susceptible'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:YES | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:None | Prior Malaria Infection:YES | Chloroquine Resistance:N/A'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:NO | Chloroquine Resistance:Resistant'
 'Current Malaria Infection:P.Vivax | Prior Malaria Infection:N/A | Chloroquine Resistance:Resistant']


Susceptible
33


Resistant
31


Control
66
how many samples?
(123,)
What are the classes?
['Chloroquine resistant' 'Chloroquine susceptible' 'Control']

Parse the feature table and encode the class labels


In [16]:
# Grab samples that have correct class labels
# get rid of the class-label row
df_p_vivax_raw = df_p_vivax.iloc[1:, :]
# index based on the 'sample' row
df_p_vivax_raw = df_p_vivax_raw.set_index(df_p_vivax_raw['Samples'])
# Keep only samples and convert to (samples x features)
df_p_vivax_raw = df_p_vivax_raw[class_labels.index].T
display(df_p_vivax_raw.head())

# remove first column and convert to float
print "df_p_vivax_raw shape", df_p_vivax_raw.shape
print "class labels", class_labels.shape

# Make sure labels and df_p_vivax_raw-columns are in matching order
print "quick eyeball that y and X are in same order ", zip(df_p_vivax_raw.columns, class_labels.index)[0:5]
assert (df_p_vivax_raw.index == class_labels.index).all()
display(df_p_vivax_raw.head())

# Conver to binary class labels
print class_labels.unique()
le = preprocessing.LabelEncoder()
le.fit(class_labels)
y_pvivax = le.transform(class_labels)
print y_pvivax


Samples 85.5089_36.2 85.5146_37.6 86.0955_48.6 86.5166_270.6 87.0429_204.8 87.0432_263.8 87.0988_50.9 87.5085_353.2 87.5086_13.4 88.0747_258.9 ... 1762.6041_417.6 1763.1075_417.3 1810.4472_46.3 1810.4763_411.8 1815.3909_407.5 1982.3328_417.0 1982.8332_416.7 1983.3337_417.2 1983.8177_417.2 1984.3186_417.5
2009227 51065.03274 34348.85255 880363.6806 0 0 0 0 0 134251.0278 0 ... 0 0 0 0 0 124759 0 1036.54 0 0
2008791 22154.69474 49230.82536 1057405.156 125925.8035 0 0 0 234774.6689 37484.24267 0 ... 0 0 0 0 0 430687 2.06874e+06 460889 40449.8 322134
2008700 0 70190.04916 1362530.042 0 0 115557.2717 48285.29025 282774.3529 155057.2188 0 ... 0 510225 0 0 0 469729 369764 714522 0 322114
2008675 26024.15828 36202.7997 1204027.617 233731.1325 186770.4664 0 25228.12601 156219.2191 227745.6642 0 ... 0 0 0 0 0 568305 295861 760267 0 72068.8
2008630 0 242840.7866 1139145.689 0 0 0 42667.85198 189666.9579 177576.6134 0 ... 0 0 0 0 0 623452 362512 874229 0 697438

5 rows × 20347 columns

df_p_vivax_raw shape (123, 20347)
class labels (123,)
quick eyeball that y and X are in same order  [('85.5089_36.2', '2009227'), ('85.5146_37.6', '2008791'), ('86.0955_48.6', '2008700'), ('86.5166_270.6', '2008675'), ('87.0429_204.8', '2008630')]
Samples 85.5089_36.2 85.5146_37.6 86.0955_48.6 86.5166_270.6 87.0429_204.8 87.0432_263.8 87.0988_50.9 87.5085_353.2 87.5086_13.4 88.0747_258.9 ... 1762.6041_417.6 1763.1075_417.3 1810.4472_46.3 1810.4763_411.8 1815.3909_407.5 1982.3328_417.0 1982.8332_416.7 1983.3337_417.2 1983.8177_417.2 1984.3186_417.5
2009227 51065.03274 34348.85255 880363.6806 0 0 0 0 0 134251.0278 0 ... 0 0 0 0 0 124759 0 1036.54 0 0
2008791 22154.69474 49230.82536 1057405.156 125925.8035 0 0 0 234774.6689 37484.24267 0 ... 0 0 0 0 0 430687 2.06874e+06 460889 40449.8 322134
2008700 0 70190.04916 1362530.042 0 0 115557.2717 48285.29025 282774.3529 155057.2188 0 ... 0 510225 0 0 0 469729 369764 714522 0 322114
2008675 26024.15828 36202.7997 1204027.617 233731.1325 186770.4664 0 25228.12601 156219.2191 227745.6642 0 ... 0 0 0 0 0 568305 295861 760267 0 72068.8
2008630 0 242840.7866 1139145.689 0 0 0 42667.85198 189666.9579 177576.6134 0 ... 0 0 0 0 0 623452 362512 874229 0 697438

5 rows × 20347 columns

['Chloroquine resistant' 'Chloroquine susceptible' 'Control']
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2]

In [5]:
# Get map between feature names and mz, rt
df_p_vivax_raw.columns
mz_rt_list = [i.split('_') for i in df_p_vivax_raw.columns]
feature_map = pd.DataFrame(mz_rt_list, columns=['mz', 'rt'], index=df_p_vivax_raw.columns)

display(feature_map.head())


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-0a4208cc46f9> in <module>()
      1 # Get map between feature names and mz, rt
----> 2 df_p_vivax_raw.columns
      3 mz_rt_list = [i.split('_') for i in df_p_vivax_raw.columns]
      4 feature_map = pd.DataFrame(mz_rt_list, columns=['mz', 'rt'], index=df_p_vivax_raw.columns)
      5 

NameError: name 'df_p_vivax_raw' is not defined

Preprocess

MTBLS315 first, since we know it can classify well


In [1]:
# Don't normalize by dilution factor - have seen that doesn't help
# do set prevalence threshold
thresh = 0.5
X_df_malaria_filter = preproc.prevalence_threshold(X_df_malaria_raw, thresh)
print "raw shape", X_df_malaria_raw.shape
print "50% filter shape", X_df_malaria_filter.shape

# fill nans with 1/2 min from each sample
fill = X_df_malaria_filter.min(axis=1) / 2.0
X_df_malaria_filter_filled = X_df_malaria_filter.T.fillna(value=fill).T
# sanity check
print "how many nulls?", X_df_malaria_filter.isnull().sum().sum()
print "how many nulls?", X_df_malaria_filter_filled.isnull().sum().sum()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-ad3c3cb708ef> in <module>()
      2 # do set prevalence threshold
      3 thresh = 0.5
----> 4 X_df_malaria_filter = preproc.prevalence_threshold(X_df_malaria_raw, thresh)
      5 print "raw shape", X_df_malaria_raw.shape
      6 print "50% filter shape", X_df_malaria_filter.shape

NameError: name 'preproc' is not defined

collapse rt


In [264]:
ppm = 30

# add mz and rt vals into dataframe
print df_malaria_raw.shape
mz_malaria = df_malaria_raw.loc[X_df_malaria_filter_filled.columns]['mz']

ppm_mat_malaria = combine_mz.ppm_matrix(mz_malaria, mz_malaria)


(6737, 69)

In [265]:
# make triangular
test = np.array(np.arange(1,10), dtype=np.float).reshape([3,3])
display(test)
display(np.tril(test, k=-1))

idx = np.tril_indices(test.shape[0])

test[idx] = np.nan
print test

# Convert to upper triangular matrix
idx_ppm = np.tril_indices(ppm_mat_malaria.shape[0])
ppm_mat_malaria[idx_ppm] = np.nan


array([[ 1.,  2.,  3.],
       [ 4.,  5.,  6.],
       [ 7.,  8.,  9.]])
array([[ 0.,  0.,  0.],
       [ 4.,  0.,  0.],
       [ 7.,  8.,  0.]])
[[ nan   2.   3.]
 [ nan  nan   6.]
 [ nan  nan  nan]]

In [266]:
ppm_mat_malaria


Out[266]:
array([[             nan,   9.80917292e+03,   9.83392448e+03, ...,
          8.98246100e+05,   8.98348785e+05,   8.98451299e+05],
       [             nan,              nan,   2.49967620e+01, ...,
          8.97238091e+05,   8.97341793e+05,   8.97445322e+05],
       [             nan,              nan,              nan, ...,
          8.97235522e+05,   8.97339227e+05,   8.97442759e+05],
       ..., 
       [             nan,              nan,              nan, ...,
                     nan,   1.00915190e+03,   2.01661639e+03],
       [             nan,              nan,              nan, ...,
                     nan,              nan,   1.00848220e+03],
       [             nan,              nan,              nan, ...,
                     nan,              nan,              nan]])

Ignore the runtime errors - just means we're ignoring hte nans like i want to


In [267]:
print '# of overlaps:', (ppm_mat_malaria < ppm).sum()


# of overlaps: 3405
/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/ipykernel/__main__.py:1: RuntimeWarning: invalid value encountered in less
  if __name__ == '__main__':

This doesn't give us the number of features, tho. That's the goal


In [92]:
def group_by_mz(mat, ppm):
    '''
    Given square matrix from ppm comparison,
    group all isomers together
    '''
    isomers = np.argwhere(mat < ppm)
    print np.nonzero(mat < ppm)


# match (a,b), (c,d), no others
toy = np.array([[np.nan]*4,
               [0.1, np.nan, np.nan, np.nan],
               [10,100, np.nan, np.nan],
               [20,200,0.2, np.nan]
               ])
print toy
group_by_mz(toy, 1)


[[             nan              nan              nan              nan]
 [  1.00000000e-01              nan              nan              nan]
 [  1.00000000e+01   1.00000000e+02              nan              nan]
 [  2.00000000e+01   2.00000000e+02   2.00000000e-01              nan]]
[[False False False False]
 [ True False False False]
 [False False False False]
 [False False  True False]]
[[1 0]
 [3 2]]
(array([1, 3]), array([0, 2]))
/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in less
  from ipykernel import kernelapp as app
/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/ipykernel/__main__.py:4: RuntimeWarning: invalid value encountered in less
/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/ipykernel/__main__.py:5: RuntimeWarning: invalid value encountered in less

In [285]:
def group_isomer_indices(indices):
    '''
    Given a list of isomer pairs, combine all isomers pairs into 
    a single group - i.e. [[0,1], [1,5]] becomes [0,1,5]
    '''
    output = []
    for idx in indices:
        #print '\n\nfinding indices that match ', idx 
        # Find any matches to the current index in the list of indices
        match_idx = np.argwhere([np.in1d(idx, poop).any() for poop in indices])
        #print 'Indices that match\n', match_idx
        #print 'Do these match %s?: \n %s' % (idx, indices[match_idx])
        unique_matches = np.unique(indices[match_idx])
        #print 'Unique matches', unique_matches
        
        # Check if any values from unique_matches are present in output
        # if not, append
        # if so, append to place where they're found
        in_output = [np.in1d(unique_matches, poop).any() for poop in output]
        #print 'Output\n', output
        #print 'Is it in the output?\n', in_output
        where_in_output = np.argwhere(in_output)
        #print 'Where?\n', where_in_output, 'Size', where_in_output.size, 'Greater than 1?', where_in_output.size
        
        
        # if found in output, append it to where you found it in the output
        if  where_in_output.size != 0:
            if where_in_output.size > 1:
                raise ValueError('You should only find an index one place in the output. Somehting is wrong')
            #print 'Append unique vals to entry that overlaps in output'
            #print int(where_in_output)
            #print output[int(where_in_output)]
            
            output[int(where_in_output)] = np.append(output[int(where_in_output)], unique_matches)
        # If not found in an entry of the output, append values found to the output 
        else:
            output.append(unique_matches)
        #print 'Output', output
        #
    output = [np.unique(poop) for poop in output]
    #print ('-'*50, 'Final output\n', output)
    return output


test = np.array([[0,1], [2,3], [0,4], [4,5], [1,6]])

iso = group_isomer_indices(test)
print test
print iso


[[0 1]
 [2 3]
 [0 4]
 [4 5]
 [1 6]]
[array([0, 1, 4, 5, 6]), array([2, 3])]

In [275]:
ppm = 30
isomer_indices = np.argwhere(ppm_mat_malaria < 30)
isomer_indices.shape
print isomer_indices[0:10]


[[ 1  2]
 [ 4  5]
 [10 11]
 [10 12]
 [11 12]
 [18 19]
 [18 20]
 [19 20]
 [36 37]
 [36 38]]
/home/irockafe/miniconda2/envs/isaac_revo_healthcare/lib/python2.7/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in less
  from ipykernel import kernelapp as app

In [286]:
isomer_groups = group_isomer_indices(isomer_indices)
print isomer_groups


[array([1, 2]), array([4, 5]), array([10, 11, 12]), array([18, 19, 20]), array([36, 37, 38]), array([41, 42]), array([47, 48, 49, 50, 51, 52, 53]), array([54, 55]), array([64, 65]), array([67, 68, 69, 70]), array([73, 74, 75, 76, 77]), array([80, 81]), array([82, 83]), array([97, 98, 99]), array([101, 102]), array([103, 104]), array([126, 127]), array([128, 129, 130]), array([134, 135]), array([137, 138, 139, 140]), array([141, 142, 143]), array([148, 149, 150, 151]), array([152, 153, 154]), array([155, 156, 157]), array([161, 162, 163, 164, 165]), array([166, 167]), array([168, 169]), array([170, 171, 172]), array([173, 174, 175]), array([179, 180]), array([184, 185]), array([190, 191, 192, 193, 194, 195, 196, 197, 198, 199]), array([200, 201, 202]), array([205, 206]), array([209, 210, 211]), array([220, 221, 222]), array([231, 232]), array([236, 237]), array([239, 240, 241]), array([243, 244, 245]), array([251, 252]), array([259, 260]), array([261, 262]), array([272, 273]), array([274, 275, 276]), array([277, 278]), array([282, 283, 284]), array([285, 286, 287]), array([290, 291]), array([292, 293]), array([299, 300]), array([317, 318]), array([321, 322]), array([324, 325, 326, 327, 328]), array([331, 332, 333]), array([338, 339]), array([340, 341, 342, 343, 344, 345]), array([352, 353]), array([356, 357, 358]), array([359, 360]), array([363, 364, 365]), array([368, 369, 370]), array([376, 377, 378]), array([379, 380]), array([382, 383, 384]), array([385, 386]), array([388, 389]), array([394, 395]), array([399, 400, 401, 402]), array([414, 415, 416]), array([420, 421, 422]), array([424, 425, 426, 427]), array([428, 429, 430]), array([431, 432]), array([441, 442, 443]), array([465, 466]), array([468, 469, 470]), array([471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483,
       484, 485, 486, 487, 488]), array([492, 493, 494]), array([495, 496, 497]), array([503, 504, 505]), array([511, 512, 513, 514, 515, 517, 518, 519]), array([528, 529]), array([533, 534]), array([536, 537]), array([540, 541]), array([557, 558]), array([565, 566, 567]), array([569, 570]), array([574, 575, 576]), array([577, 578, 579]), array([580, 581]), array([590, 591, 592]), array([594, 595]), array([596, 597, 598, 599]), array([618, 619, 620]), array([621, 622]), array([623, 624, 625]), array([634, 635]), array([644, 645, 646]), array([652, 653]), array([664, 665, 666]), array([669, 670]), array([671, 672]), array([673, 674, 675]), array([682, 683]), array([688, 689, 690]), array([691, 692, 693]), array([694, 695, 696, 697, 698]), array([701, 702]), array([703, 704]), array([708, 709]), array([712, 713, 714]), array([722, 723]), array([728, 729, 730]), array([731, 732]), array([741, 742]), array([743, 744, 745]), array([748, 749]), array([755, 756]), array([757, 758]), array([759, 760]), array([761, 762]), array([766, 767]), array([768, 769]), array([770, 771, 772, 773]), array([774, 775, 776]), array([779, 780, 781]), array([782, 783, 784]), array([797, 798]), array([808, 809, 810]), array([813, 814]), array([822, 823]), array([828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840,
       841, 842, 844]), array([849, 850]), array([851, 852, 853]), array([854, 855, 856]), array([864, 865, 866]), array([868, 869]), array([871, 872]), array([877, 878]), array([884, 885]), array([897, 898]), array([899, 900, 901]), array([903, 904]), array([913, 914, 915]), array([916, 917, 918]), array([922, 923]), array([926, 927, 928]), array([929, 930]), array([940, 941]), array([943, 944]), array([947, 948, 949]), array([955, 956, 957, 958, 959]), array([964, 965]), array([967, 968]), array([969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981,
       982, 983, 984]), array([987, 988, 989]), array([994, 995, 996]), array([1005, 1006, 1007]), array([1016, 1017, 1018]), array([1019, 1020, 1021]), array([1022, 1023]), array([1027, 1028]), array([1030, 1031]), array([1034, 1035]), array([1038, 1039]), array([1042, 1043]), array([1045, 1046, 1047]), array([1049, 1050]), array([1060, 1061, 1062]), array([1066, 1067]), array([1078, 1079, 1080]), array([1082, 1083, 1084]), array([1085, 1086, 1087]), array([1088, 1089]), array([1090, 1091]), array([1092, 1093]), array([1095, 1096]), array([1101, 1102]), array([1103, 1104]), array([1108, 1109]), array([1110, 1111]), array([1113, 1114]), array([1135, 1136, 1137]), array([1138, 1139]), array([1140, 1141]), array([1142, 1143]), array([1156, 1157]), array([1159, 1160]), array([1162, 1163]), array([1164, 1165]), array([1167, 1168]), array([1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180,
       1181, 1182, 1183]), array([1185, 1186]), array([1190, 1191, 1192, 1193]), array([1195, 1196, 1197, 1198]), array([1200, 1201, 1202]), array([1203, 1204, 1205]), array([1208, 1209, 1210, 1211]), array([1215, 1216]), array([1219, 1220]), array([1223, 1224, 1225]), array([1226, 1227]), array([1232, 1233]), array([1235, 1236, 1237]), array([1238, 1239]), array([1240, 1241]), array([1244, 1245, 1246]), array([1248, 1249, 1250]), array([1258, 1259, 1260]), array([1263, 1264]), array([1279, 1280]), array([1291, 1292]), array([1293, 1294]), array([1295, 1296, 1297, 1298, 1299]), array([1320, 1321]), array([1323, 1324, 1325]), array([1326, 1327]), array([1332, 1333]), array([1338, 1339]), array([1340, 1341]), array([1345, 1346]), array([1360, 1361]), array([1362, 1363]), array([1365, 1366, 1367]), array([1369, 1370]), array([1372, 1373]), array([1378, 1379]), array([1381, 1382]), array([1383, 1384, 1385, 1386]), array([1390, 1391]), array([1393, 1394]), array([1410, 1411]), array([1413, 1414]), array([1415, 1416]), array([1418, 1419, 1420]), array([1421, 1422, 1423]), array([1425, 1426]), array([1428, 1429, 1430]), array([1431, 1432]), array([1437, 1438]), array([1439, 1440, 1441, 1442]), array([1443, 1444]), array([1445, 1446]), array([1448, 1449]), array([1450, 1451]), array([1454, 1455]), array([1457, 1458]), array([1475, 1476]), array([1484, 1485]), array([1486, 1487]), array([1490, 1491]), array([1495, 1496]), array([1498, 1499, 1500]), array([1502, 1503]), array([1518, 1519]), array([1526, 1527]), array([1530, 1531]), array([1534, 1535]), array([1537, 1538]), array([1542, 1543, 1544]), array([1553, 1554, 1555, 1556, 1557]), array([1565, 1566]), array([1567, 1568]), array([1577, 1578, 1579]), array([1580, 1581]), array([1582, 1583]), array([1586, 1587]), array([1593, 1594, 1595]), array([1600, 1601, 1602]), array([1603, 1604]), array([1608, 1609, 1610]), array([1612, 1613]), array([1618, 1619]), array([1621, 1622]), array([1623, 1624, 1625]), array([1633, 1634, 1635, 1636, 1637]), array([1640, 1641, 1642, 1643]), array([1646, 1647]), array([1653, 1654, 1655, 1656, 1657, 1658]), array([1661, 1662]), array([1667, 1668]), array([1673, 1674, 1675]), array([1676, 1677]), array([1678, 1679, 1680]), array([1687, 1688, 1689]), array([1690, 1691]), array([1698, 1699]), array([1700, 1701]), array([1703, 1704, 1705]), array([1707, 1708, 1709]), array([1711, 1712]), array([1714, 1715]), array([1727, 1728]), array([1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736]), array([1739, 1740]), array([1744, 1745]), array([1747, 1748]), array([1754, 1755]), array([1756, 1757, 1758, 1759, 1760, 1761, 1762, 1763]), array([1768, 1769, 1770, 1771, 1772, 1773, 1774]), array([1782, 1783]), array([1784, 1785]), array([1786, 1787, 1788]), array([1789, 1790]), array([1791, 1792]), array([1796, 1797, 1798, 1799]), array([1800, 1801]), array([1807, 1808]), array([1812, 1813]), array([1814, 1815]), array([1816, 1817]), array([1822, 1823]), array([1825, 1826]), array([1828, 1829]), array([1836, 1837, 1838]), array([1842, 1843, 1844, 1845]), array([1847, 1848]), array([1855, 1856, 1857, 1858]), array([1866, 1867]), array([1868, 1869]), array([1871, 1872]), array([1874, 1875]), array([1876, 1877]), array([1883, 1884]), array([1885, 1886]), array([1887, 1888]), array([1890, 1891]), array([1893, 1894, 1895, 1896]), array([1897, 1898]), array([1899, 1900]), array([1906, 1907]), array([1908, 1909, 1910, 1911]), array([1912, 1913, 1914, 1915]), array([1917, 1918]), array([1923, 1924]), array([1925, 1926]), array([1928, 1929]), array([1930, 1931, 1932, 1933]), array([1935, 1936]), array([1943, 1944]), array([1946, 1947]), array([1948, 1949]), array([1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965]), array([1966, 1967]), array([1969, 1970]), array([1975, 1976]), array([1981, 1982]), array([1984, 1985]), array([1986, 1987, 1988, 1989]), array([1990, 1991]), array([1992, 1993, 1994]), array([1995, 1996]), array([2002, 2003, 2004, 2005]), array([2006, 2007, 2008]), array([2009, 2010]), array([2014, 2015, 2016]), array([2019, 2020, 2021, 2022, 2023]), array([2033, 2034]), array([2035, 2036]), array([2044, 2045, 2046]), array([2047, 2048]), array([2052, 2053]), array([2054, 2055]), array([2056, 2057]), array([2064, 2065]), array([2068, 2069, 2070, 2071]), array([2079, 2080, 2081]), array([2082, 2083]), array([2091, 2092, 2093, 2094, 2095, 2096]), array([2097, 2098]), array([2106, 2107, 2108, 2109, 2110, 2111, 2112, 2113]), array([2116, 2117]), array([2121, 2122, 2123, 2124]), array([2125, 2126, 2127, 2128, 2129]), array([2134, 2135, 2136]), array([2137, 2138, 2139]), array([2150, 2151]), array([2152, 2153, 2154, 2155]), array([2156, 2157, 2158]), array([2161, 2162, 2163, 2164, 2165, 2166, 2167]), array([2168, 2169]), array([2180, 2181, 2182, 2183]), array([2185, 2186]), array([2191, 2192, 2193, 2194, 2195]), array([2197, 2198]), array([2200, 2201, 2202]), array([2206, 2207, 2208]), array([2210, 2211, 2212, 2213]), array([2223, 2224]), array([2232, 2233]), array([2239, 2240]), array([2246, 2247]), array([2249, 2250, 2251, 2252]), array([2254, 2255]), array([2256, 2257, 2258, 2259]), array([2264, 2265]), array([2268, 2269]), array([2278, 2279, 2280, 2281, 2282, 2283, 2284, 2285, 2286, 2287, 2288]), array([2292, 2293, 2294, 2295]), array([2296, 2297]), array([2300, 2301]), array([2309, 2310, 2311]), array([2312, 2313, 2314, 2315]), array([2316, 2317]), array([2318, 2319, 2320]), array([2324, 2325, 2326]), array([2338, 2339, 2340, 2341, 2342]), array([2343, 2344, 2345, 2346, 2347]), array([2348, 2349]), array([2355, 2356, 2357]), array([2358, 2359, 2360, 2361, 2362]), array([2363, 2364, 2365]), array([2373, 2374]), array([2375, 2376, 2377, 2378]), array([2379, 2380, 2381, 2382]), array([2383, 2384]), array([2385, 2386, 2387]), array([2390, 2391]), array([2394, 2395]), array([2396, 2397, 2398]), array([2399, 2400, 2401, 2402]), array([2406, 2407, 2408]), array([2412, 2413, 2414, 2415]), array([2416, 2417]), array([2421, 2422]), array([2425, 2426]), array([2428, 2429]), array([2430, 2431, 2432, 2433]), array([2434, 2435]), array([2436, 2437]), array([2440, 2441]), array([2442, 2443]), array([2445, 2446]), array([2450, 2451]), array([2462, 2463]), array([2464, 2465, 2466, 2467, 2468, 2469]), array([2470, 2471]), array([2474, 2475]), array([2477, 2478]), array([2479, 2480, 2481, 2482, 2483, 2484]), array([2485, 2486, 2487]), array([2488, 2489, 2490, 2491, 2492, 2493]), array([2498, 2499, 2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508]), array([2511, 2512, 2513]), array([2517, 2518]), array([2519, 2520, 2521, 2522]), array([2527, 2528]), array([2530, 2531]), array([2537, 2538, 2539, 2540, 2541, 2542]), array([2543, 2544]), array([2545, 2546]), array([2548, 2549, 2550, 2551]), array([2555, 2556]), array([2560, 2561]), array([2562, 2563]), array([2564, 2565, 2566, 2567, 2568, 2569, 2570, 2571, 2572]), array([2577, 2578]), array([2579, 2580]), array([2581, 2582]), array([2585, 2586]), array([2595, 2596, 2597]), array([2598, 2599]), array([2600, 2601, 2602]), array([2603, 2604, 2605]), array([2606, 2607, 2608]), array([2609, 2610]), array([2612, 2613]), array([2617, 2618, 2619, 2620, 2621, 2622]), array([2624, 2625, 2626]), array([2629, 2630]), array([2631, 2632, 2633, 2634]), array([2635, 2636, 2637]), array([2638, 2639, 2640, 2641]), array([2646, 2647, 2648]), array([2656, 2657]), array([2661, 2662]), array([2665, 2666, 2667, 2668]), array([2669, 2670]), array([2673, 2674]), array([2675, 2676]), array([2678, 2679, 2680]), array([2682, 2683]), array([2686, 2687, 2688]), array([2689, 2690]), array([2691, 2692, 2693]), array([2698, 2699]), array([2700, 2701]), array([2703, 2704]), array([2706, 2707, 2708]), array([2709, 2710]), array([2712, 2713]), array([2717, 2718]), array([2719, 2720]), array([2721, 2722]), array([2723, 2724, 2725, 2726, 2727]), array([2728, 2729, 2730]), array([2734, 2735, 2736]), array([2737, 2738]), array([2741, 2742]), array([2743, 2744]), array([2749, 2750, 2751, 2752, 2753]), array([2754, 2755]), array([2760, 2761, 2762]), array([2766, 2767]), array([2768, 2769, 2770, 2771, 2772]), array([2776, 2777]), array([2778, 2779, 2780, 2781, 2782]), array([2784, 2785, 2786]), array([2790, 2791, 2792, 2793, 2794]), array([2799, 2800, 2801, 2802, 2803, 2804]), array([2805, 2806]), array([2811, 2812]), array([2813, 2814, 2815]), array([2819, 2820, 2821]), array([2825, 2826, 2827]), array([2830, 2831]), array([2832, 2833, 2834, 2835, 2836]), array([2837, 2838]), array([2842, 2843]), array([2844, 2845]), array([2848, 2849]), array([2850, 2851, 2852, 2853, 2854, 2855, 2856]), array([2858, 2859]), array([2861, 2862, 2863]), array([2865, 2866]), array([2867, 2868, 2869]), array([2873, 2874]), array([2878, 2879]), array([2880, 2881]), array([2882, 2883]), array([2886, 2887, 2888, 2889, 2890, 2891]), array([2893, 2894]), array([2895, 2896, 2897, 2898]), array([2901, 2902, 2903]), array([2906, 2907]), array([2909, 2910]), array([2912, 2913]), array([2916, 2917]), array([2922, 2923, 2924]), array([2928, 2929, 2930, 2931, 2932, 2933]), array([2937, 2938]), array([2940, 2941, 2942, 2943, 2944]), array([2945, 2946, 2947]), array([2948, 2949]), array([2951, 2952, 2953]), array([2956, 2957, 2958]), array([2962, 2963]), array([2971, 2972]), array([2978, 2979]), array([2980, 2981]), array([2985, 2986, 2987, 2988]), array([2991, 2992]), array([2994, 2995, 2996, 2997, 2998]), array([2999, 3000, 3001]), array([3004, 3005, 3006]), array([3011, 3012, 3013, 3014, 3015]), array([3016, 3017]), array([3020, 3021, 3022]), array([3030, 3031]), array([3034, 3035, 3036, 3037]), array([3038, 3039, 3040, 3041, 3042]), array([3043, 3044]), array([3046, 3047, 3048]), array([3055, 3056, 3057]), array([3058, 3059, 3060, 3061]), array([3063, 3064, 3065]), array([3068, 3069, 3070]), array([3071, 3072]), array([3073, 3074, 3075, 3076, 3077, 3078, 3079]), array([3080, 3081]), array([3083, 3084]), array([3085, 3086, 3087, 3088, 3089]), array([3093, 3094, 3095, 3096, 3097]), array([3101, 3102, 3103, 3104, 3105, 3106]), array([3108, 3109]), array([3111, 3112, 3113]), array([3119, 3120, 3121]), array([3126, 3127]), array([3128, 3129, 3130]), array([3131, 3132]), array([3133, 3134, 3135, 3136]), array([3137, 3138]), array([3143, 3144, 3145, 3146, 3147]), array([3149, 3150]), array([3151, 3152, 3153]), array([3157, 3158]), array([3159, 3160]), array([3164, 3165]), array([3167, 3168, 3169, 3170, 3171]), array([3172, 3173]), array([3174, 3175]), array([3176, 3177]), array([3180, 3181]), array([3182, 3183]), array([3186, 3187, 3188, 3189]), array([3190, 3191]), array([3193, 3194, 3195]), array([3199, 3200]), array([3202, 3203, 3204]), array([3207, 3208, 3209]), array([3211, 3212, 3213, 3214]), array([3219, 3220, 3221, 3222, 3223, 3224, 3225]), array([3231, 3232, 3233, 3234]), array([3235, 3236, 3237, 3238, 3239, 3240, 3241, 3242]), array([3244, 3245]), array([3246, 3247]), array([3249, 3250, 3251]), array([3255, 3256, 3257, 3258]), array([3259, 3260, 3261]), array([3263, 3264, 3265]), array([3266, 3267]), array([3268, 3269]), array([3270, 3271]), array([3276, 3277, 3278, 3279, 3280]), array([3281, 3282, 3283]), array([3284, 3285]), array([3288, 3289]), array([3293, 3294]), array([3296, 3297, 3298, 3299]), array([3303, 3304]), array([3305, 3306, 3307, 3308, 3309]), array([3311, 3312, 3313, 3314]), array([3318, 3319]), array([3320, 3321]), array([3322, 3323]), array([3324, 3325, 3326, 3327, 3328]), array([3330, 3331, 3332, 3333]), array([3336, 3337]), array([3339, 3340]), array([3342, 3343, 3344]), array([3345, 3346]), array([3348, 3349]), array([3350, 3351]), array([3354, 3355, 3356]), array([3359, 3360]), array([3361, 3362, 3363]), array([3366, 3367]), array([3372, 3373, 3374]), array([3375, 3376, 3377]), array([3386, 3387]), array([3388, 3389]), array([3396, 3397]), array([3401, 3402, 3403]), array([3405, 3406, 3407]), array([3409, 3410]), array([3414, 3415, 3416, 3417]), array([3419, 3420]), array([3423, 3424]), array([3429, 3430]), array([3436, 3437, 3438, 3439, 3440, 3441]), array([3457, 3458]), array([3461, 3462, 3463]), array([3472, 3473, 3474]), array([3479, 3480]), array([3482, 3483]), array([3497, 3498]), array([3502, 3503, 3504]), array([3506, 3507]), array([3509, 3510]), array([3515, 3516, 3517, 3518]), array([3539, 3540]), array([3541, 3542]), array([3549, 3550]), array([3552, 3553, 3554]), array([3557, 3558]), array([3562, 3563]), array([3565, 3566]), array([3573, 3574, 3575]), array([3584, 3585, 3586]), array([3591, 3592]), array([3595, 3596]), array([3600, 3601]), array([3602, 3603]), array([3610, 3611]), array([3618, 3619]), array([3620, 3621, 3622]), array([3624, 3625]), array([3631, 3632]), array([3635, 3636]), array([3642, 3643]), array([3644, 3645]), array([3647, 3648]), array([3650, 3651]), array([3664, 3665, 3666, 3667]), array([3670, 3671, 3672]), array([3673, 3674, 3675, 3676]), array([3677, 3678]), array([3685, 3686]), array([3694, 3695]), array([3696, 3697, 3698]), array([3700, 3701]), array([3704, 3705]), array([3708, 3709]), array([3719, 3720, 3721, 3722]), array([3725, 3726]), array([3730, 3731]), array([3733, 3734, 3735, 3736, 3737]), array([3738, 3739]), array([3741, 3742, 3743]), array([3744, 3745]), array([3748, 3749, 3750]), array([3751, 3752, 3753]), array([3755, 3756, 3757, 3758]), array([3759, 3760]), array([3761, 3762]), array([3763, 3764]), array([3767, 3768, 3769, 3770]), array([3771, 3772]), array([3775, 3776]), array([3779, 3780]), array([3784, 3785]), array([3787, 3788]), array([3803, 3804]), array([3811, 3812]), array([3815, 3816]), array([3818, 3819]), array([3820, 3821]), array([3822, 3823]), array([3824, 3825]), array([3829, 3830, 3831]), array([3838, 3839]), array([3841, 3842]), array([3848, 3849, 3850]), array([3852, 3853]), array([3856, 3857, 3858]), array([3860, 3861]), array([3864, 3865]), array([3872, 3873]), array([3878, 3879]), array([3884, 3885]), array([3887, 3888]), array([3889, 3890]), array([3894, 3895, 3896]), array([3898, 3899, 3900]), array([3902, 3903, 3904, 3905, 3906]), array([3914, 3915]), array([3917, 3918]), array([3922, 3923, 3924, 3925, 3926, 3927, 3928]), array([3929, 3930]), array([3931, 3932]), array([3935, 3936, 3937, 3938]), array([3945, 3946, 3947, 3948]), array([3949, 3950]), array([3958, 3959]), array([3963, 3964, 3965]), array([3972, 3973]), array([3975, 3976]), array([3981, 3982, 3983, 3984, 3985]), array([3988, 3989]), array([3992, 3993]), array([3994, 3995]), array([4000, 4001]), array([4016, 4017]), array([4027, 4028]), array([4038, 4039, 4040, 4041]), array([4044, 4045]), array([4046, 4047]), array([4050, 4051, 4052]), array([4057, 4058]), array([4064, 4065]), array([4066, 4067]), array([4068, 4069]), array([4077, 4078]), array([4083, 4084]), array([4086, 4087]), array([4096, 4097, 4098]), array([4102, 4103, 4104, 4105]), array([4106, 4107]), array([4113, 4114]), array([4115, 4116]), array([4120, 4121]), array([4122, 4123]), array([4128, 4129, 4130]), array([4136, 4137, 4138]), array([4141, 4142]), array([4154, 4155]), array([4157, 4158]), array([4160, 4161]), array([4166, 4167]), array([4182, 4183]), array([4195, 4196, 4197]), array([4199, 4200]), array([4205, 4206]), array([4207, 4208, 4209]), array([4217, 4218]), array([4222, 4223]), array([4224, 4225]), array([4226, 4227, 4228]), array([4232, 4233]), array([4237, 4238]), array([4241, 4242, 4243]), array([4244, 4245, 4246, 4247]), array([4251, 4252]), array([4256, 4257, 4258]), array([4266, 4267, 4268]), array([4269, 4270, 4271]), array([4276, 4277]), array([4278, 4279]), array([4280, 4281]), array([4285, 4286, 4287]), array([4294, 4295]), array([4297, 4298, 4299]), array([4300, 4301, 4302]), array([4307, 4308]), array([4310, 4311, 4312]), array([4320, 4321, 4322, 4323]), array([4340, 4341]), array([4342, 4343, 4344, 4345]), array([4350, 4351]), array([4352, 4353]), array([4354, 4355]), array([4368, 4369, 4370]), array([4371, 4372]), array([4383, 4384, 4385, 4386]), array([4388, 4389]), array([4395, 4396, 4397]), array([4402, 4403]), array([4412, 4413]), array([4422, 4423, 4424]), array([4435, 4436]), array([4438, 4439]), array([4440, 4441]), array([4447, 4448, 4449, 4450, 4451, 4452]), array([4461, 4462]), array([4464, 4465]), array([4469, 4470]), array([4474, 4475, 4476]), array([4486, 4487]), array([4488, 4489, 4490, 4491]), array([4493, 4494, 4495, 4496, 4497]), array([4498, 4499, 4500]), array([4505, 4506, 4507, 4508]), array([4512, 4513]), array([4520, 4521, 4522]), array([4523, 4524, 4525, 4526]), array([4528, 4529, 4530]), array([4533, 4534]), array([4535, 4536]), array([4537, 4538]), array([4539, 4540, 4541]), array([4542, 4543]), array([4553, 4554]), array([4555, 4556]), array([4562, 4563]), array([4571, 4572]), array([4580, 4581]), array([4587, 4588]), array([4604, 4605]), array([4611, 4612]), array([4616, 4617, 4618]), array([4620, 4621, 4622]), array([4628, 4629, 4630]), array([4632, 4633]), array([4640, 4641, 4642]), array([4646, 4647, 4648]), array([4654, 4655, 4656, 4657]), array([4664, 4665, 4666, 4667]), array([4680, 4681, 4682]), array([4683, 4684]), array([4687, 4688]), array([4694, 4695]), array([4700, 4701]), array([4702, 4703]), array([4710, 4711, 4712, 4713]), array([4715, 4716]), array([4718, 4719]), array([4720, 4721, 4722, 4723]), array([4736, 4737]), array([4744, 4745]), array([4750, 4751]), array([4762, 4763]), array([4767, 4768]), array([4783, 4784]), array([4788, 4789]), array([4794, 4795]), array([4797, 4798]), array([4803, 4804, 4805, 4806]), array([4807, 4808]), array([4817, 4818, 4819]), array([4821, 4822]), array([4826, 4827, 4828, 4829, 4830]), array([4835, 4836]), array([4840, 4841, 4842, 4843]), array([4844, 4845]), array([4846, 4847]), array([4852, 4853]), array([4856, 4857]), array([4871, 4872, 4873, 4874]), array([4876, 4877]), array([4881, 4882, 4883, 4884]), array([4888, 4889, 4890, 4891, 4892]), array([4898, 4899, 4900]), array([4909, 4910]), array([4913, 4914]), array([4915, 4916]), array([4918, 4919, 4920]), array([4927, 4928]), array([4929, 4930]), array([4934, 4935]), array([4945, 4946]), array([4950, 4951]), array([4954, 4955]), array([4960, 4961]), array([4963, 4964]), array([4978, 4979]), array([4981, 4982]), array([4988, 4989]), array([4990, 4991]), array([5000, 5001]), array([5002, 5003]), array([5004, 5005]), array([5012, 5013]), array([5020, 5021]), array([5025, 5026]), array([5037, 5038]), array([5046, 5047]), array([5054, 5055]), array([5059, 5060, 5061, 5062]), array([5065, 5066]), array([5076, 5077]), array([5078, 5079]), array([5086, 5087]), array([5095, 5096]), array([5104, 5105]), array([5114, 5115]), array([5121, 5122]), array([5149, 5150]), array([5155, 5156]), array([5157, 5158, 5159, 5160]), array([5167, 5168]), array([5169, 5170]), array([5233, 5234]), array([5235, 5236]), array([5247, 5248, 5249]), array([5250, 5251]), array([5256, 5257]), array([5258, 5259]), array([5265, 5266]), array([5269, 5270]), array([5272, 5273]), array([5295, 5296, 5297, 5298]), array([5304, 5305]), array([5306, 5307]), array([5312, 5313]), array([5317, 5318]), array([5335, 5336]), array([5348, 5349, 5350]), array([5354, 5355]), array([5356, 5357]), array([5371, 5372]), array([5373, 5374, 5375]), array([5380, 5381]), array([5384, 5385]), array([5388, 5389]), array([5393, 5394]), array([5405, 5406]), array([5420, 5421]), array([5431, 5432]), array([5437, 5438]), array([5440, 5441]), array([5442, 5443, 5444]), array([5464, 5465, 5466]), array([5467, 5468]), array([5479, 5480]), array([5488, 5489]), array([5490, 5491]), array([5499, 5500]), array([5505, 5506]), array([5509, 5510]), array([5514, 5515, 5516]), array([5517, 5518]), array([5526, 5527]), array([5528, 5529]), array([5537, 5538]), array([5542, 5543, 5544]), array([5546, 5547]), array([5548, 5549]), array([5550, 5551]), array([5552, 5553]), array([5554, 5555]), array([5557, 5558, 5559, 5560, 5561]), array([5588, 5589]), array([5595, 5596, 5597, 5598, 5599, 5600, 5601]), array([5605, 5606]), array([5609, 5610, 5611, 5612, 5613, 5614]), array([5618, 5619, 5620, 5621]), array([5622, 5623, 5624, 5625, 5626, 5627, 5628]), array([5630, 5631]), array([5634, 5635]), array([5636, 5637]), array([5639, 5640, 5641, 5642]), array([5645, 5646, 5647, 5648, 5649]), array([5652, 5653]), array([5655, 5656, 5657, 5658, 5659]), array([5661, 5662]), array([5663, 5664, 5665]), array([5668, 5669, 5670, 5671]), array([5672, 5673]), array([5675, 5676]), array([5677, 5678, 5679]), array([5681, 5682]), array([5683, 5684, 5685, 5686]), array([5687, 5688]), array([5690, 5691]), array([5693, 5694]), array([5698, 5699]), array([5700, 5701]), array([5703, 5704]), array([5707, 5708]), array([5709, 5710]), array([5713, 5714, 5715]), array([5716, 5717]), array([5718, 5719, 5720]), array([5723, 5724]), array([5737, 5738]), array([5739, 5740]), array([5742, 5743, 5744]), array([5746, 5747]), array([5748, 5749]), array([5750, 5751]), array([5752, 5753]), array([5754, 5755]), array([5758, 5759, 5760, 5761]), array([5762, 5763, 5764]), array([5770, 5771, 5772, 5773, 5774]), array([5778, 5779]), array([5781, 5782, 5783]), array([5784, 5785]), array([5790, 5791, 5792]), array([5794, 5795]), array([5796, 5797]), array([5799, 5800, 5801]), array([5804, 5805]), array([5809, 5810, 5811, 5812]), array([5813, 5814]), array([5815, 5816]), array([5818, 5819, 5820]), array([5821, 5822, 5823]), array([5826, 5827, 5828, 5829, 5830]), array([5833, 5834, 5835]), array([5840, 5841]), array([5842, 5843]), array([5846, 5847, 5848, 5849]), array([5852, 5853]), array([5855, 5856]), array([5858, 5859, 5860, 5861]), array([5867, 5868, 5869]), array([5870, 5871]), array([5874, 5875, 5876, 5877]), array([5881, 5882]), array([5884, 5885]), array([5888, 5889]), array([5891, 5892, 5893]), array([5895, 5896]), array([5899, 5900, 5901]), array([5906, 5907]), array([5908, 5909, 5910]), array([5913, 5914]), array([5919, 5920]), array([5924, 5925]), array([5927, 5928, 5929, 5930]), array([5931, 5932, 5933]), array([5934, 5935]), array([5936, 5937, 5938]), array([5939, 5940, 5941]), array([5943, 5944]), array([5946, 5947, 5948, 5949]), array([5950, 5951, 5952]), array([5955, 5956]), array([5959, 5960, 5961, 5962]), array([5963, 5964]), array([5967, 5968, 5969, 5970, 5971]), array([5972, 5973]), array([5974, 5975]), array([5984, 5985, 5986, 5987, 5988]), array([5989, 5990]), array([5991, 5992]), array([6000, 6001, 6002]), array([6006, 6007]), array([6008, 6009]), array([6011, 6012]), array([6014, 6015, 6016]), array([6017, 6018]), array([6019, 6020, 6021, 6022]), array([6024, 6025]), array([6026, 6027, 6028, 6029, 6030]), array([6031, 6032]), array([6033, 6034]), array([6035, 6036]), array([6038, 6039, 6040, 6041, 6042]), array([6043, 6044, 6045, 6046, 6047, 6048]), array([6050, 6051, 6052, 6053, 6054, 6055]), array([6056, 6057]), array([6058, 6059, 6060, 6061]), array([6065, 6066, 6067, 6068, 6069, 6070]), array([6072, 6073, 6074]), array([6075, 6076]), array([6077, 6078]), array([6079, 6080]), array([6081, 6082, 6083]), array([6084, 6085, 6086]), array([6089, 6090]), array([6091, 6092, 6093]), array([6094, 6095]), array([6096, 6097]), array([6099, 6100, 6101]), array([6102, 6103, 6104, 6105]), array([6106, 6107]), array([6112, 6113, 6114, 6115]), array([6116, 6117]), array([6118, 6119]), array([6120, 6121]), array([6123, 6124, 6125, 6126, 6127]), array([6128, 6129]), array([6130, 6131]), array([6134, 6135, 6136]), array([6138, 6139]), array([6141, 6142, 6143]), array([6145, 6146]), array([6147, 6148]), array([6151, 6152, 6153, 6154, 6155]), array([6156, 6157, 6158, 6159]), array([6160, 6161, 6162]), array([6163, 6164, 6165]), array([6166, 6167]), array([6169, 6170, 6171, 6172]), array([6173, 6174, 6175]), array([6177, 6178]), array([6180, 6181, 6182, 6183, 6184, 6185]), array([6186, 6187]), array([6193, 6194, 6195]), array([6200, 6201]), array([6205, 6206, 6207]), array([6208, 6209, 6210]), array([6215, 6216]), array([6220, 6221, 6222]), array([6223, 6224]), array([6225, 6226]), array([6230, 6231, 6232, 6233]), array([6235, 6236]), array([6238, 6239, 6240, 6241, 6242]), array([6244, 6245]), array([6246, 6247, 6248, 6249, 6250, 6251]), array([6253, 6254, 6255]), array([6257, 6258, 6259]), array([6262, 6263, 6264]), array([6266, 6267, 6268]), array([6271, 6272]), array([6273, 6274, 6275]), array([6276, 6277]), array([6279, 6280]), array([6281, 6282, 6283, 6284]), array([6285, 6286]), array([6290, 6291, 6292, 6293]), array([6294, 6295, 6296]), array([6302, 6303, 6304, 6305, 6306, 6307]), array([6308, 6309, 6310]), array([6314, 6315, 6316, 6317]), array([6318, 6319]), array([6321, 6322]), array([6323, 6324, 6325, 6326]), array([6328, 6329, 6330]), array([6331, 6332]), array([6333, 6334]), array([6335, 6336, 6337]), array([6339, 6340, 6341]), array([6342, 6343, 6344]), array([6345, 6346]), array([6350, 6351]), array([6352, 6353]), array([6354, 6355]), array([6359, 6360]), array([6363, 6364, 6365]), array([6367, 6368]), array([6370, 6371]), array([6372, 6373, 6374]), array([6375, 6376]), array([6377, 6378]), array([6380, 6381]), array([6382, 6383]), array([6385, 6386, 6387]), array([6389, 6390]), array([6391, 6392]), array([6395, 6396]), array([6402, 6403]), array([6410, 6411]), array([6412, 6413]), array([6427, 6428]), array([6429, 6430]), array([6431, 6432]), array([6434, 6435, 6436, 6437]), array([6440, 6441]), array([6442, 6443, 6444, 6445, 6446, 6447]), array([6453, 6454, 6455, 6456]), array([6457, 6458]), array([6459, 6460]), array([6461, 6462]), array([6465, 6466]), array([6468, 6469]), array([6470, 6471, 6472, 6473, 6474]), array([6479, 6480, 6481]), array([6482, 6483]), array([6486, 6487, 6488, 6489]), array([6492, 6493, 6494]), array([6495, 6496]), array([6506, 6507]), array([6515, 6516]), array([6518, 6519]), array([6522, 6523, 6524]), array([6531, 6532, 6533, 6534]), array([6538, 6539, 6540]), array([6543, 6544]), array([6547, 6548]), array([6557, 6558, 6559]), array([6570, 6571]), array([6572, 6573]), array([6583, 6584]), array([6586, 6587]), array([6607, 6608]), array([6613, 6614, 6615]), array([6619, 6620]), array([6652, 6653, 6654]), array([6676, 6677])]

In [290]:
mtbls315_path = '/revo_healthcare/presentations/isaac_bats/rt_window_plots/MTBLS315/'
output = local_path + mtbls315_path + 'isomer_indices.pkl'
pickle.dump(isomer_groups, open(output, 'wb'))

In [483]:
print 'Number of isomer groups:',  len(isomer_groups)
print 'Number of features that make up those groups isomers', np.concatenate(isomer_groups).size
group_size = [len(i) for i in isomer_groups]
plt.hist(group_size, bins=range(min(group_size), max(group_size)), 
        align='left', rwidth=0.7)
plt.title('Most isomer-groups are the combination of 2 features')
plt.xlabel('Number of isomers')
plt.ylabel('Count')
plt.savefig(local_path + mtbls315_path + '/number_of_features_per_isomer_group.pdf', 
           format='pdf')
plt.show()


Number of isomer groups: 1225
Number of features that make up those groups isomers 3358

Now Learn to sum intensities of isomer features, remove (? or treat as feature engineering?) others


In [526]:
def sum_isomer_intensities(df, isomer_groups):
    '''
    GOAL - Sum the intensity values of isomers
    INPUT - 
        df (samples x features) of intensity values
        isomer_groups - np array of arrays
    OUTPUT - 
        dataframe containing the summed values
        mapping between name of feature and values that make it up
    '''
    feature_names = ['isomer_group_%s' % i 
                     for i in range(0,len(isomer_groups))]
    output_df = pd.DataFrame(index=df.index, columns=feature_names)
    feature_mapping = pd.Series(index=feature_names)
    
    count = 0
    for i, group in enumerate(isomer_groups):
        summed_val = df.iloc[:,group].sum(axis=1, skipna=True)
        # Add to dataframe
        output_df['isomer_group_%s' % i] = summed_val
        feature_mapping['isomer_group_%s' % i] = np.array(df.iloc[:, group].columns)
        #if type(df.iloc[:, group].columns == )
        count += len(np.array(df.iloc[:, group].columns))
    print 'Final count', count
    return output_df, feature_mapping

df_summed_isomers, summed_isomer_features = sum_isomer_intensities(
    X_df_malaria_filter_filled, isomer_groups)

# Double check that # of isomers is same in df and group
print 'original size', np.concatenate(isomer_groups).size  
print 'other shapes', summed_isomer_features.shape
print 'summed_df shape', df_summed_isomers.shape


Final count 3358
original size 3358
other shapes (1225,)
summed_df shape (61, 1225)

In [488]:
# TODO - decide if these need to be normalized...?
sns.distplot(np.log10(summed_isomer_df).mean(axis=0), bins=100,
        label='isomer_groups')
sns.distplot(np.log10(X_df_malaria_filter_filled).mean(axis=0), bins=100,
        label='original features' )
plt.xlabel('log_10 Inensity')
plt.legend()
plt.title('Mean feature intensity distribution of isomer features vs normal')
plt.show()
plt.close()



In [489]:
# Case and control look about the same
# How does case/control of isomer features look?
case_poop = summed_isomer_df.loc[case_labels]
ctrl_poop = summed_isomer_df.loc[control_labels]
print case_poop.shape
print 'ctrl', ctrl_poop.shape
sns.distplot(np.log10(ctrl_poop).mean(axis=0),
            bins=100, label='Control')
sns.distplot(np.log10(case_poop).mean(axis=0), bins=100,
            label='Case')
plt.legend()
plt.title('Mean feature intensity distribution of isomer ' +
          'features: case vs control')
plt.xlabel('log_10 Intensity')
plt.show()


(34, 1225)
ctrl (27, 1225)

Now let's try to classify by removing the features from isomer groups and adding new features


In [535]:
# Remove features
print summed_isomer_features.values
isomer_idx = np.concatenate(summed_isomer_features.values)

#print summed_isomer_features.values
#print summed_isomer_index.get_level_values(0)
isomers_to_drop = X_df_malaria_filter_filled[isomer_idx]
print 'Shape of original', X_df_malaria_filter_filled.shape
print 'Shape of what to drop, should be 3358', isomers_to_drop.shape


[ array(['102.008758281:39.019987471', '102.011308234:39.0611977932'], dtype=object)
 array(['103.005418678:39.0289939892', '103.008163158:38.9530452658'], dtype=object)
 array(['104.106482797:769.364889063', '104.106436464:46.4052594739',
       '104.106485307:798.017378255'], dtype=object)
 ...,
 array(['879.671423793:792.320765852', '879.692213794:397.480873199'], dtype=object)
 array(['896.534826858:954.927273907', '896.554808795:1015.60791894',
       '896.564938271:1044.58261003'], dtype=object)
 array(['908.69010713:811.066920876', '908.696864178:531.392078889'], dtype=object)]
Shape of original (61, 6737)
Shape of what to drop, should be 3358 (61, 3358)

In [539]:
# Drop and add
X_df_malaria_filter_filled_isomers_grouped = pd.concat([
    # drop first
    X_df_malaria_filter_filled.drop(isomers_to_drop.columns, axis=1)
    ,
    df_summed_isomers], axis=1)
print 'new shape. Should be 4604', X_df_malaria_filter_filled_isomers_grouped.shape


new shape. Should be 4604 (61, 4604)

In [567]:
print not_samples

# combine the rt back
X = (pd.concat([X_df_malaria_filter_filled_isomers_grouped,
              df_malaria_raw[not_samples].T], axis=0)
     .T
     .fillna(value=0)
    )

min_val = 0
max_val = df_malaria_raw['rt'].max()
width = max_val 
step = width 
sliding_window = rtwin.make_sliding_window(min_val, 
                                           max_val, width, step)

# run classifier on all data
n_iter = 3
test_size = 0.3
rf_trees = 1000
all_aucs = np.full([len(sliding_window), n_iter], np.nan)
path = ('/revo_healthcare/presentations/isaac_bats/' +
        'combine_isomers')
output_path = local_path + path

# Run rt-sliding-window classifier
rtwin.sliding_rt_window_aucs(X, y_malaria, sliding_window, not_samples,
                      rf_trees=rf_trees, n_iter=n_iter, test_size=test_size,
                      output_path=output_path)


['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax', 'npeaks', 'uhplc_pos']
RT plot (0.0, 1495.6946680256501)
1495.69466803
994.676795832
slice shape (61, 6736)
y shape (61,)
0.0% done! 3.47418117523s elapsed


--------------------------------------------------NEXT ROUND--------------------------------------------------



Out[567]:
array([[ 0.85227273,  0.95454545,  0.56818182]])