In [1]:
import pandas as pd
import glob as glob
import numpy as np
import sys
import multiprocessing as mp
import os
from matplotlib import pyplot as plt
%matplotlib notebook

def group_duplicates(df,group_col,make_string=False):
    """
    takes in a list of grouping columns and turns the rest into arrays
    """
    
    all_cols = np.asarray(df.columns)
    #get the index of the grouping term as array
    idx_group = np.argwhere(all_cols == group_col).flatten()
    #get the indices of all other terms as array
    idx_list = np.argwhere(all_cols != group_col).flatten()
    cols = all_cols[idx_list]
    
    # create a sorted numpy array (sorted by column=group_col)
    a = df.sort_values(group_col).values.T

    #get the indices of the first instance of each unique identifier
    ukeys, index = np.unique(a[idx_group,:],return_index=True)

    #split the other rows of the array into separate arrays using the 
    #unique index
    arrays = np.split(a[idx_list,:],index[1:],axis=1)

    #make a list of dicts with column headings as keys
    #if there are not multiple items then return value
    #If there are multiple items then return list
    
#     ucpds = [dict([(c,aa) if len(aa)>1 else (c,aa[0]) for c,aa in zip(cols,a)]) for a in arrays ]
    ucpds = [dict([(c,aa) for c,aa in zip(cols,a)]) for a in arrays ]

    #make a dataframe from the list of dicts
    df2 = pd.DataFrame(ucpds,index=ukeys)
    
    #make strings of array columns if you want to save it in anything useful
    if make_string==True:
        for c in cols:
#             df2[c] = df2[c].apply(lambda x: np.array2string(x, precision=5, separator=','))
            df2[c] = df2[c].apply(lambda x: str(x.tolist()))
            
    df2.index = df2.index.set_names(group_col)
    df2.reset_index(inplace=True)

    #return dataframe
    return df2

In [2]:
files = glob.glob('/project/projectdirs/metatlas/projects/cs_ffff/*_features.h5')
# files = glob.glob('/project/projectdirs/metatlas/projects/ffff/20171114_KBL_AS-JC_502921_MCom_1to3mem_QE139_ZHILIC5um_736960_POS*.h5')
df = pd.read_hdf(files[0],'ms1_summary')
# eic = group_duplicates(df[['label','rt','i','in_feature']],'label',make_string=False)
# eic.head()
df.head()


Out[2]:
mz_centroid num_datapoints peak_area peak_height rt_peak
label
1 84.960321 37.0 4.806260e+07 1.455724e+06 6.571208
6 89.507234 37.0 1.654031e+08 7.371130e+06 10.831382
7 89.940499 33.0 2.151427e+07 7.398588e+05 1.253665
13 96.027754 1.0 3.604715e+03 3.604715e+03 0.771313
32 108.044790 33.0 7.019225e+07 7.123824e+06 0.385624

you only have to make EIC and msms for the max? could save num_files time


In [77]:
files


Out[77]:
['/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_4_S4-MC-RCG-R4_4_Rg80to1200-CE205060--S1_Run47_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_5_G1-MC-RCG-R1_1_Rg80to1200-CE102040--S1_Run17_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_10_Control-MC-RCG-R2_2_Rg80to1200-CE102040--S1_Run26_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_3_S4-MC-RCG-R3_3_Rg80to1200-CE102040--S1_Run41_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_9_Control-MC-RCG-R1_1_Rg80to1200-CE102040--S1_Run14_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_1_S4-MC-RCG-R1_1_Rg80to1200-CE102040--S1_Run20_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_6_G1-MC-RCG-R2_2_Rg80to1200-CE102040--S1_Run23_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_12_Control-MC-RCG-R4_4_Rg80to1200-CE205060--S1_Run53_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_2_S4-MC-RCG-R2_2_Rg80to1200-CE102040--S1_Run32_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_8_G1-MC-RCG-R4_4_Rg80to1200-CE205060--S1_Run56_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_11_Control-MC-RCG-R3_3_Rg80to1200-CE102040--S1_Run35_features.h5',
 '/project/projectdirs/metatlas/projects/cs_ffff/20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_7_G1-MC-RCG-R3_3_Rg80to1200-CE102040--S1_Run44_features.h5']

In [111]:
%%time

# metatlas_dataset[file_idx][compound_idx]['ms1_summary'] = dict
# rows are same order as atlas

metatlas_dataset = []
atlas = pd.read_hdf(files[0],'atlas')
atlas.set_index('label',drop=True,inplace=True)
# atlas = atlas[[]] #we only need the index as a placeholder to organize the merge

def setup_dataframe(filename):
    df = pd.read_hdf(filename,'ms1_summary')
    
    eic = pd.read_hdf(filename,'flat_ms1')
    eic = group_duplicates(eic[['label','rt','mz','i','in_feature']],'label',make_string=False).set_index('label',drop=True)
    eic.columns = ['eic_%s'%c for c in eic.columns]
    print(eic.columns)
    df = pd.merge(df,eic,how='outer',left_index=True,right_index=True)
    
    msms = pd.read_hdf(filename,'flat_ms2')
    msms = group_duplicates(msms[['label','mz','i','rt','in_feature']],'label',make_string=False).set_index('label',drop=True)
    msms.columns = ['msms_%s'%c for c in msms.columns]
    df = pd.merge(df,msms,left_index=True,right_index=True,how='left')
#         df = pd.merge(atlas,msms,left_index=True,right_index=True,how='left')

    df['filename'] = os.path.basename(filename)
    return df

# df = setup_dataframe(files[0])
pool = mp.Pool(processes=12)
data = pool.map(setup_dataframe,files)
pool.close()
pool.join()
data = pd.concat(data)
data.index.name = 'label'

# never finishes with 40 cores
# 7min 43 s with 20 cores
# 19min 14 s with 5 cores


Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
Index([u'eic_i', u'eic_in_feature', u'eic_mz', u'eic_rt'], dtype='object')
CPU times: user 575 ms, sys: 675 ms, total: 1.25 s
Wall time: 1.6 s

In [112]:
data.head()


Out[112]:
mz_centroid num_datapoints peak_area peak_height rt_peak eic_i eic_in_feature eic_mz eic_rt msms_i msms_in_feature msms_mz msms_rt filename
label
1 84.960321 37.0 4.806260e+07 1.455724e+06 6.571208 [1192286.875, 1371415.125, 1260008.0, 1212800.... [True, True, True, True, True, True, True, Tru... [84.9603347778, 84.9603118896, 84.9603271484, ... [6.56609296799, 6.51328706741, 6.51810359955, ... [99598.4921875, 1542.93115234, 6641.25439453, ... [True, True, True, True, True, True, True, Tru... [84.9602966309, 84.9602966309, 84.9602966309, ... [6.62827205658, 6.62827205658, 6.62827205658, ... 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C1...
6 89.507234 37.0 1.654031e+08 7.371130e+06 10.831382 [5465512.0, 6092759.5, 5921025.5, 5720820.0, 5... [True, True, True, True, True, True, True, Tru... [89.5072402954, 89.5072402954, 89.5072250366, ... [10.8871250153, 10.8815727234, 10.8760528564, ... [905781.75, 42632.7148438, 4903.70458984, 3684... [True, True, True, True, True, True, True, Tru... [89.5072021484, 89.5072021484, 89.5072021484, ... [10.9332647324, 10.9332647324, 10.9332647324, ... 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C1...
7 89.940499 33.0 2.151427e+07 7.398588e+05 1.253665 [619433.75, 618308.9375, 659818.3125, 645762.1... [True, True, True, True, True, True, True, Tru... [89.9404830933, 89.9404754639, 89.9404907227, ... [1.20521366596, 1.2112865448, 1.21732211113, 1... [1598.48669434, 1865.84631348, 2096.5, 1449.68... [True, True, True, True, True, True, True, Tru... [89.9404983521, 89.9404983521, 89.9404983521, ... [1.19687259197, 1.19687259197, 1.19687259197, ... 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C1...
13 96.027754 1.0 3.604715e+03 3.604715e+03 0.771313 [3604.71533203] [True] [96.0277557373] [0.771312832832] NaN NaN NaN NaN 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C1...
32 108.044790 33.0 7.019225e+07 7.123824e+06 0.385624 [4837.21191406, 3779.58032227, 2097595.75, 205... [True, True, True, True, True, True, True, Tru... [108.044784546, 108.044815063, 108.044868469, ... [0.30367693305, 0.297187328339, 0.427196621895... [27090.4570312, 5315.66699219, 4353.57421875, ... [True, True, True, True, True, True, True, Tru... [108.044799805, 108.044799805, 108.044799805, ... [0.388961911201, 0.388961911201, 0.38896191120... 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C1...

In [126]:
temp = peak_height.T.reset_index()
def get_group_name(filename):
    if '_Control-' in filename:
        return 'Mean_Control'
    elif '_S4-' in filename:
        return 'Mean_S4'
    elif '_G1-' in filename:
        return 'Mean_G1'
    
temp['filename'] = temp['filename'].apply(get_group_name)
mean_groups = temp.groupby('filename',axis=0).mean().T
mean_groups['S4_log2'] = np.log2((mean_groups['Mean_S4']+1.0)/(1.0 + mean_groups['Mean_Control']))
mean_groups['G1_log2'] = np.log2((mean_groups['Mean_G1']+1.0)/(1.0 + mean_groups['Mean_Control']))
mean_groups.head()


Out[126]:
filename Mean_Control Mean_G1 Mean_S4 S4_log2 G1_log2
label
1 1.366927e+06 1.476195e+06 1.443770e+06 0.078905 0.110947
5 2.289487e+03 0.000000e+00 0.000000e+00 -11.161439 -11.161439
6 5.849534e+06 6.193266e+06 6.659633e+06 0.187121 0.082379
7 7.414602e+05 7.211796e+05 7.221202e+05 -0.038130 -0.040011
12 8.817485e+03 9.460507e+03 4.139517e+03 -1.090720 0.101539

In [127]:
peak_height = pd.pivot_table(data.reset_index(), values='peak_height',index=['label'], columns=['filename'],fill_value=0.)
temp = peak_height.T.reset_index()
def get_group_name(filename):
    if '_Control-' in filename:
        return 'Mean_Control'
    elif '_S4-' in filename:
        return 'Mean_S4'
    elif '_G1-' in filename:
        return 'Mean_G1'
    
temp['filename'] = temp['filename'].apply(get_group_name)
mean_groups = temp.groupby('filename',axis=0).mean().T
mean_groups['S4_log2'] = np.log2((mean_groups['Mean_S4']+1.0)/(1.0 + mean_groups['Mean_Control']))
mean_groups['G1_log2'] = np.log2((mean_groups['Mean_G1']+1.0)/(1.0 + mean_groups['Mean_Control']))
peak_height = peak_height.merge(mean_groups,how='outer',left_index=True,right_index=True)
peak_height = peak_height.merge(atlas,how='outer',left_index=True,right_index=True)
peak_height.to_csv('/global/homes/b/bpb/Downloads/cs_ffff_peak_height_7.csv')
peak_height.head()


Out[127]:
20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_10_Control-MC-RCG-R2_2_Rg80to1200-CE102040--S1_Run26_features.h5 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_11_Control-MC-RCG-R3_3_Rg80to1200-CE102040--S1_Run35_features.h5 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_12_Control-MC-RCG-R4_4_Rg80to1200-CE205060--S1_Run53_features.h5 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_1_S4-MC-RCG-R1_1_Rg80to1200-CE102040--S1_Run20_features.h5 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_2_S4-MC-RCG-R2_2_Rg80to1200-CE102040--S1_Run32_features.h5 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_3_S4-MC-RCG-R3_3_Rg80to1200-CE102040--S1_Run41_features.h5 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_4_S4-MC-RCG-R4_4_Rg80to1200-CE205060--S1_Run47_features.h5 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_5_G1-MC-RCG-R1_1_Rg80to1200-CE102040--S1_Run17_features.h5 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_6_G1-MC-RCG-R2_2_Rg80to1200-CE102040--S1_Run23_features.h5 20180427_KBL_MO-CS_FungSM_CpxMed_S4G1_QE-HF_C18_105_POS_MSMS_7_G1-MC-RCG-R3_3_Rg80to1200-CE102040--S1_Run44_features.h5 ... Mean_G1 Mean_S4 S4_log2 G1_log2 mz rt_peak inchi_key rt_min rt_max group_index
label
1 1311738.00 1502384.250 1.346962e+06 1.400423e+06 1.424484e+06 1.494449e+06 1455724.00 1.390986e+06 1.481474e+06 1.568711e+06 ... 1.476195e+06 1.443770e+06 0.078905 0.110947 84.960 6.604186 NaN 6.504186 6.704186 0
5 0.00 0.000 9.157949e+03 0.000000e+00 0.000000e+00 0.000000e+00 0.00 0.000000e+00 0.000000e+00 0.000000e+00 ... 0.000000e+00 0.000000e+00 -11.161439 -11.161439 88.037 5.812425 NaN 5.712425 5.912425 1
6 5769812.00 6494037.500 6.311342e+06 5.994042e+06 6.357763e+06 6.915598e+06 7371130.50 5.520660e+06 5.924286e+06 7.023860e+06 ... 6.193266e+06 6.659633e+06 0.187121 0.082379 89.507 10.930367 NaN 10.830367 11.030367 2
7 725063.25 643727.875 7.733621e+05 7.694987e+05 7.364027e+05 6.427208e+05 739858.75 7.811588e+05 7.521926e+05 6.360627e+05 ... 7.211796e+05 7.221202e+05 -0.038130 -0.040011 89.941 1.240956 NaN 1.140956 1.340956 3
8 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 90.622 5.861404 NaN 5.761404 5.961404 4

5 rows × 23 columns


In [ ]:
a = data.reset_index().values
labels, row_pos = np.unique(a[:, 0], return_inverse=True) #these are feature labels
basenames, col_pos = np.unique(a[:, -1], return_inverse=True) #these are rt values

pivot_table = np.zeros((len(labels), len(basenames),5), dtype=float)
pivot_table[row_pos, col_pos] = a[:, [1,2,3,4,5]]
# labels, row_pos = np.unique(a[:, 0], return_inverse=True) #these are feature labels
# rt, col_pos = np.unique(a[:, 1], return_inverse=True) #these are rt values

# pivot_table = np.zeros((len(labels), len(rt),3), dtype=float)
# pivot_table[row_pos, col_pos] = a[:, [2,3,4]]

In [ ]:
eic_table = np.zeros((len(labels), len(basenames),4),dtype=object)
eic_table[row_pos, col_pos] = a[:, [6,7,8,9]]

In [ ]:
eic_table[0,0]

In [ ]:
data.index.name = 'label'
peak_height = pd.pivot_table(data.reset_index(), values='i',index=['label'], columns=['filename'],fill_value=None)
peak_height.head()

In [ ]:
#20 seconds to do 64 of them

# metatlas_dataset[file_idx][compound_idx]['ms1_summary'] = dict
# rows are same order as atlas
metatlas_dataset = []
atlas = pd.read_hdf(files[0],'atlas')
atlas.set_index('label',drop=True,inplace=True)
atlas = atlas[[]] #we only need the index as a placeholder to organize the merge

def make_data_for_metatlas_dataset(f):
    df = pd.read_hdf(f,'ms1_summary')
    ms1 = pd.merge(atlas,df,left_index=True,right_index=True,how='left').to_dict('records')
    
    df = pd.read_hdf(f,'flat_ms1')
    eic = group_duplicates(df[['label','rt','mz','i','in_feature']],'label',make_string=False).set_index('label',drop=True)
    eic = pd.merge(atlas,eic,left_index=True,right_index=True,how='left').to_dict('records')

    df = pd.read_hdf(f,'flat_ms2')
    msms = group_duplicates(df[['label','mz','i','rt','in_feature']],'label',make_string=False).set_index('label',drop=True)
    msms = pd.merge(atlas,msms,left_index=True,right_index=True,how='left').to_dict('records')

    return [{'ms1_summary':m,'file':f,'msms':msms[j],'eic':eic[j]} for j,m in enumerate(ms1)]
    
# pool = mp.Pool(processes=10)
# data = pool.map(make_data_for_metatlas_dataset,files)
# pool.close()
# pool.join()

In [ ]:
df = pd.read_hdf(files[0],'flat_ms2')
msms = group_duplicates(df[['label','mz','i','rt','in_feature']],'label',make_string=False).set_index('label',drop=True)
msms.head(100)

In [ ]:
# %%time
df = pd.read_hdf(files[0],'flat_ms1')
eic = group_duplicates(df[['label','rt','mz','i','in_feature']],'label',make_string=False).set_index('label',drop=True)
eic.head()