In [1]:
    
import pandas as pd
import glob as glob
import numpy as np
import sys
import multiprocessing as mp
import os
from matplotlib import pyplot as plt
%matplotlib notebook
def group_duplicates(df,group_col,make_string=False):
    """
    takes in a list of grouping columns and turns the rest into arrays
    """
    
    all_cols = np.asarray(df.columns)
    #get the index of the grouping term as array
    idx_group = np.argwhere(all_cols == group_col).flatten()
    #get the indices of all other terms as array
    idx_list = np.argwhere(all_cols != group_col).flatten()
    cols = all_cols[idx_list]
    
    # create a sorted numpy array (sorted by column=group_col)
    a = df.sort_values(group_col).values.T
    #get the indices of the first instance of each unique identifier
    ukeys, index = np.unique(a[idx_group,:],return_index=True)
    #split the other rows of the array into separate arrays using the 
    #unique index
    arrays = np.split(a[idx_list,:],index[1:],axis=1)
    #make a list of dicts with column headings as keys
    #if there are not multiple items then return value
    #If there are multiple items then return list
    
#     ucpds = [dict([(c,aa) if len(aa)>1 else (c,aa[0]) for c,aa in zip(cols,a)]) for a in arrays ]
    ucpds = [dict([(c,aa) for c,aa in zip(cols,a)]) for a in arrays ]
    #make a dataframe from the list of dicts
    df2 = pd.DataFrame(ucpds,index=ukeys)
    
    #make strings of array columns if you want to save it in anything useful
    if make_string==True:
        for c in cols:
#             df2[c] = df2[c].apply(lambda x: np.array2string(x, precision=5, separator=','))
            df2[c] = df2[c].apply(lambda x: str(x.tolist()))
            
    df2.index = df2.index.set_names(group_col)
    df2.reset_index(inplace=True)
    #return dataframe
    return df2
    
In [2]:
    
files = glob.glob('/project/projectdirs/metatlas/projects/cs_ffff/*_features.h5')
# files = glob.glob('/project/projectdirs/metatlas/projects/ffff/20171114_KBL_AS-JC_502921_MCom_1to3mem_QE139_ZHILIC5um_736960_POS*.h5')
df = pd.read_hdf(files[0],'ms1_summary')
# eic = group_duplicates(df[['label','rt','i','in_feature']],'label',make_string=False)
# eic.head()
df.head()
    
    Out[2]:
In [77]:
    
files
    
    Out[77]:
In [111]:
    
%%time
# metatlas_dataset[file_idx][compound_idx]['ms1_summary'] = dict
# rows are same order as atlas
metatlas_dataset = []
atlas = pd.read_hdf(files[0],'atlas')
atlas.set_index('label',drop=True,inplace=True)
# atlas = atlas[[]] #we only need the index as a placeholder to organize the merge
def setup_dataframe(filename):
    df = pd.read_hdf(filename,'ms1_summary')
    
    eic = pd.read_hdf(filename,'flat_ms1')
    eic = group_duplicates(eic[['label','rt','mz','i','in_feature']],'label',make_string=False).set_index('label',drop=True)
    eic.columns = ['eic_%s'%c for c in eic.columns]
    print(eic.columns)
    df = pd.merge(df,eic,how='outer',left_index=True,right_index=True)
    
    msms = pd.read_hdf(filename,'flat_ms2')
    msms = group_duplicates(msms[['label','mz','i','rt','in_feature']],'label',make_string=False).set_index('label',drop=True)
    msms.columns = ['msms_%s'%c for c in msms.columns]
    df = pd.merge(df,msms,left_index=True,right_index=True,how='left')
#         df = pd.merge(atlas,msms,left_index=True,right_index=True,how='left')
    df['filename'] = os.path.basename(filename)
    return df
# df = setup_dataframe(files[0])
pool = mp.Pool(processes=12)
data = pool.map(setup_dataframe,files)
pool.close()
pool.join()
data = pd.concat(data)
data.index.name = 'label'
# never finishes with 40 cores
# 7min 43 s with 20 cores
# 19min 14 s with 5 cores
    
    
In [112]:
    
data.head()
    
    Out[112]:
In [126]:
    
temp = peak_height.T.reset_index()
def get_group_name(filename):
    if '_Control-' in filename:
        return 'Mean_Control'
    elif '_S4-' in filename:
        return 'Mean_S4'
    elif '_G1-' in filename:
        return 'Mean_G1'
    
temp['filename'] = temp['filename'].apply(get_group_name)
mean_groups = temp.groupby('filename',axis=0).mean().T
mean_groups['S4_log2'] = np.log2((mean_groups['Mean_S4']+1.0)/(1.0 + mean_groups['Mean_Control']))
mean_groups['G1_log2'] = np.log2((mean_groups['Mean_G1']+1.0)/(1.0 + mean_groups['Mean_Control']))
mean_groups.head()
    
    Out[126]:
In [127]:
    
peak_height = pd.pivot_table(data.reset_index(), values='peak_height',index=['label'], columns=['filename'],fill_value=0.)
temp = peak_height.T.reset_index()
def get_group_name(filename):
    if '_Control-' in filename:
        return 'Mean_Control'
    elif '_S4-' in filename:
        return 'Mean_S4'
    elif '_G1-' in filename:
        return 'Mean_G1'
    
temp['filename'] = temp['filename'].apply(get_group_name)
mean_groups = temp.groupby('filename',axis=0).mean().T
mean_groups['S4_log2'] = np.log2((mean_groups['Mean_S4']+1.0)/(1.0 + mean_groups['Mean_Control']))
mean_groups['G1_log2'] = np.log2((mean_groups['Mean_G1']+1.0)/(1.0 + mean_groups['Mean_Control']))
peak_height = peak_height.merge(mean_groups,how='outer',left_index=True,right_index=True)
peak_height = peak_height.merge(atlas,how='outer',left_index=True,right_index=True)
peak_height.to_csv('/global/homes/b/bpb/Downloads/cs_ffff_peak_height_7.csv')
peak_height.head()
    
    Out[127]:
In [ ]:
    
a = data.reset_index().values
labels, row_pos = np.unique(a[:, 0], return_inverse=True) #these are feature labels
basenames, col_pos = np.unique(a[:, -1], return_inverse=True) #these are rt values
pivot_table = np.zeros((len(labels), len(basenames),5), dtype=float)
pivot_table[row_pos, col_pos] = a[:, [1,2,3,4,5]]
# labels, row_pos = np.unique(a[:, 0], return_inverse=True) #these are feature labels
# rt, col_pos = np.unique(a[:, 1], return_inverse=True) #these are rt values
# pivot_table = np.zeros((len(labels), len(rt),3), dtype=float)
# pivot_table[row_pos, col_pos] = a[:, [2,3,4]]
    
In [ ]:
    
eic_table = np.zeros((len(labels), len(basenames),4),dtype=object)
eic_table[row_pos, col_pos] = a[:, [6,7,8,9]]
    
In [ ]:
    
eic_table[0,0]
    
In [ ]:
    
data.index.name = 'label'
peak_height = pd.pivot_table(data.reset_index(), values='i',index=['label'], columns=['filename'],fill_value=None)
peak_height.head()
    
In [ ]:
    
#20 seconds to do 64 of them
# metatlas_dataset[file_idx][compound_idx]['ms1_summary'] = dict
# rows are same order as atlas
metatlas_dataset = []
atlas = pd.read_hdf(files[0],'atlas')
atlas.set_index('label',drop=True,inplace=True)
atlas = atlas[[]] #we only need the index as a placeholder to organize the merge
def make_data_for_metatlas_dataset(f):
    df = pd.read_hdf(f,'ms1_summary')
    ms1 = pd.merge(atlas,df,left_index=True,right_index=True,how='left').to_dict('records')
    
    df = pd.read_hdf(f,'flat_ms1')
    eic = group_duplicates(df[['label','rt','mz','i','in_feature']],'label',make_string=False).set_index('label',drop=True)
    eic = pd.merge(atlas,eic,left_index=True,right_index=True,how='left').to_dict('records')
    df = pd.read_hdf(f,'flat_ms2')
    msms = group_duplicates(df[['label','mz','i','rt','in_feature']],'label',make_string=False).set_index('label',drop=True)
    msms = pd.merge(atlas,msms,left_index=True,right_index=True,how='left').to_dict('records')
    return [{'ms1_summary':m,'file':f,'msms':msms[j],'eic':eic[j]} for j,m in enumerate(ms1)]
    
# pool = mp.Pool(processes=10)
# data = pool.map(make_data_for_metatlas_dataset,files)
# pool.close()
# pool.join()
    
In [ ]:
    
df = pd.read_hdf(files[0],'flat_ms2')
msms = group_duplicates(df[['label','mz','i','rt','in_feature']],'label',make_string=False).set_index('label',drop=True)
msms.head(100)
    
In [ ]:
    
# %%time
df = pd.read_hdf(files[0],'flat_ms1')
eic = group_duplicates(df[['label','rt','mz','i','in_feature']],'label',make_string=False).set_index('label',drop=True)
eic.head()