In [1]:
import pandas as pd
import glob as glob
import numpy as np
import sys
import multiprocessing as mp
import os
from matplotlib import pyplot as plt
%matplotlib notebook
def group_duplicates(df,group_col,make_string=False):
"""
takes in a list of grouping columns and turns the rest into arrays
"""
all_cols = np.asarray(df.columns)
#get the index of the grouping term as array
idx_group = np.argwhere(all_cols == group_col).flatten()
#get the indices of all other terms as array
idx_list = np.argwhere(all_cols != group_col).flatten()
cols = all_cols[idx_list]
# create a sorted numpy array (sorted by column=group_col)
a = df.sort_values(group_col).values.T
#get the indices of the first instance of each unique identifier
ukeys, index = np.unique(a[idx_group,:],return_index=True)
#split the other rows of the array into separate arrays using the
#unique index
arrays = np.split(a[idx_list,:],index[1:],axis=1)
#make a list of dicts with column headings as keys
#if there are not multiple items then return value
#If there are multiple items then return list
# ucpds = [dict([(c,aa) if len(aa)>1 else (c,aa[0]) for c,aa in zip(cols,a)]) for a in arrays ]
ucpds = [dict([(c,aa) for c,aa in zip(cols,a)]) for a in arrays ]
#make a dataframe from the list of dicts
df2 = pd.DataFrame(ucpds,index=ukeys)
#make strings of array columns if you want to save it in anything useful
if make_string==True:
for c in cols:
# df2[c] = df2[c].apply(lambda x: np.array2string(x, precision=5, separator=','))
df2[c] = df2[c].apply(lambda x: str(x.tolist()))
df2.index = df2.index.set_names(group_col)
df2.reset_index(inplace=True)
#return dataframe
return df2
In [2]:
files = glob.glob('/project/projectdirs/metatlas/projects/cs_ffff/*_features.h5')
# files = glob.glob('/project/projectdirs/metatlas/projects/ffff/20171114_KBL_AS-JC_502921_MCom_1to3mem_QE139_ZHILIC5um_736960_POS*.h5')
df = pd.read_hdf(files[0],'ms1_summary')
# eic = group_duplicates(df[['label','rt','i','in_feature']],'label',make_string=False)
# eic.head()
df.head()
Out[2]:
In [77]:
files
Out[77]:
In [111]:
%%time
# metatlas_dataset[file_idx][compound_idx]['ms1_summary'] = dict
# rows are same order as atlas
metatlas_dataset = []
atlas = pd.read_hdf(files[0],'atlas')
atlas.set_index('label',drop=True,inplace=True)
# atlas = atlas[[]] #we only need the index as a placeholder to organize the merge
def setup_dataframe(filename):
df = pd.read_hdf(filename,'ms1_summary')
eic = pd.read_hdf(filename,'flat_ms1')
eic = group_duplicates(eic[['label','rt','mz','i','in_feature']],'label',make_string=False).set_index('label',drop=True)
eic.columns = ['eic_%s'%c for c in eic.columns]
print(eic.columns)
df = pd.merge(df,eic,how='outer',left_index=True,right_index=True)
msms = pd.read_hdf(filename,'flat_ms2')
msms = group_duplicates(msms[['label','mz','i','rt','in_feature']],'label',make_string=False).set_index('label',drop=True)
msms.columns = ['msms_%s'%c for c in msms.columns]
df = pd.merge(df,msms,left_index=True,right_index=True,how='left')
# df = pd.merge(atlas,msms,left_index=True,right_index=True,how='left')
df['filename'] = os.path.basename(filename)
return df
# df = setup_dataframe(files[0])
pool = mp.Pool(processes=12)
data = pool.map(setup_dataframe,files)
pool.close()
pool.join()
data = pd.concat(data)
data.index.name = 'label'
# never finishes with 40 cores
# 7min 43 s with 20 cores
# 19min 14 s with 5 cores
In [112]:
data.head()
Out[112]:
In [126]:
temp = peak_height.T.reset_index()
def get_group_name(filename):
if '_Control-' in filename:
return 'Mean_Control'
elif '_S4-' in filename:
return 'Mean_S4'
elif '_G1-' in filename:
return 'Mean_G1'
temp['filename'] = temp['filename'].apply(get_group_name)
mean_groups = temp.groupby('filename',axis=0).mean().T
mean_groups['S4_log2'] = np.log2((mean_groups['Mean_S4']+1.0)/(1.0 + mean_groups['Mean_Control']))
mean_groups['G1_log2'] = np.log2((mean_groups['Mean_G1']+1.0)/(1.0 + mean_groups['Mean_Control']))
mean_groups.head()
Out[126]:
In [127]:
peak_height = pd.pivot_table(data.reset_index(), values='peak_height',index=['label'], columns=['filename'],fill_value=0.)
temp = peak_height.T.reset_index()
def get_group_name(filename):
if '_Control-' in filename:
return 'Mean_Control'
elif '_S4-' in filename:
return 'Mean_S4'
elif '_G1-' in filename:
return 'Mean_G1'
temp['filename'] = temp['filename'].apply(get_group_name)
mean_groups = temp.groupby('filename',axis=0).mean().T
mean_groups['S4_log2'] = np.log2((mean_groups['Mean_S4']+1.0)/(1.0 + mean_groups['Mean_Control']))
mean_groups['G1_log2'] = np.log2((mean_groups['Mean_G1']+1.0)/(1.0 + mean_groups['Mean_Control']))
peak_height = peak_height.merge(mean_groups,how='outer',left_index=True,right_index=True)
peak_height = peak_height.merge(atlas,how='outer',left_index=True,right_index=True)
peak_height.to_csv('/global/homes/b/bpb/Downloads/cs_ffff_peak_height_7.csv')
peak_height.head()
Out[127]:
In [ ]:
a = data.reset_index().values
labels, row_pos = np.unique(a[:, 0], return_inverse=True) #these are feature labels
basenames, col_pos = np.unique(a[:, -1], return_inverse=True) #these are rt values
pivot_table = np.zeros((len(labels), len(basenames),5), dtype=float)
pivot_table[row_pos, col_pos] = a[:, [1,2,3,4,5]]
# labels, row_pos = np.unique(a[:, 0], return_inverse=True) #these are feature labels
# rt, col_pos = np.unique(a[:, 1], return_inverse=True) #these are rt values
# pivot_table = np.zeros((len(labels), len(rt),3), dtype=float)
# pivot_table[row_pos, col_pos] = a[:, [2,3,4]]
In [ ]:
eic_table = np.zeros((len(labels), len(basenames),4),dtype=object)
eic_table[row_pos, col_pos] = a[:, [6,7,8,9]]
In [ ]:
eic_table[0,0]
In [ ]:
data.index.name = 'label'
peak_height = pd.pivot_table(data.reset_index(), values='i',index=['label'], columns=['filename'],fill_value=None)
peak_height.head()
In [ ]:
#20 seconds to do 64 of them
# metatlas_dataset[file_idx][compound_idx]['ms1_summary'] = dict
# rows are same order as atlas
metatlas_dataset = []
atlas = pd.read_hdf(files[0],'atlas')
atlas.set_index('label',drop=True,inplace=True)
atlas = atlas[[]] #we only need the index as a placeholder to organize the merge
def make_data_for_metatlas_dataset(f):
df = pd.read_hdf(f,'ms1_summary')
ms1 = pd.merge(atlas,df,left_index=True,right_index=True,how='left').to_dict('records')
df = pd.read_hdf(f,'flat_ms1')
eic = group_duplicates(df[['label','rt','mz','i','in_feature']],'label',make_string=False).set_index('label',drop=True)
eic = pd.merge(atlas,eic,left_index=True,right_index=True,how='left').to_dict('records')
df = pd.read_hdf(f,'flat_ms2')
msms = group_duplicates(df[['label','mz','i','rt','in_feature']],'label',make_string=False).set_index('label',drop=True)
msms = pd.merge(atlas,msms,left_index=True,right_index=True,how='left').to_dict('records')
return [{'ms1_summary':m,'file':f,'msms':msms[j],'eic':eic[j]} for j,m in enumerate(ms1)]
# pool = mp.Pool(processes=10)
# data = pool.map(make_data_for_metatlas_dataset,files)
# pool.close()
# pool.join()
In [ ]:
df = pd.read_hdf(files[0],'flat_ms2')
msms = group_duplicates(df[['label','mz','i','rt','in_feature']],'label',make_string=False).set_index('label',drop=True)
msms.head(100)
In [ ]:
# %%time
df = pd.read_hdf(files[0],'flat_ms1')
eic = group_duplicates(df[['label','rt','mz','i','in_feature']],'label',make_string=False).set_index('label',drop=True)
eic.head()