The goal of this notebook is to look into the weird batchiness of Craig's processed malaria dataset.
In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [52]:
fname = '../7-24-17_Malaria_pos_grouped_data.csv'
fmeta = '../7-22-17_malaria_metadata.txt'
fmap = '../a_UPLC_POS_nmfi_and_bsi_diagnosis.txt'
df = pd.read_csv(fname)
meta = pd.read_csv(fmeta, sep='\t')
smap = pd.read_csv(fmap, sep='\t')
df.head()
Out[52]:
In [53]:
# Make a new index of mz:rt
mz = df.loc[:,"mz"].astype('str')
rt = df.loc[:,"rt"].astype('str')
idx = mz+':'+rt
df.index = idx
# separate samples from xcms/camera things to make feature table
not_samples = ['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax',
'isotopes', 'adduct']
samples_list = df.columns.difference(not_samples)
df = df[samples_list]
df.head()
Out[53]:
In [54]:
meta.index = meta['Sample Name']
meta.head()
Out[54]:
In [55]:
meta['Factor Value[patient group]'].unique()
Out[55]:
In [56]:
smap[['Sample Name', 'MS Assay Name']].head()
Out[56]:
In [57]:
# Change columns in feature table to match sample name in metadata
# MS Assay Name to Sample Name
ms_to_sample = {i: j for i, j in zip(smap['MS Assay Name'], smap['Sample Name'])}
# Feature table column names to MS Assay Name
df_to_ms = {i: i.split('_')[0][1:] + '_P' for i in df.columns}
# Feature table column names to sample names
df_to_sample = [ms_to_sample[df_to_ms[i]] for i in df.columns]
In [58]:
print(len(df_to_sample), df.shape, meta.shape)
In [59]:
df.columns = df_to_sample
df.head()
Out[59]:
In [60]:
discol = 'Factor Value[patient group]'
sample_colors = pd.DataFrame(meta.loc[df.columns, discol])
colormap = {'malaria': 'red', 'non-malarial febrile illness': 'blue', 'bacterial bloodstream infection': 'green'}
sample_colors['color'] = [colormap[i] for i in sample_colors[discol]]
sample_colors[discol].unique()
Out[60]:
In [61]:
toplot = np.log10(df.fillna(1).replace(0, 1))
top100feats = toplot.mean(axis=1).sort_values(ascending=False).index[0:100]
In [62]:
sns.clustermap(toplot.loc[top100feats], row_cluster=False, col_colors=sample_colors['color'])
Out[62]: