In [1]:
import subprocess
subprocess.call('pip install mwtab', shell=True)
Out[1]:
In [2]:
%load_ext autoreload
import src.project_fxns.organize_xcms as xcms_fxns
import src.data.preprocessing as preproc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import src.data.data_exploration as explore
import pandas as pd
import scipy.stats as stats
from sklearn.decomposition import PCA
from sklearn import preprocessing
%matplotlib inline
%autoreload 2
In [3]:
mtbls315_pos = xcms_fxns.Xcms_organize(
'/home/data/processed/MTBLS315/uhplc_pos/xcms_result.tsv'
)
mtbls315_pos.remove_column_prefix(prefix='X')
# get class to samplename mapping
class_col = 'Factor Value[patient group]'
mtbls315_pos.mtbls_class_mapping(
'/home/data/raw/MTBLS315/a_UPLC_POS_nmfi_and_bsi_diagnosis.txt',
'/home/data/raw/MTBLS315/s_NMFI and BSI diagnosis.txt',
class_col)
# encode classes as numeric
mtbls315_pos.class_encoder()
In [4]:
mtbls315_pos.class_dict
no_mal = np.concatenate([mtbls315_pos.class_dict['bacterial bloodstream infection'],
mtbls315_pos.class_dict['non-malarial febrile illness']])
mal = mtbls315_pos.class_dict['malaria']
new_class_dict = {'malaria': mal, 'non-malarial fever': no_mal}
In [5]:
mtbls315_pos.feature_table
Out[5]:
In [12]:
sparse_graph = explore.plot_feature_sparsity(mtbls315_pos.feature_table,
new_class_dict)
In [7]:
print('Original number of nans %s ' % mtbls315_pos.feature_table.isnull().sum().sum())
zero_filled = explore.fill_zero_nan(mtbls315_pos.feature_table, )
zero_filled.isnull().sum().sum()
Out[7]:
In [8]:
zero_filled.min(axis=1)
Out[8]:
In [9]:
tidy = explore.tidy(zero_filled)
tidy['value'] = np.log10(tidy['value'])
In [10]:
axes = explore.sample_feature_intensity(tidy,
mtbls315_pos.class_dict)
In [10]:
explore.save_axes(axes, '/home/deletable',
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
In [11]:
explore.sample_feature_intensity(tidy, plot_type=sns.boxplot)
Out[11]:
In [53]:
# Plot the sparseness distribution per feature
feat_sparsity = ((mtbls315_pos.feature_table < 1e-15).sum(axis=0)
/ mtbls315_pos.feature_table.shape[0])
explore.distplot_classes(feat_sparsity)
Out[53]:
In [51]:
sample_sparsity = ((mtbls315_pos.feature_table < 1e-15).sum(axis=1)
/ mtbls315_pos.feature_table.shape[1])
explore.distplot_classes(sparsity, new_class_dict)
Out[51]:
In [48]:
# Sparseness per sample
sparsity = ((mtbls315_pos.feature_table < 1e-15).sum(axis=1)
/ mtbls315_pos.feature_table.shape[1])
explore.plot_feature_sparsity(mtbls315_pos.feature_table,
mtbls315_pos.class_dict)
Out[48]:
In [47]:
explore.plot_feature_sparsity(mtbls315_pos.feature_table,
new_class_dict,
#bins=10
)
Out[47]:
In [41]:
explore.distplot_classes(np.log10(zero_filled),
new_class_dict,
fxn=np.mean,
axlabel='log10(Mean Intensity)',
bins=100)
Out[41]:
In [14]:
# plot intensities
#sns.distplot(np.log10(zero_filled.mean(axis=0)), bins=100)
explore.plot_mean_intensity(np.log10(zero_filled),
class_dict=mtbls315_pos.class_dict,
axlabel='log10(Intensity)')
Out[14]:
In [15]:
mw_vals = explore.two_group_stat(zero_filled, new_class_dict,
stats.mannwhitneyu)
In [16]:
mw_pvals = np.array([i[1] for i in mw_vals])
sns.distplot(mw_pvals*2, bins=50, kde=False)
Out[16]:
In [18]:
#a = np.array_split(zero_filled.std(axis=0).sort_values(), 6)
ngroups=10
a = np.array_split(zero_filled.std(axis=0).sort_values(),
ngroups)
n_cols=3.0
n_rows= np.ceil(ngroups/n_cols)
print n_rows
fig, axes = plt.subplots(int(n_rows), int(n_cols),
sharex=True, sharey=True)
for i, arr in enumerate(a):
stratified_mw = mw_vals[arr.index]
pvals = [val[1] for val in stratified_mw]
row = int(np.floor(i/n_cols))
col = int(i%n_cols)
#print('row: %s col: %s' % (row, col))
ax = sns.distplot(pvals, bins=50, kde=False, ax=axes[row,col])
ax.set_title(i+1)
#plt.title('%.2f to %.2f' % (arr.min(), arr.max()))
In [55]:
mw_pvals = mw_vals.apply(lambda x: x[1])
In [54]:
axes = explore.plot_pvals_stratified(zero_filled,
zero_filled.std(axis=0),
mw_pvals*2,
'MW pval', ngroups=9)
In [42]:
for class_label, samples in new_class_dict.iteritems():
# split by class and plot
# mean intensity dist,
# intensity-std distribution
#
zero_filled.loc[samples]
In [45]:
np.mean(zero_filled, axis=0)
Out[45]:
In [36]:
covariates = mtbls315_pos.all_data.T.loc[['mz', 'rt',
'mzmin', 'mzmax', 'rtmin', 'rtmax']].T
covariates
Out[36]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [41]:
pca = PCA(n_components=2)
pca_out = pca.fit_transform(zero_filled)
In [125]:
x = [i[0] for i in pca_out]
y = [i[1] for i in pca_out]
ax = plt.scatter(x,y,
c=y_coded,
)
In [90]:
y_coded = le.fit_transform(mtbls315_pos.sample_classes[mtbls315_pos.class_label_col])
In [91]:
le.inverse_transform(y_coded)
Out[91]:
In [88]:
y
Out[88]:
In [ ]: