Here's a template of the kind of exploration, preprocessing, model construction and evaluation I want to do with each project
Distributions (broken down by class)
Covariates
Statistics
Unsupervised viz (visualize by class)
In [73]:
import subprocess
subprocess.call('pip install mwtab', shell=True)
%load_ext autoreload
import src.project_fxns.organize_xcms as xcms_fxns
import src.data.preprocessing as preproc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import src.data.data_exploration as explore
import pandas as pd
import scipy.stats as stats
# scikit
from sklearn.decomposition import PCA
from sklearn import preprocessing
%matplotlib inline
%autoreload 2
In [2]:
mtbls315_pos = xcms_fxns.Xcms_organize(
'/home/data/processed/MTBLS315/uhplc_pos/xcms_result.tsv'
)
mtbls315_pos.remove_column_prefix(prefix='X')
# get class to samplename mapping
class_col = 'Factor Value[patient group]'
mtbls315_pos.mtbls_class_mapping(
'/home/data/raw/MTBLS315/a_UPLC_POS_nmfi_and_bsi_diagnosis.txt',
'/home/data/raw/MTBLS315/s_NMFI and BSI diagnosis.txt',
class_col)
# encode classes as numeric
mtbls315_pos.class_encoder()
In [3]:
# Samples are from 3 classes, I might want to just differentiate between two
mtbls315_pos.class_dict
no_mal = np.concatenate([mtbls315_pos.class_dict['bacterial bloodstream infection'],
mtbls315_pos.class_dict['non-malarial febrile illness']])
mal = mtbls315_pos.class_dict['malaria']
new_class_dict = {'malaria': mal, 'non-malarial fever': no_mal}
In [4]:
(mtbls315_pos.feature_table < 1e-15).sum().sum()
Out[4]:
In [5]:
zeros / float(num_feats)
In [247]:
num_feats = mtbls315_pos.feature_table.shape[1] * mtbls315_pos.feature_table.shape[0]
zeros = (mtbls315_pos.feature_table < 1e-15).sum().sum()
print('Original number of zeros %s, %.2f%% ' % (zeros, 100*float(zeros)/num_feats)
)
print('Original number of NaNs %s' % mtbls315_pos.feature_table.isnull().sum().sum()
)
zero_filled = explore.fill_zero_nan(mtbls315_pos.feature_table)
print('Zero-filled number of zeros %s' % (zero_filled < 1e-15).sum().sum()
)
print('Zero-filled number of NaNs %s' % zero_filled.isnull().sum().sum()
)
print('Zero-filled min: %s' % zero_filled.min().min())
In [ ]:
In [14]:
# Sample Intensity distributions
tidy = explore.tidy(zero_filled)
tidy['value'] = np.log10(tidy['value'])
tidy
Out[14]:
In [55]:
axes = explore.sample_feature_intensity(tidy,
new_class_dict, ylabel='log10(Intensity)',
xlabel='Samples')
In [70]:
# Mean intensity per feature
axes = explore.distplot_classes(np.log10(zero_filled),
class_dict=new_class_dict, fxn=np.mean,
axlabel='log10(Mean Intensity)', bins=100,
kde=False)
plt.show()
# Mean intensity per feature per class
df_mean = pd.DataFrame({k: np.mean(zero_filled.loc[v], axis=0)
for k, v in new_class_dict.iteritems()}
)
sns.boxplot(data=np.log10(df_mean))
Out[70]:
In [71]:
explore.distplot_classes(
np.log10(zero_filled), class_dict=new_class_dict,
fxn=np.var, axlabel='$log_{10}$(Variance)', bins=50, kde=False)
plt.show()
df_var = pd.DataFrame({k: np.var(zero_filled.loc[v], axis=0)
for k, v in new_class_dict.iteritems()}
)
sns.boxplot(data=np.log10(df_var))
plt.show()
In [10]:
# Sparsity per sample
sample_sparsity = ((mtbls315_pos.feature_table < 1e-15).sum(axis=1)
/ mtbls315_pos.feature_table.shape[1])
ax = explore.distplot_classes(sample_sparsity, new_class_dict,
kde=False, bins=10)
for graph in ax:
graph.set_title('Feature sparsity per sample')
graph.set_xlabel('Sparsity ratio')
In [39]:
# Distribution of mean feature sparsity
for k, v in new_class_dict.iteritems():
print(v)
feature_sparsity = ((mtbls315_pos.feature_table.loc[v] < 1e-15).sum(axis=0)
/ mtbls315_pos.feature_table.loc[v].shape[0])
ax = explore.distplot_classes(feature_sparsity,
kde=False)
ax.set_title('Class: %s' % k)
plt.show()
In [122]:
mw_output = explore.two_group_stat(zero_filled, new_class_dict,
stats.mannwhitneyu, alternative='two-sided')
In [123]:
# pval distribution (*2 because it should be two-tailed)
mw_pvals = np.array([i[1] for i in mw_output])
sns.distplot(mw_pvals, bins=50, kde=False,
axlabel='Raw Mann-Whitney pval')
Out[123]:
In [260]:
g = sns.jointplot(np.log10(np.mean(zero_filled, axis=0)),
np.log10(np.var(zero_filled, axis=0)),
s=3, alpha=0.3)
g.set_axis_labels('$-log_{10}$(Mean Feature Intensity)',
'$-log_{10}$(Feature Variance)')
Out[260]:
In [241]:
# Stratify by covariates
covariates = mtbls315_pos.all_data.T.loc[['mz', 'rt',
'mzmin', 'mzmax', 'rtmin', 'rtmax']].T
# Get just the pvalue as a series
mw_pval_series = mw_output.apply(lambda x: x[1])
In [264]:
def pval_strat_hist_and_scatter(covar, stats, covar_name='Covariate',
**kwargs):
# First plot stratified pval histograms
axes = explore.plot_pvals_stratified(covar, stats, covar_name,
**kwargs)
plt.show()
# Next plot scatter plot with real values
df = pd.concat([covar, stats], axis=1, keys=[covar_name, 'pval'])
g = sns.jointplot(df[covar_name], -np.log10(df['pval']),
s=10, alpha=0.3)
g.set_axis_labels(covar_name, '$-log_{10}$(pval)')
plt.show()
# then scatter plot with joint values
g = sns.jointplot(range(0, df.shape[0]), -np.log10(df['pval']),
dropna=False,
s=10, alpha=0.3)
g.set_axis_labels('Rank %s' % covar_name, '$-log_{10}(pval)')
plt.show()
In [265]:
# Variance
pval_strat_hist_and_scatter(np.log10(np.var(zero_filled, axis=0)),
mw_pval_series,
covar_name='$log_{10}$(Variance)',
ngroups=8,
title_fxn=None)
Looks like feature-variance is:
(See IHW vignette for more, section 3.4.2 - https://www.bioconductor.org/packages/release/bioc/vignettes/IHW/inst/doc/introduction_to_ihw.html )
In [266]:
# Mean Intensity
pval_strat_hist_and_scatter(
np.log10(np.mean(zero_filled, axis=0)),
mw_pval_series,
covar_name='Mean Intensity',
ngroups=8,
title_fxn=None
)
Looks like mean feature intensity is:
But this is odd, b/c of how tightly correlated variance and mean intensity are
In [269]:
# Feature sparsity
feature_sparsity = ((mtbls315_pos.feature_table < 1e-15).sum(axis=0)
/ mtbls315_pos.feature_table.shape[0])
pval_strat_hist_and_scatter(
feature_sparsity,
mw_pval_series,
covar_name='Sparsity',
ngroups=8,
title_fxn=np.log10
)
Looks like feature sparsity:
In [272]:
pval_strat_hist_and_scatter(covariates['rt'], mw_pval_series,
covar_name='RT', ngroups=8,
title_fxn=None)
Looks like RT is
In [273]:
pval_strat_hist_and_scatter(covariates['mz'],
mw_pval_series,
covar_name='mz',
ngroups=8)
Looks like mz:
In [276]:
rt_width = covariates['rtmax'] - covariates['rtmin']
pval_strat_hist_and_scatter(rt_width,
mw_pval_series,
covar_name='rt width',
ngroups=9,
title_fxn=None)
Looks like rt width is
In [282]:
# Check out mz-width
mz_wid = covariates['mzmax'] - covariates['mzmin']
pval_strat_hist_and_scatter(mz_wid,
mw_pval_series,
covar_name='mz-width',
ngroups=6,
title_fxn=None
)
Looks like mzwidth is:
In [147]:
y == 0
Out[147]:
In [150]:
# Get class for each sample
new_class_labels = xcms_fxns.encode_new_classes(new_class_dict)
# Make sure orders are right
assert(
(new_class_labels.index == zero_filled.index).all())
le = preprocessing.LabelEncoder()
y = le.fit_transform(new_class_labels.values)
X = zero_filled.values
pca = PCA(n_components=2, whiten=True)
pca_out = pca.fit_transform(zero_filled)
In [165]:
pca_out[y==0,1].shape
Out[165]:
In [159]:
le.inverse_transform([0,1])
Out[159]:
In [175]:
le.inverse_transform(np.unique(y))
Out[175]:
In [386]:
def plot_PCA(data, class_labels):
# data - samples x feats
# class_labels - pd.Series, string or numeric, same index as
# data
# If index doesn't match, reindex and retry
'''
try:
assert((data.index == class_labels.index).all())
except AssertionError:
data = data.sort_index()
class_labels = class_labels.sort_index()
try:
assert ((data.index == class_labels.index).all())
except:
print('The indices in class_labels and data dont match')
raise
'''
le = preprocessing.LabelEncoder()
y = le.fit_transform(class_labels.values)
pca = PCA(n_components=2, whiten=False)
pca_out = pca.fit_transform(data)
colors = ['darkorange', 'turquoise']
encoded_classes = np.unique(y)
target_names = le.inverse_transform(encoded_classes)
for color, i, target_name in zip(colors, encoded_classes, target_names):
print(color, i, target_name)
plt.scatter(pca_out[y==i,0], pca_out[y==i, 1],
color=color, alpha=0.7, label=target_name,
s=20
)
plt.legend(loc='best', scatterpoints=1)
plt.title('PCA')
return pca_out
pca_out = plot_PCA(zero_filled, new_class_labels)
In [265]:
X_scaled = preprocessing.scale(zero_filled, axis=0)
In [266]:
pca_out = plot_PCA(X_scaled, new_class_labels)
In [359]:
# try t-sne
from sklearn import manifold
from sklearn import feature_selection
In [367]:
In [404]:
# Do some preprocessing - variance filter and stuff
assert( (new_class_labels.index == zero_filled.index).all())
# toss out 25% lowest variances
X = zero_filled
x_var = np.var(X, axis=0)
sort_idx = x_var.argsort()
num_splits = 8
splits = np.array_split(sort_idx, num_splits)
# This is the variance of the 25% feature
var_thresh = x_var[splits[num_splits-2]].max()
print('Variance Threshold', var_thresh)
# preprocess
selector = feature_selection.VarianceThreshold(var_thresh)
X = selector.fit_transform(X)
print('X shape', X.shape)
X = preprocessing.scale((X))
print('X shape', X.shape)
In [406]:
for perplex in [2, 5, 10, 20, 30, 40, 50, 75, 100, 200]:
tsne = manifold.TSNE(n_components=2, perplexity=perplex,
init='random', random_state=1)
Y = tsne.fit_transform(X)
colors = ['navy', 'turquoise']
classes = le.inverse_transform(np.unique(y))
for color, i, name in zip(colors, np.unique(y), classes):
print(color, i, name)
plt.scatter(Y[y==i, 0], Y[y==i, 1], color=color,
label=name)
plt.legend(bbox_to_anchor=(1.01, 1.01))
plt.title('Perplexity: %s' % perplex)
plt.show()
In [407]:
pca_out = plot_PCA(X, new_class_labels)
X.shape
Out[407]: