In [2]:
import time
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import cross_val_score
#from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle
from scipy import interp
import scipy.stats as stats
import pickle
# My libraries
import data.preprocessing as preproc
import project_fxns.rt_window_prediction as rtwin
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [3]:
toy = pd.DataFrame([[1,2,3,0],
[0,0,0,0],
[0.5,1,0,0]], dtype=float,
columns=['1', '2', '3', '4'],
index=['sample_%s' % i for i in range(1,4)])
preproc.prevalence_threshold(toy)
Out[3]:
In [4]:
### Subdivide the data into a feature table
local_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/'\
'projects'
data_path = local_path + '/revo_healthcare/data/processed/MTBLS72/positive_mode/'\
'mtbls_no_retcor_bw2.csv'
## Import the data and remove extraneous columns
df = pd.read_csv(data_path, index_col=0)
# Replace X's at the beginning of sample names
new_idx = [i.replace('X', '') for i in df.columns]
df.columns = new_idx
print df.columns
df.shape
df.head()
# Make a new index of mz:rt
mz = df.loc[:,"mz"].astype('str')
rt = df.loc[:,"rt"].astype('str')
idx = mz+':'+rt
df.index = idx
# separate samples from xcms/camera things to make feature table
not_samples = ['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax',
'npeaks', 'positive_mode',
]
samples_list = df.columns.difference(not_samples)
mz_rt_df = df[not_samples]
# Convert to (samples x features) format - our standard
X_df_raw = df[samples_list].T
new_idx = [i.replace('X', '') for i in X_df_raw.index]
X_df_raw.index = new_idx
In [5]:
# get mapping between sample name and sample class
path_sample_class_map = (local_path +
'/revo_healthcare/data/raw/MTBLS72/s_Plasma_AD_Lipidomics.txt')
class_column = 'Factor Value[Cognitive Status]'
class_df = pd.read_csv(path_sample_class_map,
sep='\t')
# Set index as sample name
class_df.set_index('Sample Name', inplace=True)
class_df = class_df[class_column]
# select only positive values from positive-ion mode
class_df = class_df[class_df.index.str.contains('POS')]
print class_df.head(10)
print "Class label shape: ", class_df.shape
print "feature table shape: ", X_df_raw.shape
#class_df.rename('class', inplace=True)
print class_df.head()
print '\n\nClass labels: ', pd.unique(class_df.values)
# Get case and control dataframes
case_labels = class_df[class_df=='aMCI/AD'].index
control_labels = class_df[class_df == 'Normal Control'].index
case = X_df_raw.loc[case_labels]
control = X_df_raw.loc[control_labels]
In [15]:
# Match between feature table and metadata and assert that they're in the same order.
# then define the numpy-arrays for X and y
class_df = class_df[X_df_raw.index].sort_index()
X_df_raw = X_df_raw.sort_index()
print 'Class values:', class_df.unique()
assert (class_df.index == X_df_raw.index).all()
# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(class_df)
y = le.transform(class_df)
print y
print "class-labels: ", le.classes_
In [28]:
# fill nan values with 1/2 the minimum from each sample
fill_val = X_df_raw.min(axis=1) / 2.0
# Only keep features present in at least 70% of samples
# Note that this means 70% total, not 70% in a particular class
X_df_filtered = preproc.prevalence_threshold(X_df_raw,
threshold=0.7)
print 'Raw shape', X_df_raw.shape
print 'Filtered at 70% prevalence', X_df_filtered.shape
# must transpose, b/c fillna only operates along columns
X_df_filled = X_df_filtered.fillna(value=fill_val, )
#X_pqn_df_raw = preproc.correct_dilution_factor(X_df_raw, plot=True)
X_pqn_df_filled = preproc.correct_dilution_factor(X_df_filled, plot=True)
X_pqn_df_filled_log = np.log10(X_pqn_df_filled)
In [30]:
# Do mann-whitney on case vs control
def mw_pval_dist(case, control):
'''
case - dataframe containing case
control - dataframe with control samples
All should have same features (columns)
'''
# get parametric pvals
mann_whitney_vals = pd.DataFrame(np.full([case.shape[1],2], np.nan),
index=case.columns, columns= ['u', 'pval'])
for idx, case_vals in case.iteritems():
control_vals = control[idx]
u, pval = stats.mannwhitneyu(case_vals, control_vals)
mann_whitney_vals.loc[idx, 'u'] = u
mann_whitney_vals.loc[idx, 'pval'] = pval
# plot mw pval distribution
mann_whitney_vals.hist('pval')
plt.title('mann-whitney pval between case and control')
plt.show()
# plot distribution of mean intensities
case_mean = case.mean(axis=0)
ctrl_mean = control.mean(axis=0)
sns.distplot(np.log10(case_mean), label='case')
sns.distplot(np.log10(ctrl_mean), label='control')
plt.xlabel('log_10 intensity')
plt.title('Mean intensity of case vs. control')
plt.legend()
plt.show()
u, pval = stats.mannwhitneyu(case_mean, ctrl_mean)
print 'pval (MannW) of intensities between case and control: ', pval
#print('Raw intensities\n\n')
#mw_pval_dist(X_df_raw.loc[case_labels], X_df_raw.loc[control_labels])
print('*'*50+'\nNaN filled with 1/2 min')
mw_pval_dist(X_df_filled.loc[case_labels], X_df_filled.loc[control_labels])
#print('*'*50+'\n Raw pqn_normalized')
#mw_pval_dist(X_pqn_df_raw.loc[case_labels], X_pqn_df_raw.loc[control_labels])
print('*'*50+'\n NaN filled with 1/2 min, pqn normalized')
mw_pval_dist(X_pqn_df_filled.loc[case_labels], X_pqn_df_filled.loc[control_labels])
print('*'*50+'\n NaN filled with 1/2 min, pqn normalized, log10 transformed')
mw_pval_dist(X_pqn_df_filled_log.loc[case_labels],
X_pqn_df_filled_log.loc[control_labels])
Note that the pvals reported assume normality - I didn't do them via permutation test, because laziness at current moment
In [145]:
# Add back mz, rt, etc. columns to feature table and reshape it to be
# (feats x samples)
X_pqn_df_filled_mzrt = pd.concat([df[not_samples].T, X_pqn_df_filled],
axis=0).T
# run a sliding windonw
# Make sliding window
min_val = 0
max_val = df['rt'].max()
width = max_val / 5
step = width / 2
sliding_window = rtwin.make_sliding_window(min_val,
max_val, width, step)
# Paths to plot things
path = ('/revo_healthcare/presentations/isaac_bats/'+
'rt_window_plots/MTBLS72/')
output_path = local_path + path
# run classifier & plot on sliding window
n_iter = 50
test_size = 0.3
rf_trees = 1000
# Run rt-sliding-window classifier
rtwin.sliding_rt_window_aucs(X_pqn_df_filled_mzrt, y, sliding_window, not_samples,
rf_trees=rf_trees, n_iter=n_iter, test_size=test_size,
output_path=output_path)
Out[145]:
In [150]:
auc_vals = pickle.load(open(output_path+'/auc_vals.pkl', 'rb'))
fig_path = output_path + 'auc_vs_rt.pdf'
rtwin.plot_auc_vs_rt(auc_vals, sliding_window, df, fig_path)
That would be a shocking finding. That plasma metabolites are sooooo different between people about to experience cognitive impairment vs. those who aren't.
I need to re-process the data & ask authors for their parameters.
But this still feels fishy to me and reads as a cautionary note for my analyses