You'll have to... mean center and standardize all your features?
In [540]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from IPython.display import display
import pickle
# My code
import data.preprocessing as preproc
import util.combine_by_mz as combine_mz
import util.rt_window_prediction as rtwin
In [260]:
### import two datasets
def reindex_xcms_by_mzrt(df):
df.index = (df.loc[:,'mz'].astype('str') +
':' + df.loc[:, 'rt'].astype('str'))
return df
local_path = ('/home/irockafe/Dropbox (MIT)/Alm_Lab/' +
'projects/')
# malaria
malaria_path = (local_path + '/revo_healthcare/data/processed/MTBLS315/'+
'uhplc_pos/xcms_result_4.csv')
df_malaria_raw = pd.read_csv(malaria_path, index_col=0)
# convert column names to remove X added to them
new_idx = [i.replace('X', '') for i in df_malaria_raw.columns]
df_malaria_raw.columns = new_idx
print df_malaria_raw.columns
# replace 0 values with nans, so it's easier to replace them later
df_malaria_raw = df_malaria_raw.replace(to_replace=0.0 , value=np.nan, )
# Make a new index of mz:rt
mz = df_malaria_raw.loc[:,"mz"].astype('str')
rt = df_malaria_raw.loc[:,"rt"].astype('str')
idx = mz+':'+rt
df_malaria_raw.index = idx
# separate samples from xcms/camera things to make feature table
not_samples = ['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax',
'npeaks', 'uhplc_pos',
]
samples_list = df_malaria_raw.columns.difference(not_samples)
mz_rt_df = df_malaria_raw[not_samples]
# convert to samples x features
X_df_malaria_raw = df_malaria_raw[samples_list].T
print "original shape: %s \n# nans: %f\n" % (X_df_malaria_raw.shape, X_df_malaria_raw.isnull().sum().sum())
In [261]:
# Get mapping between sample name and assay names
path_sample_name_map = (local_path + 'revo_healthcare/data/raw/' +
'MTBLS315/metadata/a_UPLC_POS_nmfi_and_bsi_diagnosis.txt')
# Sample name to Assay name
# Index is the sample name
# value we want is the Assay name
sample_df = pd.read_csv(path_sample_name_map,
sep='\t', index_col=0)
sample_df = sample_df['MS Assay Name']
sample_df.shape
# sample name to sample class
path_sample_class_map = (local_path + 'revo_healthcare/data/raw/' +
'MTBLS315/metadata/s_NMFI and BSI diagnosis.txt')
class_df = pd.read_csv(path_sample_class_map,
sep='\t')
class_df.set_index('Sample Name', inplace=True)
class_df = class_df['Factor Value[patient group]']
# Combine sample > assay and sample > class into one
class_map_df = pd.concat([sample_df, class_df], axis=1)
class_map_df.rename(columns={'Factor Value[patient group]': 'class'}, inplace=True)
# convert all non-malarial classes into a single classes
# (collapse non-malarial febril illness and bacteremia together)
binary_class_map = class_map_df.replace(to_replace=['non-malarial febrile illness', 'bacterial bloodstream infection' ],
value='non-malarial fever')
print "binary class map:\n", binary_class_map.head()
# Get case and control samples in their own dataframes
case_str = 'malaria'
control_str = 'non-malarial fever'
# get that assay names based on class
case_labels = binary_class_map[binary_class_map['class'] == case_str]['MS Assay Name']
control_labels = binary_class_map[binary_class_map['class'] == control_str]['MS Assay Name']
# select the assay names from X_df_raw based on class
case = X_df_malaria_raw.loc[case_labels]
control = X_df_malaria_raw.loc[control_labels]
print 'case shape: ', case.shape
print 'control shape: ', control.shape
In [262]:
# convert classes to numbers
le = preprocessing.LabelEncoder()
le.fit(binary_class_map['class'])
assert(binary_class_map['MS Assay Name'] == X_df_malaria_raw.index).all()
y_malaria = le.transform(binary_class_map['class'])
y_malaria
Out[262]:
In [12]:
# Get p. vivax data
project_path = ('/revo_healthcare/data/processed/' +
'ST000578/ST000578_AN000888_Results.tsv')
p_vivax_path = local_path + project_path
df_p_vivax = pd.read_csv(p_vivax_path, sep='\t')
In [13]:
## Parse the class-labels from output
outcome = df_p_vivax.iloc[0,:]
print outcome.unique()
# convert all P. vivax into just P.vivax
susceptible_triplicate = outcome[outcome.str.contains('Current Malaria Infection:P.Vivax') &
outcome.str.contains('Chloroquine Resistance:Susceptible')]
resistant_triplicate = outcome[outcome.str.contains('Current Malaria Infection:P.Vivax') &
outcome.str.contains('Chloroquine Resistance:Resistant')]
control_triplicate = outcome[outcome.str.contains('Current Malaria Infection:None') ]
print '\n\nSusceptible\n', susceptible_triplicate.shape[0] / 3
print '\n\nResistant\n', resistant_triplicate.shape[0] / 3
print '\n\nControl\n', control_triplicate.shape[0] /3
# Select one of the three triplicate samples ( the one that doesn't have a period in the name)
resistant = resistant_triplicate[~resistant_triplicate.index.str.contains('\.')]
susceptible = susceptible_triplicate[~susceptible_triplicate.index.str.contains('\.')]
control = control_triplicate[~control_triplicate.index.str.contains('\.')]
# Relabel so that only three classes
resistant[:] = 'Chloroquine resistant'
susceptible[:] = 'Chloroquine susceptible'
control[:] = 'Control'
class_labels = pd.concat([resistant, susceptible, control])
print "how many samples?\n", class_labels.shape
print "What are the classes?\n", class_labels.unique()
In [16]:
# Grab samples that have correct class labels
# get rid of the class-label row
df_p_vivax_raw = df_p_vivax.iloc[1:, :]
# index based on the 'sample' row
df_p_vivax_raw = df_p_vivax_raw.set_index(df_p_vivax_raw['Samples'])
# Keep only samples and convert to (samples x features)
df_p_vivax_raw = df_p_vivax_raw[class_labels.index].T
display(df_p_vivax_raw.head())
# remove first column and convert to float
print "df_p_vivax_raw shape", df_p_vivax_raw.shape
print "class labels", class_labels.shape
# Make sure labels and df_p_vivax_raw-columns are in matching order
print "quick eyeball that y and X are in same order ", zip(df_p_vivax_raw.columns, class_labels.index)[0:5]
assert (df_p_vivax_raw.index == class_labels.index).all()
display(df_p_vivax_raw.head())
# Conver to binary class labels
print class_labels.unique()
le = preprocessing.LabelEncoder()
le.fit(class_labels)
y_pvivax = le.transform(class_labels)
print y_pvivax
In [5]:
# Get map between feature names and mz, rt
df_p_vivax_raw.columns
mz_rt_list = [i.split('_') for i in df_p_vivax_raw.columns]
feature_map = pd.DataFrame(mz_rt_list, columns=['mz', 'rt'], index=df_p_vivax_raw.columns)
display(feature_map.head())
In [1]:
# Don't normalize by dilution factor - have seen that doesn't help
# do set prevalence threshold
thresh = 0.5
X_df_malaria_filter = preproc.prevalence_threshold(X_df_malaria_raw, thresh)
print "raw shape", X_df_malaria_raw.shape
print "50% filter shape", X_df_malaria_filter.shape
# fill nans with 1/2 min from each sample
fill = X_df_malaria_filter.min(axis=1) / 2.0
X_df_malaria_filter_filled = X_df_malaria_filter.T.fillna(value=fill).T
# sanity check
print "how many nulls?", X_df_malaria_filter.isnull().sum().sum()
print "how many nulls?", X_df_malaria_filter_filled.isnull().sum().sum()
In [264]:
ppm = 30
# add mz and rt vals into dataframe
print df_malaria_raw.shape
mz_malaria = df_malaria_raw.loc[X_df_malaria_filter_filled.columns]['mz']
ppm_mat_malaria = combine_mz.ppm_matrix(mz_malaria, mz_malaria)
In [265]:
# make triangular
test = np.array(np.arange(1,10), dtype=np.float).reshape([3,3])
display(test)
display(np.tril(test, k=-1))
idx = np.tril_indices(test.shape[0])
test[idx] = np.nan
print test
# Convert to upper triangular matrix
idx_ppm = np.tril_indices(ppm_mat_malaria.shape[0])
ppm_mat_malaria[idx_ppm] = np.nan
In [266]:
ppm_mat_malaria
Out[266]:
In [267]:
print '# of overlaps:', (ppm_mat_malaria < ppm).sum()
In [92]:
def group_by_mz(mat, ppm):
'''
Given square matrix from ppm comparison,
group all isomers together
'''
isomers = np.argwhere(mat < ppm)
print np.nonzero(mat < ppm)
# match (a,b), (c,d), no others
toy = np.array([[np.nan]*4,
[0.1, np.nan, np.nan, np.nan],
[10,100, np.nan, np.nan],
[20,200,0.2, np.nan]
])
print toy
group_by_mz(toy, 1)
In [285]:
def group_isomer_indices(indices):
'''
Given a list of isomer pairs, combine all isomers pairs into
a single group - i.e. [[0,1], [1,5]] becomes [0,1,5]
'''
output = []
for idx in indices:
#print '\n\nfinding indices that match ', idx
# Find any matches to the current index in the list of indices
match_idx = np.argwhere([np.in1d(idx, poop).any() for poop in indices])
#print 'Indices that match\n', match_idx
#print 'Do these match %s?: \n %s' % (idx, indices[match_idx])
unique_matches = np.unique(indices[match_idx])
#print 'Unique matches', unique_matches
# Check if any values from unique_matches are present in output
# if not, append
# if so, append to place where they're found
in_output = [np.in1d(unique_matches, poop).any() for poop in output]
#print 'Output\n', output
#print 'Is it in the output?\n', in_output
where_in_output = np.argwhere(in_output)
#print 'Where?\n', where_in_output, 'Size', where_in_output.size, 'Greater than 1?', where_in_output.size
# if found in output, append it to where you found it in the output
if where_in_output.size != 0:
if where_in_output.size > 1:
raise ValueError('You should only find an index one place in the output. Somehting is wrong')
#print 'Append unique vals to entry that overlaps in output'
#print int(where_in_output)
#print output[int(where_in_output)]
output[int(where_in_output)] = np.append(output[int(where_in_output)], unique_matches)
# If not found in an entry of the output, append values found to the output
else:
output.append(unique_matches)
#print 'Output', output
#
output = [np.unique(poop) for poop in output]
#print ('-'*50, 'Final output\n', output)
return output
test = np.array([[0,1], [2,3], [0,4], [4,5], [1,6]])
iso = group_isomer_indices(test)
print test
print iso
In [275]:
ppm = 30
isomer_indices = np.argwhere(ppm_mat_malaria < 30)
isomer_indices.shape
print isomer_indices[0:10]
In [286]:
isomer_groups = group_isomer_indices(isomer_indices)
print isomer_groups
In [290]:
mtbls315_path = '/revo_healthcare/presentations/isaac_bats/rt_window_plots/MTBLS315/'
output = local_path + mtbls315_path + 'isomer_indices.pkl'
pickle.dump(isomer_groups, open(output, 'wb'))
In [483]:
print 'Number of isomer groups:', len(isomer_groups)
print 'Number of features that make up those groups isomers', np.concatenate(isomer_groups).size
group_size = [len(i) for i in isomer_groups]
plt.hist(group_size, bins=range(min(group_size), max(group_size)),
align='left', rwidth=0.7)
plt.title('Most isomer-groups are the combination of 2 features')
plt.xlabel('Number of isomers')
plt.ylabel('Count')
plt.savefig(local_path + mtbls315_path + '/number_of_features_per_isomer_group.pdf',
format='pdf')
plt.show()
In [526]:
def sum_isomer_intensities(df, isomer_groups):
'''
GOAL - Sum the intensity values of isomers
INPUT -
df (samples x features) of intensity values
isomer_groups - np array of arrays
OUTPUT -
dataframe containing the summed values
mapping between name of feature and values that make it up
'''
feature_names = ['isomer_group_%s' % i
for i in range(0,len(isomer_groups))]
output_df = pd.DataFrame(index=df.index, columns=feature_names)
feature_mapping = pd.Series(index=feature_names)
count = 0
for i, group in enumerate(isomer_groups):
summed_val = df.iloc[:,group].sum(axis=1, skipna=True)
# Add to dataframe
output_df['isomer_group_%s' % i] = summed_val
feature_mapping['isomer_group_%s' % i] = np.array(df.iloc[:, group].columns)
#if type(df.iloc[:, group].columns == )
count += len(np.array(df.iloc[:, group].columns))
print 'Final count', count
return output_df, feature_mapping
df_summed_isomers, summed_isomer_features = sum_isomer_intensities(
X_df_malaria_filter_filled, isomer_groups)
# Double check that # of isomers is same in df and group
print 'original size', np.concatenate(isomer_groups).size
print 'other shapes', summed_isomer_features.shape
print 'summed_df shape', df_summed_isomers.shape
In [488]:
# TODO - decide if these need to be normalized...?
sns.distplot(np.log10(summed_isomer_df).mean(axis=0), bins=100,
label='isomer_groups')
sns.distplot(np.log10(X_df_malaria_filter_filled).mean(axis=0), bins=100,
label='original features' )
plt.xlabel('log_10 Inensity')
plt.legend()
plt.title('Mean feature intensity distribution of isomer features vs normal')
plt.show()
plt.close()
In [489]:
# Case and control look about the same
# How does case/control of isomer features look?
case_poop = summed_isomer_df.loc[case_labels]
ctrl_poop = summed_isomer_df.loc[control_labels]
print case_poop.shape
print 'ctrl', ctrl_poop.shape
sns.distplot(np.log10(ctrl_poop).mean(axis=0),
bins=100, label='Control')
sns.distplot(np.log10(case_poop).mean(axis=0), bins=100,
label='Case')
plt.legend()
plt.title('Mean feature intensity distribution of isomer ' +
'features: case vs control')
plt.xlabel('log_10 Intensity')
plt.show()
In [535]:
# Remove features
print summed_isomer_features.values
isomer_idx = np.concatenate(summed_isomer_features.values)
#print summed_isomer_features.values
#print summed_isomer_index.get_level_values(0)
isomers_to_drop = X_df_malaria_filter_filled[isomer_idx]
print 'Shape of original', X_df_malaria_filter_filled.shape
print 'Shape of what to drop, should be 3358', isomers_to_drop.shape
In [539]:
# Drop and add
X_df_malaria_filter_filled_isomers_grouped = pd.concat([
# drop first
X_df_malaria_filter_filled.drop(isomers_to_drop.columns, axis=1)
,
df_summed_isomers], axis=1)
print 'new shape. Should be 4604', X_df_malaria_filter_filled_isomers_grouped.shape
In [567]:
print not_samples
# combine the rt back
X = (pd.concat([X_df_malaria_filter_filled_isomers_grouped,
df_malaria_raw[not_samples].T], axis=0)
.T
.fillna(value=0)
)
min_val = 0
max_val = df_malaria_raw['rt'].max()
width = max_val
step = width
sliding_window = rtwin.make_sliding_window(min_val,
max_val, width, step)
# run classifier on all data
n_iter = 3
test_size = 0.3
rf_trees = 1000
all_aucs = np.full([len(sliding_window), n_iter], np.nan)
path = ('/revo_healthcare/presentations/isaac_bats/' +
'combine_isomers')
output_path = local_path + path
# Run rt-sliding-window classifier
rtwin.sliding_rt_window_aucs(X, y_malaria, sliding_window, not_samples,
rf_trees=rf_trees, n_iter=n_iter, test_size=test_size,
output_path=output_path)
Out[567]: