In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy as np
%matplotlib inline
Start with MTBLS315, the malaria vs fever dataset. Could get ~0.85 AUC for whole dataset.
In [2]:
# Get the data
### Subdivide the data into a feature table
local_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/'
data_path = local_path + '/revo_healthcare/data/processed/MTBLS315/'\
'uhplc_pos/xcms_camera_results.csv'
## Import the data and remove extraneous columns
df = pd.read_csv(data_path, index_col=0)
df.shape
df.head()
not_samples=['']
Almost everything is below 30sec rt-window
In [3]:
# Show me a scatterplot of m/z rt dots
# distribution along mass-axis and rt axist
plt.scatter(df['mz'], df['rt'], s=1,)
plt.xlabel('mz')
plt.ylabel('rt')
plt.title('mz vs. rt')
plt.show()
In [4]:
# Check out the distribution of rt-windows to choose your rt bin
rt_dist = df['rtmax'] - df['rtmin']
rt_dist.hist(bins=100)
# Choose to use 40 second bins...?
Out[4]:
In [ ]:
In [ ]:
# Divide m/z and rt into windows of certain width
def subdivide_mz_rt(df, rt_bins, mz_bins, samples, not_samples):
'''
GOAL - Subdivide mz/rt space into a grid of defined widths. Sum entrie sof the grid
IPUT - df - dataframe from xcms (rt, mz, patient names as columns)
OUTPUT - Matrix containing summed intensities
NOTE - Do adducts play a large roll here? Uncertain. Could choose your rt-widths based on
rt-distribution
'''
# Define the bins based on separation criteria
# todo this is hacky and will always round down
rt_window = df['rt'].max() / rt_bins
rt_bounds = [(i*rt_window, i*rt_window+rt_window)
for i in range(0, rt_bins)]
mz_window = df['mz'].max() / mz_bins
mz_bounds = [(i*mz_window, i*mz_window+mz_window)
for i in range(0, mz_bins)]
# Tidy up by converting patient labels to column using melt
tidy = pd.melt(df, id_vars=not_samples,
value_vars=samples, var_name='Samples',
value_name='Intensity')
# Go through each sample and sum intensities onto grid
grid_dict = {}
for sample in samples:
# Initiate empty dict with shape mz_windo x rt_window
grid_vals = np.zeros([mz_bins, rt_bins])
df_sample = tidy[tidy['Samples'] == sample]
# use floor, b/c zero is the start of indexes, not 1
# TODO Deal with
y_vals = np.floor(df_sample['rt'].div(
df_sample['rt'].max()+1e-9)*rt_bins
).astype(int)
x_vals = np.floor(df_sample['mz'].div(
df_sample['mz'].max()+1e-9)*mz_bins
).astype(int)
df_sample['rt_bin'] = x_vals
df_sample['mz_bin'] = y_vals
# Go through each row and add intensity to the correct
# grid-entry
for idx, row in df_sample.iterrows():
grid_vals[row['rt_bin'], row['mz_bin']] += row['Intensity']
# Add to grid dict
grid_dict[sample] = grid_vals
ax = sns.heatmap(grid_vals)
ax.set_xlabel('mz')
ax.set_ylabel('rt')
ax.invert_yaxis()
plt.title(sample)
plt.show()
# Create matrix to add intensities to
# Get the number of bins by finding hte max values of m/z and
return grid_dict
not_samples = ['mz', 'mzmin', 'mzmax', 'rt', 'rtmin', 'rtmax',
'adduct', 'isotopes', 'npeaks', 'pcgroup', 'uhplc_pos']
samples = df.columns.difference(not_samples)
print 'max:', df['mz'].max()
rt_divisor = 50 # base this on distribution of rt-widths..?
mz_bins = 5
rt_bins = 5
print mz_bins
print rt_bins
grid_dict = subdivide_mz_rt(df, mz_bins, rt_bins, samples, not_samples)
In [92]:
# Make shit tidy
test_df = pd.DataFrame({'mz': [10,20,30,100], 'rt':[100,200,300,1000],
'A': [1,2,3,4], 'B': [10,20,30,40],
'C': [5,15,25,35]})
print 'Original: \n', test_df
tidy_df = pd.melt(test_df, id_vars=['mz', 'rt'], value_vars=['A', 'B', 'C'],
var_name='Subject', value_name='Intensity')
print '\n\n Tidy:\n', tidy_df
feature_table_df = pd.pivot_table(tidy_df, index=['mz', 'rt'], values='Intensity',
columns='Subject')
print '\n\n Unpivoted, step 1:\n',feature_table_df
feature_table_df.reset_index(inplace=True)
print feature_table_df
abc = tidy_df[tidy_df['Subject'] == 'A']
In [120]:
a = np.zeros([2,6])
a[0.0,2.0] += 3
a[0,2]+=7
a
In [84]:
# divide feature table into slices of retention time
def get_rt_slice(df, rt_bounds):
'''
PURPOSE:
Given a tidy feature table with 'mz' and 'rt' column headers,
retain only the features whose rt is between rt_left
and rt_right
INPUT:
df - a tidy pandas dataframe with 'mz' and 'rt' column
headers
rt_left, rt_right: the boundaries of your rt_slice, in seconds
'''
out_df = df.loc[ (df['rt'] > rt_bounds[0]) &
(df['rt'] < rt_bounds[1])]
return out_df
def sliding_window_rt(df, rt_width, step=rt_width*0.25):
pass
# get range of values [(0, rt_width)
#get_rt_slice(df, )
rt_min = np.min(df['rt'])
rt_max = np.max(df['rt'])
# define the ranges
left_bound = np.arange(rt_min, rt_max, step)
right_bound = left_bound + rt_width
rt_bounds = zip(left_bound, right_bound)
for rt_slice in rt_bounds:
rt_window = get_rt_slice(df, rt_slice)
#print rt_window.head()
print 'shape', rt_window.shape
raise hee
# TODO Send to ml pipeline here? Or separate function?
print type(np.float64(3.5137499999999999))
a = get_rt_slice(df, (750, 1050))
print 'Original dataframe shape: ', df.shape
print '\n Shape:', a.shape, '\n\n\n\n'
sliding_window_rt(df, 100)
In [16]:
# Convert selected slice to feature table, X, get labels y
Out[16]:
In [88]:
### Subdivide the data into a feature table
local_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/'\
'projects'
data_path = local_path + '/revo_healthcare/data/processed/MTBLS72/positive_mode/'\
'mtbls_no_retcor_bw2.csv'
## Import the data and remove extraneous columns
df = pd.read_csv(data_path, index_col=0)
df.shape
df.head()
# Make a new index of mz:rt
mz = df.loc[:,"mz"].astype('str')
rt = df.loc[:,"rt"].astype('str')
idx = mz+':'+rt
df.index = idx
df.head()
a = get_rt_slice(df, (0,100))
print df.shape
print a.shape