In [17]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import seaborn as sns
%matplotlib inline
In [2]:
# import the data
local_path = '/home/irockafe/Dropbox (MIT)/Alm_Lab/projects/'
project_path = ('/revo_healthcare/data/processed/Husermet_MTBLS97/'+
'Husermet_UPLCMS_positive_ion_mode.xlsx')
metadata = pd.read_excel(local_path+project_path, sheetname=1,
index_col = 0)
peaks = pd.read_excel(local_path+project_path, sheetname=2, index_col=0)
# samples x features
df = pd.read_excel(local_path+project_path, sheetname=3,
dtype=np.float64)
# Replace X from df column labels
df.columns = pd.Series([i.replace('X', '') for i in df.columns],
dtype='Int64')
In [243]:
df.info()
In [244]:
# Peaks is all the features detected, pre-QC, it seems. Select
# Only those that made it to the dataframe.
print 'df columns', df.columns
print 'peak index', peaks.index
# Sanity check that all the peaks are accounted for in the
# peaklist
for i in df.columns:
if i not in peaks.index:
print ("Oh shit, you couldn't find one of the"+
"df columns in the peaklist index")
raise hell
else:
print "Found {i}".format(i=i)
In [20]:
# Make a matrix of the pairwise-ppm difference between peaks
def pairwise_ppm_matrix(peak_mz):
'''
GOAL - Make a matrix containing pairwise ppm differences
from a pandas series
INPUT - peak_mz: pandas series with index as feature identifier
OUTPUT - matrix of pairwise ppm values. half-full with comparisons
Other (redundant) half is nan values. Using nans so
you can ask "ppm_matrix < 20" and sum rows/columns
to get an answer
'''
ppm_pairwise_matrix = pd.DataFrame(
np.full([len(peak_mz), len(peak_mz)], np.nan),
index=peak_mz.index, columns=peak_mz.index)
for i, mz in enumerate(peak_mz):
for idx, mz2 in enumerate(peak_mz[i+1:]):
j=i+1+idx #
min_ppm = abs(
(float(mz-mz2)/max(mz,mz2)) * 10**6)
ppm_pairwise_matrix.iloc[j,i] = min_ppm
return ppm_pairwise_matrix
test_mz = pd.Series([1,2,3], index=['a', 'b', 'c'], dtype='float64')
print test_mz
test_val = pairwise_ppm_matrix(test_mz)
should_val = pd.DataFrame({'a': [np.nan, 0.5*10**6, (2.0/3)*10**6],
'b': [np.nan, np.nan, (1.0/3)*10**6],
'c': [np.nan, np.nan, np.nan]},
index=['a', 'b', 'c'])
print '\nOutput from test_vals:\n', test_val
print '\nShould be this:\n', should_val
assert(test_val.all() == should_val.all()).all()
if (test_val.all() == should_val.all()).all():
print '\n\nYou passed the test! (might be other bugs, but idk)'
In [21]:
# Select out the peaks from dataframe (those that presumably passed QC)
features = peaks.loc[df.columns]
feature_mz = features['mz']
In [15]:
def plot_mz_rt(df, save=False,path=None, rt_bounds=[-1e5,-1e5]):
# the random data
x = df['rt']
y = df['mz']
print np.max(x)
print np.max(y)
nullfmt = NullFormatter() # no labels
# definitions for the axes
left, width = 0.1, 0.65
bottom, height = 0.1, 0.65
bottom_h = left_h = left + width + 0.02
rect_scatter = [left, bottom, width, height]
rect_histx = [left, bottom_h, width, 0.2]
rect_histy = [left_h, bottom, 0.2, height]
# start with a rectangular Figure
#fig = plt.figure(1, figsize=(8, 8))
fig = plt.figure(1, figsize=(10,10))
axScatter = plt.axes(rect_scatter)
axHistx = plt.axes(rect_histx)
axHisty = plt.axes(rect_histy)
# no labels
axHistx.xaxis.set_major_formatter(nullfmt)
axHisty.yaxis.set_major_formatter(nullfmt)
# the scatter plot:
axScatter.scatter(x, y, s=1)
# now determine nice limits by hand:
binwidth = 0.25
#xymax = np.max([np.max(np.fabs(x)), np.max(np.fabs(y))])
#lim = (int(xymax/binwidth) + 1) * binwidth
x_min = np.min(x)-50
x_max = np.max(x)+50
axScatter.set_xlim(x_min, x_max )
y_min = np.min(y)-50
y_max = np.max(y)+50
axScatter.set_ylim(y_min, y_max)
# Add vertical red line between 750-1050 retention time
'''
plt.plot([0,1], [0,1], linestyle = '--', lw=2, color='r',
label='Luck', alpha=0.5)
'''
print 'ymin: ', y_min
# Add vertical/horizontal lines to scatter and histograms
axScatter.axvline(x=rt_bounds[0], lw=2, color='r', alpha=0.5)
axScatter.axvline(x=rt_bounds[1], lw=2, color='r', alpha=0.5)
#bins = np.arange(-lim, lim + binwidth, binwidth)
bins = 100
axHistx.hist(x, bins=bins)
axHisty.hist(y, bins=bins, orientation='horizontal')
axHistx.set_xlim(axScatter.get_xlim())
axHisty.set_ylim(axScatter.get_ylim())
axScatter.set_ylabel('m/z', fontsize=30)
axScatter.set_xlabel('Retention Time', fontsize=30)
axHistx.set_ylabel('# of Features', fontsize=20)
axHisty.set_xlabel('# of Features', fontsize=20)
if save:
plt.savefig(path,
format='pdf')
plt.show()
In [16]:
plot_mz_rt(features)
In [22]:
# Do a quick test on own data
ppm_matrix = pairwise_ppm_matrix(feature_mz)
Also, there's a distinct possibility that they removed other redundant features... Check on that below
In [23]:
def plot_ppm_overlaps(ppm_matrix, x_vals):
x = x_vals
y = (np.array([(ppm_matrix < i).sum().sum() for i in x]) /
float(ppm_matrix.shape[0]))*100
plt.scatter(x,y)
plt.xlabel('ppm')
plt.ylabel('% of overlapping m/z')
plt.title('Few overlapping m/z values in Husermet dataset'
+ ' (# Features = %s)' % ppm_matrix.shape[0])
plt.axvline(5, color='red', alpha=0.2, label='Instrument precision')
plt.legend()
plt.show()
plot_ppm_overlaps(ppm_matrix, range(1,20))
In [167]:
# Check how many of the annotated features (peaks?)
# have overlapping m/z
# This takes a long time. Maybe try to matrix-ify some of the code?
all_feats = pairwise_ppm_matrix(peaks['mz'])
In [173]:
plot_ppm_overlaps(all_feats, range(1,20))
In [28]:
# Get the overlapping features...
# Stack() pivots values and drops nan values - yay!
overlapping_mz_pairs = list(ppm_matrix[ppm_matrix < 6].stack().index)
len(overlapping_mz_pairs)
print overlapping_mz_pairs
# write a function to get intensities for these features
def plot_overlapping_mz_intensities(df, feature_pair):
'''
GOAL - Take in tuple of feature indices, return Intensity values
for that pair.
INPUT -
df - pandas dataframe. A feature table with
(samples x features), with column
index that has same index as feature_pair
feature_pair - Tuple. Contains indexes to get intensity vals
OUTPUT -
Dataframe of (sample, intensity) for each feature pair
'''
feats = df[list(feature_pair)]
# convert to tidy data
tidy_feats = feats.melt(id_vars=feats.index,
value_vars=feats.columns,
var_name='feature',
value_name='intensity').dropna(axis=1, how='all')
# Get mann-whitney values
u, pval_u = stats.mannwhitneyu(df[feature_pair[0]], df[feature_pair[1]])
# Convert dtype of intensity values! float..?
sns.boxplot(x='feature', y='intensity',
data=tidy_feats)
ax = sns.stripplot(data=tidy_feats,
x='feature', y='intensity',
jitter=True)
plt.title("mann-whitney: {u}, pval: {pval:.2e}".format(
u=u, pval=pval_u))
plt.show()
#TODO fix bug here that says
# I'm using different-length arrays
for i in range(0,len(overlapping_mz_pairs)):
plot_overlapping_mz_intensities(df , overlapping_mz_pairs[i])
In [252]:
test = pd.DataFrame({'A': [1,2,3], 'B':[10,20,30], 'C':[100,200,300]})
print test
test.melt(id_vars=test.index, value_vars=test.columns,
var_name='feature', value_name='intensity').dropna(axis=1)
Out[252]: