as per http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3459425/#SD2
also see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3097418/ for suggestions on analysis of glycan arrays
In the paper by Cholleti and Cummings http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3459425/#SD2
"To avoid using an arbitrary threshold in determining binders and non-binders, we used the z-score as the statistical test for significance of a sample. The z-score transformation is calculated by comparing the value of a sample, relative to the sample mean and standard deviation, with the null hypothesis being that a random sample pulled from the population would be a non-binder. If the converted p value is less than 0.15, the null hypothesis is rejected and the sample is considered a binding glycan. We used a non-conservative p value to allow more glycans in the list of candidate binders as an input to GLYMMR. The z-score transformation is based on the sum of the RFU intensity values for the three different concentrations of the glycan. This statistical test allows the program to discard not only non-binding glycans, but glycans that exhibit non-specific binding, which could distort the motif discovery algorithm. "
In [ ]:
## House keeping tasks
In [ ]:
%reset -f
In [ ]:
# standard imports
import urllib2
import os
import json
import StringIO
import pickle
# dataframe and numerical
import pandas as pd
import numpy as np
# plotting
import matplotlib.pyplot as plt
%matplotlib inline
#scipy
from scipy import stats
from scipy.special import erf
from scipy import sqrt
In [ ]:
from IPython.display import HTML
def addToggle():
return '''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.'''
HTML(addToggle())
In [ ]:
## variables for this project
samples_in="../data/sna/SNA_4.0_plant.json"
results_dir = "../results/sna/"
dataframe_out=results_dir+"dataframes_sna.pkl"
In [ ]:
# Check whether or not the dataframes exist
subdir="./"
dataframefile=dataframe_out
if not os.path.isfile(dataframefile):
print "calling the notebook that loads the data"
%run download_cfg_for_sna.ipynb
with open(os.path.join(subdir, dataframefile)) as f:
dataframes = pickle.load(f)
In [ ]:
# peak at the data in frame 0
frame=dataframes[0]["dataframe"]
frame.head()
In [ ]:
# recalculate CV
# rank the glycans by RFU
frame["CV"]=100*frame["STDEV"]/frame["RFU"]
maxrfu=frame["RFU"].max()
frame["rank"]=100*frame["RFU"]/maxrfu
frame.head()
In [ ]:
# choose data to work with
frames=[dataframes[0]["dataframe"],dataframes[1]["dataframe"], dataframes[2]["dataframe"]]
sample_keys=[dataframes[0]["sample"].encode('utf-8'),dataframes[1]["sample"].encode('utf-8'), dataframes[2]["sample"].encode('utf-8')]
# recalculate CV and rank the glycans by RFU
for frame in frames:
frame["CV"]=100*frame["STDEV"]/frame["RFU"]
maxrfu=frame["RFU"].max()
frame["rank"]=100*frame["RFU"]/maxrfu
# peak at all frames
result = pd.concat(frames, keys=sample_keys)
result
In [ ]:
# calculate rank, %CV for all frames, z-score, p-value for all frames and sort by average rank
RFU="RFU"
for aframe in frames:
aframe["CV"]=100*aframe["STDEV"]/aframe["RFU"]
maxrfu=aframe["RFU"].max()
aframe["rank"]=100*aframe["RFU"]/maxrfu
aframe["z-score"]=stats.zscore(aframe[RFU])
aframe["p-value"]=1- 0.5 * (1 + erf(aframe["z-score"] / sqrt(2)))
#. merge_frames
df_final = reduce(lambda left,right: pd.merge(left,right,on=['Structure','Chart Number']), frames)
df_final
#. calculate the average rank
df_final["avg_rank"]=df_final.filter(regex=("rank.*")).sum(axis=1)/df_final.filter(regex=("rank.*")).shape[1] # http://stackoverflow.com/questions/30808430/how-to-select-columns-from-dataframe-by-regex
#. calculate the summed RFU
df_final["summed_RFU"]=df_final.filter(regex=("RFU.*")).sum(axis=1)
#. calculate the z-score and p-value for the summed RFU
df_final.head()
df_final["summed_RFU_z-score"]=stats.zscore(df_final["summed_RFU"])
df_final["summed_RFU_p-value"]=1- 0.5 * (1 + erf(df_final["summed_RFU_z-score"] / sqrt(2)))
df_final.sort_values("avg_rank",ascending=False)
#frames_RFU_sum["p-value_them"]=1- 0.5 * (1 + erf(frames_RFU_sum["stats_z-score"] / sqrt(2)))
In agreement with p values from paper
In [ ]:
#. extract the high binders. p-value < 0.15
df_final_high_binders = df_final[df_final["summed_RFU_p-value"] <0.15]
df_final_high_binders.sort_values("avg_rank",ascending=False)
#print df_final_high_binders.shape
high_binders_from_paper=[353,256,327,341,259,343,54,315,52,314,51,46,258,275,260,325,257,321,53,342,255,407,300,292,340,373]
assert df_final_high_binders.shape[0]==len(high_binders_from_paper)
assert set(df_final_high_binders["Chart Number"])==set(high_binders_from_paper)
In [ ]:
df_final[df_final["Chart Number"]==340]
See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3097418/
There are three statements made about %CV
50% the data should be disregarded
20-40% the CV is high and the results for binding may not be reliable
30% when binding is low with high imprecision the data are classified as inconclusive.
In [ ]:
#. lets pull out any %CV column of df_final and ensure CV <20
#.. remember there are negative CV in this sample, exclude these
df_cv_20=df_final.filter(regex=("%CV.*")) <=20
df_cv_0=df_final.filter(regex=("%CV.*")) >0
df_cv_0_20=(df_cv_0 & df_cv_20)
#print df_cv_0_20.head(340)
andmask=df_cv_0_20["%CV_x"]&df_cv_0_20["%CV_y"]&df_cv_0_20["%CV"]
ormask=df_cv_0_20["%CV_x"]|df_cv_0_20["%CV_y"]|df_cv_0_20["%CV"]
#mask
glycan_ids_cv_20=df_final["Chart Number"][andmask]
print len(glycan_ids_cv_20)
In [ ]:
df_final_high_binders.sort_values("avg_rank",ascending=False)["Chart Number"]
In [ ]:
df_final[andmask].sort_values("avg_rank",ascending=False)
In [ ]:
#. any in common with high_binders_from_paper (intersection)
in_common=set(high_binders_from_paper)&set(glycan_ids_cv_20)
print "Common:", in_common, len(in_common)
# differences
print "Highbind-CV: ",set(high_binders_from_paper)-set(glycan_ids_cv_20), len(set(high_binders_from_paper)-set(glycan_ids_cv_20))
print "CV- Highbind: ",set(glycan_ids_cv_20)-set(high_binders_from_paper), len(set(glycan_ids_cv_20)-set(high_binders_from_paper))
In [ ]:
# see http://pandas.pydata.org/pandas-docs/stable/options.html for pandas options
pd.set_option('display.max_columns',1000)
pd.set_option('max_columns', 100)
df_final[df_final["Chart Number"]==340]
In [ ]:
# make various views of frame 0 based on the %CV
df_cv_50 = frame[frame.CV <50]
df_cv_30 = frame[frame.CV <30]
df_cv_20 = frame[frame.CV <20]
df_cv_20_0 = df_cv_20[df_cv_20.CV>0]
In [ ]:
# plot rank v %CV
# plot comparison of different %CV subsets
plt.figure()
df_cv_20["CV"].plot(legend=True, title='%CV<=20%')
df_cv_20["STDEV"].plot(secondary_y=True, style='g', legend=True)
plt.figure()
df_cv_20_0["CV"].plot(legend=True, title='0<%CV<=20%')
df_cv_20_0["STDEV"].plot(secondary_y=True, style='g', legend=True)
plt.figure()
df_cv_30["CV"].plot(legend=True, title='%CV<=30%')
df_cv_30["STDEV"].plot(secondary_y=True, style='g', legend=True)
plt.figure()
df_cv_50["CV"].plot(legend=True, title='%CV<=50%')
df_cv_50["STDEV"].plot(secondary_y=True, style='g', legend=True)
In [ ]:
# use 0<cv<20 and order by rank
pd.set_option('max_rows', 300)
df_cv_20_0.sort_values("rank",ascending=False)
In [ ]: