also see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3097418/ for suggestions on analysis of glycan arrays
In the paper by Cholleti and Cummings http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3459425/#SD2
"To avoid using an arbitrary threshold in determining binders and non-binders, we used the z-score as the statistical test for significance of a sample. The z-score transformation is calculated by comparing the value of a sample, relative to the sample mean and standard deviation, with the null hypothesis being that a random sample pulled from the population would be a non-binder. If the converted p value is less than 0.15, the null hypothesis is rejected and the sample is considered a binding glycan. We used a non-conservative p value to allow more glycans in the list of candidate binders as an input to GLYMMR. The z-score transformation is based on the sum of the RFU intensity values for the three different concentrations of the glycan. This statistical test allows the program to discard not only non-binding glycans, but glycans that exhibit non-specific binding, which could distort the motif discovery algorithm. "
In [ ]:
## House keeping tasks
In [ ]:
%reset -f
In [ ]:
# standard imports
import urllib2
import os
import json
import StringIO
import pickle
# dataframe and numerical
import pandas as pd
import numpy as np
# plotting
import matplotlib.pyplot as plt
%matplotlib inline
#scipy
from scipy import stats
from scipy.special import erf
from scipy import sqrt
In [ ]:
from IPython.display import HTML
def addToggle():
return '''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.'''
HTML(addToggle())
In [ ]:
## variables for this project
samples_in="../data/galectin-3/galectin-3_5.0_human.json"
results_dir = "../results/galectin-3/"
dataframe_out=results_dir+"dataframes_galectin.pkl"
In [ ]:
# Check whether or not the dataframes exist
subdir="./"
dataframefile=dataframe_out
if not os.path.isfile(dataframefile):
print "calling the notebook that loads the data"
%run download_cfg_for_sna.ipynb
with open(os.path.join(subdir, dataframefile)) as f:
dataframes = pickle.load(f)
In [ ]:
# peak at the data in frame 0
frame=dataframes[0]["dataframe"]
frame.head()
In [ ]:
# recalculate CV
# rank the glycans by RFU
STDEV="StDev"
RFU="Average RFU"
frame["CV"]=100*frame[STDEV]/frame[RFU]
maxrfu=frame[RFU].max()
frame["rank"]=100*frame[RFU]/maxrfu
frame.head()
In [ ]:
# choose data to work with
# ignore the 0.5 ug, use 2,5,10 ()
frames=[dataframes[1]["dataframe"],dataframes[2]["dataframe"], dataframes[3]["dataframe"]]
sample_keys=[dataframes[1]["sample"].encode('utf-8'),dataframes[2]["sample"].encode('utf-8'), dataframes[3]["sample"].encode('utf-8')]
# recalculate CV and rank the glycans by RFU
for frame in frames:
frame["%CV"]=frame["% CV"]
frame["CV"]=100*frame[STDEV]/frame[RFU]
maxrfu=frame[RFU].max()
frame["rank"]=100*frame[RFU]/maxrfu
# peak at all frames
result = pd.concat(frames, keys=sample_keys)
result
In [ ]:
# calculate rank, %CV for all frames, z-score, p-value for all frames and sort by average rank
Structure="Structure on Masterlist"
for aframe in frames:
aframe["CV"]=100*aframe[STDEV]/aframe[RFU]
maxrfu=aframe[RFU].max()
aframe["rank"]=100*aframe[RFU]/maxrfu
aframe["z-score"]=stats.zscore(aframe[RFU])
aframe["p-value"]=1- 0.5 * (1 + erf(aframe["z-score"] / sqrt(2)))
#. merge_frames
df_final = reduce(lambda left,right: pd.merge(left,right,on=[Structure,'Chart Number']), frames)
df_final
In [ ]:
frames[2]["CV"], sample_keys[2]
In [ ]:
#. calculate the average rank
df_final["avg_rank"]=df_final.filter(regex=("rank.*")).sum(axis=1)/df_final.filter(regex=("rank.*")).shape[1] # http://stackoverflow.com/questions/30808430/how-to-select-columns-from-dataframe-by-regex
#. calculate the summed RFU
df_final["summed_RFU"]=df_final.filter(regex=("RFU.*")).sum(axis=1)
#. calculate the z-score and p-value for the summed RFU
df_final.head()
df_final["summed_RFU_z-score"]=stats.zscore(df_final["summed_RFU"])
df_final["summed_RFU_p-value"]=1- 0.5 * (1 + erf(df_final["summed_RFU_z-score"] / sqrt(2)))
df_final.sort_values("avg_rank",ascending=False)
#frames_RFU_sum["p-value_them"]=1- 0.5 * (1 + erf(frames_RFU_sum["stats_z-score"] / sqrt(2)))
In [ ]:
#. extract the high binders. p-value < 0.15
df_final_high_binders = df_final[df_final["summed_RFU_p-value"] <0.15]
df_final_high_binders.sort_values("avg_rank",ascending=False)
#print df_final_high_binders.shape
In [ ]:
high_binders= set(df_final_high_binders["Chart Number"])
high_binders
In [ ]:
df_final[df_final["Chart Number"]==582]
See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3097418/
There are three statements made about %CV
50% the data should be disregarded
20-40% the CV is high and the results for binding may not be reliable
30% when binding is low with high imprecision the data are classified as inconclusive.
In [ ]:
#. lets pull out any %CV column of df_final and ensure CV <20
#.. remember there are negative CV in this sample, exclude these
df_cv_20=df_final.filter(regex=("%CV.*")) <=20
df_cv_0=df_final.filter(regex=("%CV.*")) >0
df_cv_0_20=(df_cv_0 & df_cv_20)
#print df_cv_0_20.head(340)
andmask=df_cv_0_20["%CV_x"]&df_cv_0_20["%CV_y"]&df_cv_0_20["%CV"]
ormask=df_cv_0_20["%CV_x"]|df_cv_0_20["%CV_y"]|df_cv_0_20["%CV"]
ormask1=df_cv_0_20["%CV_x"]
ormask2=df_cv_0_20["%CV_y"]
ormask3=df_cv_0_20["%CV"]
#mask
glycan_ids_cv_20=df_final["Chart Number"][andmask]
print len(glycan_ids_cv_20)
In [ ]:
In [ ]:
df_final_high_binders.sort_values("avg_rank",ascending=False)["Chart Number"]
In [ ]:
sample_keys # but note the way I made the frame means that rank_x is 2mg, rank_y is 5mg and rank is 10mg
In [ ]:
glycan_ids_cv_20_1= df_final[ormask1].sort_values("rank_x",ascending=False)
print len(glycan_ids_cv_20_1)
In [ ]:
glycan_ids_cv_20_2= df_final[ormask2].sort_values("rank_y",ascending=False)
print len(glycan_ids_cv_20_2)
In [ ]:
glycan_ids_cv_20_3= df_final[ormask3].sort_values("rank",ascending=False)
print len(glycan_ids_cv_20_3)
In [ ]:
# create dictionary to store results
results = {}
def highbinders(dataframe, pvalue="p-value", rank="rank",pvalue_cutoff=0.15, rank_cutoff=75):
"""
A function which filter the input dataframe by pvalue and rank
returns the filtered dataframe and a list of glycan chart number for the current array
"""
dataframe_p=dataframe[dataframe[pvalue]<pvalue_cutoff]
dataframe_p_r = dataframe_p[dataframe_p[rank]>rank_cutoff]
dataframe_p_r.sort_values(rank,ascending=False)
return dataframe_p_r,set(dataframe_p_r["Chart Number"])
In [ ]:
#. extract the high binders. p-value < 0.15 rank> 75
average_high_df, average_highbind_list = highbinders(df_final,pvalue="summed_RFU_p-value", rank="avg_rank")
average_highbind_list, len(average_highbind_list)
results["average"]=average_highbind_list
In [ ]:
# rank filter 75
rf=75
#. extract the high binders for 2mg . p-value < 0.15 rank> 75
twomggly, twomg_highbind_list = highbinders(glycan_ids_cv_20_1,pvalue="p-value_x",rank="rank_x")
results["twomg_filter"+str(rf)]=set(twomg_highbind_list)
#. extract the high binders for 5mg . p-value < 0.15 rank> 75
fivemggly, fivemg_highbind_list = highbinders(glycan_ids_cv_20_2,pvalue="p-value_y",rank="rank_y")
results["fivemg_filter"+str(rf)]=set(fivemg_highbind_list)
#. extract the high binders for 5mg . p-value < 0.15 rank> 75
tenmggly, tenmg_highbind_list = highbinders(glycan_ids_cv_20_3,pvalue="p-value",rank="rank")
results["tenmg_filter"+str(rf)]=set(tenmg_highbind_list)
In [ ]:
# rank filter 50
rf=50
#. extract the high binders for 2mg . p-value < 0.15 rank> 75
twomggly, twomg_highbind_list = highbinders(glycan_ids_cv_20_1,pvalue="p-value_x",rank="rank_x",rank_cutoff=rf)
results["twomg_filter"+str(rf)]=set(twomg_highbind_list)
#. extract the high binders for 5mg . p-value < 0.15 rank> 75
fivemggly, fivemg_highbind_list = highbinders(glycan_ids_cv_20_2,pvalue="p-value_y",rank="rank_y",rank_cutoff=rf)
results["fivemg_filter"+str(rf)]=set(fivemg_highbind_list)
#. extract the high binders for 5mg . p-value < 0.15 rank> 75
tenmggly, tenmg_highbind_list = highbinders(glycan_ids_cv_20_3,pvalue="p-value",rank="rank",rank_cutoff=rf)
results["tenmg_filter"+str(rf)]=set(tenmg_highbind_list)
In [ ]:
# top 10 without filtering
results["twomg_topten_nofilter"]=set(df_final.sort_values("rank_x",ascending=False)[0:10]["Chart Number"])
results["fivemg_topten_nofilter"]=set(df_final.sort_values("rank_y",ascending=False)[0:10]["Chart Number"])
results["tenmg_topten_nofilter"]=set(df_final.sort_values("rank",ascending=False)[0:10]["Chart Number"])
In [ ]:
results
#a={"average":average_highbind_list, "twomg":set(twomg_highbind_list),"fivemg":set(fivemg_highbind_list),"tenmg":set(tenmg_highbind_list)}
#a
In [ ]:
# see http://pandas.pydata.org/pandas-docs/stable/options.html for pandas options
pd.set_option('display.max_columns',1000)
pd.set_option('max_columns', 100)
df_final[df_final["Chart Number"]==340]
In [ ]:
# make various views of frame 0 based on the %CV
df_cv_50 = frame[frame.CV <50]
df_cv_30 = frame[frame.CV <30]
df_cv_20 = frame[frame.CV <20]
df_cv_20_0 = df_cv_20[df_cv_20.CV>0]
In [ ]:
# plot rank v %CV
# plot comparison of different %CV subsets
plt.figure()
df_cv_20["CV"].plot(legend=True, title='%CV<=20%')
df_cv_20[STDEV].plot(secondary_y=True, style='g', legend=True)
plt.figure()
df_cv_20_0["CV"].plot(legend=True, title='0<%CV<=20%')
df_cv_20_0[STDEV].plot(secondary_y=True, style='g', legend=True)
plt.figure()
df_cv_30["CV"].plot(legend=True, title='%CV<=30%')
df_cv_30[STDEV].plot(secondary_y=True, style='g', legend=True)
plt.figure()
df_cv_50["CV"].plot(legend=True, title='%CV<=50%')
df_cv_50[STDEV].plot(secondary_y=True, style='g', legend=True)
In [ ]:
# use 0<cv<20 and order by rank
pd.set_option('max_rows', 300)
df_cv_20_0.sort_values("rank",ascending=False)
In [ ]:
plt.figure()
df_cv_20[RFU].plot(legend=True, title='%CV<=20%')
df_cv_20[STDEV].plot(secondary_y=True, style='g', legend=True)