check z-score and features of galectin data

also see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3097418/ for suggestions on analysis of glycan arrays

z-score as the statistical test for significance of a sample

In the paper by Cholleti and Cummings http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3459425/#SD2

"To avoid using an arbitrary threshold in determining binders and non-binders, we used the z-score as the statistical test for significance of a sample. The z-score transformation is calculated by comparing the value of a sample, relative to the sample mean and standard deviation, with the null hypothesis being that a random sample pulled from the population would be a non-binder. If the converted p value is less than 0.15, the null hypothesis is rejected and the sample is considered a binding glycan. We used a non-conservative p value to allow more glycans in the list of candidate binders as an input to GLYMMR. The z-score transformation is based on the sum of the RFU intensity values for the three different concentrations of the glycan. This statistical test allows the program to discard not only non-binding glycans, but glycans that exhibit non-specific binding, which could distort the motif discovery algorithm. "


In [ ]:
## House keeping tasks

In [ ]:
%reset -f

import all required dependencies


In [ ]:
# standard imports
import urllib2
import os
import json
import StringIO
import pickle

# dataframe and numerical
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
%matplotlib inline

#scipy
from scipy import stats
from scipy.special import erf
from scipy import sqrt

In [ ]:
from IPython.display import HTML
def addToggle():
    return '''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.'''
HTML(addToggle())

In [ ]:
## variables for this project

samples_in="../data/galectin-3/galectin-3_5.0_human.json"
results_dir = "../results/galectin-3/"
dataframe_out=results_dir+"dataframes_galectin.pkl"

Load data from pickle


In [ ]:
# Check whether or not the dataframes exist

subdir="./"
dataframefile=dataframe_out

if not os.path.isfile(dataframefile):
    print "calling the notebook that loads the data"
    %run download_cfg_for_sna.ipynb
with open(os.path.join(subdir, dataframefile)) as f:
    dataframes = pickle.load(f)

In [ ]:
# peak at the data in frame 0 
frame=dataframes[0]["dataframe"]
frame.head()

In [ ]:
# recalculate CV
# rank the glycans by RFU
STDEV="StDev"
RFU="Average RFU"

frame["CV"]=100*frame[STDEV]/frame[RFU]
maxrfu=frame[RFU].max()
frame["rank"]=100*frame[RFU]/maxrfu
frame.head()

In [ ]:
# choose data to work with 
# ignore the 0.5 ug, use 2,5,10 ()
frames=[dataframes[1]["dataframe"],dataframes[2]["dataframe"], dataframes[3]["dataframe"]]
sample_keys=[dataframes[1]["sample"].encode('utf-8'),dataframes[2]["sample"].encode('utf-8'), dataframes[3]["sample"].encode('utf-8')]

# recalculate CV and rank the glycans by RFU
for frame in frames:
    frame["%CV"]=frame["% CV"]
    frame["CV"]=100*frame[STDEV]/frame[RFU]
    maxrfu=frame[RFU].max()
    frame["rank"]=100*frame[RFU]/maxrfu

# peak at all frames
result = pd.concat(frames, keys=sample_keys)
result

RFU, zscore and p-value

Must convert from z-score to p-value. In the paper, the RFU is summed across all datasets and this is used to calculate a p-value


In [ ]:
# calculate rank, %CV for all frames, z-score, p-value for all frames and sort by average rank
Structure="Structure on Masterlist"

for aframe in frames:
    aframe["CV"]=100*aframe[STDEV]/aframe[RFU]
    maxrfu=aframe[RFU].max()
    aframe["rank"]=100*aframe[RFU]/maxrfu
    aframe["z-score"]=stats.zscore(aframe[RFU])
    aframe["p-value"]=1- 0.5 * (1 + erf(aframe["z-score"] / sqrt(2)))

#. merge_frames
df_final = reduce(lambda left,right: pd.merge(left,right,on=[Structure,'Chart Number']), frames)
df_final

In [ ]:
frames[2]["CV"], sample_keys[2]

In [ ]:
#. calculate the average rank

df_final["avg_rank"]=df_final.filter(regex=("rank.*")).sum(axis=1)/df_final.filter(regex=("rank.*")).shape[1]  # http://stackoverflow.com/questions/30808430/how-to-select-columns-from-dataframe-by-regex


#. calculate the summed RFU

df_final["summed_RFU"]=df_final.filter(regex=("RFU.*")).sum(axis=1)


#. calculate the z-score and p-value for the summed RFU
df_final.head()
df_final["summed_RFU_z-score"]=stats.zscore(df_final["summed_RFU"])
df_final["summed_RFU_p-value"]=1- 0.5 * (1 + erf(df_final["summed_RFU_z-score"] / sqrt(2)))
df_final.sort_values("avg_rank",ascending=False)
#frames_RFU_sum["p-value_them"]=1- 0.5 * (1 + erf(frames_RFU_sum["stats_z-score"] / sqrt(2)))

extract only the high binders


In [ ]:
#. extract the high binders. p-value < 0.15

df_final_high_binders = df_final[df_final["summed_RFU_p-value"] <0.15]
df_final_high_binders.sort_values("avg_rank",ascending=False)
#print df_final_high_binders.shape

In [ ]:
high_binders= set(df_final_high_binders["Chart Number"])
high_binders

In [ ]:
df_final[df_final["Chart Number"]==582]

What about the % CV?

See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3097418/

There are three statements made about %CV

  • 50% the data should be disregarded

  • 20-40% the CV is high and the results for binding may not be reliable

  • 30% when binding is low with high imprecision the data are classified as inconclusive.

comparing % CV with the high binders from z-scores.


In [ ]:
#. lets pull out any %CV column of df_final and ensure CV <20
#.. remember there are negative CV in this sample, exclude these 
df_cv_20=df_final.filter(regex=("%CV.*")) <=20
df_cv_0=df_final.filter(regex=("%CV.*")) >0

df_cv_0_20=(df_cv_0 & df_cv_20)


#print df_cv_0_20.head(340)
andmask=df_cv_0_20["%CV_x"]&df_cv_0_20["%CV_y"]&df_cv_0_20["%CV"]
ormask=df_cv_0_20["%CV_x"]|df_cv_0_20["%CV_y"]|df_cv_0_20["%CV"]

ormask1=df_cv_0_20["%CV_x"]
ormask2=df_cv_0_20["%CV_y"]
ormask3=df_cv_0_20["%CV"]
#mask

glycan_ids_cv_20=df_final["Chart Number"][andmask]
print len(glycan_ids_cv_20)

In [ ]:


In [ ]:
df_final_high_binders.sort_values("avg_rank",ascending=False)["Chart Number"]

In [ ]:
sample_keys # but note the way I made the frame means that rank_x is 2mg, rank_y is 5mg and rank is 10mg

In [ ]:
glycan_ids_cv_20_1= df_final[ormask1].sort_values("rank_x",ascending=False)
print len(glycan_ids_cv_20_1)

In [ ]:
glycan_ids_cv_20_2= df_final[ormask2].sort_values("rank_y",ascending=False)
print len(glycan_ids_cv_20_2)

In [ ]:
glycan_ids_cv_20_3= df_final[ormask3].sort_values("rank",ascending=False)
print len(glycan_ids_cv_20_3)

extract highbinders and other sets for MCAW analysis

  • these should be 2,5,10, but this is manually coded so watch out

In [ ]:
# create dictionary to store results
results = {}

def highbinders(dataframe, pvalue="p-value", rank="rank",pvalue_cutoff=0.15, rank_cutoff=75):
    """
    A function which filter the input dataframe by pvalue and rank
    returns the filtered dataframe and a list of glycan chart number for the current array
    """
    dataframe_p=dataframe[dataframe[pvalue]<pvalue_cutoff]
    dataframe_p_r = dataframe_p[dataframe_p[rank]>rank_cutoff]
    dataframe_p_r.sort_values(rank,ascending=False)
    return dataframe_p_r,set(dataframe_p_r["Chart Number"])

In [ ]:
#. extract the high binders. p-value < 0.15 rank> 75
average_high_df, average_highbind_list = highbinders(df_final,pvalue="summed_RFU_p-value", rank="avg_rank")
average_highbind_list, len(average_highbind_list)
results["average"]=average_highbind_list

In [ ]:
# rank filter 75
rf=75
#. extract the high binders for 2mg . p-value < 0.15 rank> 75
twomggly, twomg_highbind_list = highbinders(glycan_ids_cv_20_1,pvalue="p-value_x",rank="rank_x")
results["twomg_filter"+str(rf)]=set(twomg_highbind_list)

#. extract the high binders for 5mg . p-value < 0.15 rank> 75
fivemggly, fivemg_highbind_list = highbinders(glycan_ids_cv_20_2,pvalue="p-value_y",rank="rank_y")
results["fivemg_filter"+str(rf)]=set(fivemg_highbind_list)
#. extract the high binders for 5mg . p-value < 0.15 rank> 75
tenmggly, tenmg_highbind_list = highbinders(glycan_ids_cv_20_3,pvalue="p-value",rank="rank")
results["tenmg_filter"+str(rf)]=set(tenmg_highbind_list)

In [ ]:
# rank filter 50
rf=50
#. extract the high binders for 2mg . p-value < 0.15 rank> 75
twomggly, twomg_highbind_list = highbinders(glycan_ids_cv_20_1,pvalue="p-value_x",rank="rank_x",rank_cutoff=rf)
results["twomg_filter"+str(rf)]=set(twomg_highbind_list)

#. extract the high binders for 5mg . p-value < 0.15 rank> 75
fivemggly, fivemg_highbind_list = highbinders(glycan_ids_cv_20_2,pvalue="p-value_y",rank="rank_y",rank_cutoff=rf)
results["fivemg_filter"+str(rf)]=set(fivemg_highbind_list)
#. extract the high binders for 5mg . p-value < 0.15 rank> 75
tenmggly, tenmg_highbind_list = highbinders(glycan_ids_cv_20_3,pvalue="p-value",rank="rank",rank_cutoff=rf)
results["tenmg_filter"+str(rf)]=set(tenmg_highbind_list)

In [ ]:
# top 10 without filtering


results["twomg_topten_nofilter"]=set(df_final.sort_values("rank_x",ascending=False)[0:10]["Chart Number"])
results["fivemg_topten_nofilter"]=set(df_final.sort_values("rank_y",ascending=False)[0:10]["Chart Number"])
results["tenmg_topten_nofilter"]=set(df_final.sort_values("rank",ascending=False)[0:10]["Chart Number"])

In [ ]:
results
#a={"average":average_highbind_list,  "twomg":set(twomg_highbind_list),"fivemg":set(fivemg_highbind_list),"tenmg":set(tenmg_highbind_list)}
#a

In [ ]:
# see http://pandas.pydata.org/pandas-docs/stable/options.html for pandas options
pd.set_option('display.max_columns',1000)
pd.set_option('max_columns', 100)
df_final[df_final["Chart Number"]==340]

A consideration of the glycans by %CV for the first frame


In [ ]:
# make various views of frame 0 based on the %CV
df_cv_50 = frame[frame.CV <50]
df_cv_30 = frame[frame.CV <30]
df_cv_20 = frame[frame.CV <20]
df_cv_20_0 = df_cv_20[df_cv_20.CV>0]

In [ ]:
# plot rank v %CV
# plot comparison of different %CV subsets
plt.figure()
df_cv_20["CV"].plot(legend=True, title='%CV<=20%')
df_cv_20[STDEV].plot(secondary_y=True, style='g', legend=True)
plt.figure()
df_cv_20_0["CV"].plot(legend=True, title='0<%CV<=20%')
df_cv_20_0[STDEV].plot(secondary_y=True, style='g', legend=True)
plt.figure()
df_cv_30["CV"].plot(legend=True, title='%CV<=30%')
df_cv_30[STDEV].plot(secondary_y=True, style='g', legend=True)
plt.figure()
df_cv_50["CV"].plot(legend=True, title='%CV<=50%')
df_cv_50[STDEV].plot(secondary_y=True, style='g', legend=True)

In [ ]:
# use 0<cv<20 and order by rank
pd.set_option('max_rows', 300)
df_cv_20_0.sort_values("rank",ascending=False)

In [ ]:
plt.figure()
df_cv_20[RFU].plot(legend=True, title='%CV<=20%')
df_cv_20[STDEV].plot(secondary_y=True, style='g', legend=True)