Comparison of Coverage by Credible Interval


In [1]:
%run '../ipython_startup.py'


Importing commonly used libraries: 
            os, sys 
            numpy as np 
            scipy as sp 
            pandas as pd 
            matplotlib as mp 
            matplotlib.pyplot as plt
            datetime as dt 
            mclib_Python/flagging as fg

Creating project level variables: 
        MCLAB = /home/jfear/mclab 
        PROJ = /home/jfear/mclab/cegs_ase_paper 
        TODAY = 20160214

Adding ['scripts/mclib_Python', 'scripts/ase_Python'] to PYTHONPATH


In [4]:
# Import additional libraries
import cPickle as pickle
from scipy.stats import chi2_contingency
from sas7bdat import SAS7BDAT as SAS

In [6]:
# Import Data
with SAS(os.path.join(PROJ, 'sas_data/emp_bayesian_results_w_flags.sas7bdat')) as FH:
    df = FH.to_data_frame()


[emp_bayesian_results_w_flags.sas7bdat] header length 65536 != 8192
[emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] header length 65536 != 8192
[emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] header length 65536 != 8192
WARNING:/home/jfear/mclab/cegs_ase_paper/sas_data/emp_bayesian_results_w_flags.sas7bdat:[emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] header length 65536 != 8192

In [7]:
# Import drop list from 100 genome simulation
toDrop = pickle.load(open(os.path.join(PROJ, 'pipeline_output/100_genome_simulation/exonic_region_drop_list.pkl'), 'rb'))

# Drop exonic regions in drop list
print 'Original DataFrame has {} rows'.format(df.shape[0])
df = df[-df['fusion_id'].isin(toDrop)].copy()
print 'After dropping DataFrame has {} rows'.format(df.shape[0])


Original DataFrame has 1939828 rows
After dropping DataFrame has 1936215 rows

In [35]:
df['apn_bin'], bins = pd.qcut(df['mean_apn'], q=3, labels=['low', 'medium', 'high'], retbins=True)

display(pd.crosstab(df['apn_bin'], df['flag_all_AI'], margins=True))
pprint(chi2_contingency(pd.crosstab(df['apn_bin'], df['flag_all_AI'], margins=False)))


flag_all_AI 0.0 1.0 All
apn_bin
low 593376 52036 645412
medium 551950 93448 645398
high 540086 105319 645405
All 1685412 250803 1936215
(21506.221051377073,
 0.0,
 2,
 array([[ 561810.09327167,   83601.90672833],
       [ 561797.90672833,   83600.09327167],
       [ 561804.        ,   83601.        ]]))

In [10]:
bins


Out[10]:
array([  4.78468900e-03,   2.79458795e+00,   1.03217792e+01,
         7.57773368e+04])

In [23]:
((df['mean_apn'] > bins[0]) & (df['mean_apn'] <= bins[1])).sum()


Out[23]:
645411

In [24]:
((df['mean_apn'] > bins[1]) & (df['mean_apn'] <= bins[2])).sum()


Out[24]:
645398

In [25]:
((df['mean_apn'] > bins[2]) & (df['mean_apn'] <= bins[3])).sum()


Out[25]:
645405

bin 1: .004 to 2.79 bin 2: 2.8 to 10.3 bin 3: 10.33


In [29]:
df['apn_bin2'] = False
df.loc[df['mean_apn'] <= 25, 'apn_bin2'] = 'Low'
df.loc[(df['mean_apn'] > 25) & (df['mean_apn'] <= 33), 'apn_bin2'] = 'Medium'
df.loc[(df['mean_apn'] > 33), 'apn_bin2'] = 'High'

In [30]:
display(pd.crosstab(df['apn_bin2'], df['flag_all_AI'], margins=True))
pprint(chi2_contingency(pd.crosstab(df['apn_bin2'], df['flag_all_AI'], margins=False)))


flag_all_AI 0.0 1.0 All
apn_bin2
High 170680 42686 213366
Low 1450834 197347 1648181
Medium 63898 10770 74668
All 1685412 250803 1936215
(10958.627203809903,
 0.0,
 2,
 array([[  185728.14320311,    27637.85679689],
       [ 1434687.79839636,   213493.20160364],
       [   64996.05840054,     9671.94159946]]))

In [34]:
print('Low Percent: {}'.format(197347 / float(1648181) * 100))
print('Medium Percent: {}'.format(10770 / float(74668) * 100))
print('High Percent: {}'.format(42686 / float(213366) * 100))


Low Percent: 11.9736242561
Medium Percent: 14.4238495741
High Percent: 20.0059990814

In [40]:
df['apn_bin3'], bins = pd.qcut(df['mean_apn'], q=20, retbins=True)

CT = pd.crosstab(df['apn_bin3'], df['flag_all_AI'], margins=True)
CT['Percent'] = CT[1.0] / CT['All'] * 100
display(CT)
pprint(chi2_contingency(pd.crosstab(df['apn_bin3'], df['flag_all_AI'], margins=False)))


flag_all_AI 0.0 1.0 All Percent
apn_bin3
[0.00478, 0.292] 95447 1425 96872 1.471013
(0.292, 0.588] 92461 4334 96795 4.477504
(0.588, 0.943] 89905 6861 96766 7.090300
(0.943, 1.36] 87946 8866 96812 9.157956
(1.36, 1.841] 86444 10376 96820 10.716794
(1.841, 2.388] 85039 11784 96823 12.170662
(2.388, 3.0118] 84125 12694 96819 13.111063
(3.0118, 3.723] 83415 13364 96779 13.808781
(3.723, 4.545] 83001 13819 96820 14.272878
(4.545, 5.498] 82440 14364 96804 14.838230
(5.498, 6.629] 82594 14231 96825 14.697650
(6.629, 8] 83309 14500 97809 14.824812
(8, 9.674] 81834 14017 95851 14.623739
(9.674, 11.81] 82570 14217 96787 14.688956
(11.81, 14.63] 82508 14271 96779 14.745968
(14.63, 18.642] 82802 14009 96811 14.470463
(18.642, 24.807] 82948 13863 96811 14.319654
(24.807, 36] 82861 13954 96815 14.413056
(36, 65.829] 81495 15312 96807 15.817038
(65.829, 75777.337] 72268 24542 96810 25.350687
All 1685412 250803 1936215 12.953262
(38460.778611379239,
 0.0,
 19,
 array([[ 84323.91612708,  12548.08387292],
       [ 84256.89013875,  12538.10986125],
       [ 84231.6465847 ,  12534.3534153 ],
       [ 84271.68808423,  12540.31191577],
       [ 84278.65182327,  12541.34817673],
       [ 84281.26322542,  12541.73677458],
       [ 84277.78135589,  12541.21864411],
       [ 84242.96266065,  12536.03733935],
       [ 84278.65182327,  12541.34817673],
       [ 84264.72434518,  12539.27565482],
       [ 84283.00416018,  12541.99583982],
       [ 85139.54406303,  12669.45593697],
       [ 83435.16893114,  12415.83106886],
       [ 84249.9263997 ,  12537.0736003 ],
       [ 84242.96266065,  12536.03733935],
       [ 84270.81761685,  12540.18238315],
       [ 84270.81761685,  12540.18238315],
       [ 84274.29948637,  12540.70051363],
       [ 84267.33574732,  12539.66425268],
       [ 84269.94714946,  12540.05285054]]))

In [ ]: