Comparison of Coverage by Credible Interval



In [1]:

    
%run '../ipython_startup.py'









    



Importing commonly used libraries: 
            os, sys 
            numpy as np 
            scipy as sp 
            pandas as pd 
            matplotlib as mp 
            matplotlib.pyplot as plt
            datetime as dt 
            mclib_Python/flagging as fg

Creating project level variables: 
        MCLAB = /home/jfear/mclab 
        PROJ = /home/jfear/mclab/cegs_ase_paper 
        TODAY = 20160214

Adding ['scripts/mclib_Python', 'scripts/ase_Python'] to PYTHONPATH



In [4]:

    
# Import additional libraries
import cPickle as pickle
from scipy.stats import chi2_contingency
from sas7bdat import SAS7BDAT as SAS



In [6]:

    
# Import Data
with SAS(os.path.join(PROJ, 'sas_data/emp_bayesian_results_w_flags.sas7bdat')) as FH:
    df = FH.to_data_frame()









    



[emp_bayesian_results_w_flags.sas7bdat] header length 65536 != 8192
[emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] header length 65536 != 8192
[emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] header length 65536 != 8192
WARNING:/home/jfear/mclab/cegs_ase_paper/sas_data/emp_bayesian_results_w_flags.sas7bdat:[emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] [emp_bayesian_results_w_flags.sas7bdat] header length 65536 != 8192



In [7]:

    
# Import drop list from 100 genome simulation
toDrop = pickle.load(open(os.path.join(PROJ, 'pipeline_output/100_genome_simulation/exonic_region_drop_list.pkl'), 'rb'))

# Drop exonic regions in drop list
print 'Original DataFrame has {} rows'.format(df.shape[0])
df = df[-df['fusion_id'].isin(toDrop)].copy()
print 'After dropping DataFrame has {} rows'.format(df.shape[0])









    



Original DataFrame has 1939828 rows
After dropping DataFrame has 1936215 rows



In [35]:

    
df['apn_bin'], bins = pd.qcut(df['mean_apn'], q=3, labels=['low', 'medium', 'high'], retbins=True)

display(pd.crosstab(df['apn_bin'], df['flag_all_AI'], margins=True))
pprint(chi2_contingency(pd.crosstab(df['apn_bin'], df['flag_all_AI'], margins=False)))









    






  
    
      flag_all_AI
      0.0
      1.0
      All
    
    
      apn_bin
      
      
      
    
  
  
    
      low
      593376
      52036
      645412
    
    
      medium
      551950
      93448
      645398
    
    
      high
      540086
      105319
      645405
    
    
      All
      1685412
      250803
      1936215
    
  








    



(21506.221051377073,
 0.0,
 2,
 array([[ 561810.09327167,   83601.90672833],
       [ 561797.90672833,   83600.09327167],
       [ 561804.        ,   83601.        ]]))



In [10]:

    
bins









    Out[10]:





array([  4.78468900e-03,   2.79458795e+00,   1.03217792e+01,
         7.57773368e+04])



In [23]:

    
((df['mean_apn'] > bins[0]) & (df['mean_apn'] <= bins[1])).sum()









    Out[23]:





645411



In [24]:

    
((df['mean_apn'] > bins[1]) & (df['mean_apn'] <= bins[2])).sum()









    Out[24]:





645398



In [25]:

    
((df['mean_apn'] > bins[2]) & (df['mean_apn'] <= bins[3])).sum()









    Out[25]:





645405

bin 1: .004 to 2.79 bin 2: 2.8 to 10.3 bin 3: 10.33



In [29]:

    
df['apn_bin2'] = False
df.loc[df['mean_apn'] <= 25, 'apn_bin2'] = 'Low'
df.loc[(df['mean_apn'] > 25) & (df['mean_apn'] <= 33), 'apn_bin2'] = 'Medium'
df.loc[(df['mean_apn'] > 33), 'apn_bin2'] = 'High'



In [30]:

    
display(pd.crosstab(df['apn_bin2'], df['flag_all_AI'], margins=True))
pprint(chi2_contingency(pd.crosstab(df['apn_bin2'], df['flag_all_AI'], margins=False)))









    






  
    
      flag_all_AI
      0.0
      1.0
      All
    
    
      apn_bin2
      
      
      
    
  
  
    
      High
      170680
      42686
      213366
    
    
      Low
      1450834
      197347
      1648181
    
    
      Medium
      63898
      10770
      74668
    
    
      All
      1685412
      250803
      1936215
    
  








    



(10958.627203809903,
 0.0,
 2,
 array([[  185728.14320311,    27637.85679689],
       [ 1434687.79839636,   213493.20160364],
       [   64996.05840054,     9671.94159946]]))



In [34]:

    
print('Low Percent: {}'.format(197347 / float(1648181) * 100))
print('Medium Percent: {}'.format(10770 / float(74668) * 100))
print('High Percent: {}'.format(42686 / float(213366) * 100))









    



Low Percent: 11.9736242561
Medium Percent: 14.4238495741
High Percent: 20.0059990814



In [40]:

    
df['apn_bin3'], bins = pd.qcut(df['mean_apn'], q=20, retbins=True)

CT = pd.crosstab(df['apn_bin3'], df['flag_all_AI'], margins=True)
CT['Percent'] = CT[1.0] / CT['All'] * 100
display(CT)
pprint(chi2_contingency(pd.crosstab(df['apn_bin3'], df['flag_all_AI'], margins=False)))









    






  
    
      flag_all_AI
      0.0
      1.0
      All
      Percent
    
    
      apn_bin3
      
      
      
      
    
  
  
    
      [0.00478, 0.292]
      95447
      1425
      96872
      1.471013
    
    
      (0.292, 0.588]
      92461
      4334
      96795
      4.477504
    
    
      (0.588, 0.943]
      89905
      6861
      96766
      7.090300
    
    
      (0.943, 1.36]
      87946
      8866
      96812
      9.157956
    
    
      (1.36, 1.841]
      86444
      10376
      96820
      10.716794
    
    
      (1.841, 2.388]
      85039
      11784
      96823
      12.170662
    
    
      (2.388, 3.0118]
      84125
      12694
      96819
      13.111063
    
    
      (3.0118, 3.723]
      83415
      13364
      96779
      13.808781
    
    
      (3.723, 4.545]
      83001
      13819
      96820
      14.272878
    
    
      (4.545, 5.498]
      82440
      14364
      96804
      14.838230
    
    
      (5.498, 6.629]
      82594
      14231
      96825
      14.697650
    
    
      (6.629, 8]
      83309
      14500
      97809
      14.824812
    
    
      (8, 9.674]
      81834
      14017
      95851
      14.623739
    
    
      (9.674, 11.81]
      82570
      14217
      96787
      14.688956
    
    
      (11.81, 14.63]
      82508
      14271
      96779
      14.745968
    
    
      (14.63, 18.642]
      82802
      14009
      96811
      14.470463
    
    
      (18.642, 24.807]
      82948
      13863
      96811
      14.319654
    
    
      (24.807, 36]
      82861
      13954
      96815
      14.413056
    
    
      (36, 65.829]
      81495
      15312
      96807
      15.817038
    
    
      (65.829, 75777.337]
      72268
      24542
      96810
      25.350687
    
    
      All
      1685412
      250803
      1936215
      12.953262
    
  








    



(38460.778611379239,
 0.0,
 19,
 array([[ 84323.91612708,  12548.08387292],
       [ 84256.89013875,  12538.10986125],
       [ 84231.6465847 ,  12534.3534153 ],
       [ 84271.68808423,  12540.31191577],
       [ 84278.65182327,  12541.34817673],
       [ 84281.26322542,  12541.73677458],
       [ 84277.78135589,  12541.21864411],
       [ 84242.96266065,  12536.03733935],
       [ 84278.65182327,  12541.34817673],
       [ 84264.72434518,  12539.27565482],
       [ 84283.00416018,  12541.99583982],
       [ 85139.54406303,  12669.45593697],
       [ 83435.16893114,  12415.83106886],
       [ 84249.9263997 ,  12537.0736003 ],
       [ 84242.96266065,  12536.03733935],
       [ 84270.81761685,  12540.18238315],
       [ 84270.81761685,  12540.18238315],
       [ 84274.29948637,  12540.70051363],
       [ 84267.33574732,  12539.66425268],
       [ 84269.94714946,  12540.05285054]]))



In [ ]:

flag_all_AI	0.0	1.0	All
apn_bin
low	593376	52036	645412
medium	551950	93448	645398
high	540086	105319	645405
All	1685412	250803	1936215

flag_all_AI	0.0	1.0	All
apn_bin2
High	170680	42686	213366
Low	1450834	197347	1648181
Medium	63898	10770	74668
All	1685412	250803	1936215

flag_all_AI	0.0	1.0	All	Percent
apn_bin3
[0.00478, 0.292]	95447	1425	96872	1.471013
(0.292, 0.588]	92461	4334	96795	4.477504
(0.588, 0.943]	89905	6861	96766	7.090300
(0.943, 1.36]	87946	8866	96812	9.157956
(1.36, 1.841]	86444	10376	96820	10.716794
(1.841, 2.388]	85039	11784	96823	12.170662
(2.388, 3.0118]	84125	12694	96819	13.111063
(3.0118, 3.723]	83415	13364	96779	13.808781
(3.723, 4.545]	83001	13819	96820	14.272878
(4.545, 5.498]	82440	14364	96804	14.838230
(5.498, 6.629]	82594	14231	96825	14.697650
(6.629, 8]	83309	14500	97809	14.824812
(8, 9.674]	81834	14017	95851	14.623739
(9.674, 11.81]	82570	14217	96787	14.688956
(11.81, 14.63]	82508	14271	96779	14.745968
(14.63, 18.642]	82802	14009	96811	14.470463
(18.642, 24.807]	82948	13863	96811	14.319654
(24.807, 36]	82861	13954	96815	14.413056
(36, 65.829]	81495	15312	96807	15.817038
(65.829, 75777.337]	72268	24542	96810	25.350687
All	1685412	250803	1936215	12.953262