In [1]:
    
%run '../ipython_startup.py'
    
    
In [4]:
    
# Import additional libraries
import cPickle as pickle
from scipy.stats import chi2_contingency
from sas7bdat import SAS7BDAT as SAS
    
In [6]:
    
# Import Data
with SAS(os.path.join(PROJ, 'sas_data/emp_bayesian_results_w_flags.sas7bdat')) as FH:
    df = FH.to_data_frame()
    
    
In [7]:
    
# Import drop list from 100 genome simulation
toDrop = pickle.load(open(os.path.join(PROJ, 'pipeline_output/100_genome_simulation/exonic_region_drop_list.pkl'), 'rb'))
# Drop exonic regions in drop list
print 'Original DataFrame has {} rows'.format(df.shape[0])
df = df[-df['fusion_id'].isin(toDrop)].copy()
print 'After dropping DataFrame has {} rows'.format(df.shape[0])
    
    
In [35]:
    
df['apn_bin'], bins = pd.qcut(df['mean_apn'], q=3, labels=['low', 'medium', 'high'], retbins=True)
display(pd.crosstab(df['apn_bin'], df['flag_all_AI'], margins=True))
pprint(chi2_contingency(pd.crosstab(df['apn_bin'], df['flag_all_AI'], margins=False)))
    
    
    
In [10]:
    
bins
    
    Out[10]:
In [23]:
    
((df['mean_apn'] > bins[0]) & (df['mean_apn'] <= bins[1])).sum()
    
    Out[23]:
In [24]:
    
((df['mean_apn'] > bins[1]) & (df['mean_apn'] <= bins[2])).sum()
    
    Out[24]:
In [25]:
    
((df['mean_apn'] > bins[2]) & (df['mean_apn'] <= bins[3])).sum()
    
    Out[25]:
bin 1: .004 to 2.79 bin 2: 2.8 to 10.3 bin 3: 10.33
In [29]:
    
df['apn_bin2'] = False
df.loc[df['mean_apn'] <= 25, 'apn_bin2'] = 'Low'
df.loc[(df['mean_apn'] > 25) & (df['mean_apn'] <= 33), 'apn_bin2'] = 'Medium'
df.loc[(df['mean_apn'] > 33), 'apn_bin2'] = 'High'
    
In [30]:
    
display(pd.crosstab(df['apn_bin2'], df['flag_all_AI'], margins=True))
pprint(chi2_contingency(pd.crosstab(df['apn_bin2'], df['flag_all_AI'], margins=False)))
    
    
    
In [34]:
    
print('Low Percent: {}'.format(197347 / float(1648181) * 100))
print('Medium Percent: {}'.format(10770 / float(74668) * 100))
print('High Percent: {}'.format(42686 / float(213366) * 100))
    
    
In [40]:
    
df['apn_bin3'], bins = pd.qcut(df['mean_apn'], q=20, retbins=True)
CT = pd.crosstab(df['apn_bin3'], df['flag_all_AI'], margins=True)
CT['Percent'] = CT[1.0] / CT['All'] * 100
display(CT)
pprint(chi2_contingency(pd.crosstab(df['apn_bin3'], df['flag_all_AI'], margins=False)))
    
    
    
In [ ]: