In [1]:
%run '../ipython_startup.py'
In [4]:
# Import additional libraries
import cPickle as pickle
from scipy.stats import chi2_contingency
from sas7bdat import SAS7BDAT as SAS
In [6]:
# Import Data
with SAS(os.path.join(PROJ, 'sas_data/emp_bayesian_results_w_flags.sas7bdat')) as FH:
df = FH.to_data_frame()
In [7]:
# Import drop list from 100 genome simulation
toDrop = pickle.load(open(os.path.join(PROJ, 'pipeline_output/100_genome_simulation/exonic_region_drop_list.pkl'), 'rb'))
# Drop exonic regions in drop list
print 'Original DataFrame has {} rows'.format(df.shape[0])
df = df[-df['fusion_id'].isin(toDrop)].copy()
print 'After dropping DataFrame has {} rows'.format(df.shape[0])
In [35]:
df['apn_bin'], bins = pd.qcut(df['mean_apn'], q=3, labels=['low', 'medium', 'high'], retbins=True)
display(pd.crosstab(df['apn_bin'], df['flag_all_AI'], margins=True))
pprint(chi2_contingency(pd.crosstab(df['apn_bin'], df['flag_all_AI'], margins=False)))
In [10]:
bins
Out[10]:
In [23]:
((df['mean_apn'] > bins[0]) & (df['mean_apn'] <= bins[1])).sum()
Out[23]:
In [24]:
((df['mean_apn'] > bins[1]) & (df['mean_apn'] <= bins[2])).sum()
Out[24]:
In [25]:
((df['mean_apn'] > bins[2]) & (df['mean_apn'] <= bins[3])).sum()
Out[25]:
bin 1: .004 to 2.79 bin 2: 2.8 to 10.3 bin 3: 10.33
In [29]:
df['apn_bin2'] = False
df.loc[df['mean_apn'] <= 25, 'apn_bin2'] = 'Low'
df.loc[(df['mean_apn'] > 25) & (df['mean_apn'] <= 33), 'apn_bin2'] = 'Medium'
df.loc[(df['mean_apn'] > 33), 'apn_bin2'] = 'High'
In [30]:
display(pd.crosstab(df['apn_bin2'], df['flag_all_AI'], margins=True))
pprint(chi2_contingency(pd.crosstab(df['apn_bin2'], df['flag_all_AI'], margins=False)))
In [34]:
print('Low Percent: {}'.format(197347 / float(1648181) * 100))
print('Medium Percent: {}'.format(10770 / float(74668) * 100))
print('High Percent: {}'.format(42686 / float(213366) * 100))
In [40]:
df['apn_bin3'], bins = pd.qcut(df['mean_apn'], q=20, retbins=True)
CT = pd.crosstab(df['apn_bin3'], df['flag_all_AI'], margins=True)
CT['Percent'] = CT[1.0] / CT['All'] * 100
display(CT)
pprint(chi2_contingency(pd.crosstab(df['apn_bin3'], df['flag_all_AI'], margins=False)))
In [ ]: