ASE Genotype AI Summary



In [4]:

    
%run '../ipython_startup.py'

import cPickle as pkl
from sas7bdat import SAS7BDAT as SAS









    



Importing commonly used libraries: os, sys, numpy as np, scipy as sp, pandas as pd, matplotlib as mp, matplotlib.pyplot as plt, datetime as dt, mclib_Python/flagging as fg
Creating project level variables: MCLAB = /home/jfear/mclab, PROJ = /home/jfear/mclab/cegs_ase_paper, TODAY = 20150904
Adding ['scripts/mclib_Python', 'scripts/ase_Python'] to PYTHONPATH
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [5]:

    
# Import Data
with SAS(os.path.join(PROJ, 'sas_data/clean_ase_sbs.sas7bdat')) as FH:
    df = FH.to_data_frame()

# Import drop list from 100 genome simulation
toDrop = pkl.load(open(os.path.join(PROJ, 'pipeline_output/100_genome_simulation/exonic_region_drop_list.pkl'), 'rb'))

# Drop exonic regions in drop list
print 'Original DataFrame has {} rows'.format(df.shape[0])
df = df[-df['fusion_id'].isin(toDrop)].copy()
print 'After dropping DataFrame has {} rows'.format(df.shape[0])









    



[clean_ase_sbs.sas7bdat] header length 65536 != 8192
[clean_ase_sbs.sas7bdat] [clean_ase_sbs.sas7bdat] header length 65536 != 8192
WARNING:/home/jfear/mclab/cegs_ase_paper/sas_data/clean_ase_sbs.sas7bdat:[clean_ase_sbs.sas7bdat] [clean_ase_sbs.sas7bdat] header length 65536 != 8192






    



Original DataFrame has 79967 rows
After dropping DataFrame has 79967 rows



In [6]:

    
# Get APN info for genotypes
with SAS(os.path.join(PROJ, 'sas_data/clean_ase_stack.sas7bdat')) as FH:
    apn = FH.to_data_frame()

apn_by_line = apn.groupby(['line', 'mating_status'])['mean_apn'].mean().unstack()
apn_by_line.columns = ['Mated APN', 'Virgin APN']









    



[clean_ase_stack.sas7bdat] header length 65536 != 8192
[clean_ase_stack.sas7bdat] [clean_ase_stack.sas7bdat] header length 65536 != 8192
WARNING:/home/jfear/mclab/cegs_ase_paper/sas_data/clean_ase_stack.sas7bdat:[clean_ase_stack.sas7bdat] [clean_ase_stack.sas7bdat] header length 65536 != 8192



In [7]:

    
# Make a bunch of flags for easy filtering
## Flag exons that have AI in both M and V 
df['flag_AI_both'] = 0
df.loc[(df['flag_AI_combined_m'] == 1) & (df['flag_AI_combined_v'] == 1), 'flag_AI_both'] = 1

## Flag exons that have AI in either M or V
df['flag_AI_either'] = 0
df.loc[(df['flag_AI_combined_m'] == 1) | (df['flag_AI_combined_v'] == 1), 'flag_AI_either'] = 1

## Flag AI in M only
df['m_only'] = 0
df.loc[(df['flag_AI_combined_m'] == 1) & (df['flag_AI_combined_v'] == 0), 'm_only'] = 1

## Flag AI in V only
df['v_only'] = 0
df.loc[(df['flag_AI_combined_m'] == 0) & (df['flag_AI_combined_v'] == 1), 'v_only'] = 1



In [8]:

    
# Get total number of exonic regions with AI
numFus = len(set(df[df['flag_AI_either'] == 1]['fusion_id'].values))
print('There were a total of {} exonic regions with AI'.format(numFus))









    



There were a total of 4090 exonic regions with AI



In [9]:

    
# Get total number of exonic regions with AI
numFusBoth = len(set(df[df['flag_AI_both'] == 1]['fusion_id'].values))
print('There were a total of {} exonic regions with AI in mated and virgin'.format(numFusBoth))









    



There were a total of 2712 exonic regions with AI in mated and virgin



In [10]:

    
# Summarize AI counts to the genotype level
AI = df.groupby('line')['flag_AI_either'].sum()
AI.name = 'flag_AI_either'

tested = df.groupby('line')['flag_AI_either'].count()
tested.name = 'num_tested'

pct = np.round(AI / tested * 100, decimals=2)
pct.name = 'percent'

AI_summary = pd.concat([AI, tested, pct], axis=1)


# Create Genotype Summary table with max and min counts and percents
## Do the calculations
genotype = list()
genotype.append(AI_summary.loc[AI_summary['flag_AI_either'].argmax()])
genotype.append(AI_summary.loc[AI_summary['flag_AI_either'].argmin()])
genotype.append(AI_summary.loc[AI_summary['percent'].argmax()])
genotype.append(AI_summary.loc[AI_summary['percent'].argmin()])

# Build the table
genotype_summary = pd.concat(genotype, axis=1).T
genotype_summary.reset_index(inplace=True)
genotype_summary.index = ['Max number of exons with AI', 'Min number of exons with AI', 'Max percent of exons with AI', 'Min percent of exons with AI']
genotype_summary.columns = ['Genotype', 'Number exons with AI', 'Number exons tested', 'Percent exons with AI']
genotype_summary.merge(np.round(apn_by_line, decimals=2), how='left', left_on='Genotype', right_index=True)









    Out[10]:






  
    
      
      Genotype
      Number exons with AI
      Number exons tested
      Percent exons with AI
      Mated APN
      Virgin APN
    
  
  
    
      Max number of exons with AI
      w47
      1244
      3003
      41.43
      234.12
      128.53
    
    
      Min number of exons with AI
      r21
      143
      608
      23.52
      116.74
      91.90
    
    
      Max percent of exons with AI
      r857
      486
      886
      54.85
      155.64
      222.24
    
    
      Min percent of exons with AI
      r380
      236
      2038
      11.58
      128.07
      120.69



In [11]:

    
# Distribution of percent AI
AI_summary['percent'].plot(kind='kde')
AI_summary['percent'].describe()









    Out[11]:





count    49.000000
mean     27.186327
std       9.633105
min      11.580000
25%      19.850000
50%      25.170000
75%      33.330000
max      54.850000
Name: percent, dtype: float64



In [12]:

    
# Genotype discordance
## Indicator up down M q5
df['ind_m_05'] = 0
df.loc[df['q5_mean_theta_m'] > 0.5, 'ind_m_05'] = 'Mated towards tester'
df.loc[df['q5_mean_theta_m'] < 0.5, 'ind_m_05'] = 'Mated towards line'

## Indicator up down V q5
df['ind_v_05'] = 0
df.loc[df['q5_mean_theta_v'] > 0.5, 'ind_v_05'] = 'Virgin towards tester'
df.loc[df['q5_mean_theta_v'] < 0.5, 'ind_v_05'] = 'Virgin towards line'

mated_discord = df[df['flag_AI_both'] == 1].groupby(['fusion_id', 'ind_m_05']).size().unstack().dropna()
mated_discord['Mated total'] = mated_discord.sum(axis=1)

virgin_discord = df[df['flag_AI_both'] == 1].groupby(['fusion_id', 'ind_v_05']).size().unstack().dropna()
virgin_discord['Virgin total'] = virgin_discord.sum(axis=1)

genotype_discord = mated_discord.join(virgin_discord, how='outer')
display(genotype_discord.head())
print('There is a total of {} exonic regions with some evidence of discordance.'.format(genotype_discord.shape[0]))
print('{} exonic regions have at least 10 genotypes in mated'.format((genotype_discord['Mated total'] >= 10).sum()))
print('{} exonic regions have at least 10 genotypes in virgin'.format((genotype_discord['Virgin total'] >= 10).sum()))

genotype_discord.to_excel(os.path.join(PROJ, 'pipeline_output/ase_summary/genotype_discordance.xls'))









    






  
    
      
      Mated towards line
      Mated towards tester
      Mated total
      Virgin towards line
      Virgin towards tester
      Virgin total
    
    
      fusion_id
      
      
      
      
      
      
    
  
  
    
      F10060_SI
      4
      3
      7
      4
      3
      7
    
    
      F10136_SI
      1
      3
      4
      1
      3
      4
    
    
      F10268_SI
      1
      3
      4
      1
      3
      4
    
    
      F10523_SI
      NaN
      NaN
      NaN
      1
      1
      2
    
    
      F11773_SI
      3
      2
      5
      3
      2
      5
    
  








    



There is a total of 548 exonic regions with some evidence of discordance.
112 exonic regions have at least 10 genotypes in mated
118 exonic regions have at least 10 genotypes in virgin



In [19]:

    
both_discord = genotype_discord[(genotype_discord['Mated total'] >= 10) & (genotype_discord['Virgin total'] >= 10)]
print both_discord.shape[0]
both_discord.sort('Mated total')









    



112






    Out[19]:






  
    
      
      Mated towards line
      Mated towards tester
      Mated total
      Virgin towards line
      Virgin towards tester
      Virgin total
    
    
      fusion_id
      
      
      
      
      
      
    
  
  
    
      F13385_SI
      5
      5
      10
      6
      4
      10
    
    
      S37763_SI
      2
      8
      10
      2
      8
      10
    
    
      F62997_SI
      2
      8
      10
      2
      8
      10
    
    
      S11514_SI
      8
      2
      10
      8
      2
      10
    
    
      S13372_SI
      4
      6
      10
      4
      6
      10
    
    
      F53196_SI
      9
      1
      10
      9
      1
      10
    
    
      F46859_SI
      8
      2
      10
      8
      2
      10
    
    
      F46432_SI
      4
      6
      10
      5
      5
      10
    
    
      S4199_SI
      1
      9
      10
      2
      8
      10
    
    
      S43484_SI
      6
      4
      10
      6
      4
      10
    
    
      S16644_SI
      2
      8
      10
      2
      8
      10
    
    
      F9836_SI
      4
      6
      10
      4
      6
      10
    
    
      F4373_SI
      2
      8
      10
      2
      8
      10
    
    
      S29178_SI
      2
      8
      10
      2
      8
      10
    
    
      S25923_SI
      1
      9
      10
      1
      9
      10
    
    
      S54321_SI
      1
      9
      10
      1
      9
      10
    
    
      S17629_SI
      1
      9
      10
      1
      9
      10
    
    
      S5525_SI
      8
      2
      10
      9
      1
      10
    
    
      F36300_SI
      7
      3
      10
      8
      2
      10
    
    
      F15820_SI
      8
      2
      10
      7
      3
      10
    
    
      F15821_SI
      3
      7
      10
      1
      9
      10
    
    
      F28147_SI
      7
      3
      10
      7
      3
      10
    
    
      F26668_SI
      3
      7
      10
      3
      7
      10
    
    
      F26657_SI
      2
      8
      10
      2
      8
      10
    
    
      S6467_SI
      6
      4
      10
      5
      5
      10
    
    
      F38056_SI
      6
      4
      10
      6
      4
      10
    
    
      S62457_SI
      2
      8
      10
      2
      8
      10
    
    
      S36492_SI
      1
      10
      11
      1
      10
      11
    
    
      S37758_SI
      2
      9
      11
      2
      9
      11
    
    
      S10190_SI
      1
      10
      11
      1
      10
      11
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      F37523_SI
      2
      15
      17
      2
      15
      17
    
    
      S44694_SI
      3
      14
      17
      3
      14
      17
    
    
      S18092_SI
      1
      17
      18
      1
      17
      18
    
    
      S4197_SI
      13
      5
      18
      13
      5
      18
    
    
      F22521_SI
      3
      16
      19
      4
      15
      19
    
    
      S37764_SI
      8
      11
      19
      7
      12
      19
    
    
      S29177_SI
      1
      18
      19
      1
      18
      19
    
    
      F44391_SI
      2
      18
      20
      2
      18
      20
    
    
      F16103_SI
      17
      3
      20
      19
      1
      20
    
    
      S17997_SI
      17
      4
      21
      17
      4
      21
    
    
      F63396_SI
      16
      5
      21
      15
      6
      21
    
    
      S42358_SI
      17
      4
      21
      17
      4
      21
    
    
      S39381_SI
      7
      15
      22
      7
      15
      22
    
    
      S3602_SI
      17
      5
      22
      18
      4
      22
    
    
      F672_SI
      19
      3
      22
      20
      2
      22
    
    
      S62122_SI
      17
      5
      22
      17
      5
      22
    
    
      S26386_SI
      5
      18
      23
      5
      18
      23
    
    
      S15823_SI
      3
      21
      24
      2
      22
      24
    
    
      F39345_SI
      12
      12
      24
      12
      12
      24
    
    
      S26631_SI
      14
      10
      24
      14
      10
      24
    
    
      S43600_SI
      15
      11
      26
      15
      11
      26
    
    
      S26385_SI
      21
      5
      26
      21
      5
      26
    
    
      S44768_SI
      25
      2
      27
      25
      2
      27
    
    
      S44389_SI
      26
      4
      30
      26
      4
      30
    
    
      S37765_SI
      12
      19
      31
      12
      19
      31
    
    
      S21256_SI
      31
      1
      32
      31
      1
      32
    
    
      S21257_SI
      28
      6
      34
      28
      6
      34
    
    
      S15131_SI
      1
      33
      34
      1
      33
      34
    
    
      F5339_SI
      29
      7
      36
      29
      7
      36
    
    
      F40168_SI
      31
      7
      38
      29
      9
      38
    
  

112 rows × 6 columns



In [13]:

    
# Flag AI in both, but theta in different directions Environment
df['diff_direction_env'] = 0
df.loc[(df['flag_AI_combined_m'] == 1) & (df['flag_AI_combined_v'] == 1) & 
       (((df['q5_mean_theta_m'] > 0.5) & (df['q5_mean_theta_v'] < 0.5)) | ((df['q5_mean_theta_m'] < 0.5) & (df['q5_mean_theta_v'] > 0.5))), 
       'diff_direction_env'] = 1

	Genotype	Number exons with AI	Number exons tested	Percent exons with AI	Mated APN	Virgin APN
Max number of exons with AI	w47	1244	3003	41.43	234.12	128.53
Min number of exons with AI	r21	143	608	23.52	116.74	91.90
Max percent of exons with AI	r857	486	886	54.85	155.64	222.24
Min percent of exons with AI	r380	236	2038	11.58	128.07	120.69

	Mated towards line	Mated towards tester	Mated total	Virgin towards line	Virgin towards tester	Virgin total
fusion_id
F10060_SI	4	3	7	4	3	7
F10136_SI	1	3	4	1	3	4
F10268_SI	1	3	4	1	3	4
F10523_SI	NaN	NaN	NaN	1	1	2
F11773_SI	3	2	5	3	2	5

	Mated towards line	Mated towards tester	Mated total	Virgin towards line	Virgin towards tester	Virgin total
fusion_id
F13385_SI	5	5	10	6	4	10
S37763_SI	2	8	10	2	8	10
F62997_SI	2	8	10	2	8	10
S11514_SI	8	2	10	8	2	10
S13372_SI	4	6	10	4	6	10
F53196_SI	9	1	10	9	1	10
F46859_SI	8	2	10	8	2	10
F46432_SI	4	6	10	5	5	10
S4199_SI	1	9	10	2	8	10
S43484_SI	6	4	10	6	4	10
S16644_SI	2	8	10	2	8	10
F9836_SI	4	6	10	4	6	10
F4373_SI	2	8	10	2	8	10
S29178_SI	2	8	10	2	8	10
S25923_SI	1	9	10	1	9	10
S54321_SI	1	9	10	1	9	10
S17629_SI	1	9	10	1	9	10
S5525_SI	8	2	10	9	1	10
F36300_SI	7	3	10	8	2	10
F15820_SI	8	2	10	7	3	10
F15821_SI	3	7	10	1	9	10
F28147_SI	7	3	10	7	3	10
F26668_SI	3	7	10	3	7	10
F26657_SI	2	8	10	2	8	10
S6467_SI	6	4	10	5	5	10
F38056_SI	6	4	10	6	4	10
S62457_SI	2	8	10	2	8	10
S36492_SI	1	10	11	1	10	11
S37758_SI	2	9	11	2	9	11
S10190_SI	1	10	11	1	10	11
...	...	...	...	...	...	...
F37523_SI	2	15	17	2	15	17
S44694_SI	3	14	17	3	14	17
S18092_SI	1	17	18	1	17	18
S4197_SI	13	5	18	13	5	18
F22521_SI	3	16	19	4	15	19
S37764_SI	8	11	19	7	12	19
S29177_SI	1	18	19	1	18	19
F44391_SI	2	18	20	2	18	20
F16103_SI	17	3	20	19	1	20
S17997_SI	17	4	21	17	4	21
F63396_SI	16	5	21	15	6	21
S42358_SI	17	4	21	17	4	21
S39381_SI	7	15	22	7	15	22
S3602_SI	17	5	22	18	4	22
F672_SI	19	3	22	20	2	22
S62122_SI	17	5	22	17	5	22
S26386_SI	5	18	23	5	18	23
S15823_SI	3	21	24	2	22	24
F39345_SI	12	12	24	12	12	24
S26631_SI	14	10	24	14	10	24
S43600_SI	15	11	26	15	11	26
S26385_SI	21	5	26	21	5	26
S44768_SI	25	2	27	25	2	27
S44389_SI	26	4	30	26	4	30
S37765_SI	12	19	31	12	19	31
S21256_SI	31	1	32	31	1	32
S21257_SI	28	6	34	28	6	34
S15131_SI	1	33	34	1	33	34
F5339_SI	29	7	36	29	7	36
F40168_SI	31	7	38	29	9	38