Which genes are turned on/off in some cancers but not others?



In [1]:

    
import NotebookImport
from DX_screen import *
from Figures.Regression import *









    




importing IPython notebook from DX_screen






    




importing IPython notebook from Imports






    




importing IPython notebook from /cellar/users/agross/anaconda2/lib/python2.7/site-packages/MethylTools/Probe_Annotations



In [2]:

    
def dx_group(split, matched_tn, fc=1.):
    tab_1 = binomial_test_screen(matched_tn.ix[:, ti(split==True)], fc=fc)
    tab_1 = tab_1.ix[(((tab_1.frac - .5).abs() > .25) & (tab_1.p > .001)) == False]
    
    tab_0 = binomial_test_screen(matched_tn.ix[:, ti(split==False)], fc=fc)
    tab_0 = tab_0.ix[(((tab_0.frac - .5).abs() > .25) & (tab_0.p > .001)) == False]
    
    df = matched_tn.ix[:, ti(split==True)]
    ttest_1 = df.apply(ttest_rel, axis=1)
    
    df = matched_tn.ix[:, ti(split==False)]
    ttest_0 = df.apply(ttest_rel, axis=1)
    
    t0 = pd.concat([tab_0, ttest_0], keys=['binom','ttest'], axis=1)
    t1 = pd.concat([tab_1, ttest_1], keys=['binom','ttest'], axis=1)
    t = pd.concat([t0, t1], keys=['miss','hit'], axis=1)
    return t

TP53



In [3]:

    
mut_all = pd.read_csv('/cellar/users/agross/TCGA_Code/TCGA/Data/MAFs_new_2/meta.csv', 
                      index_col=[1,2])
mut = mut_all['0'].unstack().fillna(0)
mut = FH.fix_barcode_columns(mut)
mut = mut.xs('01', axis=1, level=1)



In [5]:

    
p53_dx = dx_group(mut.ix['TP53'].dropna()>0, matched_rna, fc=1.)



In [6]:

    
gg = ti((p53_dx.hit.binom.frac - p53_dx.miss.binom.frac).dropna() > .25)
rm = p53_dx.miss.binom.frac 
rh = p53_dx.hit.binom.frac
r = ((rm < .75) & (rm > .25)) | ((rh < .75) & (rh > .25))
t = (p53_dx.miss.binom.num_dx > 50) & (p53_dx.hit.binom.num_dx > 50)
gg_up = gg.intersection(ti(r & t))
len(gg_up)









    Out[6]:





378



In [7]:

    
v = pd.Series(1, gg_up).ix[ti(t)].fillna(0)
mm = run_mgsa(v==1)
mm.sort('estimate', ascending=False).head()









    Out[7]:






  
    
      
      inPopulation
      inStudySet
      estimate
      std.error
    
  
  
    
      KEGG_STEROID_BIOSYNTHESIS
       17
       3
       0.20
       0.02
    
    
      KEGG_BETA_ALANINE_METABOLISM
       22
       3
       0.08
       0.01
    
    
      KEGG_ASCORBATE_AND_ALDARATE_METABOLISM
       24
       3
       0.05
       0.01
    
    
      REACTOME_FANCONI_ANEMIA_PATHWAY
       15
       2
       0.03
       0.01
    
    
      REACTOME_RNA_POLYMERASE_III_TRANSCRIPTION_INITIATION
       29
       3
       0.02
       0.01



In [8]:

    
gg = ti((p53_dx.hit.binom.frac - p53_dx.miss.binom.frac).dropna() < -.25)
rm = p53_dx.miss.binom.frac 
rh = p53_dx.hit.binom.frac
r = ((rm < .75) & (rm > .25)) | ((rh < .75) & (rh > .25))
t = (p53_dx.miss.binom.num_dx > 50) & (p53_dx.hit.binom.num_dx > 50)
gg_down = gg.intersection(ti(r & t))
len(gg_down)









    Out[8]:





188



In [9]:

    
v = pd.Series(1, gg_down).ix[ti(t)].fillna(0)
mm = run_mgsa(v==1)
mm.sort('estimate', ascending=False).head()









    Out[9]:






  
    
      
      inPopulation
      inStudySet
      estimate
      std.error
    
  
  
    
      REACTOME_TOLL_LIKE_RECEPTOR_4_CASCADE
       28
       4
       0.34
       0.02
    
    
      ST_GA12_PATHWAY
       22
       3
       0.17
       0.01
    
    
      REACTOME_PROSTANOID_HORMONES
       11
       2
       0.09
       0.00
    
    
      KEGG_P53_SIGNALING_PATHWAY
       68
       6
       0.08
       0.00
    
    
      REACTOME_P2Y_RECEPTORS
       13
       2
       0.06
       0.01



In [25]:

    
ax = None
s1, s2 = p53_dx.hit.binom.frac, p53_dx.miss.binom.frac.ix[ti(t)]
fig, ax = init_ax(ax, figsize=(4, 4))
ax.hexbin(*match_series(s1,s2), gridsize=80, bins='log', cmap=plt.cm.Greys)
ax.annotate('r = {0:.2}'.format(Tests.pearson_pandas(s1, s2)['rho']), (.95, .02),
             xycoords='axes fraction', ha='right', va='bottom', size=14)

series_scatter(s1.ix[gg_up], s2.ix[gg_up], ax=ax, s=10, alpha=.3,
               color=colors[4], ann=None)

series_scatter(s1.ix[gg_down], s2.ix[gg_down], ax=ax, s=10, alpha=.3,
               color=colors[3], ann=None)

#series_scatter(s1.ix[['MCM8']], s2.ix[['MCM8']], ax=ax, s=10, alpha=1,
#               color=colors[1], ann=None)

prettify_ax(ax)
ax.set_ybound(-.05,1.05)
ax.set_xbound(-.05,1.05)
ax.set_xlabel('TP53 mutant\nfraction overexpressed')
ax.set_ylabel('TP53 WT\nfraction overexpressed')
ax.set_xticks([0, .5, 1])
ax.set_yticks([0, .5, 1])
plt.savefig('/cellar/users/agross/figures/p53_dx.pdf', transparent=True)



In [11]:

    
fig, axs = subplots(1,2, figsize=(6,3))
ax = axs[0]
hit = p53_dx.hit.binom.frac.ix[ti(gs2['REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA']>0)]
hit.name= 'TP53 Mut.'
miss = p53_dx.miss.binom.frac.ix[ti(gs2['REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA']>0)]
miss.name = 'TP53 WT'
series_scatter(miss, hit, s=20, alpha=.6, ax=ax, ann=None)

ax = axs[1]
hit = p53_dx.hit.binom.frac.ix[ti(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM']>0)]
hit.name= 'TP53 Mut.'
miss = p53_dx.miss.binom.frac.ix[ti(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM']>0)]
miss.name = 'TP53 WT'
series_scatter(miss, hit, s=20, alpha=.6, ax=ax, ann=None)

for ax in axs:
    ax.plot([0,1],[0,1], color='grey')
    ax.set_xbound(0,1)
    ax.set_ybound(0,1)
    prettify_ax(ax)
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/capped_intron_p53.png', dpi=150)



In [33]:

    
p53_dx.ix['MCM8']









    Out[33]:





miss  binom  num_ox    1.90e+02
             num_dx    3.62e+02
             frac      5.25e-01
             p         3.72e-01
      ttest  p         8.17e-03
             t         2.66e+00
hit   binom  num_ox    1.41e+02
             num_dx    1.58e+02
             frac      8.92e-01
             p         1.70e-25
      ttest  p         4.07e-30
             t         1.43e+01
Name: MCM8, dtype: float64



In [13]:

    
v = matched_tn.ix['HSP90AA1'][:, '01'] - matched_tn.ix['HSP90AA1'][:, '11']
violin_plot_pandas(mut.ix['TP53']>0, v)



In [14]:

    
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_tn.ix['HSP90AA1'].ix[ti(mut.ix['TP53']==0)], ax=axs[0])
violin_plot_series(matched_tn.ix['HSP90AA1'].ix[ti(mut.ix['TP53']>0)], ax=axs[1])
for ax in axs:
    prettify_ax(ax)
    #ax.set_ybound(8,13)
    ax.set_xticklabels(['Tumor','Normal'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')









    Out[14]:





<matplotlib.text.Text at 0x7f5958a00dd0>



In [32]:

    
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_tn.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], ax=axs[0],
                   order=['11','01'])
violin_plot_series(matched_tn.ix['MCM8'].ix[ti(mut.ix['TP53']>0)], ax=axs[1],
                   order=['11','01'])
for ax in axs:
    prettify_ax(ax)
    ax.set_ybound(4,11)
    ax.set_xticklabels(['Normal','Tumor'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')
fig.savefig('/cellar/users/agross/figures/MCM8.pdf', transparent=True)



In [30]:

    
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_tn.ix['MDM2'].ix[ti(mut.ix['TP53']==0)], ax=axs[0],
                   order=['11','01'])
violin_plot_series(matched_tn.ix['MDM2'].ix[ti(mut.ix['TP53']>0)], ax=axs[1],
                   order=['11','01'])
for ax in axs:
    prettify_ax(ax)
    ax.set_ybound(8,13)
    ax.set_xticklabels(['Normal','Tumor'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')
fig.savefig('/cellar/users/agross/figures/MDM2.pdf', transparent=True)



In [16]:

    
def paired_bp_tn_split(vec, assignment, ax=None, split_vals=('01', '11'),
                       data_type='gene expression', order=None):
    """
    Paired boxplot for a single Series, with splitting on the index,
    grouped by assignment. I.E. Tumor-Normal gene expression split by
    cancer.
    
    vec: 
        vector of values to plot.
    assignment: 
        vector mapping keys to group assignment
    ax (None):
        matplotlib axis to plot on or None
    split_vals ('01','11'):
        Values to split the boxplot pairing on. The default of 
        ('01','11') indicates tumor vs. normal in the standard 
        TCGA barcode nomenclature.  This should coorespond to values
        on the second level of the index for vec and assignment.
        
    **both vec and assignment should have an overlapping index with
    multiple levels**
    """
    _, ax = init_ax(ax, figsize=(8, 3))
    if vec.name != None:
        label = vec.name  # lose label in manipulation
    else:
        label = ''
    g1 = split_vals[0]
    g2 = split_vals[1]
    vec = pd.concat([vec[:, g1], vec[:, g2]], keys=[g1, g2],
                    axis=1)
    vec = vec.dropna().stack()

    counts = vec.unstack().groupby(assignment).size()
    if order is None:
        groups = list(true_index(counts > 5))
        groups = vec.unstack().groupby(assignment).median()[g1].ix[groups]
        groups = groups.order().index[::-1]
    else:
        groups = order

    l1 = [np.array(vec[:, g1].ix[true_index(assignment == c)].dropna())
          for c in groups]
    l2 = [np.array(vec[:, g2].ix[true_index(assignment == c)].dropna())
          for c in groups]
    boxes = [x for t in zip(l1, l2) for x in t if len(t[1]) > 5]

    ax, bp = paired_boxplot(boxes, ax)
    labels = ['{}\n({})'.format(c, counts[c]) for c in groups]
    ax.set_xticklabels(labels)
    prettify_ax(ax)
    ax.set_ylabel('{} {}'.format(label, data_type))



In [17]:

    
d = pd.concat([codes, (mut.ix['TP53']>0).map({True: 'mut', False:'wt'})], axis=1).dropna()
d = d.ix[matched_rna.columns.get_level_values(0).unique()].dropna()
d = d.apply(lambda s: s.ix[0] + '\n' + s.ix[1], 1)
o = matched_rna.ix['EDA2R'][:,'11'].groupby(codes).mean().order().index
o = [v for c in o for v in [c + '\nwt', c + '\nmut'] if v in list(d)]



In [18]:

    
len(cancers)









    Out[18]:





27



In [19]:

    
p53_mut = (mut.ix['TP53']>0).map({True: 'mut', False:'wt'})
pt = matched_rna.columns.get_level_values(0).unique()
ct = pd.crosstab(codes.ix[pt], p53_mut)
cc = ti(ct.min(1) > 10)
len(cc)









    Out[19]:





6



In [20]:

    
fig, axs = subplots(1,6, figsize=(10,3), sharey=False)
for i,c in enumerate(cc):
    ax = axs[i]
    paired_bp_tn_split(matched_rna.ix['MDM2'].ix[ti(codes==c)], p53_mut, ax=ax)
    if i > 0:
        ax.set_ylabel('')
    ax.set_xlabel(c)
fig.tight_layout()



In [21]:

    
paired_bp_tn_split(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], codes)



In [22]:

    
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], ax=axs[0])
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']>0)], ax=axs[1])
for ax in axs:
    prettify_ax(ax)
    ax.set_xticklabels(['Tumor','Normal'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')









    Out[22]:





<matplotlib.text.Text at 0x7f594143a310>



In [23]:

    
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], ax=axs[0])
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']>0)], ax=axs[1])
for ax in axs:
    prettify_ax(ax)
    ax.set_xticklabels(['Tumor','Normal'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')









    Out[23]:





<matplotlib.text.Text at 0x7f5954d99690>



In [24]:

    
p53_dx.ix['EDA2R']









    Out[24]:





miss  binom  num_ox               206
             num_dx               323
             frac               0.638
             p               8.38e-07
      ttest  p               4.22e-13
             t          7.52563113172
hit   binom  num_ox                19
             num_dx               139
             frac               0.137
             p               4.01e-19
      ttest  p               3.76e-21
             t         -10.9727518238
Name: EDA2R, dtype: object



In [25]:

    
dr = (matched_rna.xs('01',1,1) > matched_rna.xs('11',1,1)) * 1.
dr = dr.ix[ti(dx_rna.num_dx > 200)].dropna()
cc = mut.ix['TP53']>0
odds_ratio, df_odds_ratio = odds_ratio_df(dr, cc)
log_odds = np.log2(odds_ratio).clip(-8,8)



In [26]:

    
violin_plot_pandas(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM'], log_odds)



In [27]:

    
human_net = pd.read_csv('/cellar/users/agross/Data/networks/HumanNet/HumanNet_HUGO.csv',
                        usecols=['gene1','gene2','weight'])

p53_int1 = pd.Series(1, human_net[human_net.gene1 == 'TP53'].gene2).ix[log_odds.index].fillna(0)
p53_int2 = pd.Series(1, human_net[human_net.gene2 == 'TP53'].gene1).ix[log_odds.index].fillna(0)
p53_int = (p53_int1 + p53_int2) > 0



In [29]:

    
log_odds.order()









    Out[29]:





RNF125    -3.66
NOSTRIN   -3.14
EDA2R     -2.98
ABCA8     -2.75
ASPA      -2.72
HLF       -2.69
LRRK2     -2.68
RPS27L    -2.63
SPATA18   -2.61
TMEM220   -2.58
MSRA      -2.50
MYOC      -2.42
SDPR      -2.42
TMEM100   -2.40
ACSM1     -2.39
...
CCDC21      2.50
AARS        2.57
KIAA1524    2.57
SALL4       2.67
AURKA       2.69
TCFL5       2.71
FANCB       2.72
MCM8        2.91
UBE2S       2.91
TROAP       2.92
C1orf135    2.93
STIP1       3.00
CEP55       3.11
C20orf20    3.15
KIF14       3.19
Length: 18419, dtype: float64



In [30]:

    
%time p = df_odds_ratio[log_odds.abs() > 2].apply(fet, 1).order()









    



CPU times: user 2.19 s, sys: 4.08 ms, total: 2.19 s
Wall time: 2.19 s



In [31]:

    
lp = (-1 * np.log10(p) * np.sign(log_odds))



In [32]:

    
v = (log_odds.abs() > 2) * np.sign(log_odds)



In [33]:

    
sim = screen_feature(v[v.abs() > 0]>0, fisher_exact_test, dr.T)



In [34]:

    
o = sim.ix[ti(mut.ix['TP53'] == 0)].dropna()



In [35]:

    
d2 = mut.ix[:, o.index]
d2 = d2[d2.sum(1) > 15]
d2.shape









    Out[35]:





(369, 362)



In [36]:

    
rr = screen_feature(lp, rev_kruskal, gs2.T)
rr.head(10)









    Out[36]:






  
    
      
      H
      p
      q
    
  
  
    
      REACTOME_GENE_EXPRESSION
       146.39
       1.07e-33
       8.87e-31
    
    
      REACTOME_DOWNSTREAM_EVENTS_IN_GPCR_SIGNALING
       138.98
       4.45e-32
       1.85e-29
    
    
      REACTOME_GPCR_LIGAND_BINDING
       114.55
       9.85e-27
       2.73e-24
    
    
      REACTOME_CELL_CYCLE_MITOTIC
       106.10
       7.03e-25
       1.46e-22
    
    
      REACTOME_SIGNALING_IN_IMMUNE_SYSTEM
       104.17
       1.86e-24
       3.10e-22
    
    
      REACTOME_HEMOSTASIS
        87.10
       1.03e-20
       1.43e-18
    
    
      REACTOME_CLASS_A1_RHODOPSIN_LIKE_RECEPTORS
        81.74
       1.55e-19
       1.85e-17
    
    
      KEGG_MAPK_SIGNALING_PATHWAY
        79.43
       5.00e-19
       5.20e-17
    
    
      KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION
        76.03
       2.79e-18
       2.58e-16
    
    
      KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION
        73.68
       9.18e-18
       7.64e-16



In [37]:

    
proliferation = pd.read_csv('/cellar/users/agross/TCGA_Code/DX/rna_signature.csv',
                            header=None, index_col=[0,1], squeeze=True)



In [38]:

    
v = proliferation.ix[:, 1].dropna()
v = (v - codes.map(v.groupby(codes).mean())) / codes.map(v.groupby(codes).std())
violin_plot_pandas(p53_mut, v)

	inPopulation	inStudySet	estimate	std.error
KEGG_STEROID_BIOSYNTHESIS	17	3	0.20	0.02
KEGG_BETA_ALANINE_METABOLISM	22	3	0.08	0.01
KEGG_ASCORBATE_AND_ALDARATE_METABOLISM	24	3	0.05	0.01
REACTOME_FANCONI_ANEMIA_PATHWAY	15	2	0.03	0.01
REACTOME_RNA_POLYMERASE_III_TRANSCRIPTION_INITIATION	29	3	0.02	0.01

	inPopulation	inStudySet	estimate	std.error
REACTOME_TOLL_LIKE_RECEPTOR_4_CASCADE	28	4	0.34	0.02
ST_GA12_PATHWAY	22	3	0.17	0.01
REACTOME_PROSTANOID_HORMONES	11	2	0.09	0.00
KEGG_P53_SIGNALING_PATHWAY	68	6	0.08	0.00
REACTOME_P2Y_RECEPTORS	13	2	0.06	0.01

	H	p	q
REACTOME_GENE_EXPRESSION	146.39	1.07e-33	8.87e-31
REACTOME_DOWNSTREAM_EVENTS_IN_GPCR_SIGNALING	138.98	4.45e-32	1.85e-29
REACTOME_GPCR_LIGAND_BINDING	114.55	9.85e-27	2.73e-24
REACTOME_CELL_CYCLE_MITOTIC	106.10	7.03e-25	1.46e-22
REACTOME_SIGNALING_IN_IMMUNE_SYSTEM	104.17	1.86e-24	3.10e-22
REACTOME_HEMOSTASIS	87.10	1.03e-20	1.43e-18
REACTOME_CLASS_A1_RHODOPSIN_LIKE_RECEPTORS	81.74	1.55e-19	1.85e-17
KEGG_MAPK_SIGNALING_PATHWAY	79.43	5.00e-19	5.20e-17
KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION	76.03	2.79e-18	2.58e-16
KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION	73.68	9.18e-18	7.64e-16