Which genes are turned on/off in some cancers but not others?


In [1]:
import NotebookImport
from DX_screen import *
from Figures.Regression import *


importing IPython notebook from DX_screen
importing IPython notebook from Imports

In [2]:
def dx_group(split, matched_tn, fc=1.):
    tab_1 = binomial_test_screen(matched_tn.ix[:, ti(split==True)], fc=fc)
    tab_1 = tab_1.ix[(((tab_1.frac - .5).abs() > .25) & (tab_1.p > .001)) == False]
    
    tab_0 = binomial_test_screen(matched_tn.ix[:, ti(split==False)], fc=fc)
    tab_0 = tab_0.ix[(((tab_0.frac - .5).abs() > .25) & (tab_0.p > .001)) == False]
    
    df = matched_tn.ix[:, ti(split==True)]
    ttest_1 = df.apply(ttest_rel, axis=1)
    
    df = matched_tn.ix[:, ti(split==False)]
    ttest_0 = df.apply(ttest_rel, axis=1)
    
    t0 = pd.concat([tab_0, ttest_0], keys=['binom','ttest'], axis=1)
    t1 = pd.concat([tab_1, ttest_1], keys=['binom','ttest'], axis=1)
    t = pd.concat([t0, t1], keys=['miss','hit'], axis=1)
    return t

TP53


In [3]:
mut_all = pd.read_csv('/cellar/users/agross/TCGA_Code/TCGA/Data/MAFs_new_2/meta.csv', 
                      index_col=[1,2])
mut = mut_all['0'].unstack().fillna(0)
mut = FH.fix_barcode_columns(mut)
mut = mut.xs('01', axis=1, level=1)

In [5]:
p53_dx = dx_group(mut.ix['TP53'].dropna()>0, matched_rna, fc=1.)

In [6]:
gg = ti((p53_dx.hit.binom.frac - p53_dx.miss.binom.frac).dropna() > .25)
rm = p53_dx.miss.binom.frac 
rh = p53_dx.hit.binom.frac
r = ((rm < .75) & (rm > .25)) | ((rh < .75) & (rh > .25))
t = (p53_dx.miss.binom.num_dx > 50) & (p53_dx.hit.binom.num_dx > 50)
gg_up = gg.intersection(ti(r & t))
len(gg_up)


Out[6]:
378

In [7]:
v = pd.Series(1, gg_up).ix[ti(t)].fillna(0)
mm = run_mgsa(v==1)
mm.sort('estimate', ascending=False).head()


Out[7]:
inPopulation inStudySet estimate std.error
KEGG_STEROID_BIOSYNTHESIS 17 3 0.20 0.02
KEGG_BETA_ALANINE_METABOLISM 22 3 0.08 0.01
KEGG_ASCORBATE_AND_ALDARATE_METABOLISM 24 3 0.05 0.01
REACTOME_FANCONI_ANEMIA_PATHWAY 15 2 0.03 0.01
REACTOME_RNA_POLYMERASE_III_TRANSCRIPTION_INITIATION 29 3 0.02 0.01

In [8]:
gg = ti((p53_dx.hit.binom.frac - p53_dx.miss.binom.frac).dropna() < -.25)
rm = p53_dx.miss.binom.frac 
rh = p53_dx.hit.binom.frac
r = ((rm < .75) & (rm > .25)) | ((rh < .75) & (rh > .25))
t = (p53_dx.miss.binom.num_dx > 50) & (p53_dx.hit.binom.num_dx > 50)
gg_down = gg.intersection(ti(r & t))
len(gg_down)


Out[8]:
188

In [9]:
v = pd.Series(1, gg_down).ix[ti(t)].fillna(0)
mm = run_mgsa(v==1)
mm.sort('estimate', ascending=False).head()


Out[9]:
inPopulation inStudySet estimate std.error
REACTOME_TOLL_LIKE_RECEPTOR_4_CASCADE 28 4 0.34 0.02
ST_GA12_PATHWAY 22 3 0.17 0.01
REACTOME_PROSTANOID_HORMONES 11 2 0.09 0.00
KEGG_P53_SIGNALING_PATHWAY 68 6 0.08 0.00
REACTOME_P2Y_RECEPTORS 13 2 0.06 0.01

In [25]:
ax = None
s1, s2 = p53_dx.hit.binom.frac, p53_dx.miss.binom.frac.ix[ti(t)]
fig, ax = init_ax(ax, figsize=(4, 4))
ax.hexbin(*match_series(s1,s2), gridsize=80, bins='log', cmap=plt.cm.Greys)
ax.annotate('r = {0:.2}'.format(Tests.pearson_pandas(s1, s2)['rho']), (.95, .02),
             xycoords='axes fraction', ha='right', va='bottom', size=14)

series_scatter(s1.ix[gg_up], s2.ix[gg_up], ax=ax, s=10, alpha=.3,
               color=colors[4], ann=None)

series_scatter(s1.ix[gg_down], s2.ix[gg_down], ax=ax, s=10, alpha=.3,
               color=colors[3], ann=None)

#series_scatter(s1.ix[['MCM8']], s2.ix[['MCM8']], ax=ax, s=10, alpha=1,
#               color=colors[1], ann=None)

prettify_ax(ax)
ax.set_ybound(-.05,1.05)
ax.set_xbound(-.05,1.05)
ax.set_xlabel('TP53 mutant\nfraction overexpressed')
ax.set_ylabel('TP53 WT\nfraction overexpressed')
ax.set_xticks([0, .5, 1])
ax.set_yticks([0, .5, 1])
plt.savefig('/cellar/users/agross/figures/p53_dx.pdf', transparent=True)



In [11]:
fig, axs = subplots(1,2, figsize=(6,3))
ax = axs[0]
hit = p53_dx.hit.binom.frac.ix[ti(gs2['REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA']>0)]
hit.name= 'TP53 Mut.'
miss = p53_dx.miss.binom.frac.ix[ti(gs2['REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA']>0)]
miss.name = 'TP53 WT'
series_scatter(miss, hit, s=20, alpha=.6, ax=ax, ann=None)

ax = axs[1]
hit = p53_dx.hit.binom.frac.ix[ti(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM']>0)]
hit.name= 'TP53 Mut.'
miss = p53_dx.miss.binom.frac.ix[ti(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM']>0)]
miss.name = 'TP53 WT'
series_scatter(miss, hit, s=20, alpha=.6, ax=ax, ann=None)

for ax in axs:
    ax.plot([0,1],[0,1], color='grey')
    ax.set_xbound(0,1)
    ax.set_ybound(0,1)
    prettify_ax(ax)
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/capped_intron_p53.png', dpi=150)



In [33]:
p53_dx.ix['MCM8']


Out[33]:
miss  binom  num_ox    1.90e+02
             num_dx    3.62e+02
             frac      5.25e-01
             p         3.72e-01
      ttest  p         8.17e-03
             t         2.66e+00
hit   binom  num_ox    1.41e+02
             num_dx    1.58e+02
             frac      8.92e-01
             p         1.70e-25
      ttest  p         4.07e-30
             t         1.43e+01
Name: MCM8, dtype: float64

In [13]:
v = matched_tn.ix['HSP90AA1'][:, '01'] - matched_tn.ix['HSP90AA1'][:, '11']
violin_plot_pandas(mut.ix['TP53']>0, v)



In [14]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_tn.ix['HSP90AA1'].ix[ti(mut.ix['TP53']==0)], ax=axs[0])
violin_plot_series(matched_tn.ix['HSP90AA1'].ix[ti(mut.ix['TP53']>0)], ax=axs[1])
for ax in axs:
    prettify_ax(ax)
    #ax.set_ybound(8,13)
    ax.set_xticklabels(['Tumor','Normal'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')


Out[14]:
<matplotlib.text.Text at 0x7f5958a00dd0>

In [32]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_tn.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], ax=axs[0],
                   order=['11','01'])
violin_plot_series(matched_tn.ix['MCM8'].ix[ti(mut.ix['TP53']>0)], ax=axs[1],
                   order=['11','01'])
for ax in axs:
    prettify_ax(ax)
    ax.set_ybound(4,11)
    ax.set_xticklabels(['Normal','Tumor'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')
fig.savefig('/cellar/users/agross/figures/MCM8.pdf', transparent=True)



In [30]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_tn.ix['MDM2'].ix[ti(mut.ix['TP53']==0)], ax=axs[0],
                   order=['11','01'])
violin_plot_series(matched_tn.ix['MDM2'].ix[ti(mut.ix['TP53']>0)], ax=axs[1],
                   order=['11','01'])
for ax in axs:
    prettify_ax(ax)
    ax.set_ybound(8,13)
    ax.set_xticklabels(['Normal','Tumor'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')
fig.savefig('/cellar/users/agross/figures/MDM2.pdf', transparent=True)



In [16]:
def paired_bp_tn_split(vec, assignment, ax=None, split_vals=('01', '11'),
                       data_type='gene expression', order=None):
    """
    Paired boxplot for a single Series, with splitting on the index,
    grouped by assignment. I.E. Tumor-Normal gene expression split by
    cancer.
    
    vec: 
        vector of values to plot.
    assignment: 
        vector mapping keys to group assignment
    ax (None):
        matplotlib axis to plot on or None
    split_vals ('01','11'):
        Values to split the boxplot pairing on. The default of 
        ('01','11') indicates tumor vs. normal in the standard 
        TCGA barcode nomenclature.  This should coorespond to values
        on the second level of the index for vec and assignment.
        
    **both vec and assignment should have an overlapping index with
    multiple levels**
    """
    _, ax = init_ax(ax, figsize=(8, 3))
    if vec.name != None:
        label = vec.name  # lose label in manipulation
    else:
        label = ''
    g1 = split_vals[0]
    g2 = split_vals[1]
    vec = pd.concat([vec[:, g1], vec[:, g2]], keys=[g1, g2],
                    axis=1)
    vec = vec.dropna().stack()

    counts = vec.unstack().groupby(assignment).size()
    if order is None:
        groups = list(true_index(counts > 5))
        groups = vec.unstack().groupby(assignment).median()[g1].ix[groups]
        groups = groups.order().index[::-1]
    else:
        groups = order

    l1 = [np.array(vec[:, g1].ix[true_index(assignment == c)].dropna())
          for c in groups]
    l2 = [np.array(vec[:, g2].ix[true_index(assignment == c)].dropna())
          for c in groups]
    boxes = [x for t in zip(l1, l2) for x in t if len(t[1]) > 5]

    ax, bp = paired_boxplot(boxes, ax)
    labels = ['{}\n({})'.format(c, counts[c]) for c in groups]
    ax.set_xticklabels(labels)
    prettify_ax(ax)
    ax.set_ylabel('{} {}'.format(label, data_type))

In [17]:
d = pd.concat([codes, (mut.ix['TP53']>0).map({True: 'mut', False:'wt'})], axis=1).dropna()
d = d.ix[matched_rna.columns.get_level_values(0).unique()].dropna()
d = d.apply(lambda s: s.ix[0] + '\n' + s.ix[1], 1)
o = matched_rna.ix['EDA2R'][:,'11'].groupby(codes).mean().order().index
o = [v for c in o for v in [c + '\nwt', c + '\nmut'] if v in list(d)]

In [18]:
len(cancers)


Out[18]:
27

In [19]:
p53_mut = (mut.ix['TP53']>0).map({True: 'mut', False:'wt'})
pt = matched_rna.columns.get_level_values(0).unique()
ct = pd.crosstab(codes.ix[pt], p53_mut)
cc = ti(ct.min(1) > 10)
len(cc)


Out[19]:
6

In [20]:
fig, axs = subplots(1,6, figsize=(10,3), sharey=False)
for i,c in enumerate(cc):
    ax = axs[i]
    paired_bp_tn_split(matched_rna.ix['MDM2'].ix[ti(codes==c)], p53_mut, ax=ax)
    if i > 0:
        ax.set_ylabel('')
    ax.set_xlabel(c)
fig.tight_layout()



In [21]:
paired_bp_tn_split(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], codes)



In [22]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], ax=axs[0])
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']>0)], ax=axs[1])
for ax in axs:
    prettify_ax(ax)
    ax.set_xticklabels(['Tumor','Normal'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')


Out[22]:
<matplotlib.text.Text at 0x7f594143a310>

In [23]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], ax=axs[0])
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']>0)], ax=axs[1])
for ax in axs:
    prettify_ax(ax)
    ax.set_xticklabels(['Tumor','Normal'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')


Out[23]:
<matplotlib.text.Text at 0x7f5954d99690>

In [24]:
p53_dx.ix['EDA2R']


Out[24]:
miss  binom  num_ox               206
             num_dx               323
             frac               0.638
             p               8.38e-07
      ttest  p               4.22e-13
             t          7.52563113172
hit   binom  num_ox                19
             num_dx               139
             frac               0.137
             p               4.01e-19
      ttest  p               3.76e-21
             t         -10.9727518238
Name: EDA2R, dtype: object

In [25]:
dr = (matched_rna.xs('01',1,1) > matched_rna.xs('11',1,1)) * 1.
dr = dr.ix[ti(dx_rna.num_dx > 200)].dropna()
cc = mut.ix['TP53']>0
odds_ratio, df_odds_ratio = odds_ratio_df(dr, cc)
log_odds = np.log2(odds_ratio).clip(-8,8)

In [26]:
violin_plot_pandas(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM'], log_odds)



In [27]:
human_net = pd.read_csv('/cellar/users/agross/Data/networks/HumanNet/HumanNet_HUGO.csv',
                        usecols=['gene1','gene2','weight'])

p53_int1 = pd.Series(1, human_net[human_net.gene1 == 'TP53'].gene2).ix[log_odds.index].fillna(0)
p53_int2 = pd.Series(1, human_net[human_net.gene2 == 'TP53'].gene1).ix[log_odds.index].fillna(0)
p53_int = (p53_int1 + p53_int2) > 0

In [29]:
log_odds.order()


Out[29]:
RNF125    -3.66
NOSTRIN   -3.14
EDA2R     -2.98
ABCA8     -2.75
ASPA      -2.72
HLF       -2.69
LRRK2     -2.68
RPS27L    -2.63
SPATA18   -2.61
TMEM220   -2.58
MSRA      -2.50
MYOC      -2.42
SDPR      -2.42
TMEM100   -2.40
ACSM1     -2.39
...
CCDC21      2.50
AARS        2.57
KIAA1524    2.57
SALL4       2.67
AURKA       2.69
TCFL5       2.71
FANCB       2.72
MCM8        2.91
UBE2S       2.91
TROAP       2.92
C1orf135    2.93
STIP1       3.00
CEP55       3.11
C20orf20    3.15
KIF14       3.19
Length: 18419, dtype: float64

In [30]:
%time p = df_odds_ratio[log_odds.abs() > 2].apply(fet, 1).order()


CPU times: user 2.19 s, sys: 4.08 ms, total: 2.19 s
Wall time: 2.19 s

In [31]:
lp = (-1 * np.log10(p) * np.sign(log_odds))

In [32]:
v = (log_odds.abs() > 2) * np.sign(log_odds)

In [33]:
sim = screen_feature(v[v.abs() > 0]>0, fisher_exact_test, dr.T)

In [34]:
o = sim.ix[ti(mut.ix['TP53'] == 0)].dropna()

In [35]:
d2 = mut.ix[:, o.index]
d2 = d2[d2.sum(1) > 15]
d2.shape


Out[35]:
(369, 362)

In [36]:
rr = screen_feature(lp, rev_kruskal, gs2.T)
rr.head(10)


Out[36]:
H p q
REACTOME_GENE_EXPRESSION 146.39 1.07e-33 8.87e-31
REACTOME_DOWNSTREAM_EVENTS_IN_GPCR_SIGNALING 138.98 4.45e-32 1.85e-29
REACTOME_GPCR_LIGAND_BINDING 114.55 9.85e-27 2.73e-24
REACTOME_CELL_CYCLE_MITOTIC 106.10 7.03e-25 1.46e-22
REACTOME_SIGNALING_IN_IMMUNE_SYSTEM 104.17 1.86e-24 3.10e-22
REACTOME_HEMOSTASIS 87.10 1.03e-20 1.43e-18
REACTOME_CLASS_A1_RHODOPSIN_LIKE_RECEPTORS 81.74 1.55e-19 1.85e-17
KEGG_MAPK_SIGNALING_PATHWAY 79.43 5.00e-19 5.20e-17
KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 76.03 2.79e-18 2.58e-16
KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION 73.68 9.18e-18 7.64e-16

In [37]:
proliferation = pd.read_csv('/cellar/users/agross/TCGA_Code/DX/rna_signature.csv',
                            header=None, index_col=[0,1], squeeze=True)

In [38]:
v = proliferation.ix[:, 1].dropna()
v = (v - codes.map(v.groupby(codes).mean())) / codes.map(v.groupby(codes).std())
violin_plot_pandas(p53_mut, v)