Import everything from the imports notebook. This reads in all of the expression data as well as the functions needed to analyse differential expression data.
In [1]:
    
import NotebookImport
from Imports import *
    
    
In [2]:
    
import seaborn as sns
sns.set_context('paper',font_scale=1.5)
sns.set_style('white')
    
matched_meth is our matched methylation data.
In [3]:
    
store = '/data_ssd/TCGA_methylation_2014_04_16.h5'
matched_meth = pd.read_hdf(store, 'matched_tn')
matched_meth = matched_meth.groupby(axis=1, level=[0,1]).first()
    
In [4]:
    
matched_meth.T.head(10).T.head()
    
    Out[4]:
Read in matched Gene and miRNA expression data.
In [5]:
    
matched_rna = pd.read_hdf('/data_ssd/RNASeq_2014_07_15.h5', 'matched_tn')
matched_mir = pd.read_hdf('/data_ssd/miRNASeq_2014_07_15.h5', 'matched_tn')
    
In [6]:
    
dx_rna = binomial_test_screen(matched_rna, fc=1.)
dx_rna = dx_rna[dx_rna.num_dx > 300]
    
In [7]:
    
dx_rna.ix[['ADH1A','ADH1B','ADH1C']]
    
    Out[7]:
In [8]:
    
dx_rna.shape
    
    Out[8]:
In [9]:
    
dx_rna.p.rank().ix[['ADH1A','ADH1B','ADH1C']]
    
    Out[9]:
In [10]:
    
dx_rna.sort('p').head(10)
    
    Out[10]:
In [11]:
    
plt.rcParams['savefig.dpi'] = 150
    
In [12]:
    
fig, ax = subplots(figsize=(7.75,2.))
paired_bp_tn_split(matched_rna.ix['ADH1B'], codes, data_type='mRNA',
                   ax=ax)
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/ADH1B.pdf')
    
    
In [55]:
    
dx_mir = binomial_test_screen(matched_mir, fc=1.)
dx_mir = dx_mir[dx_mir.num_dx > 300]
    
In [56]:
    
dx_mir.sort('p').head()
    
    Out[56]:
In [57]:
    
paired_bp_tn_split(matched_mir.ix['hsa-mir-139'], codes, data_type='')
    
    
In [64]:
    
fig, ax = subplots(figsize=(6.5,2.))
paired_bp_tn_split(matched_mir.ix['hsa-mir-21'], codes, data_type='',
                   ax=ax)
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/mir21.pdf')
    
    
In [59]:
    
dx_meth = binomial_test_screen(matched_meth, fc=1.)
dx_meth = dx_meth[dx_meth.num_dx > 300]
    
In [60]:
    
dx_meth.sort('p').head()
    
    Out[60]:
In [61]:
    
paired_bp_tn_split(matched_meth.ix['cg10216717'], codes, data_type='Beta')
    
    
We are going to want to reuse this plot so here I'm wrapping it in a function.
In [62]:
    
def fig_1e(ax):
    draw_dist(dx_meth.frac, ax=ax, lw=2.5)
    draw_dist(dx_rna.frac, ax=ax, lw=2.5, bins=200)
    draw_dist(dx_mir.frac, ax=ax, lw=2.5, bins=100)
    ax.set_yticks([])
    ax.set_xticks([0,.5,1])
    ax.set_ylabel('Density')
    ax.set_xlabel('Fraction')
    ax.legend(('Methylation','mRNA','miRNA'), frameon=False)
    prettify_ax(ax)
    return ax
    
In [63]:
    
#Do not import
fig, ax = subplots(1,1, figsize=(5,3))
fig_1e(ax);
    
    
In [67]:
    
gs2 = gene_sets.ix[dx_rna.index].fillna(0)
    
In [68]:
    
rr = screen_feature(dx_rna.frac, rev_kruskal, gs2.T, 
                    align=False)
fp = (1.*gene_sets.T * dx_rna.frac).T.dropna().replace(0, np.nan).mean().order()
fp.name = 'mean frac'
    
Overexpressed pathways
In [69]:
    
rr.ix[ti(fp > .5)].join(fp).sort('p').head()
    
    Out[69]:
Underexpressed pathways
In [70]:
    
rr.ix[ti(fp < .5)].join(fp).sort('p').head()
    
    Out[70]:
I am folling up on Fatty Acid Metabolism as opposed to biological oxidations, because it has a larger effect size, although the smaller gene-set size gives it a less extreme p-value.
In [71]:
    
def fig_1f(ax):
    v = pd.concat([dx_rna.frac, 
                   dx_rna.frac.ix[ti(gs2['REACTOME_CELL_CYCLE_MITOTIC']>0)],
                   dx_rna.frac.ix[ti(gs2['KEGG_FATTY_ACID_METABOLISM']>0)]])
    v1 = pd.concat([pd.Series('All Genes', dx_rna.frac.index), 
                    pd.Series('Cell Cycle\nMitotic',
                              ti(gs2['REACTOME_CELL_CYCLE_MITOTIC']>0)),
                    pd.Series('Fatty Acid\nMetabolism', 
                              ti(gs2['KEGG_FATTY_ACID_METABOLISM']>0))])
    v1.name = ''
    v.name = 'Fraction Overexpressed'
    violin_plot_pandas(v1, v, ann=None, ax=ax)
    prettify_ax(ax)
    return ax
    
In [72]:
    
#Do not import
fig, ax = subplots(1,1, figsize=(4,3))
fig_1f(ax)
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/gsea.pdf')
    
    
In [73]:
    
os.chdir('../Methlation/')
    
In [74]:
    
import Setup.DX_Imports as DX
    
    
    
    
In [75]:
    
#Do not import
fig, axs = subplots(2,4, figsize=(12,5))
axs = axs.flatten()
for i,p in enumerate(DX.probe_sets.keys()):
    draw_dist(dx_meth.frac, DX.probe_sets[p], ax=axs[i])
    axs[i].legend().set_visible(False)
    axs[i].set_yticks([])
    axs[i].set_title(p)
    prettify_ax(axs[i])
f_odds = pd.DataFrame({f: fisher_exact_test((dx_meth.frac - .5).abs() > .25, v)
                      for f,v in DX.probe_sets.iteritems()}).T
np.log2(f_odds.odds_ratio).plot(kind='bar', ax=axs[-1])
prettify_ax(axs[-1])
fig.tight_layout()
    
    
In [76]:
    
def fig_1g(ax):
    lw = 2.5
    draw_dist(dx_meth.frac.ix[ti(DX.probe_sets['Promoter'])], ax=ax, lw=lw)
    draw_dist(dx_meth.frac.ix[ti(DX.probe_sets['CpG Island'])], ax=ax, lw=lw)
    draw_dist(dx_meth.frac.ix[ti(DX.probe_sets['PRC2'])], ax=ax, lw=lw)
    draw_dist(dx_meth.frac, ax=ax, colors='grey', lw=lw)
    ax.set_yticks([])
    ax.set_xticks([0,.5,1])
    ax.set_ylabel('Density')
    ax.set_xlabel('Fraction with Increased Methylation')
    ax.legend(('Promoter','CpG Island','PRC2','All Probes'))
    prettify_ax(ax)
    return ax
    
In [79]:
    
plt.rcParams['savefig.dpi'] = 150
    
In [81]:
    
#Do not import
fig, ax = subplots(1,1, figsize=(5,3))
fig_1g(ax);
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/fx_enrichment.pdf')
    
    
In [32]:
    
#Do not import
fig, axs = subplots(1,3, figsize=(15,3.5))
fig_1e(axs[0])
fig_1f(axs[1])
fig_1g(axs[2])
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/fig1_bottom.png', dpi=300)