In [1]:
import NotebookImport
from DX_screen import *
In [2]:
def odds_ratio_df(a,b):
a = a.astype(int)
b = b.astype(int)
flip = lambda v: (v == 0).astype(int)
a11 = (a.add(b) == 2).sum(axis=1)
a10 = (a.add(flip(b)) == 2).sum(axis=1)
a01 = (flip(a).add(b) == 2).sum(axis=1)
a00 = (flip(a).add(flip(b)) == 2).sum(axis=1)
odds_ratio = (1.*a11 * a00) / (1.*a10 * a01)
df = pd.concat([a00, a01, a10, a11], axis=1,
keys=['00','01','10','11'])
return odds_ratio, df
In [3]:
ann = DX.probe_annotations
ann.Genomic_Coordinate = ann.Genomic_Coordinate.astype(float)
ann = ann.ix[dx_meth.index.intersection(ann.index)]
ann = ann.sort(['Chromosome','Genomic_Coordinate'])
In [4]:
dr = matched_rna.xs('01',1,1) > matched_rna.xs('11',1,1)
dm = matched_meth.xs('01',1,1) > matched_meth.xs('11',1,1)
gene_lookup = ann.Gene_Symbol.dropna()
gene_lookup = gene_lookup[gene_lookup.isin(dr.index)]
dm = dm.ix[gene_lookup.index]
dr = dr.ix[:, dr.columns.intersection(dm.columns)]
dm = dm.ix[:, dr.columns.intersection(dm.columns)]
In [5]:
dr_x = dr.ix[gene_lookup] * 1.
dr_x.index = gene_lookup.index
dm_x = dm.ix[gene_lookup.index] * 1.
odds_ratio, df_odds_ratio = odds_ratio_df(dr_x, dm_x)
In [13]:
fig, ax = subplots(figsize=(6,4))
log_odds = np.log2(odds_ratio).clip(-3,3)
log_odds.hist(bins=100)
ax.set_xlabel('log odds-ratio')
ax.set_ylabel('# of probes')
Out[13]:
Odds ratio associations across genome annotations
In [7]:
fig, axs = subplots(2,4, figsize=(12,5))
axs = axs.flatten()
for i,p in enumerate(DX.probe_sets.keys()):
draw_dist(log_odds, DX.probe_sets[p], ax=axs[i])
axs[i].legend().set_visible(False)
axs[i].set_yticks([])
axs[i].set_title(p)
prettify_ax(axs[i])
f_odds = pd.DataFrame({f: fisher_exact_test(log_odds.abs().dropna() > 1, v)
for f,v in DX.probe_sets.iteritems()}).T
np.log2(f_odds.odds_ratio).plot(kind='bar', ax=axs[-1])
prettify_ax(axs[-1])
fig.tight_layout()
In [8]:
pd.Series({s: sum(v*1.) for s,v in DX.probe_sets.iteritems()})
Out[8]:
In [9]:
gl = {}
for p in DX.probe_sets:
ss = ti(DX.probe_sets[p])
m = log_odds.ix[ss].groupby(gene_lookup).mean().dropna().order()
s = log_odds.ix[ss].groupby(gene_lookup).size()
s = s[s.index.isin(dx_rna.index)]
g = m.ix[ti(s > 10)]
gl[p] = pd.concat([g,s], keys=['g','s'], axis=1).dropna().sort('g')
In [10]:
pd.concat(gl, axis=1).fillna(0).sort([('Gene Body','g')]).replace(0,'')
Out[10]:
Odds Ratio associations across individual gene annotations
In [91]:
m = log_odds.groupby(gene_lookup).mean().dropna().order()
s = log_odds.groupby(gene_lookup).size()
s = s[s.index.isin(dx_rna.index)]
g = m.ix[ti(s > 20)]
pd.concat([g,s], keys=['g','s'], axis=1).dropna().sort('g').head(10)
Out[91]:
In [141]:
fig, ax = subplots(figsize=(10,3))
a = ann[ann.Gene_Symbol == 'RIMBP2']
v = a.Genomic_Coordinate
v2 = ann[(ann.Chromosome == a.Chromosome.iloc[0]) &
(ann.Genomic_Coordinate > v.min() -1000000) &
(ann.Genomic_Coordinate < v.max() + 1000000)].Genomic_Coordinate
series_scatter(v2, log_odds, ax=ax, ann=None)
ax.set_ylabel('log odds ratio')
prettify_ax(ax)
In [107]:
fig, ax = subplots(figsize=(10,3))
v = ann[ann.Gene_Symbol == 'GATA3'].Genomic_Coordinate
series_scatter(v, log_odds, ax=ax, ann=None)
ax.set_ylabel('log odds ratio')
prettify_ax(ax)
In [11]:
m = log_odds.groupby(gene_lookup).mean().dropna().order()
s = log_odds.groupby(gene_lookup).size()
s = s[s.index.isin(dx_rna.index)]
g = m.ix[ti(s > 10)]
pd.concat([g,s], keys=['g','s'], axis=1).dropna().sort('g').tail(10)
Out[11]:
In [105]:
fig, ax = subplots(figsize=(12,3))
paired_bp_tn_split(matched_rna.ix['GATA3'], codes, ax=ax)
In [94]:
draw_dist?
In [104]:
fig, ax = subplots()
draw_dist(log_odds, ax=ax)
draw_dist(log_odds.ix[ti(gene_lookup == 'GATA3')], ax=ax, bins=50)
prettify_ax(ax)
ax.set_xlabel('log odds ratio')
ax.set_ylabel('probe density')
Out[104]:
In [17]:
log_odds.ix[ti(gene_loakup == 'PTPRN2')]
Out[17]:
In [12]:
pd.concat([g,s], keys=['g','s'], axis=1).dropna().sort('g').head(10)
Out[12]:
In [13]:
gs3 = pd.DataFrame({p: gene_lookup.isin(ti(v>0)) for p,v in gs2.iteritems()})
In [14]:
gs4 = gs3.ix[ti(DX.gene_body)].dropna()
In [15]:
rr_b = screen_feature(log_odds.ix[gs4.index].abs(), rev_kruskal, gs4.T,
align=False)
In [16]:
rr_b.head()
Out[16]:
In [17]:
gs5 = gs3.ix[ti(DX.dhs_site)].dropna()
In [18]:
rr_prc = screen_feature(log_odds.ix[gs5.index], rev_kruskal, gs5.T,
align=False)
In [19]:
rr_prc.head()
Out[19]:
In [20]:
v = gene_lookup.isin(ti(gs2['BIOCARTA_VIP_PATHWAY']>0))
draw_dist(log_odds, v.ix[ti(DX.dhs_site)])
In [21]:
rr_p = screen_feature(log_odds.ix[gs5.index], rev_kruskal, gs5.T,
align=False)
rr_p.head(3)
Out[21]:
In [22]:
v = gene_lookup.isin(ti(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM']>0))
draw_dist(log_odds, v.ix[ti(DX.promoter)])
In [23]:
gs5 = gs3.ix[ti(DX.promoter)].dropna()
In [24]:
rr_f = screen_feature(log_odds.ix[gs3.index].abs(), rev_kruskal, gs3.T,
align=False)
In [25]:
fp = (1.*gs3.T * log_odds.abs()).T.dropna().replace(0, np.nan).mean().order()
fp.name = 'mean LOD'
In [26]:
rr_f.ix[ti(fp < fp.mean())].join(fp).sort('p').head(3)
Out[26]:
In [27]:
rr_f.ix[ti(fp > fp.mean())].join(fp).sort('p').head(5)
Out[27]:
In [28]:
v = gene_lookup.isin(ti(gs2['REACTOME_CELL_CYCLE_MITOTIC']>0))
draw_dist(log_odds, v)
In [29]:
v = gene_lookup.isin(ti(gs2['KEGG_CELL_ADHESION_MOLECULES_CAMS']>0))
draw_dist(log_odds, v)
In [30]:
pd.concat([g.ix[ti(gs2['KEGG_CELL_ADHESION_MOLECULES_CAMS']>0)],
s], keys=['g','s'], axis=1).dropna().sort('g').head(10)
Out[30]: