In [1]:
import NotebookImport
from DX_screen import *
from Figures.Regression import *
In [2]:
def dx_group(split, matched_tn, fc=1.):
tab_1 = binomial_test_screen(matched_tn.ix[:, ti(split==True)], fc=fc)
tab_1 = tab_1.ix[(((tab_1.frac - .5).abs() > .25) & (tab_1.p > .001)) == False]
tab_0 = binomial_test_screen(matched_tn.ix[:, ti(split==False)], fc=fc)
tab_0 = tab_0.ix[(((tab_0.frac - .5).abs() > .25) & (tab_0.p > .001)) == False]
df = matched_tn.ix[:, ti(split==True)]
ttest_1 = df.apply(ttest_rel, axis=1)
df = matched_tn.ix[:, ti(split==False)]
ttest_0 = df.apply(ttest_rel, axis=1)
t0 = pd.concat([tab_0, ttest_0], keys=['binom','ttest'], axis=1)
t1 = pd.concat([tab_1, ttest_1], keys=['binom','ttest'], axis=1)
t = pd.concat([t0, t1], keys=['miss','hit'], axis=1)
return t
In [3]:
mut_all = pd.read_csv('/cellar/users/agross/TCGA_Code/TCGA/Data/MAFs_new_2/meta.csv',
index_col=[1,2])
mut = mut_all['0'].unstack().fillna(0)
mut = FH.fix_barcode_columns(mut)
mut = mut.xs('01', axis=1, level=1)
In [5]:
p53_dx = dx_group(mut.ix['TP53'].dropna()>0, matched_rna, fc=1.)
In [6]:
gg = ti((p53_dx.hit.binom.frac - p53_dx.miss.binom.frac).dropna() > .25)
rm = p53_dx.miss.binom.frac
rh = p53_dx.hit.binom.frac
r = ((rm < .75) & (rm > .25)) | ((rh < .75) & (rh > .25))
t = (p53_dx.miss.binom.num_dx > 50) & (p53_dx.hit.binom.num_dx > 50)
gg_up = gg.intersection(ti(r & t))
len(gg_up)
Out[6]:
In [7]:
v = pd.Series(1, gg_up).ix[ti(t)].fillna(0)
mm = run_mgsa(v==1)
mm.sort('estimate', ascending=False).head()
Out[7]:
In [8]:
gg = ti((p53_dx.hit.binom.frac - p53_dx.miss.binom.frac).dropna() < -.25)
rm = p53_dx.miss.binom.frac
rh = p53_dx.hit.binom.frac
r = ((rm < .75) & (rm > .25)) | ((rh < .75) & (rh > .25))
t = (p53_dx.miss.binom.num_dx > 50) & (p53_dx.hit.binom.num_dx > 50)
gg_down = gg.intersection(ti(r & t))
len(gg_down)
Out[8]:
In [9]:
v = pd.Series(1, gg_down).ix[ti(t)].fillna(0)
mm = run_mgsa(v==1)
mm.sort('estimate', ascending=False).head()
Out[9]:
In [25]:
ax = None
s1, s2 = p53_dx.hit.binom.frac, p53_dx.miss.binom.frac.ix[ti(t)]
fig, ax = init_ax(ax, figsize=(4, 4))
ax.hexbin(*match_series(s1,s2), gridsize=80, bins='log', cmap=plt.cm.Greys)
ax.annotate('r = {0:.2}'.format(Tests.pearson_pandas(s1, s2)['rho']), (.95, .02),
xycoords='axes fraction', ha='right', va='bottom', size=14)
series_scatter(s1.ix[gg_up], s2.ix[gg_up], ax=ax, s=10, alpha=.3,
color=colors[4], ann=None)
series_scatter(s1.ix[gg_down], s2.ix[gg_down], ax=ax, s=10, alpha=.3,
color=colors[3], ann=None)
#series_scatter(s1.ix[['MCM8']], s2.ix[['MCM8']], ax=ax, s=10, alpha=1,
# color=colors[1], ann=None)
prettify_ax(ax)
ax.set_ybound(-.05,1.05)
ax.set_xbound(-.05,1.05)
ax.set_xlabel('TP53 mutant\nfraction overexpressed')
ax.set_ylabel('TP53 WT\nfraction overexpressed')
ax.set_xticks([0, .5, 1])
ax.set_yticks([0, .5, 1])
plt.savefig('/cellar/users/agross/figures/p53_dx.pdf', transparent=True)
In [11]:
fig, axs = subplots(1,2, figsize=(6,3))
ax = axs[0]
hit = p53_dx.hit.binom.frac.ix[ti(gs2['REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA']>0)]
hit.name= 'TP53 Mut.'
miss = p53_dx.miss.binom.frac.ix[ti(gs2['REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA']>0)]
miss.name = 'TP53 WT'
series_scatter(miss, hit, s=20, alpha=.6, ax=ax, ann=None)
ax = axs[1]
hit = p53_dx.hit.binom.frac.ix[ti(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM']>0)]
hit.name= 'TP53 Mut.'
miss = p53_dx.miss.binom.frac.ix[ti(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM']>0)]
miss.name = 'TP53 WT'
series_scatter(miss, hit, s=20, alpha=.6, ax=ax, ann=None)
for ax in axs:
ax.plot([0,1],[0,1], color='grey')
ax.set_xbound(0,1)
ax.set_ybound(0,1)
prettify_ax(ax)
fig.tight_layout()
fig.savefig('/cellar/users/agross/figures/capped_intron_p53.png', dpi=150)
In [33]:
p53_dx.ix['MCM8']
Out[33]:
In [13]:
v = matched_tn.ix['HSP90AA1'][:, '01'] - matched_tn.ix['HSP90AA1'][:, '11']
violin_plot_pandas(mut.ix['TP53']>0, v)
In [14]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_tn.ix['HSP90AA1'].ix[ti(mut.ix['TP53']==0)], ax=axs[0])
violin_plot_series(matched_tn.ix['HSP90AA1'].ix[ti(mut.ix['TP53']>0)], ax=axs[1])
for ax in axs:
prettify_ax(ax)
#ax.set_ybound(8,13)
ax.set_xticklabels(['Tumor','Normal'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')
Out[14]:
In [32]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_tn.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], ax=axs[0],
order=['11','01'])
violin_plot_series(matched_tn.ix['MCM8'].ix[ti(mut.ix['TP53']>0)], ax=axs[1],
order=['11','01'])
for ax in axs:
prettify_ax(ax)
ax.set_ybound(4,11)
ax.set_xticklabels(['Normal','Tumor'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')
fig.savefig('/cellar/users/agross/figures/MCM8.pdf', transparent=True)
In [30]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_tn.ix['MDM2'].ix[ti(mut.ix['TP53']==0)], ax=axs[0],
order=['11','01'])
violin_plot_series(matched_tn.ix['MDM2'].ix[ti(mut.ix['TP53']>0)], ax=axs[1],
order=['11','01'])
for ax in axs:
prettify_ax(ax)
ax.set_ybound(8,13)
ax.set_xticklabels(['Normal','Tumor'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')
fig.savefig('/cellar/users/agross/figures/MDM2.pdf', transparent=True)
In [16]:
def paired_bp_tn_split(vec, assignment, ax=None, split_vals=('01', '11'),
data_type='gene expression', order=None):
"""
Paired boxplot for a single Series, with splitting on the index,
grouped by assignment. I.E. Tumor-Normal gene expression split by
cancer.
vec:
vector of values to plot.
assignment:
vector mapping keys to group assignment
ax (None):
matplotlib axis to plot on or None
split_vals ('01','11'):
Values to split the boxplot pairing on. The default of
('01','11') indicates tumor vs. normal in the standard
TCGA barcode nomenclature. This should coorespond to values
on the second level of the index for vec and assignment.
**both vec and assignment should have an overlapping index with
multiple levels**
"""
_, ax = init_ax(ax, figsize=(8, 3))
if vec.name != None:
label = vec.name # lose label in manipulation
else:
label = ''
g1 = split_vals[0]
g2 = split_vals[1]
vec = pd.concat([vec[:, g1], vec[:, g2]], keys=[g1, g2],
axis=1)
vec = vec.dropna().stack()
counts = vec.unstack().groupby(assignment).size()
if order is None:
groups = list(true_index(counts > 5))
groups = vec.unstack().groupby(assignment).median()[g1].ix[groups]
groups = groups.order().index[::-1]
else:
groups = order
l1 = [np.array(vec[:, g1].ix[true_index(assignment == c)].dropna())
for c in groups]
l2 = [np.array(vec[:, g2].ix[true_index(assignment == c)].dropna())
for c in groups]
boxes = [x for t in zip(l1, l2) for x in t if len(t[1]) > 5]
ax, bp = paired_boxplot(boxes, ax)
labels = ['{}\n({})'.format(c, counts[c]) for c in groups]
ax.set_xticklabels(labels)
prettify_ax(ax)
ax.set_ylabel('{} {}'.format(label, data_type))
In [17]:
d = pd.concat([codes, (mut.ix['TP53']>0).map({True: 'mut', False:'wt'})], axis=1).dropna()
d = d.ix[matched_rna.columns.get_level_values(0).unique()].dropna()
d = d.apply(lambda s: s.ix[0] + '\n' + s.ix[1], 1)
o = matched_rna.ix['EDA2R'][:,'11'].groupby(codes).mean().order().index
o = [v for c in o for v in [c + '\nwt', c + '\nmut'] if v in list(d)]
In [18]:
len(cancers)
Out[18]:
In [19]:
p53_mut = (mut.ix['TP53']>0).map({True: 'mut', False:'wt'})
pt = matched_rna.columns.get_level_values(0).unique()
ct = pd.crosstab(codes.ix[pt], p53_mut)
cc = ti(ct.min(1) > 10)
len(cc)
Out[19]:
In [20]:
fig, axs = subplots(1,6, figsize=(10,3), sharey=False)
for i,c in enumerate(cc):
ax = axs[i]
paired_bp_tn_split(matched_rna.ix['MDM2'].ix[ti(codes==c)], p53_mut, ax=ax)
if i > 0:
ax.set_ylabel('')
ax.set_xlabel(c)
fig.tight_layout()
In [21]:
paired_bp_tn_split(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], codes)
In [22]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], ax=axs[0])
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']>0)], ax=axs[1])
for ax in axs:
prettify_ax(ax)
ax.set_xticklabels(['Tumor','Normal'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')
Out[22]:
In [23]:
fig, axs = subplots(1,2,figsize=(6,3), sharey=True)
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']==0)], ax=axs[0])
violin_plot_series(matched_rna.ix['MCM8'].ix[ti(mut.ix['TP53']>0)], ax=axs[1])
for ax in axs:
prettify_ax(ax)
ax.set_xticklabels(['Tumor','Normal'])
axs[0].set_xlabel('TP53 WT (n=362)')
axs[1].set_xlabel('TP53 Mut. (n=158)')
Out[23]:
In [24]:
p53_dx.ix['EDA2R']
Out[24]:
In [25]:
dr = (matched_rna.xs('01',1,1) > matched_rna.xs('11',1,1)) * 1.
dr = dr.ix[ti(dx_rna.num_dx > 200)].dropna()
cc = mut.ix['TP53']>0
odds_ratio, df_odds_ratio = odds_ratio_df(dr, cc)
log_odds = np.log2(odds_ratio).clip(-8,8)
In [26]:
violin_plot_pandas(gs2['REACTOME_SIGNALING_IN_IMMUNE_SYSTEM'], log_odds)
In [27]:
human_net = pd.read_csv('/cellar/users/agross/Data/networks/HumanNet/HumanNet_HUGO.csv',
usecols=['gene1','gene2','weight'])
p53_int1 = pd.Series(1, human_net[human_net.gene1 == 'TP53'].gene2).ix[log_odds.index].fillna(0)
p53_int2 = pd.Series(1, human_net[human_net.gene2 == 'TP53'].gene1).ix[log_odds.index].fillna(0)
p53_int = (p53_int1 + p53_int2) > 0
In [29]:
log_odds.order()
Out[29]:
In [30]:
%time p = df_odds_ratio[log_odds.abs() > 2].apply(fet, 1).order()
In [31]:
lp = (-1 * np.log10(p) * np.sign(log_odds))
In [32]:
v = (log_odds.abs() > 2) * np.sign(log_odds)
In [33]:
sim = screen_feature(v[v.abs() > 0]>0, fisher_exact_test, dr.T)
In [34]:
o = sim.ix[ti(mut.ix['TP53'] == 0)].dropna()
In [35]:
d2 = mut.ix[:, o.index]
d2 = d2[d2.sum(1) > 15]
d2.shape
Out[35]:
In [36]:
rr = screen_feature(lp, rev_kruskal, gs2.T)
rr.head(10)
Out[36]:
In [37]:
proliferation = pd.read_csv('/cellar/users/agross/TCGA_Code/DX/rna_signature.csv',
header=None, index_col=[0,1], squeeze=True)
In [38]:
v = proliferation.ix[:, 1].dropna()
v = (v - codes.map(v.groupby(codes).mean())) / codes.map(v.groupby(codes).std())
violin_plot_pandas(p53_mut, v)