In [1]:
cd ..
In [2]:
import NotebookImport
from DX_screen import *
In [3]:
cd ../DX/Notebooks/
In [4]:
def plot_HR(ci, colors=None, ax=None):
_, ax = init_ax(ax)
if colors is None:
colors = ['grey'] * ci.shape[0]
ci = np.log2(ci)
for i,(h,v) in enumerate(ci.iterrows()):
ax.scatter(v['exp(coef)'], i, marker='s', s=50, color=colors[i],
edgecolors=['black'], zorder=10)
ax.plot((v['lower .95'],v['upper .95']), (i,i),
lw=1.7, ls='-', marker='o', dash_joinstyle='bevel',
color=colors[i])
prettify_ax(ax)
ax.set_ybound(-.5, i + .5)
ax.set_yticks(range(i+1))
ax.set_yticklabels(ci.index)
ax.axvline(0, ls='--', lw=2, color='grey', zorder=-1)
ax.set_xticklabels(2.**ax.get_xticks())
In [5]:
from Imports import *
from Preprocessing.ClinicalDataFilters import *
Pull in log-hazard ratios from stratified pan-cancer survival screen
In [6]:
pancan_haz = pd.read_csv('r_pancan.csv', header=None,
index_col=0, squeeze=True)
In [35]:
meta_pcna = list({w for w in s.split() if w in rna_df.index})
meta_pcna = rna_df.ix[meta_pcna].median()[:,'01']
meta_pcna.name = 'meta_pcna'
len(meta_pcna)
Out[35]:
In [192]:
from Stats.Regression import *
In [230]:
process_factors??
In [245]:
def stratified_regression(target, feature, strata):
target, feature, strata = map(sanitize_for_r, [target, feature, strata])
fmla = '{} ~ {} + strata({})'.format(target.name, feature.name, strata.name)
fmla = robjects.Formula(fmla)
df_r = convert_to_r_dataframe(pd.concat([strata, target, feature], 1))
fit = lm(fmla, df_r)
tab = convert_robj(robjects.r.summary(fit)[3])
return tab.ix[feature.name]
In [251]:
df = rna_df.xs('01',1,1)
df = df.sub(df.mean(1), 0).div(df.std(1), 0)
idx = meta_pcna.index.intersection(codes.index)
df = df.ix[:, idx]
In [284]:
%%time
gg = rr.sort('p').index
pcna_genes = {}
for g in gg:
try:
pcna_genes[g] = stratified_regression(meta_pcna, df.ix[g], codes)
except:
pass
pcna_genes = pd.DataFrame(pcna_genes).T
pcna_genes.to_csv('pcga_stratified_correlation.csv')
In [297]:
plot_regression_density(pcna_genes['Estimate'], dx_rna.frac - .5, rad=.05)
In [36]:
rna_sig.name = 'f'
In [88]:
rr = screen_feature(meta_pcna, pearson_pandas, rna_df.xs('01',1,1))
In [373]:
standardize = lambda s: s.sub(s.median(1), axis=0).div(s.mad(1), axis=0)
df = rna_df.xs('01',1,1)
df_s = df.groupby(codes, axis=1).apply(standardize)
mp2 = meta_pcna.groupby(codes).apply(standardize)
In [374]:
pcna_corr = df_s.T.corrwith(mp2)
In [376]:
fig, ax = subplots()
f = pcna_corr
plot_regression(f, dx_rna.frac - .5, density=True, rad=.03, s=20,
rasterized=True, ax=ax)
ann = ax.get_children()[4]
ann.set_text(ann.get_text().split()[0])
ax.set_ylabel('Fraction Overexpressed')
ax.set_xlabel('Correlation with Meta-PCNA')
Out[376]:
In [377]:
def detrend(x,y):
x, y = match_series(x, y)
reg = linear_regression(x, y)
#adj = (x * reg['slope']) + reg['intercept']
adj = (y - reg['intercept']) / reg['slope']
return adj
In [378]:
f_adj = pcna_corr.ix[ti(pcna_corr < .4)].dropna()
f_adj = detrend(f_adj, dx_rna.frac)
f_adj = f_adj - pcna_genes.Estimate
f_adj = f_adj.dropna()
#f_adj = f_adj.ix[ti(pcna_genes.Estimate < .35)].dropna()
In [379]:
fig, ax = subplots()
plot_regression(pcna_corr, f_adj, density=True, rad=.03, s=20,
line_args={'lw':0}, rasterized=True, ax=ax)
ann = ax.get_children()[4]
ann.set_text(ann.get_text().split()[0])
ax.set_ylabel('Fraction Overexpressed')
ax.set_xlabel('Correlation with Meta-PCNA')
Out[379]:
In [400]:
dx_rna.ix['HIST2H2AA3']
Out[400]:
In [399]:
pcga_corr.ix['HIST2H2AA3']
Out[399]:
In [398]:
paired_bp_tn_split(rna_df.ix['HIST2H2AA3'], codes)
In [380]:
gs2 = gene_sets.ix[f_adj.index].fillna(0)
r2 = screen_feature(f_adj, rev_kruskal, gs2.T,
align=False)
fp = (1.*gene_sets.T * f_adj).T.dropna().replace(0, np.nan).mean().order()
fp.name = 'mean frac'
In [381]:
r2.ix[ti(fp > 0)].join(fp).sort('p').head(6)
Out[381]:
In [384]:
r2.ix[ti(fp < 0)].join(fp).sort('p').head()
Out[384]:
In [ ]:
patient_switch = df_s.corrwith(f_adj)
patient_switch = patient_switch.groupby(codes).apply(standardize)
In [605]:
mut_all = pd.read_csv('/cellar/users/agross/TCGA_Code/CancerData/Data/MAFs_new_4/meta.csv',
index_col=[1,2])
mut = mut_all['0'].unstack().fillna(0)
mut = FH.fix_barcode_columns(mut)
mut = mut.xs('01', axis=1, level=1)
In [ ]:
mut = mut.ix[rna_df.index]
mut_rate = (mut>0).mean().order()
mut = mut.ix[:, ti(mut_rate < .1)] #filter samples with lots of mutations
mut.shape
In [ ]:
df = mut[(mut > 0).sum(1) > 30]
df.shape
In [ ]:
c
In [521]:
rr = screen_feature(patient_switch, rev_kruskal, df>0, align=False)
In [522]:
rr.head(10)
Out[522]:
In [574]:
(rr.p < .005).value_counts()
Out[574]:
In [600]:
ptw = ti(gs2.ix[ti(rr.p < .01)].sum() > 4)
ptw.shape
Out[600]:
In [601]:
r2 = screen_feature(rr.p < .01, fisher_exact_test, gs2[ptw].T>0)
In [602]:
r2.head()
Out[602]:
In [603]:
pd.crosstab(gs2['REACTOME_SIGNALING_BY_FGFR']>0,
rr.p < .01)
Out[603]:
In [604]:
violin_plot_pandas(gs2['REACTOME_SIGNALING_BY_FGFR']>0, np.log(rr.p))
In [492]:
cn = FH.get_gistic_gene_matrix(run.data_path, 'HNSC')
In [493]:
bap1 = cn.xs('NCKAP1', level=2).iloc[0]
In [530]:
violin_plot_pandas((mut.ix[['BAP1','CDK17','MCTP1']] > 0).sum()>0, patient_switch)
In [435]:
anova(mut.ix['KRAS']>0, patient_switch)
Out[435]:
In [415]:
get_cox_ph_ms(surv_5y, patient_switch.ix[keepers], [meta_pcna, codes, age],
interactions=False)
Out[415]:
In [397]:
f_adj.ix[ti(gs2['REACTOME_PACKAGING_OF_TELOMERE_ENDS']>0)].order()
Out[397]:
In [388]:
violin_plot_pandas(gs2['REACTOME_PACKAGING_OF_TELOMERE_ENDS'], dx_rna.frac.dropna())
In [389]:
violin_plot_pandas(gs2['REACTOME_PACKAGING_OF_TELOMERE_ENDS'], f_adj.dropna())
In [390]:
violin_plot_pandas(gs2['REACTOME_PACKAGING_OF_TELOMERE_ENDS'], pcna_genes.Estimate.dropna())
In [64]:
fig, ax = subplots()
f = rr.rho
plot_regression(f, pancan_haz, density=True, rad=.03, s=20,
line_args={'lw':0}, rasterized=True, ax=ax)
ann = ax.get_children()[4]
ann.set_text(ann.get_text().split()[0])
ax.set_ybound(-.3,.55)
ax.set_yticks(np.log([.75, 1, 1.25, 1.5]))
ax.set_yticklabels(np.round(np.exp(ax.get_yticks()), 2))
ax.set_ylabel('Hazard Ratio')
ax.set_xlabel('Correlation with Meta-PCNA')
Out[64]:
In [56]:
series_scatter(meta_pcna, rna_sig)
In [46]:
f = 'Surv(days, event) ~ meta_pcna + f + codes'
In [49]:
get_cox_ph(surv_5y, covariates=[meta_pcna, rna_sig, codes], formula=f,
interactions=False, print_desc=True);
RNA proliferation signature
In [9]:
rna_sig = pd.read_csv('/cellar/users/agross/TCGA_Code/DX/rna_signature.csv',
index_col=[0,1], header=None, squeeze=True)
rna_sig = rna_sig[:,1]
rna_sig.index.name = ''
In [40]:
f = meta_pcna.copy()
f = (f - f.mean()) / f.std()
f.name = 'feature'
codes_f = codes.ix[(codes.index.intersection(f.index).intersection(stage.index).
intersection(age.index).intersection(old.index))]
codes_f.name = 'codes'
m = get_cox_ph(surv_5y, f, [age, old, stage, codes_f],
print_desc=False, interactions=False)
ci = convert_robj(robjects.r.summary(m)[7])
ci.index = ci.index.map(lambda s: s.replace('codes', ''))
ci.index = ci.index.map(lambda s: s.replace('stagestage', 'stage'))
pancan_ci = ci.ix['feature']
pancan_ci.name = 'PANCAN'
In [41]:
v = meta_pcna
ci = {}
for c in codes.unique():
try:
pts = ti(codes == c)
f = v.ix[pts].dropna()
cov = [b.ix[pts].dropna() for b in [old, age] if
b.ix[pts].dropna().unique().shape[0] > 1]
f = (f - f.mean()) / f.std()
f.name = 'featue'
ff = ' + '.join(['feature'] + [b.name for b in cov])
formula = robjects.Formula('Surv(days, event) ~ ' + ff)
m1 = get_cox_ph(surv_5y, f, covariates=cov, formula=formula,
print_desc=False, interactions=False);
ci[c] = convert_robj(robjects.r.summary(m1)[7])
except:
print c
cc = pd.concat(ci).xs('feature', level=1)
cc = cc.sort('exp(coef)', ascending=False)
cc = cc[(cc['upper .95'] - cc['lower .95']) < 4]
In [42]:
fig, ax = subplots(figsize=(8,4))
c2 = pd.DataFrame([pancan_ci]).append(cc)
colors = ['#E66101'] + (['grey'] * len(cc))
plot_HR(c2, ax=ax, colors=colors)
ax.set_xlim(-1.1, 2.5)
ax.set_xticks([-1,0,1,2])
ax.set_xticklabels(2.**ax.get_xticks())
codes_f = codes.ix[(codes.index.intersection(v.index).intersection(stage.index).
intersection(age.index).intersection(old.index))]
label = ['{} ({})'.format(i,c) for i,c in
codes_f.value_counts().ix[cc.index].iteritems()]
label = ['PANCAN ({})'.format(len(codes_f))] + label
ax.set_yticklabels(label, fontsize=7.5);
ax.set_xlabel('Hazard Ratio')
fig.tight_layout()
In [61]:
fig, axs = subplots(1,2, figsize=(8,3))
ax = axs[0]
f = dx_rna.frac
plot_regression(f - .5, pancan_haz, density=True, rad=.03, s=20,
line_args={'lw':0}, rasterized=True, ax=ax)
ann = ax.get_children()[4]
ann.set_text(ann.get_text().split()[0])
ax.set_ybound(-.3,.55)
ax.set_yticks(np.log([.75, 1, 1.25, 1.5]))
ax.set_yticklabels(np.round(np.exp(ax.get_yticks()), 2))
ax.set_ylabel('Hazard Ratio')
ax.set_xticklabels(ax.get_xticks() + .5)
ax.set_xlabel('Fraction Overexpressed')
ax = axs[1]
pts = ti(codes.str.startswith('K')).intersection(ti(age < 85))
draw_survival_curve(to_quants(rna_sig.ix[pts].dropna(), std=1, labels=True),
surv_5y, ax=ax)
prettify_ax(ax)
fig.tight_layout()
In [62]:
fig = plt.figure(figsize=(8, 7.5))
ax1 = plt.subplot2grid((7, 2), (0, 0), rowspan=3)
ax2 = plt.subplot2grid((7, 2), (0, 1), rowspan=3)
ax3 = plt.subplot2grid((7, 2), (3, 0), colspan=2, rowspan=4)
ax = ax1
f = dx_rna.frac
plot_regression(f - .5, pancan_haz, density=True, rad=.03, s=20,
line_args={'lw':0}, rasterized=True, ax=ax)
ann = ax.get_children()[4]
ann.set_text(ann.get_text().split()[0])
ax.set_ybound(-.3,.55)
ax.set_yticks(np.log([.75, 1, 1.25, 1.5]))
ax.set_yticklabels(np.round(np.exp(ax.get_yticks()), 2))
ax.set_ylabel('Hazard Ratio')
ax.set_xticklabels(ax.get_xticks() + .5)
ax.set_xlabel('Fraction Overexpressed')
ax = ax2
pts = ti(codes.str.startswith('K')).intersection(ti(age < 85))
draw_survival_curve(to_quants(rna_sig.ix[pts].dropna(), std=1, labels=True),
surv_5y, ax=ax)
prettify_ax(ax)
ax = ax3
c2 = pd.DataFrame([pancan_ci]).append(cc)
colors = ['#E66101'] + (['grey'] * len(cc))
plot_HR(c2, ax=ax, colors=colors)
ax.set_xlim(-1.1, 2.5)
ax.set_xticks([-1,0,1,2])
ax.set_xticklabels(2.**ax.get_xticks())
codes_f = codes.ix[(codes.index.intersection(v.index).intersection(stage.index).
intersection(age.index).intersection(old.index))]
label = ['{} ({})'.format(i,c) for i,c in
codes_f.value_counts().ix[cc.index].iteritems()]
label = ['PANCAN ({})'.format(len(codes_f))] + label
ax.set_yticklabels(label, fontsize=7.5);
ax.set_xlabel('Association of Tumorness with Survival (Hazard Ratio)')
letters = list(map(chr, range(97, 123)))[:6]
for i,ax in enumerate([ax1, ax2, ax3]):
ax.text(-0.1, 1.12, letters[i], transform=ax.transAxes,
fontsize=20, fontweight='bold', va='top', ha='right')
fig.tight_layout(pad=1.3, h_pad=3)
fig.savefig('/cellar/users/agross/Desktop/Figures/DX_fig2.png', dpi=300,
bbox_inches='tight')