Tables


In [1]:
cd ..


/cellar/users/agross/TCGA_Code/DX/Notebooks

In [2]:
import NotebookImport
from DX_screen import *


importing IPython notebook from DX_screen
importing IPython notebook from Imports
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/rpy/__init__.py:11: FutureWarning: The pandas.rpy module is deprecated and will be removed in a future version. We refer to external packages like rpy2. 
See here for a guide on how to port your code to rpy2: http://pandas.pydata.org/pandas-docs/stable/r_interface.html
  FutureWarning)
importing IPython notebook from Global_Parameters

In [3]:
from metaPCNA import *


importing IPython notebook from metaPCNA

GSEA

Gene level fraction upregulated statistic


In [4]:
gs2 = gene_sets.ix[dx_rna.index].fillna(0)
rr = screen_feature(dx_rna.frac, rev_kruskal, gs2.T, 
                    align=False)
fp = (1.*gene_sets.T * dx_rna.frac).T.dropna().replace(0, np.nan).mean().order()
fp.name = 'mean frac'

In [6]:
rr.join(fp).to_csv(FIGDIR + 'f_up_gene_sets.csv')

In [7]:
ff_u = filter_pathway_hits(rr.ix[ti(fp>.5)].p.order(), gs2)
ff_p = filter_pathway_hits(rr.ix[ti(fp<.5)].p.order(), gs2)
ff = ff_u.append(ff_p)

selected = rr.ix[ff[ff < .00001].index].join(fp)
selected.sort('p')


Out[7]:
H p q mean frac
Gene_Set
REACTOME_CELL_CYCLE 343.67 1.01e-76 1.35e-73 0.67
REACTOME_METABOLISM_OF_PROTEINS 145.97 1.32e-33 1.46e-31 0.59
REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA 126.77 2.09e-29 1.64e-27 0.66
REACTOME_SIGNALING_BY_GPCR 70.43 4.78e-17 1.10e-15 0.46
REACTOME_BIOLOGICAL_OXIDATIONS 54.94 1.24e-13 2.07e-12 0.41
REACTOME_TRNA_AMINOACYLATION 53.07 3.21e-13 5.21e-12 0.68
KEGG_PPAR_SIGNALING_PATHWAY 37.42 9.50e-10 1.17e-08 0.39
KEGG_RNA_DEGRADATION 32.92 9.59e-09 1.02e-07 0.61
NABA_ECM_GLYCOPROTEINS 31.19 2.34e-08 2.43e-07 0.44
REACTOME_METABOLISM_OF_NUCLEOTIDES 20.31 6.58e-06 4.81e-05 0.60

In [8]:
selected.to_csv(FIGDIR + 'f_up_gene_sets_selected.csv')

Looking for subsets of the cell-cycle pathway


In [9]:
d = pd.DataFrame({g: gs2['REACTOME_CELL_CYCLE'] for g in gs2.columns})
a,b = odds_ratio_df(d.T>0, gs2.T>0)

In [10]:
dd = rr.ix[ti((a > 100) & (rr.q < 10e-15))].join(fp).sort(fp.name, ascending=False)
filter_pathway_hits(dd, gs2)


Out[10]:
H p q mean frac
REACTOME_M_G1_TRANSITION 135.57 2.48e-31 2.20e-29 0.74
REACTOME_DEPOSITION_OF_NEW_CENPA_CONTAINING_NUCLEOSOMES_AT_THE_CENTROMERE 99.38 2.09e-23 1.11e-21 0.73

In [11]:
(combine(gs2['REACTOME_M_G1_TRANSITION']>0, 
            gs2['REACTOME_DEPOSITION_OF_NEW_CENPA_CONTAINING_NUCLEOSOMES_AT_THE_CENTROMERE']>0)
).value_counts()


Out[11]:
neither    18366
first         72
second        54
dtype: int64

In [12]:
f2 = fp.ix[ti(rr.q < .00001)]
ff_u = filter_pathway_hits(fp.ix[ti(f2>.5)].order()[::-1], gs2)
ff_p = filter_pathway_hits(fp.ix[ti(f2<.5)].order(), gs2)
ff = ff_u.append(ff_p)

selected = rr.ix[ff.index].join(f2)
selected.ix[(f2 - .5).abs().order().index[::-1]].dropna()


Out[12]:
H p q mean frac
REACTOME_UNWINDING_OF_DNA 28.61 8.86e-08 8.24e-07 0.81
REACTOME_EXTENSION_OF_TELOMERES 55.78 8.12e-14 1.40e-12 0.76
BIOCARTA_PROTEASOME_PATHWAY 56.05 7.07e-14 1.25e-12 0.74
REACTOME_DEPOSITION_OF_NEW_CENPA_CONTAINING_NUCLEOSOMES_AT_THE_CENTROMERE 99.38 2.09e-23 1.11e-21 0.73
REACTOME_FANCONI_ANEMIA_PATHWAY 30.87 2.76e-08 2.80e-07 0.71
PID_FOXM1_PATHWAY 35.10 3.13e-09 3.59e-08 0.70
REACTOME_CYTOSOLIC_TRNA_AMINOACYLATION 34.10 5.24e-09 5.91e-08 0.70
REACTOME_MRNA_SPLICING_MINOR_PATHWAY 52.40 4.53e-13 7.17e-12 0.69
REACTOME_NEP_NS2_INTERACTS_WITH_THE_CELLULAR_EXPORT_MACHINERY 33.27 8.02e-09 8.74e-08 0.67
KEGG_FATTY_ACID_METABOLISM 47.69 5.00e-12 7.15e-11 0.33
REACTOME_SRP_DEPENDENT_COTRANSLATIONAL_PROTEIN_TARGETING_TO_MEMBRANE 83.95 5.07e-20 1.82e-18 0.63
REACTOME_DEADENYLATION_DEPENDENT_MRNA_DECAY 27.93 1.26e-07 1.15e-06 0.63
REACTOME_ASPARAGINE_N_LINKED_GLYCOSYLATION 36.83 1.29e-09 1.55e-08 0.61
REACTOME_CYTOCHROME_P450_ARRANGED_BY_SUBSTRATE_TYPE 25.97 3.46e-07 2.95e-06 0.40
REACTOME_G_ALPHA_S_SIGNALLING_EVENTS 33.07 8.90e-09 9.54e-08 0.42
NABA_ECM_GLYCOPROTEINS 31.19 2.34e-08 2.43e-07 0.44
NABA_SECRETED_FACTORS 38.68 4.98e-10 6.37e-09 0.45

In [13]:
selected.to_csv(FIGDIR + 'f_up_gene_sets_selected_fc.csv')

Gene level proliferation statistic


In [14]:
gs2 = gene_sets.ix[pcna_corr.dropna().index].fillna(0)
rr = screen_feature(pcna_corr, rev_kruskal, gs2.T, 
                    align=False)
fp = (1.*gene_sets.T * pcna_corr).T.dropna().replace(0, np.nan).mean().order()
fp.name = 'mean score'

In [15]:
rr.join(fp).to_csv(FIGDIR + 'pcna_gene_sets.csv')

In [16]:
ff_u = filter_pathway_hits(rr.ix[ti(fp>0)].p.order(), gs2)
ff_p = filter_pathway_hits(rr.ix[ti(fp<0)].p.order(), gs2)
ff = ff_u.append(ff_p)
selected = rr.ix[ff[ff < .00001].index].join(fp)
selected.sort('p')


Out[16]:
H p q mean score
Gene_Set
REACTOME_CELL_CYCLE 415.34 2.52e-92 3.35e-89 4.18e-01
REACTOME_GPCR_DOWNSTREAM_SIGNALING 278.96 1.27e-62 3.37e-60 2.18e-02
NABA_MATRISOME_ASSOCIATED 227.53 2.06e-51 3.04e-49 2.02e-02
REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA 216.76 4.60e-49 6.12e-47 4.32e-01
KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION 144.05 3.47e-33 2.56e-31 -1.41e-02
NABA_ECM_AFFILIATED 82.47 1.07e-19 3.24e-18 -1.56e-02
NABA_CORE_MATRISOME 79.20 5.62e-19 1.66e-17 2.27e-02
KEGG_COMPLEMENT_AND_COAGULATION_CASCADES 71.09 3.41e-17 8.39e-16 -1.02e-01
KEGG_CELL_ADHESION_MOLECULES_CAMS 65.29 6.46e-16 1.36e-14 -1.69e-02
KEGG_RNA_DEGRADATION 62.89 2.19e-15 4.22e-14 3.40e-01
REACTOME_TRNA_AMINOACYLATION 54.13 1.88e-13 2.77e-12 3.62e-01
REACTOME_PEPTIDE_CHAIN_ELONGATION 40.34 2.14e-10 2.24e-09 1.15e-02
REACTOME_PHASE1_FUNCTIONALIZATION_OF_COMPOUNDS 35.89 2.08e-09 1.98e-08 -1.71e-02
REACTOME_BIOLOGICAL_OXIDATIONS 33.62 6.70e-09 6.11e-08 2.95e-02
REACTOME_MITOCHONDRIAL_PROTEIN_IMPORT 33.46 7.29e-09 6.55e-08 2.94e-01
KEGG_BASAL_TRANSCRIPTION_FACTORS 32.91 9.67e-09 8.58e-08 3.24e-01
REACTOME_GENERIC_TRANSCRIPTION_PATHWAY 27.15 1.88e-07 1.39e-06 1.80e-01
REACTOME_INTERFERON_GAMMA_SIGNALING 25.04 5.61e-07 3.87e-06 3.69e-03
KEGG_DILATED_CARDIOMYOPATHY 22.09 2.60e-06 1.59e-05 3.25e-02
REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 19.97 7.86e-06 4.52e-05 2.29e-01

In [17]:
selected.to_csv(FIGDIR + 'pcna_gene_sets_selected.csv')

In [18]:
f2 = fp.ix[ti(rr.q < .0001)]
ff_u = filter_pathway_hits(fp.ix[ti(f2>0)].order()[::-1], gs2)
ff_p = filter_pathway_hits(fp.ix[ti(f2<0)].order(), gs2)
ff = ff_u.append(ff_p)

selected = rr.ix[ff.index].join(f2)
selected.sort('p')


Out[18]:
H p q mean score
KEGG_OLFACTORY_TRANSDUCTION 162.64 3.00e-37 3.07e-35 2.21e-02
REACTOME_MITOTIC_PROMETAPHASE 142.12 9.16e-33 6.42e-31 5.37e-01
NABA_CORE_MATRISOME 79.20 5.62e-19 1.66e-17 2.27e-02
REACTOME_EXTENSION_OF_TELOMERES 61.80 3.80e-15 6.83e-14 5.69e-01
KEGG_GRAFT_VERSUS_HOST_DISEASE 60.74 6.51e-15 1.15e-13 -1.28e-01
BIOCARTA_PROTEASOME_PATHWAY 57.76 2.96e-14 4.63e-13 4.42e-01
REACTOME_PROCESSING_OF_CAPPED_INTRONLESS_PRE_MRNA 45.54 1.50e-11 1.79e-10 4.63e-01
REACTOME_DEADENYLATION_DEPENDENT_MRNA_DECAY 43.52 4.19e-11 4.85e-10 3.41e-01
PID_ATM_PATHWAY 41.50 1.18e-10 1.29e-09 4.11e-01
REACTOME_CHEMOKINE_RECEPTORS_BIND_CHEMOKINES 40.49 1.97e-10 2.08e-09 -4.27e-02
REACTOME_G0_AND_EARLY_G1 40.11 2.40e-10 2.50e-09 5.43e-01
KEGG_JAK_STAT_SIGNALING_PATHWAY 40.06 2.46e-10 2.54e-09 3.33e-02
REACTOME_RNA_POL_I_PROMOTER_OPENING 38.13 6.63e-10 6.68e-09 2.90e-01
KEGG_RIBOSOME 37.75 8.03e-10 8.03e-09 1.52e-02
NABA_ECM_REGULATORS 35.94 2.03e-09 1.95e-08 4.69e-02
REACTOME_BIOLOGICAL_OXIDATIONS 33.62 6.70e-09 6.11e-08 2.95e-02
REACTOME_MITOCHONDRIAL_PROTEIN_IMPORT 33.46 7.29e-09 6.55e-08 2.94e-01
REACTOME_UNWINDING_OF_DNA 32.07 1.48e-08 1.31e-07 7.53e-01
REACTOME_CYTOSOLIC_TRNA_AMINOACYLATION 31.57 1.92e-08 1.63e-07 3.66e-01
REACTOME_INITIAL_TRIGGERING_OF_COMPLEMENT 27.82 1.33e-07 9.98e-07 -2.15e-01
REACTOME_FORMATION_OF_FIBRIN_CLOT_CLOTTING_CASCADE 26.98 2.06e-07 1.51e-06 -7.95e-02
KEGG_PPAR_SIGNALING_PATHWAY 25.95 3.51e-07 2.45e-06 2.41e-03
REACTOME_RNA_POL_I_TRANSCRIPTION_INITIATION 24.91 6.01e-07 4.10e-06 3.41e-01
REACTOME_BILE_ACID_AND_BILE_SALT_METABOLISM 23.80 1.07e-06 7.03e-06 -4.40e-02
BIOCARTA_DC_PATHWAY 22.97 1.65e-06 1.06e-05 -6.99e-02
KEGG_DILATED_CARDIOMYOPATHY 22.09 2.60e-06 1.59e-05 3.25e-02
REACTOME_RESPIRATORY_ELECTRON_TRANSPORT 19.97 7.86e-06 4.52e-05 2.29e-01
REACTOME_NITRIC_OXIDE_STIMULATES_GUANYLATE_CYCLASE 18.78 1.47e-05 8.09e-05 -4.35e-02
REACTOME_GASTRIN_CREB_SIGNALLING_PATHWAY_VIA_PKC_AND_MAPK 18.77 1.47e-05 8.09e-05 6.64e-02
KEGG_LINOLEIC_ACID_METABOLISM 18.69 1.54e-05 8.38e-05 -3.85e-02

In [19]:
selected.to_csv(FIGDIR + 'pcna_gene_sets_selected_fc.csv')

Gene level detrended fraction upregulated statistic


In [31]:
gs2 = gene_sets.ix[f_win.dropna().index].fillna(0)
rr = screen_feature(f_win, rev_kruskal, gs2.T, 
                    align=False)
fp = (1.*gene_sets.T * f_win).T.dropna().replace(0, np.nan).mean().order()
fp.name = 'mean score'

In [35]:
(rr.q < .00001).value_counts()


Out[35]:
False    1271
True       59
dtype: int64

In [22]:
rr.join(fp).to_csv(FIGDIR + 'detrended_fup_sets.csv')

In [23]:
f2 = fp.ix[ti(rr.q < .0001)]
ff_u = filter_pathway_hits(rr.ix[ti(f2>0)].p.order(), gs2)
ff_p = filter_pathway_hits(rr.ix[ti(f2<0)].p.order(), gs2)
ff = ff_u.append(ff_p)

selected = rr.ix[ff[ff < .00001].index].join(fp)
selected.sort('p')


Out[23]:
H p q mean score
REACTOME_TRANSLATION 158.08 2.97e-36 3.86e-33 0.13
KEGG_SYSTEMIC_LUPUS_ERYTHEMATOSUS 82.42 1.10e-19 1.22e-17 0.10
KEGG_FATTY_ACID_METABOLISM 42.00 9.14e-11 5.79e-09 -0.13
REACTOME_INTERFERON_ALPHA_BETA_SIGNALING 41.59 1.13e-10 6.81e-09 0.11
KEGG_LYSOSOME 38.06 6.86e-10 3.51e-08 0.07
REACTOME_GENERIC_TRANSCRIPTION_PATHWAY 34.89 3.48e-09 1.36e-07 -0.03
REACTOME_VIF_MEDIATED_DEGRADATION_OF_APOBEC3G 34.13 5.16e-09 1.96e-07 0.09
REACTOME_ASPARAGINE_N_LINKED_GLYCOSYLATION 29.17 6.63e-08 1.76e-06 0.07
REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION 21.82 2.99e-06 5.45e-05 0.08

In [24]:
selected.to_csv(FIGDIR + 'detrended_fup_sets_selected.csv')

In [25]:
f2 = fp.ix[ti(rr.q < .0001)]
ff_u = filter_pathway_hits(fp.ix[ti(f2>0)].order()[::-1], gs2)
ff_p = filter_pathway_hits(fp.ix[ti(f2<0)].order(), gs2)
ff = ff_u.append(ff_p)

selected = rr.ix[ff.index].join(f2)
selected.sort('p')


Out[25]:
H p q mean score
KEGG_RIBOSOME 144.04 3.47e-33 1.16e-30 0.16
REACTOME_PACKAGING_OF_TELOMERE_ENDS 45.35 1.65e-11 1.15e-09 0.12
KEGG_FATTY_ACID_METABOLISM 42.00 9.14e-11 5.79e-09 -0.13
KEGG_LYSOSOME 38.06 6.86e-10 3.51e-08 0.07
KEGG_GRAFT_VERSUS_HOST_DISEASE 35.52 2.52e-09 1.05e-07 0.11
REACTOME_GENERIC_TRANSCRIPTION_PATHWAY 34.89 3.48e-09 1.36e-07 -0.03
KEGG_PROTEASOME 31.14 2.40e-08 7.43e-07 0.09
REACTOME_N_GLYCAN_TRIMMING_IN_THE_ER_AND_CALNEXIN_CALRETICULIN_CYCLE 24.76 6.48e-07 1.35e-05 0.18
REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION 21.82 2.99e-06 5.45e-05 0.08

In [26]:
selected.to_csv(FIGDIR + 'detrended_fup_sets_selected_fc.csv')

Diggining into the different telomere gene subsets


In [27]:
[g for g in gs2 if 'TELO' in g]


Out[27]:
['PID_TELOMERASE_PATHWAY',
 'REACTOME_EXTENSION_OF_TELOMERES',
 'REACTOME_PACKAGING_OF_TELOMERE_ENDS',
 'REACTOME_TELOMERE_MAINTENANCE']

In [28]:
a = gs2['REACTOME_PACKAGING_OF_TELOMERE_ENDS']>0
b = gs2['REACTOME_EXTENSION_OF_TELOMERES']>0
a.name = 'end packaging'
b.name = 'extension'
cc = combine(a,b).replace('neither', np.nan).dropna()

In [29]:
fig, ax = subplots(figsize=(4,4))
series_scatter(dx_rna.frac, pcna_corr.ix[ti(cc == 'end packaging')],
               ax=ax, color=colors[0], ann=None, alpha=1, s=30)
series_scatter(dx_rna.frac, pcna_corr.ix[ti(cc == 'extension')],
               ax=ax, color=colors[1], ann=None, alpha=1, s=30)
ax.set_xlabel('Fraction Upregulated')
ax.set_ylabel('Proliferation Score')
ax.legend(['end packaging','extension'], loc='upper left', frameon=True)
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'S3_Fig.pdf')



In [30]:
violin_plot_pandas(cc, f_win)