Meta Analysis of the Datasets for the Epi² pilot project


Cross analysis of datasets


PYTHON 3 / R Notebook

Adrien Leger / EMBL EBI

Starting date 04/08/2016


Import general package and definition of specific functions


In [1]:
# Local lib import
from pycl import *
from pyBioPlot import *

# Standard lib imports
from glob import glob
from collections import OrderedDict
import colorsys
from IPython.core.display import display, HTML, Markdown, Image

# Third party import
import pandas as pd
import numpy as np
import pylab as pl
import seaborn as sns

# Pyplot tweaking
%matplotlib inline
pl.rcParams['figure.figsize'] = 30, 10  # that's default image size for this interactive session

# Jupyter display tweaking 
toogle_code()
larger_display(75)

# Simplify warning reporting to lighten the notebook style 
import warnings
warnings.formatwarning=(lambda message, *args: "{}\n".format(message))

# Allow to use R directly
%load_ext rpy2.ipython


Toggle on/off the raw code

Comparison RNA PTM datasets / RNA Localization at gene level

PTM datasets Gene level analysis


In [7]:
# parsing file to summarize the datasets I have
d = OrderedDict()
for fp in glob("./PTM/PTM_Annotated_Datasets/*_hg38_cleaned_gencode_v24_lncRNAs_uniq-gene.csv"):
    ls = supersplit(file_name(fp), ["_", "."])    
    author = ls[0]
    ptm = ls[1]
    cell = ls[2]
    file = "_".join(ls[7:])
    
    if ptm not in d:
        d[ptm] = OrderedDict()   
    if cell not in d[ptm]:
        d[ptm][cell] = OrderedDict()
    if author not in d[ptm][cell]:
        d[ptm][cell][author] = []
    d[ptm][cell][author].append(fastcount(fp))

display(Markdown("**List of the datasets I already analysed**"))
print(dict_to_report(d))


List of the datasets I already analysed

A>I
	Brain
		Sakurai:[319]
	Various
		DARNED:[1300]
		RADAR:[3343]
	YH
		Peng:[505]
Y
	HEK293
		Li:[44]
	HEK293:Fibroblast
		Schwartz:[10]
	HeLa
		Carlile:[3]
m1A
	HeLa:HEK293:HepG2
		Dominissini:[338]
m5C
	HEK293
		Hussain:[39]
	HeLa
		Squires:[112]
	MEF
		Khoddami:[36]
m6A
	HEK293
		Meyer:[16]
	HepG2
		Dominissini:[84]
m6A:m6Am
	HEK293
		Linder:[168]

RNA Modification centered analysis


In [3]:
FDR = 0.05
min_targets = 20

# Define the categories of lnc and coding RNA
lncRNA_types = ["lincRNA", "antisense", "TEC", "sense_intronic", "processed_transcript", "sense_overlapping", "3prime_overlapping_ncrna", "non_coding", "bidirectional_promoter_lncrna", "macro_lncRNA"]
coding_RNA_types = ["protein_coding", "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene"]

# Read the main data
df = pd.read_table("./Localisation/Djebali-ENCODE/sleuth/gene_allRNA/ALL/localizationnucleus_wald_test.tsv")

# Deal with the 2 rnatypes
gene_df = pd.read_table( "../../Reference_Annotation/gencode_v24_gene.tsv")
# Subsetting coding genes
coding_RNA_types = ["protein_coding", "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene"]
codingRNA_id = gene_df.ID[(gene_df.gene_type.isin(coding_RNA_types))]
# Subsetting lncRNA genes
lncRNA_types = ["lincRNA", "antisense", "TEC", "sense_intronic", "processed_transcript", "sense_overlapping", "3prime_overlapping_ncrna", "non_coding", "bidirectional_promoter_lncrna", "macro_lncRNA"]
lncRNA_id = gene_df.ID[(gene_df.gene_type.isin(lncRNA_types))]

hl_types = [
    {"target_id":lncRNA_id, "label":"All lncRNA", "linestyle":'--', "color":"black"},
    {"target_id":codingRNA_id, "label":"All Coding RNA", "linestyle":':', "color":"black"},]

for modif in ["m6A", "m5C", "_Y_", "A>I", "m1A"]:  
    PTM_dataset_list = sorted(glob("./PTM/PTM_Annotated_Datasets/*{}*_hg38_cleaned_gencode_v24_uniq-gene.csv".format(modif)))
    hl_lncRNA = []
    hl_codingRNA = []
    color = get_color_list(n=len(PTM_dataset_list), colormap="Set1")
    
    for fp in PTM_dataset_list:
        PTM_df = pd.read_table(fp, names=["target_id", "gene_name", "gene_type", "count"])
        c = next(color)
        
        # lncRNA
        hl_lncRNA.append({
                "target_id": PTM_df.target_id[(PTM_df.target_id.isin(lncRNA_id))], "label":"lnc RNA_{}".format(file_name(fp).split("_hg38_")[0]), "linestyle":'--', "color":c})
        
        # codingRNA
        hl_codingRNA.append({
                "target_id": PTM_df.target_id[(PTM_df.target_id.isin(codingRNA_id))], "label":"coding RNA_{}".format(file_name(fp).split("_hg38_")[0]), "linestyle":':', "color":c})
    
    plot_text(modif, align="left", fontsize=20, fontweight="bold")
    
    # lncRNA
    hl_lncRNA.sort(key=lambda x: len(x["target_id"]), reverse=True)
    plot_text("  Long non-coding RNAs", align="left", fontsize=15, fontweight="bold")  
    volcano_plot (df=df, highlight_list=hl_lncRNA, X="b", Y="qval", FDR=FDR, X_cutoff=0, figsize=[20,5], xlim=[-6,6],
             xlabel = "Cytosol <<< beta factor >>> Nucleus", ylabel = "-log10(qval)", 
             alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
    
    MA_plot (df=df, highlight_list=hl_lncRNA, X="mean_obs", Y="b", FDR=FDR, FDR_col="qval", figsize=[20,5],
             xlabel="Mean expression", ylabel="Cytosol <<< beta factor >>> Nucleus",
             alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
    
    # codingRNA
    hl_codingRNA.sort(key=lambda x: len(x["target_id"]), reverse=True)
    plot_text("  Coding RNAs", align="left", fontsize=15, fontweight="bold")
    volcano_plot (df=df, highlight_list=hl_codingRNA, X="b", Y="qval", FDR=FDR, X_cutoff=0, figsize=[20,5], xlim=[-6,6],
             xlabel = "Cytosol <<< beta factor >>> Nucleus", ylabel = "-log10(qval)", 
             alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
    
    MA_plot (df=df, highlight_list=hl_codingRNA, X="mean_obs", Y="b", FDR=FDR, FDR_col="qval", figsize=[20,5],
             xlabel="Mean expression", ylabel="Cytosol <<< beta factor >>> Nucleus",
             alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
    
    # Density plot with both RNA types
    HL = hl_types + hl_lncRNA + hl_codingRNA
    plot_text("  Density lncRNA + Coding RNAs", align="left", fontsize=15, fontweight="bold")
    density_plot(df, "b", FDR=FDR, FDR_col="qval", figsize=[15,5], ylabel="Cumulative Beta value", highlight_list= HL, cumulative = True, highlight_min_targets=min_targets)


using a non-integer number instead of an integer will result in an error in the future
More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).

Gene biotypes centered analysis


In [5]:
# Read the main data
df = pd.read_table("./Localisation/Djebali-ENCODE/sleuth/gene_allRNA/ALL/localizationnucleus_wald_test.tsv")
# Read the tables containing all gencode genes information 
gene_df = pd.read_table( "../../Reference_Annotation/gencode_v24_gene.tsv")
# Define the types of RNA
RNA_type_list= OrderedDict()
RNA_type_list["protein_coding"] = gene_df.ID[(gene_df.gene_type == "protein_coding")]
RNA_type_list["lincRNA"] = gene_df.ID[(gene_df.gene_type == "lincRNA")]
RNA_type_list["antisense"] = gene_df.ID[(gene_df.gene_type == "antisense")]
RNA_type_list["TEC"] = gene_df.ID[(gene_df.gene_type == "TEC")]
RNA_type_list["sense_intronic"] = gene_df.ID[(gene_df.gene_type == "sense_intronic")]
RNA_type_list["misc_RNA"] = gene_df.ID[(gene_df.gene_type == "misc_RNA")]
RNA_type_list["processed_transcript"] = gene_df.ID[(gene_df.gene_type == "processed_transcript")]
RNA_type_list["sense_overlapping"] = gene_df.ID[(gene_df.gene_type == "sense_overlapping")]
RNA_type_list["processed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "processed_pseudogene")]
RNA_type_list["transcribed_unprocessed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "transcribed_unprocessed_pseudogene")]
RNA_type_list["processed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "processed_pseudogene")]
RNA_type_list["unprocessed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "unprocessed_pseudogene")]
RNA_type_list["transcribed_processed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "transcribed_processed_pseudogene")]
RNA_type_list["all pseudogenes"] = gene_df.ID[(gene_df.gene_type.isin(["processed_pseudogene","transcribed_unprocessed_pseudogene", "unprocessed_pseudogene", "transcribed_processed_pseudogene"]))]
#RNA_type_list["short_RNA"] = gene_df.ID[(gene_df.gene_type.isin(["miRNA","snoRNA","snRNA"]))]

display(Markdown("**List of gene types**"))
for RNA_type, gene_id_list in RNA_type_list.items():
    print("{:40}\tcount: {}".format(RNA_type, len(gene_id_list)))


List of gene types

protein_coding                          	count: 19815
lincRNA                                 	count: 7674
antisense                               	count: 5564
TEC                                     	count: 1053
sense_intronic                          	count: 919
misc_RNA                                	count: 2298
processed_transcript                    	count: 502
sense_overlapping                       	count: 193
processed_pseudogene                    	count: 10283
transcribed_unprocessed_pseudogene      	count: 682
unprocessed_pseudogene                  	count: 2612
transcribed_processed_pseudogene        	count: 445
all pseudogenes                         	count: 14022

In [10]:
FDR = 0.05
min_targets = 20

for RNA_type, gene_id_list in RNA_type_list.items():
    
    # Parse the dataset 
    hl=[]
    for fp in sorted(glob("./PTM/PTM_Annotated_Datasets/*_hg38_cleaned_gencode_v24_uniq-gene.csv")):
        PTM_df = pd.read_table(fp, names=["target_id", "gene_name", "gene_type", "count"])
        PTM_hl_list = PTM_df.target_id[(PTM_df.target_id.isin(gene_id_list))]
        hl.append({"target_id": PTM_hl_list, "label":file_name(fp).split("_hg38_")[0]})
    hl.sort(key=lambda x: len(x["target_id"]), reverse=True)
      
    plot_text(RNA_type, align="left", fontsize=20, fontweight="bold")
    
    HL = [{"target_id":gene_id_list, "label":RNA_type, "linestyle":'--', "color":"0.7"}]+hl
    volcano_plot (df=df, highlight_list=HL, X="b", Y="qval", FDR=FDR, X_cutoff=0, figsize=[20,5], xlim=[-6,6],
             xlabel = "Cytosol <<< beta factor >>> Nucleus", ylabel = "-log10(qval)", 
             alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
    
    MA_plot (df=df, highlight_list=HL, X="mean_obs", Y="b", FDR=FDR, FDR_col="qval", figsize=[20,5],
             xlabel="Mean expression", ylabel="Cytosol <<< beta factor >>> Nucleus",
             alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
    
    HL = hl+[{"target_id":gene_id_list, "label":RNA_type, "linestyle":'--', "color":"black"}]
    density_plot(df, "b", FDR=FDR, FDR_col="qval", figsize=[20,5], ylabel="Cumulative Beta value", cumulative = True,
                 highlight_list= HL, highlight_FDR=FDR, highlight_min_targets=min_targets)


using a non-integer number instead of an integer will result in an error in the future
More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).

Comparison lncRNA PTM datasets / lncRNA Localization at transcript level

Considering the methods used by most of the genome wide PTM investigation I don't know if it is possible to determine the transcript of origin of the modified RNA ?? Possibility to use Kallisto or other methods eventually perform a transcript level analysis.


In [ ]:


Comparison Localization datasets / RNA protein interaction

Summary of the datasets


In [7]:
df = pd.read_csv("./Protein_interaction/Li_ClipSeq/Protocol_info/Selected_datasets.csv", sep="\t", index_col=0)
df


Out[7]:
protein experiment cell_line replicate treatment localization author accession filepath
Dataset name
HPAR1 FXR1 PAR-CLIP HEK293 1 wildtype FXR1, 4-thiouridine, RNase T1 cytosol ascano GSE39682 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPTA1 AGO1 PAR-CLIP HEK293 1 4-thiouridine, RNase T1 cytosol hafner GSE21918 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPTA3 AGO3 PAR-CLIP HEK293 1 4-thiouridine, RNase T1 cytosol hafner GSE21918 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPTA4 AGO4 PAR-CLIP HEK293 1 4-thiouridine, RNase T1 cytosol hafner GSE21918 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPTPU PUM2 PAR-CLIP HEK293 1 4-thiouridine, RNase T1 cytosol hafner GSE21918 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPTLB LIN28B PAR-CLIP HEK293 1 4-thiouridine, RNase T1 cytosol hafner GSE44615 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPRO6 AGO1 PAR-CLIP HEK293 1 4-thiouridine, ML_MM_6 cytosol memczak GSE43573 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPRO7 AGO1 PAR-CLIP HEK293 2 4-thiouridine, ML_MM_7 cytosol memczak GSE43573 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPRO8 AGO1 PAR-CLIP HEK293 3 4-thiouridine, ML_MM_8 cytosol memczak GSE43573 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPLAL ALKBH5 PAR-CLIP HEK293 1 4-thiouridine nucleus baltz GSE38201 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPLZC ZC3H7B PAR-CLIP HEK293 1 4-thiouridine nucleus baltz GSE38201 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPHFW FUS PAR-CLIP HEK293 1 4-thiouridine nucleus hoell SRA025082 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HPHTA TAF15 PAR-CLIP HEK293 1 4-thiouridine nucleus hoell SRA025082 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HIUHN hnRNPC iCLIP HeLa 1 - nucleus konig SupplementaryData /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HHCD1 DGCR8 HITS-CLIP HEK293 1 D8 (endogenous DGCR8), replicate 1 nucleus macias GSE39086 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HHCD2 DGCR8 HITS-CLIP HEK293 2 D8 (endogenous DGCR8), replicate 2 nucleus macias GSE39086 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HHMF1 FUS HITS-CLIP Brain 1 replicate1, RNase T1 nucleus nakaya GSE43308 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HHMF2 FUS HITS-CLIP Brain 2 replicate2, RNase T1 nucleus nakaya GSE43308 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HHMF3 FUS HITS-CLIP Brain 3 replicate3, RNase T1 nucleus nakaya GSE43308 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...
HIUHC hnRNPC iCLIP HeLa 1 untreated nucleus zarnack E-MTAB-1371 /home/aleg/Data/Datasets/Epi2_pilot/Protein_in...

In [38]:
# Read the localization data
loc_df = pd.read_table("./Localisation/Djebali-ENCODE/sleuth/gene_allRNA/ALL/localizationnucleus_wald_test.tsv")
# Read the interaction data
protein_df = pd.read_table("./Protein_interaction/Li_ClipSeq/final_datasets/ALL_dataset_uniq_genes.tsv")

nuclearness_list=[]
    
for i in range(protein_df["nuclearness"].unique().max(), protein_df["nuclearness"].unique().min()-1, -1):
    nuclearness_list.append({
        "target_id": protein_df.gene_id[protein_df.nuclearness==i],
        "label":str(i), "alpha":abs(i)/protein_df.nuclear_protein.max(), "linestyle":'--'})

plot_text("Cytoplasmic/Nuclear proteins interactions compared with RNA enrichment in nucleus/cytosol (Nuclearness)", align="left", fontsize=20, fontweight="bold")
volcano_plot (df=loc_df, highlight_list=nuclearness_list, X="b", Y="qval", FDR=0.01, X_cutoff=0, figsize=[15,5], 
              xlabel = "Nucleus localization", sig_color="0.9", non_sig_color="1", xlim=[-6,6], alpha=0.5, highlight_palette="RdYlBu")
MA_plot (df=loc_df, highlight_list=nuclearness_list, X="mean_obs", Y="b", FDR=0.01, FDR_col="qval", figsize=[15,5], 
         xlabel="Mean expression", ylabel="Nucleus localization", sig_color="0.9", non_sig_color="1", alpha=0.5, highlight_palette="RdYlBu")



In [34]:
# Read the localization data
loc_df = pd.read_table("./Localisation/Djebali-ENCODE/sleuth/gene_allRNA/ALL/localizationnucleus_wald_test.tsv")
# Read the interaction data
protein_df = pd.read_table("./Protein_interaction/Li_ClipSeq/final_datasets/ALL_dataset_uniq_genes.tsv")

loc_list=[]
for i in range(protein_df.nuclear_protein.max(),3, -1):
    loc_list.append({
            "target_id": protein_df.gene_id[(protein_df.cytosol_protein==0) & (protein_df.nuclear_protein==i)],
            "label":"Cytoplasmic == 0 / Nuclear == {}".format(i), "alpha":i/protein_df.nuclear_protein.max()})

range_cytosol = range(2, protein_df.cytosol_protein.max())
for i in range_cytosol:
    loc_list.append({
            "target_id": protein_df.gene_id[(protein_df.cytosol_protein==i) & (protein_df.nuclear_protein==0)],
            "label":"Cytoplasmic == {} / Nuclear == 0 ".format(i), "alpha":i/protein_df.cytosol_protein.max()})

plot_text("Cytoplasmic/Nuclear proteins interactions compared with RNA enrichment in nucleus/cytosol (Excluding mixed nucleus/cytoplasm)", align="left", fontsize=20, fontweight="bold")
volcano_plot (df=loc_df, highlight_list=loc_list, X="b", Y="qval", FDR=0.01, X_cutoff=0, figsize=[15,5], 
              xlabel = "Nucleus localization", sig_color="0.9", non_sig_color="1", xlim=[-6,6], alpha=0.5, highlight_palette="RdYlBu")
MA_plot (df=loc_df, highlight_list=loc_list, X="mean_obs", Y="b", FDR=0.01, FDR_col="qval", figsize=[15,5], 
         xlabel="Mean expression", ylabel="Nucleus localization", sig_color="0.9", non_sig_color="1", alpha=0.5, highlight_palette="RdYlBu")


Hard to interpret because the nuclear protein seems to associate mainly with low expression genes (lncRNA ?) while cytoplasmic protein are found enriched in high expression proteins... THe nature of the selected proteins makes it hard to draw any clear conclusion. I don't think it can be used further on..