In [1]:
# Local lib import
from pycl import *
from pyBioPlot import *
# Standard lib imports
from glob import glob
from collections import OrderedDict
import colorsys
from IPython.core.display import display, HTML, Markdown, Image
# Third party import
import pandas as pd
import numpy as np
import pylab as pl
import seaborn as sns
# Pyplot tweaking
%matplotlib inline
pl.rcParams['figure.figsize'] = 30, 10 # that's default image size for this interactive session
# Jupyter display tweaking
toogle_code()
larger_display(75)
# Simplify warning reporting to lighten the notebook style
import warnings
warnings.formatwarning=(lambda message, *args: "{}\n".format(message))
# Allow to use R directly
%load_ext rpy2.ipython
In [7]:
# parsing file to summarize the datasets I have
d = OrderedDict()
for fp in glob("./PTM/PTM_Annotated_Datasets/*_hg38_cleaned_gencode_v24_lncRNAs_uniq-gene.csv"):
ls = supersplit(file_name(fp), ["_", "."])
author = ls[0]
ptm = ls[1]
cell = ls[2]
file = "_".join(ls[7:])
if ptm not in d:
d[ptm] = OrderedDict()
if cell not in d[ptm]:
d[ptm][cell] = OrderedDict()
if author not in d[ptm][cell]:
d[ptm][cell][author] = []
d[ptm][cell][author].append(fastcount(fp))
display(Markdown("**List of the datasets I already analysed**"))
print(dict_to_report(d))
In [3]:
FDR = 0.05
min_targets = 20
# Define the categories of lnc and coding RNA
lncRNA_types = ["lincRNA", "antisense", "TEC", "sense_intronic", "processed_transcript", "sense_overlapping", "3prime_overlapping_ncrna", "non_coding", "bidirectional_promoter_lncrna", "macro_lncRNA"]
coding_RNA_types = ["protein_coding", "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene"]
# Read the main data
df = pd.read_table("./Localisation/Djebali-ENCODE/sleuth/gene_allRNA/ALL/localizationnucleus_wald_test.tsv")
# Deal with the 2 rnatypes
gene_df = pd.read_table( "../../Reference_Annotation/gencode_v24_gene.tsv")
# Subsetting coding genes
coding_RNA_types = ["protein_coding", "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene"]
codingRNA_id = gene_df.ID[(gene_df.gene_type.isin(coding_RNA_types))]
# Subsetting lncRNA genes
lncRNA_types = ["lincRNA", "antisense", "TEC", "sense_intronic", "processed_transcript", "sense_overlapping", "3prime_overlapping_ncrna", "non_coding", "bidirectional_promoter_lncrna", "macro_lncRNA"]
lncRNA_id = gene_df.ID[(gene_df.gene_type.isin(lncRNA_types))]
hl_types = [
{"target_id":lncRNA_id, "label":"All lncRNA", "linestyle":'--', "color":"black"},
{"target_id":codingRNA_id, "label":"All Coding RNA", "linestyle":':', "color":"black"},]
for modif in ["m6A", "m5C", "_Y_", "A>I", "m1A"]:
PTM_dataset_list = sorted(glob("./PTM/PTM_Annotated_Datasets/*{}*_hg38_cleaned_gencode_v24_uniq-gene.csv".format(modif)))
hl_lncRNA = []
hl_codingRNA = []
color = get_color_list(n=len(PTM_dataset_list), colormap="Set1")
for fp in PTM_dataset_list:
PTM_df = pd.read_table(fp, names=["target_id", "gene_name", "gene_type", "count"])
c = next(color)
# lncRNA
hl_lncRNA.append({
"target_id": PTM_df.target_id[(PTM_df.target_id.isin(lncRNA_id))], "label":"lnc RNA_{}".format(file_name(fp).split("_hg38_")[0]), "linestyle":'--', "color":c})
# codingRNA
hl_codingRNA.append({
"target_id": PTM_df.target_id[(PTM_df.target_id.isin(codingRNA_id))], "label":"coding RNA_{}".format(file_name(fp).split("_hg38_")[0]), "linestyle":':', "color":c})
plot_text(modif, align="left", fontsize=20, fontweight="bold")
# lncRNA
hl_lncRNA.sort(key=lambda x: len(x["target_id"]), reverse=True)
plot_text(" Long non-coding RNAs", align="left", fontsize=15, fontweight="bold")
volcano_plot (df=df, highlight_list=hl_lncRNA, X="b", Y="qval", FDR=FDR, X_cutoff=0, figsize=[20,5], xlim=[-6,6],
xlabel = "Cytosol <<< beta factor >>> Nucleus", ylabel = "-log10(qval)",
alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
MA_plot (df=df, highlight_list=hl_lncRNA, X="mean_obs", Y="b", FDR=FDR, FDR_col="qval", figsize=[20,5],
xlabel="Mean expression", ylabel="Cytosol <<< beta factor >>> Nucleus",
alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
# codingRNA
hl_codingRNA.sort(key=lambda x: len(x["target_id"]), reverse=True)
plot_text(" Coding RNAs", align="left", fontsize=15, fontweight="bold")
volcano_plot (df=df, highlight_list=hl_codingRNA, X="b", Y="qval", FDR=FDR, X_cutoff=0, figsize=[20,5], xlim=[-6,6],
xlabel = "Cytosol <<< beta factor >>> Nucleus", ylabel = "-log10(qval)",
alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
MA_plot (df=df, highlight_list=hl_codingRNA, X="mean_obs", Y="b", FDR=FDR, FDR_col="qval", figsize=[20,5],
xlabel="Mean expression", ylabel="Cytosol <<< beta factor >>> Nucleus",
alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
# Density plot with both RNA types
HL = hl_types + hl_lncRNA + hl_codingRNA
plot_text(" Density lncRNA + Coding RNAs", align="left", fontsize=15, fontweight="bold")
density_plot(df, "b", FDR=FDR, FDR_col="qval", figsize=[15,5], ylabel="Cumulative Beta value", highlight_list= HL, cumulative = True, highlight_min_targets=min_targets)
In [5]:
# Read the main data
df = pd.read_table("./Localisation/Djebali-ENCODE/sleuth/gene_allRNA/ALL/localizationnucleus_wald_test.tsv")
# Read the tables containing all gencode genes information
gene_df = pd.read_table( "../../Reference_Annotation/gencode_v24_gene.tsv")
# Define the types of RNA
RNA_type_list= OrderedDict()
RNA_type_list["protein_coding"] = gene_df.ID[(gene_df.gene_type == "protein_coding")]
RNA_type_list["lincRNA"] = gene_df.ID[(gene_df.gene_type == "lincRNA")]
RNA_type_list["antisense"] = gene_df.ID[(gene_df.gene_type == "antisense")]
RNA_type_list["TEC"] = gene_df.ID[(gene_df.gene_type == "TEC")]
RNA_type_list["sense_intronic"] = gene_df.ID[(gene_df.gene_type == "sense_intronic")]
RNA_type_list["misc_RNA"] = gene_df.ID[(gene_df.gene_type == "misc_RNA")]
RNA_type_list["processed_transcript"] = gene_df.ID[(gene_df.gene_type == "processed_transcript")]
RNA_type_list["sense_overlapping"] = gene_df.ID[(gene_df.gene_type == "sense_overlapping")]
RNA_type_list["processed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "processed_pseudogene")]
RNA_type_list["transcribed_unprocessed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "transcribed_unprocessed_pseudogene")]
RNA_type_list["processed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "processed_pseudogene")]
RNA_type_list["unprocessed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "unprocessed_pseudogene")]
RNA_type_list["transcribed_processed_pseudogene"] = gene_df.ID[(gene_df.gene_type == "transcribed_processed_pseudogene")]
RNA_type_list["all pseudogenes"] = gene_df.ID[(gene_df.gene_type.isin(["processed_pseudogene","transcribed_unprocessed_pseudogene", "unprocessed_pseudogene", "transcribed_processed_pseudogene"]))]
#RNA_type_list["short_RNA"] = gene_df.ID[(gene_df.gene_type.isin(["miRNA","snoRNA","snRNA"]))]
display(Markdown("**List of gene types**"))
for RNA_type, gene_id_list in RNA_type_list.items():
print("{:40}\tcount: {}".format(RNA_type, len(gene_id_list)))
In [10]:
FDR = 0.05
min_targets = 20
for RNA_type, gene_id_list in RNA_type_list.items():
# Parse the dataset
hl=[]
for fp in sorted(glob("./PTM/PTM_Annotated_Datasets/*_hg38_cleaned_gencode_v24_uniq-gene.csv")):
PTM_df = pd.read_table(fp, names=["target_id", "gene_name", "gene_type", "count"])
PTM_hl_list = PTM_df.target_id[(PTM_df.target_id.isin(gene_id_list))]
hl.append({"target_id": PTM_hl_list, "label":file_name(fp).split("_hg38_")[0]})
hl.sort(key=lambda x: len(x["target_id"]), reverse=True)
plot_text(RNA_type, align="left", fontsize=20, fontweight="bold")
HL = [{"target_id":gene_id_list, "label":RNA_type, "linestyle":'--', "color":"0.7"}]+hl
volcano_plot (df=df, highlight_list=HL, X="b", Y="qval", FDR=FDR, X_cutoff=0, figsize=[20,5], xlim=[-6,6],
xlabel = "Cytosol <<< beta factor >>> Nucleus", ylabel = "-log10(qval)",
alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
MA_plot (df=df, highlight_list=HL, X="mean_obs", Y="b", FDR=FDR, FDR_col="qval", figsize=[20,5],
xlabel="Mean expression", ylabel="Cytosol <<< beta factor >>> Nucleus",
alpha=0.5, sig_neg_color="0.9", sig_pos_color="0.9", non_sig_color="0.95", highlight_FDR=FDR, highlight_min_targets=min_targets)
HL = hl+[{"target_id":gene_id_list, "label":RNA_type, "linestyle":'--', "color":"black"}]
density_plot(df, "b", FDR=FDR, FDR_col="qval", figsize=[20,5], ylabel="Cumulative Beta value", cumulative = True,
highlight_list= HL, highlight_FDR=FDR, highlight_min_targets=min_targets)
Considering the methods used by most of the genome wide PTM investigation I don't know if it is possible to determine the transcript of origin of the modified RNA ?? Possibility to use Kallisto or other methods eventually perform a transcript level analysis.
In [ ]:
In [7]:
df = pd.read_csv("./Protein_interaction/Li_ClipSeq/Protocol_info/Selected_datasets.csv", sep="\t", index_col=0)
df
Out[7]:
In [38]:
# Read the localization data
loc_df = pd.read_table("./Localisation/Djebali-ENCODE/sleuth/gene_allRNA/ALL/localizationnucleus_wald_test.tsv")
# Read the interaction data
protein_df = pd.read_table("./Protein_interaction/Li_ClipSeq/final_datasets/ALL_dataset_uniq_genes.tsv")
nuclearness_list=[]
for i in range(protein_df["nuclearness"].unique().max(), protein_df["nuclearness"].unique().min()-1, -1):
nuclearness_list.append({
"target_id": protein_df.gene_id[protein_df.nuclearness==i],
"label":str(i), "alpha":abs(i)/protein_df.nuclear_protein.max(), "linestyle":'--'})
plot_text("Cytoplasmic/Nuclear proteins interactions compared with RNA enrichment in nucleus/cytosol (Nuclearness)", align="left", fontsize=20, fontweight="bold")
volcano_plot (df=loc_df, highlight_list=nuclearness_list, X="b", Y="qval", FDR=0.01, X_cutoff=0, figsize=[15,5],
xlabel = "Nucleus localization", sig_color="0.9", non_sig_color="1", xlim=[-6,6], alpha=0.5, highlight_palette="RdYlBu")
MA_plot (df=loc_df, highlight_list=nuclearness_list, X="mean_obs", Y="b", FDR=0.01, FDR_col="qval", figsize=[15,5],
xlabel="Mean expression", ylabel="Nucleus localization", sig_color="0.9", non_sig_color="1", alpha=0.5, highlight_palette="RdYlBu")
In [34]:
# Read the localization data
loc_df = pd.read_table("./Localisation/Djebali-ENCODE/sleuth/gene_allRNA/ALL/localizationnucleus_wald_test.tsv")
# Read the interaction data
protein_df = pd.read_table("./Protein_interaction/Li_ClipSeq/final_datasets/ALL_dataset_uniq_genes.tsv")
loc_list=[]
for i in range(protein_df.nuclear_protein.max(),3, -1):
loc_list.append({
"target_id": protein_df.gene_id[(protein_df.cytosol_protein==0) & (protein_df.nuclear_protein==i)],
"label":"Cytoplasmic == 0 / Nuclear == {}".format(i), "alpha":i/protein_df.nuclear_protein.max()})
range_cytosol = range(2, protein_df.cytosol_protein.max())
for i in range_cytosol:
loc_list.append({
"target_id": protein_df.gene_id[(protein_df.cytosol_protein==i) & (protein_df.nuclear_protein==0)],
"label":"Cytoplasmic == {} / Nuclear == 0 ".format(i), "alpha":i/protein_df.cytosol_protein.max()})
plot_text("Cytoplasmic/Nuclear proteins interactions compared with RNA enrichment in nucleus/cytosol (Excluding mixed nucleus/cytoplasm)", align="left", fontsize=20, fontweight="bold")
volcano_plot (df=loc_df, highlight_list=loc_list, X="b", Y="qval", FDR=0.01, X_cutoff=0, figsize=[15,5],
xlabel = "Nucleus localization", sig_color="0.9", non_sig_color="1", xlim=[-6,6], alpha=0.5, highlight_palette="RdYlBu")
MA_plot (df=loc_df, highlight_list=loc_list, X="mean_obs", Y="b", FDR=0.01, FDR_col="qval", figsize=[15,5],
xlabel="Mean expression", ylabel="Nucleus localization", sig_color="0.9", non_sig_color="1", alpha=0.5, highlight_palette="RdYlBu")
Hard to interpret because the nuclear protein seems to associate mainly with low expression genes (lncRNA ?) while cytoplasmic protein are found enriched in high expression proteins... THe nature of the selected proteins makes it hard to draw any clear conclusion. I don't think it can be used further on..