In [2]:
# Local lib import
from pycl import *
from pyBioPlot import *
# Standard lib imports
import datetime
from glob import glob
from pprint import pprint as pp
from os.path import basename
from os import listdir, remove, rename
from os.path import abspath, basename, isdir
from collections import OrderedDict, Counter
import random
import statistics as stat
from IPython.core.display import display, HTML, Markdown, Image
# Third party import
import numpy as np
import scipy.stats as stats
import pylab as pl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SeqIO
# Pyplot tweaking
%matplotlib inline
pl.rcParams['figure.figsize'] = 30, 10 # that's default image size for this interactive session
# Jupyter display tweaking
toogle_code()
larger_display(75)
# Simplify warning reporting to lighten the notebook style
import warnings
warnings.formatwarning=(lambda message, *args: "{}\n".format(message))
# Allow to use R directly
%load_ext rpy2.ipython
Cabili, M. N. et al. Localization and abundance analysis of human lncRNAs at single-cell and single-molecule resolution. Genome Biol 16, (2015)
The only data available are a large multitab excel file containing a lot of messy information. The annotation are based on different database (lncipedia or Noncode, or refSeq and rarely ensembl)... sometime the lncRNA is only described in one database and not in ensembl. First I obtained the common name for all lncRNA from lncipedia. However it was impossible to get the Ensembl equivalent in most of the cases.
Fortunatly they provide all the probe they used to fish the lncRNA. So when the information is not available I have to blast the probe in Ensemble to get the proper gene corresponding to the probes... It is a long and painfull work, but this dataset is going to be my reference for future localization experient so it has to be done properly
In [2]:
d = OrderedDict()
for line in open("./Localisation_Original_Datasets/Cabili_RNA_FISH/probes_seq/All_probes.csv", 'r'):
sequence, seq_name, gene_num = line.split("\t")
d[gene_num]=seq_name.rpartition("_")[0]
for k,v in d.items():
print ("{}\t{}".format(k.strip(),v))
In [7]:
# Extract the sequences of the probles for each gene in a separate file
def get_fasta_probes(infile, outfile, id_list=[]):
seq_dict = OrderedDict()
for line in open (infile, "r"):
sequence, seq_name, gene_num = line.split("\t")
gene_name, sep, seq_num = seq_name.rpartition("_")
for id in id_list:
if (type(id) == int and id == int(gene_num)) or (type(id) == str and id == gene_id):
seq_dict[seq_name] = sequence
with open (outfile, "w") as fp:
for key, val in seq_dict.items():
fp.write(">{}\n{}\n".format(key, val))
In [28]:
infile = "./Localisation_Original_Datasets/Cabili_RNA_FISH/probes_seq/All_probes.csv"
# Creating a file contaning the probe used for each 34 valid experimental designs
d ={"DANCR":[51, 8, 19],
"CDKN2B-AS1":[58],
"GAS5":[59],
"KCNQ1OT1":[60],
"FENDRR":[26, 47, 52, 95],
"LINC-PINT":[57],
"lnc-SFPQ-2":[53],
"MALAT1":[92],
"MEG3":[61],
"NEAT1":[64],
"LINC00998":[3, 14],
"PSMA3-AS1":[85],
"lnc-AC078802.1-2":[65],
"TUG1":[55],
"XIST":[67],
"lnc-NRXN1-2":[41],
"LINC01116":[40],
"DUBR":[73],
"lnc-GRXCR1-4":[39],
"TMEM161B-AS1":[5, 16],
"lnc-C5orf39-2":[37],
"lnc-RREB1-4":[36],
"LINC00472":[35],
"PVT1":[72],
"LINC01278":[34],
"RAB30-AS1":[2, 13],
"LINC00941":[30],
"lnc-MAGOHB-1":[77],
"GAS6-AS2":[82],
"lnc-SNURF-1":[84],
"OIP5-AS1":[68],
"CRNDE":[74],
"lnc-USP14-2":[76,87],
"MIR4435-2HG":[23]}
for gene_name, id_list in d.items():
outfile = "./Localisation_Original_Datasets/Cabili_RNA_FISH/probes_seq/Cabili_{}.fa".format(gene_name)
get_fasta_probes(infile, outfile, id_list)
print ("Created " + outfile)
Each file was blasted against ensembl human genomic sequence hgrh38. I manually checked the overlapping genes and the position of hits relative to exons. Finally I reported the corresponding ensembl ID. In 2 conditions I was nor able to find a gene in the ensembl build at the position indicated by the probes. Instead in fond genes only described in NCBI genes. From what I saw in the blast results, all of the genes identified are not lncRNA. I compared the ensembl gene ID found with the gencodev23 file containing all the genes. Results are summarized in the table thereafter
In [132]:
def parse_ensembl_annotation (annotation_file):
annotation_dict = OrderedDict()
# Ensembl gen decomposing template
template=[0,"\t",1,"\t",2,"\t",3,"\t",4,"\t",5,"\t",6,"\t",7,"\tID=",8,";gene_id=",9,";gene_type=",10,
";gene_status=",11,";gene_name=",12,";level=",13,";havana_gene=",14]
# Parsing annotation file in a dict
with open (annotation_file, "r") as fp:
for line in fp:
if line[0] != "#":
sl = _decompose_line(line, template)
if sl[2] == "gene":
id = sl[8].split(".")[0]
assert id not in annotation_dict, "Duplicated gene ID !! Fuck"
annotation_dict[id] = {
"seqid":sl[0],"source":sl[1],"type":sl[2],"start":sl[3],"end":sl[4],"score":sl[5],
"strand":sl[6],"phase":sl[7],"ID":sl[8],"gene_id":sl[9],"gene_type":sl[10],
"gene_status":sl[11],"gene_name":sl[12],"level":sl[13],"havana_gene":sl[14]}
return annotation_dict
def annotate_ensembl_ID (annotation_file, candidate_file, out_file):
print ("Parsing annotation file ...")
annotation_dict = parse_ensembl_annotation(annotation_file)
print ("Searching for candidates ...")
with open (candidate_file, "r") as cf, open (out_file, "w") as of:
of.write("#Gen_screenID\tGene_name\tEnsemb_ID\tseqid\tsource\ttype\tstart\tend\tscore\tstrand\tphase\tID\tgene_id\tgene_type\tgene_status\tgene_name\tlevel\thavana_gene\n")
for line in cf:
if line[0] != "#":
line = line.strip()
sl= line.split("\t")
if sl[2] in annotation_dict:
of.write ("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
line,
annotation_dict[sl[2]]["seqid"].strip(),
annotation_dict[sl[2]]["source"].strip(),
annotation_dict[sl[2]]["type"].strip(),
annotation_dict[sl[2]]["start"].strip(),
annotation_dict[sl[2]]["end"].strip(),
annotation_dict[sl[2]]["score"].strip(),
annotation_dict[sl[2]]["strand"].strip(),
annotation_dict[sl[2]]["phase"].strip(),
annotation_dict[sl[2]]["ID"].strip(),
annotation_dict[sl[2]]["gene_id"].strip(),
annotation_dict[sl[2]]["gene_type"].strip(),
annotation_dict[sl[2]]["gene_status"].strip(),
annotation_dict[sl[2]]["gene_name"].strip(),
annotation_dict[sl[2]]["level"].strip(),
annotation_dict[sl[2]]["havana_gene"].strip()))
else:
of.write ("{}\t{}\n".format(line, "NOT_FOUND"))
In [134]:
# Request annotation for all candidates in lncRNA Gencode database
annotation_file = "../Reference_Annotation/gencode_v24_lncRNAs_gene.gff3"
candidate_file = "./Localisation_Original_Datasets/Cabili_RNA_FISH/candidate_list.csv"
out_file = "./Localisation_Original_Datasets/Cabili_RNA_FISH/candidate_list_annotated.csv"
annotate_ensembl_ID (annotation_file, candidate_file, out_file)
head(out_file, 5)
In [138]:
# Request annotation for all candidates in all Gencode database
annotation_file = "../Reference_Annotation/gencode_v24_gene.gff3"
candidate_file = "./Localisation_Original_Datasets/Cabili_RNA_FISH/candidate_list.csv"
out_file = "./Localisation_Original_Datasets/Cabili_RNA_FISH/candidate_list_annotated.csv"
annotate_ensembl_ID (annotation_file, candidate_file, out_file)
head(out_file, 5)
The 2 non-Ensembl lncRNA were not matched. As for the other 29 belong to the lncRNA class (processed_transcript, antisense, lincRNA), 1 is protein_coding and 2 are transcribed_unprocessed_pseudogene that do no belong to the lncRNA class. I think I will still keep a track of the pseudogene, since thay could also have a particular interest later in the analysis. So I have a final number of 29 valid lncRNA + 3 pseudogenes in the final dataset
#Gen_screenID | Gene_name | Ensemb_ID | gene_type | lncRNA_class | seqid | start | end | strand |
---|---|---|---|---|---|---|---|---|
ANCR | DANCR | ENSG00000226950 | processed_transcript | YES | chr4 | 52712404 | 52720351 | + |
Anril | CDKN2B-AS1 | ENSG00000240498 | antisense | YES | chr9 | 21994778 | 22121097 | + |
GAS5 | GAS5 | ENSG00000234741 | processed_transcript | YES | chr1 | 173863900 | 173868882 | - |
Kcnq1ot1 | KCNQ1OT1 | ENSG00000269821 | antisense | YES | chr11 | 2608328 | 2699994 | - |
XLOC_012046 | FENDRR | ENSG00000268388 | lincRNA | YES | chr16 | 86474529 | 86509099 | - |
lincMKLN1_A1 | LINC-PINT | ENSG00000231721 | antisense | YES | chr7 | 130941760 | 131110176 | - |
lincSFPQ | SFPQ | ENSG00000116560 | protein_coding | NO | chr1 | 35176378 | 35193148 | - |
MALAT1 | MALAT1 | ENSG00000251562 | lincRNA | YES | chr11 | 65497762 | 65506516 | + |
Meg3 | MEG3 | ENSG00000214548 | lincRNA | YES | chr14 | 100779410 | 100861031 | + |
NEAT1 | NEAT1 | ENSG00000245532 | lincRNA | YES | chr11 | 65422774 | 65445540 | + |
NR_024412 | LINC00998 | ENSG00000214194 | lincRNA | YES | chr7 | 113116718 | 113118613 | - |
NR_029435 | PSMA3-AS1 | ENSG00000257621 | antisense | YES | chr14 | 58265365 | 58298134 | - |
TERC | TERC | ENSG00000270141 | lincRNA | YES | chr3 | 169764520 | 169765060 | - |
XLOC_014209 | TUG1 | ENSG00000253352 | antisense | YES | chr22 | 30970677 | 30979395 | + |
XLOC_008185 | XIST | ENSG00000229807 | lincRNA | YES | chrX | 73820651 | 73852753 | - |
XLOC_002094 | AC007682.1 | ENSG00000231918 | lincRNA | YES | chr2 | 51032601 | 52407917 | + |
XLOC_002408 | LINC01116 | ENSG00000163364 | lincRNA | YES | chr2 | 176629589 | 176637931 | - |
XLOC_002746 | DUBR | ENSG00000243701 | lincRNA | YES | chr3 | 107240692 | 107326964 | + |
XLOC_003526 | RP11-1E6.1 | ENSG00000250657 | processed_transcript | YES | chr4 | 43340875 | 43345600 | + |
XLOC_004456 | TMEM161B-AS1 | ENSG00000247828 | antisense | YES | chr5 | 88268895 | 88436685 | + |
XLOC_004803 | REFSEQ-FLJ32255 | LOC643977 | NA | NA | NA | NA | NA | NA |
XLOC_005151 | NCBI-105374902 | LOC105374902 | NA | NA | NA | NA | NA | NA |
XLOC_005764 | LINC00472 | ENSG00000233237 | lincRNA | YES | chr6 | 71344344 | 71420769 | - |
XLOC_006922 | PVT1 | ENSG00000249859 | lincRNA | YES | chr8 | 127794533 | 128101253 | + |
XLOC_008174 | LINC01278 | ENSG00000235437 | processed_transcript | YES | chrX | 63343227 | 63561071 | - |
XLOC_009233 | RAB30-AS1 | ENSG00000246067 | lincRNA | YES | chr11 | 83072066 | 83106719 | + |
XLOC_009702 | LINC00941 | ENSG00000235884 | lincRNA | YES | chr12 | 30795681 | 30802711 | + |
XLOC_010017 | KLRAP1 | ENSG00000256667 | transcribed_unprocessed_pseudogene | NO | chr12 | 10588063 | 10599669 | - |
XLOC_010514 | GAS6-AS2 | ENSG00000272695 | lincRNA | YES | chr13 | 113864168 | 113866833 | + |
XLOC_011185 | SNHG14 | ENSG00000224078 | processed_transcript | YES | chr15 | 24978583 | 25419462 | + |
XLOC_011226 | OIP5-AS1 | ENSG00000247556 | processed_transcript | YES | chr15 | 41283990 | 41309737 | + |
XLOC_011950 | CRNDE | ENSG00000245694 | lincRNA | YES | chr16 | 54918863 | 54929189 | - |
XLOC_012599 | ROCK1P1 | ENSG00000263006 | transcribed_unprocessed_pseudogene | NO | chr18 | 109065 | 122219 | + |
XLOC_L2_008203 | MIR4435-2HG | ENSG00000172965 | lincRNA | YES | chr2 | 111196350 | 111495100 | - |
I removed the data from the protein coding and non ensembl gene and crosmatch them with the experimental data in a simple 3 column csv file (Ensembl-ID : Cell type : localization code). I will create a bedfile containing the coordinate of the gene with additional information fetched from the gencodev23 annotation file. The idea is to obtain a standardize file similar to the bed file created previously for PTM.
In [156]:
def Bed_ensembl_ID (annotation_file, localization_file, out_file):
print ("Parsing annotation file ...")
annotation_dict = parse_ensembl_annotation(annotation_file)
h = "# Data cleaned, converted to BED6, standardized, hg38 coordinates\n"
h+= "# Gene annotation scrapped from gencodev23\n"
h+= "# Adrien Leger (aleg@ebi.ac.uk) {}\n".format(str (datetime.datetime.today()))
h+= "# Localization type is a numeric code from the original publication\n"
h+= "# 0 = Invalid detection\n"
h+= "# 1 = One or 2 large nuclear foci\n"
h+= "# 2 = Large nuclear foci and single molecules scattered through the nucleus\n"
h+= "# 3 = Predominantly nuclear, without foci\n"
h+= "# 4 = Cytoplasmic and nuclear\n"
h+= "# 5 = Predominantly cytoplasmic\n"
h+= "# chrom\tchromstart\tchromend\tLocalization_code|cell_type|method|PMID|ensembl_id|gene_type|gene_name\tscore\tstrand\n"
print ("Searching for candidates ...")
with open (localization_file, "r") as infile, open (out_file, "w") as outfile:
outfile.write(h)
total = invalid_loc_code = invalid_ensemble_id = 0
for line in infile:
if line[0] != "#":
total+=1
sl= line.split("\t")
ensemble_id = sl[0].strip()
cell_type = sl[1].strip()
loc_code = sl[2].strip()
if loc_code == '0':
invalid_loc_code+=1
continue
if ensemble_id not in annotation_dict:
invalid_ensemble_id+=1
continue
outfile.write ("{0}\t{1}\t{2}\t{3}|{4}|{5}|{6}|{7}|{8}|{9}\t{10}\t{11}\n".format(
annotation_dict[ensemble_id]["seqid"].strip(),
annotation_dict[ensemble_id]["start"].strip(),
annotation_dict[ensemble_id]["end"].strip(),
loc_code,
cell_type,
"RNA_FISH",
"25630241",
ensemble_id,
annotation_dict[ensemble_id]["gene_type"].strip(),
annotation_dict[ensemble_id]["gene_name"].strip(),
"-",
annotation_dict[ensemble_id]["strand"].strip()))
print("Total: {}\tInvalid Localization code: {}\tInvalid Ensembl ID: {}".format(total,invalid_loc_code,invalid_ensemble_id))
In [158]:
# Request annotation for all candidates in all Gencode database (pseudogenes + lncRNA)
annotation_file = "../Reference_Annotation/gencode_v24_gene.gff3"
localization_file = "./Localisation_Original_Datasets/Cabili_RNA_FISH/Localization_cell_ensembl.csv"
out_file = "./Localisation_Clean_Datasets/Cabili_RNA_FISH_hg38.bed"
Bed_ensembl_ID (annotation_file, localization_file, out_file)
head (out_file, 15)
This last step removed 24 lines out of 93 for genes that had a invalid localization code (0). The data are now stored in a Bed compatible dataset and ready to be used. I am done with this dataset.
It might be a good idea to do this analysis at transcript level. Indeed, sometime only a fraction of the transcripts were targeted by the probes... A solution would be to blast automatically all the probes, from the coordinates, extract the overlapping transcript by simple intersection with bedtools for example
Same thing than prevously but cleaner with an id
In [8]:
! rm ./Localisation_Original_Datasets/Cabili_RNA_FISH/probes_seq2/*
In [9]:
infile = "./Localisation_Original_Datasets/Cabili_RNA_FISH/probes_seq/All_probes.csv"
# Creating a file contaning the probe used for each 34 valid experimental designs
d ={"01_DANCR":[51, 8, 19],
"02_CDKN2B-AS1":[58],
"03_GAS5":[59],
"04_KCNQ1OT1":[60],
"05_FENDRR":[26, 47, 52, 95],
"06_LINC-PINT":[57],
"07_lnc-SFPQ-2":[53],
"08_MALAT1":[92],
"09_MEG3":[61],
"10_NEAT1":[64],
"11_LINC00998":[3, 14],
"12_PSMA3-AS1":[85],
"13_TERC":[65],
"14_TUG1":[55],
"15_XIST":[67],
"16_lnc-NRXN1-2":[41],
"17_LINC01116":[40],
"18_DUBR":[73],
"19_lnc-GRXCR1-4":[39],
"20_TMEM161B-AS1":[5, 16],
"21_lnc-C5orf39-2":[37],
"22_lnc-RREB1-4":[36],
"23_LINC00472":[35],
"24_PVT1":[72],
"25_LINC01278":[34],
"26_RAB30-AS1":[2, 13],
"27_LINC00941":[30],
"28_KLRAP1":[77],
"29_GAS6-AS2":[82],
"30_SNHG14":[84],
"31_OIP5-AS1":[68],
"32_CRNDE":[74],
"33_ROCK1P1":[76,87],
"34_MIR4435-2HG":[23]}
outdir = "./Localisation_Original_Datasets/Cabili_RNA_FISH/probes_seq2"
mkdir(outdir)
for gene_name, id_list in d.items():
outfile = "{}/{}.fa".format(outdir, gene_name)
get_fasta_probes(infile, outfile, id_list)
print ("Created " + outfile)
In [41]:
head("./Localisation_Original_Datasets/Cabili_RNA_FISH/localization_patern.csv")
In [121]:
# Use my python wrapper for blast+
from Blastn import Blastn
def parse_loc_file (loc_file):
### Import the sample localization file in a panda dfataframe and parse it in a dict
loc_dict = {}
df = pd.read_table(loc_file)
for n, v in df.iterrows():
if v["id"] not in loc_dict:
loc_dict[v["id"]]={}
# Define the localization of the sample for the dataset according to the validity of the assay
loc_dict[v["id"]][v["cell_type"]] = int(v["localization"]) if v["validInCell"] else np.nan
print ("Found {} sample id".format(len(loc_dict.keys())))
return (loc_dict)
def blast_probe (reference, subject_dir, min_hit_evalue, min_hit_len, min_count_valid, verbose=False):
### Blast the probe against the reference and find the transcript of origin
# Dict to collect all the resuts
G_dict = {}
T_dict = {}
print ("Create Blast database")
with Blastn(ref_path=reference) as blastn:
print ("Blasting subject sequences")
for subject in glob(subject_dir+"*.fa"):
# Get the name and id of the sample currently analysed
sample_name = supersplit(subject, [".","/","_"])[-2]
sample_id = supersplit(subject, [".","/","_"])[-3]
if verbose:
print("Blasting sample :", sample_id, sample_name)
# Init dict for these samples
g_dict = {}
t_dict = {}
# Blast the sequences of the probes against the reference and get a lis of hits
hit_list = blastn(query_path=subject, task="blastn-short")
# Parse the list of blast hits found
for hit in hit_list:
# Select only if inverse orientation good evalue and len match longer than 18
if hit.evalue <= min_hit_evalue and hit.length >=min_hit_len and hit.s_orient != hit.q_orient :
s = hit.s_id.split("|")
t_id = s[0]
g_id = s[1]
t_name = s[4]
g_name = s[5]
t_len = s[6]
t_type = s[7]
# Fill the transcript based dict
if t_id not in t_dict:
t_dict[t_id] = OrderedDict()
t_dict[t_id]["g_id"] = g_id
t_dict[t_id]["t_name"] = t_name
t_dict[t_id]["g_name"] = g_name
t_dict[t_id]["t_len"] = t_len
t_dict[t_id]["t_type"] = t_type
t_dict[t_id]["count"] = 0
t_dict[t_id]["count"]+=1
# Fill of the gene based dict
if g_id not in g_dict:
g_dict[g_id] = OrderedDict()
g_dict[g_id]["g_name"] = g_name
g_dict[g_id]["t_type"] = t_type
g_dict[g_id]["count"] = 0
g_dict[g_id]["count"]+=1
# Filtered the resuts and write
filt_g_dict = {k:v for k, v in g_dict.items() if v["count"] > min_count_valid}
if g_dict:
G_dict[sample_id] = filt_g_dict
if verbose:
print (sample_id, sample_name, "GENES")
print ("UNFILTERED")
for k,v in g_dict.items():
print (k, "\t".join(["{}:{}".format(i,j) for i,j in v.items()]))
print ("FILTERED")
for k,v in filt_g_dict.items():
print (k, "\t".join(["{}:{}".format(i,j) for i,j in v.items()]))
filt_t_dict = {k:v for k, v in t_dict.items() if v["count"] > min_count_valid}
if t_dict:
T_dict[sample_id] = filt_t_dict
if verbose:
print (sample_id, sample_name, "TRANSCRIPTS")
print ("UNFILTERED")
for k,v in t_dict.items():
print (k, "\t".join(["{:6}:{:<18}".format(i,j) for i,j in v.items()]))
print ("FILTERED")
for k,v in filt_t_dict.items():
print (k, "\t".join(["{:6}:{:<18}".format(i,j) for i,j in v.items()]))
return (G_dict, T_dict)
def write_loc_file (d, loc_dict, outdir="./out/", prefix=""):
# Create the output directory if needed
mkdir(outdir)
# Iterate over the 3 cell types in the dataset
for cell in ["HeLa", "IMR90", "CRL2097"]:
outfile = "{}{}_{}.csv".format(outdir, prefix, cell)
first_line = True
print (outfile)
with open (outfile, "w") as fp:
for sample_id, id_dict in d.items():
# Retrieve the localization for this sample and this cell line
localization = loc_dict[int(sample_id)][cell]
for seq_id, seq_val in id_dict.items():
# Write the file header
if first_line:
fp.write("target_id\tsample_id\tlocalization\t{}\n".format("\t".join(seq_val.keys())))
first_line=False
# Write the value fields
fp.write("{}\t{}\t{}\t{}\n".format(seq_id, sample_id, localization,"\t".join([str(i) for i in seq_val.values()])))
In [123]:
# Analysis with a reference containing all the human RNA
reference = "../../Reference_Genomes/gencode.v24.transcripts.fa"
subject_dir = "./Localisation_Original_Datasets/Cabili_RNA_FISH/probes_seq2/"
loc_file = "./Localisation_Original_Datasets/Cabili_RNA_FISH/localization_patern.csv"
outdir = "./Localisation_Original_Datasets/Cabili_RNA_FISH/results_allRNA/"
loc_dict = parse_loc_file (loc_file)
G_dict, T_dict = blast_probe (reference, subject_dir, min_hit_evalue=0.01, min_hit_len=18, min_count_valid=3, verbose=False)
write_loc_file(G_dict, loc_dict, outdir, prefix="gene")
write_loc_file(T_dict, loc_dict, outdir, prefix="transcript")
In [122]:
# Analysis with a reference containing the human lncRNA only
reference = "../../Reference_Genomes/gencode.v24.lncRNA_transcripts.fa"
subject_dir = "./Localisation_Original_Datasets/Cabili_RNA_FISH/probes_seq2/"
loc_file = "./Localisation_Original_Datasets/Cabili_RNA_FISH/localization_patern.csv"
outdir = "./Localisation_Original_Datasets/Cabili_RNA_FISH/results_lncRNA/"
loc_dict = parse_loc_file (loc_file)
G_dict, T_dict = blast_probe (reference, subject_dir, min_hit_evalue=0.01, min_hit_len=18, min_count_valid=3, verbose=False)
write_loc_file(G_dict, loc_dict, outdir, prefix="gene")
write_loc_file(T_dict, loc_dict, outdir, prefix="transcript")
I obtained several list containing the localization of RNAs at gene level and transcript level based on a conservative base pairing between the probes used for the FISH experiments and the human transcripts. The available datasets are summarized in the table below
cell_line | RNA set | RNA level |
---|---|---|
HeLa | lncRNA | transcript |
HeLa | lncRNA | gene |
HeLa | allRNA | transcript |
HeLa | allRNA | gene |
IMR90 | lncRNA | transcript |
IMR90 | lncRNA | gene |
IMR90 | allRNA | transcript |
IMR90 | allRNA | gene |
CRL2097 | lncRNA | transcript |
CRL2097 | lncRNA | gene |
CRL2097 | allRNA | transcript |
CRL2097 | allRNA | gene |
I just found this new extensive dataset from the ENCODE v2 consortium published in 2012 with RNA-Seq data from various human cell lines in the nucleus and in the cytoplasm. The data were analysed with the previous version of the human genome and would probably have to be remapped completly on the new version of the genome hg38 and using the last GENCODE annotations. I for each cell line I will compare the nuclear vs the cytoplasmic fractions to get the transcript significantly enriched in the nucleus vs the cytoplasm.
The content of the dataset is listed bellow
Cell types (include immortalized cell lines, primary cells and embryonic cells)
Localization
Additional informations
Summary of the dataset I will focus on
Cell Line | Localization | RNA Extract | ENCODE Accesion |
---|---|---|---|
A549 | cytosol | Long PolyA+ RNA | EH002624 |
A549 | nucleus | Long PolyA+ RNA | EH002625 |
GM12878 | cytosol | Long PolyA+ RNA | EH000147 |
GM12878 | nucleus | Long PolyA+ RNA | EH000170 |
H1-hESC | cytosol | Long PolyA+ RNA | EH000151 |
H1-hESC | nucleus | Long PolyA+ RNA | EH000152 |
HeLa-S3 | cytosol | Long PolyA+ RNA | EH000171 |
HeLa-S3 | nucleus | Long PolyA+ RNA | EH000172 |
HepG2 | cytosol | Long PolyA+ RNA | EH000161 |
HepG2 | nucleus | Long PolyA+ RNA | EH000158 |
HUVEC | cytosol | Long PolyA+ RNA | EH000156 |
HUVEC | nucleus | Long PolyA+ RNA | EH000157 |
IMR90 | cytosol | Long PolyA+ RNA | EH002626 |
IMR90 | nucleus | Long PolyA+ RNA | EH002628 |
K562 | cytosol | Long PolyA+ RNA | EH000140 |
K562 | nucleus | Long PolyA+ RNA | EH000174 |
MCF-7 | cytosol | Long PolyA+ RNA | EH002633 |
MCF-7 | nucleus | Long PolyA+ RNA | EH002627 |
NHEK | cytosol | Long PolyA+ RNA | EH000166 |
NHEK | nucleus | Long PolyA+ RNA | EH000165 |
SK-N-SH | cytosol | Long PolyA+ RNA | EH002630 |
SK-N-SH | nucleus | Long PolyA+ RNA | EH002631 |
It has been a while that I want to use Kalisto, the software designed by Lior Patcher's team to analyse RNA data. I will perform this analysis with the conventional pipeline (STAR) in parallel to compare the results. The data are quite massive, so I will just prototype on my laptop but run the final analysis on the cluster
I dl the last reference fasta transcriptome for all RNA and for lncRNA only from the gencode server (v23). I need to clean the fasta Reference sequence from Gencode since they contain a lot of information from the transcript names of each sequences that will hamper the readability of the results. In order to be able to use the gene names and information later; I also need to save all the information contained in the names in a separate file.
In [6]:
from gzip import open as gopen
def Gencode_fastaclean_listinfo (gencode_fasta, clean_fasta="gencode_clean.fa.gz", gencode_info="gencode_info.tsv"):
with gopen (gencode_fasta, "rt") as fasta_in, gopen (clean_fasta, "wt") as fasta_out, open (gencode_info, "wt") as info:
# write header in the info file
info.write ("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
"GENCODE_transcript_id", "GENCODE_gene_id", "HAVANA_gene_id", "HAVANA_transcript_id", "transcript_name",
"gene_name", "length", "RNA_type"))
nline = nseq = 0
for line in fasta_in:
nline += 1
if line[0] == ">":
nseq += 1
ls = [i.strip() for i in line[1:].split("|")]
fasta_out.write (">{}\n".format(ls[0]))
# in lncRNA file not RNA type at the end...
if not ls[7]:
ls[7]="NA"
info.write("\t".join(ls)+"\n")
else:
fasta_out.write(line)
print ("Parsed {} lines\t Found {} Sequences".format(nline, nseq))
In [7]:
gencode_fasta = "../../Reference_Genomes/gencode.v24.lncRNA_transcripts.fa.gz"
clean_fasta = "../../Reference_Genomes/gencode_v24_lncRNA_transcripts_clean.fa.gz"
gencode_info = "../../Reference_Genomes/gencode_v24_lncRNA_transcripts_info.tsv"
print ("Clean the file and generate the list from IDs\n")
Gencode_fastaclean_listinfo (gencode_fasta, clean_fasta, gencode_info)
print ("Visualize the result\n")
head(gencode_fasta, 2)
head(clean_fasta, 2)
head(gencode_info, 2)
In [6]:
gencode_fasta = "../../Reference_Genomes/gencode.v24.transcripts.fa.gz"
clean_fasta = "../../Reference_Genomes/gencode_v24_transcripts_clean.fa.gz"
gencode_info = "../../Reference_Genomes/gencode_v24_transcripts_info.tsv"
print ("Clean the file and generate the list from IDs\n")
Gencode_fastaclean_listinfo (gencode_fasta, clean_fasta, gencode_info)
print ("Visualize the result\n")
head(gencode_fasta, 2)
head(clean_fasta, 2)
head(gencode_info, 2)
To compare the different conditions it might be interesting to to use the internal spike in used in all ENCODE samples. The added the ERCC (External RNA Controls Consortium) RNA Spike-In Mix that is also sometime called NIST spike. This is a pool of 96 synthetic RNAs with various lengths, and GC content covering a 2^20 concentration range as spike-in controls to measure sensitivity, accuracy, and biases in RNA-seq experiments as well as to derive standard curves for quantifying the abundance of transcripts. In all the sample I have selected, they used the ERCC pool 14 and the provide the concentration for all RNAs in a tsv file. I also need to have a fasta file, a gtf annotation file and an info list to further use. I found the fasta and the gtf easily on the web but I need to generate the info file from the GTF
In [2]:
# define variables and reformat the gff to obtain a simple tsv table
input_file = gunzip_file("../../Reference_Annotation/ERCC.gtf.gz")
output_file = "../../Reference_Genomes/ERCC_info.tsv"
init_template = [0,"\t",1,"\t",2,"\t",3,"\t",4,"\t",5,"\t",6,"\t",7,"\tgene_id \"",8,"\"; transcript_id \"",9,"\";"]
final_template = [0,"\t",8,"\t",9,"\t",4]
header="transcript_name\tgene_id\ttranscript_id\tlength\n"
reformat_table(input_file, output_file, init_template, final_template, header)
linerange (input_file)
linerange (output_file)
To enable there analysis one should include the sequences of the oligo transcripts in the alignment step. I will generate a new reference mixing the human genome transcripts as well as the ERCC transcript directil on the computation farm
Create indexes for Kalisto from the human lncRNA and all RNA transcriptome gencode v23 including the ERCC spike-in transcripts that might be used for normalization later
In [8]:
program = "kallisto-0.43 index"
index_dir = "../../Index/kallisto/"
index_name = "gencode_v24_lncRNA_transcripts_ERCC.idx"
index_report = "gencode_v24_lncRNA_transcripts_ERCC.txt"
gencode_fasta = "../../Reference_Genomes/gencode.v24.lncRNA_transcripts.fa.gz"
gencode_ERCC = "../../Reference_Genomes/ERCC.fa.gz"
mkdir(index_dir)
cmd = "{} -i {}{} {} {}".format(program, index_dir, index_name, gencode_fasta, gencode_ERCC)
print (cmd)
stderr = bash (cmd, ret_stderr=True, ret_stdout=False)
print (stderr)
# write the report
with open (index_dir+index_report, "w") as fp:
fp.write(stderr)
In [9]:
program = "kallisto-0.43 index"
index_dir = "../../Index/kallisto/"
index_name = "gencode_v24_transcripts_ERCC.idx"
index_report = "gencode_v24_transcripts_ERCC.txt"
gencode_fasta = "../../Reference_Genomes/gencode.v24.transcripts.fa.gz"
gencode_ERCC = "../../Reference_Genomes/ERCC.fa.gz"
mkdir(index_dir)
cmd = "{} -i {}{} {} {}".format(program, index_dir, index_name, gencode_fasta, gencode_ERCC)
print (cmd)
stderr = bash (cmd, ret_stderr=True, ret_stdout=False)
print (stderr)
# write the report
with open (index_dir+index_report, "w") as fp:
fp.write(stderr)
To analyse the data I will have to get all the fastq files and I will use the Cluster because of the amount of data to be analysed. All the data are available on the ENCODE website https://www.encodeproject.org/ and I can acces them thanks to the ENCODE Accesion number per experimental conditions. For each conditions I retrieved manually a file containing download links for all the files they generated. see bellow
In [10]:
!cat ./Localisation_Original_Datasets/Djebali-ENCODE/ENCODE_sample_files/EH000140.txt
The first file contains a large table with detailled informations for all the files contained in the conditions including there download links I will parse each of the tables and extract only the information related to fastq files
In [16]:
from urllib.request import urlopen
def parse_ENCODE_info_files(file_dir, tsv_outfile):
# Handler to write the collect information
with open (tsv_outfile, "w") as outfp:
# Loop over the information files to extract the url of the tsv file
for f in sorted(glob(file_dir+"*.txt")):
print("Parsing {}".format(file_basename(f)))
with open(f) as infp:
URL = next(infp).strip()
# Parse the tsv url and extract the lines correponding to the fadtq files
with urlopen(URL) as tsv:
for line in tsv:
dl = line.decode('utf-8')
sl = dl.split("\t")
# Write the lines in the collecting file
if sl[1] == "fastq":
outfp.write(dl)
return tsv_outfile
In [17]:
tsv_outfile = "./Localisation_Original_Datasets/Djebali-ENCODE/fastq/fastq_details.tsv"
file_dir = "./Localisation_Original_Datasets/Djebali-ENCODE/ENCODE_sample_files/"
parse_ENCODE_info_files(file_dir, tsv_outfile)
print(fastcount(outfile))
head (outfile, 2)
From this file containing all the information I will write a small python script to download the data on the cluster, similar to the following. https://github.com/a-slide/pyScripts/blob/master/EncodeParser.py. I launched the jobs on the cluster farm to download all the fastq files corresponding to the ENCODE number of Nucleus and cytosolic fraction polyA+ RNA. All the data are now available in the ebi server. There are 42 paired ended datafile. I renamed some of the datasets because HUVEC cells were called endothelial-cell-of-umbilical-vein and NHEK were called keratinocyte
Test alignement with 1M reads extracted from one of the ENCODE datasets with 4 threads, 5 bootstraps and in rf stranded mode
In [5]:
program = "kallisto-0.43 quant"
index="../../Index/kallisto/gencode_v24_lncRNA_transcripts_ERCC.idx"
R1="./Localisation_Original_Datasets/Djebali-ENCODE/fastq/1M_test_R1.fastq.gz"
R2="./Localisation_Original_Datasets/Djebali-ENCODE/fastq/1M_test_R2.fastq.gz"
output_dir = "./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/test/"
align_report = output_dir+"kallisto_report.txt"
n_thread = 4
n_bootstraps = 10
mkdir(output_dir)
cmd = "{} -t {} -b {} --fr-stranded -i {} -o {} {} {}".format(program, n_thread, n_bootstraps, index, output_dir, R1, R2)
print (cmd)
stderr = bash (cmd, ret_stderr=True, ret_stdout=False)
print (stderr)
# write the report
with open (align_report, "w") as fp:
fp.write(stderr)
Kalisto quant is working correctly and we obtain a file with counts directly in a very short time. However due to space and parralelisation concerns I will perform the dataset wide analyses on the cluster
The alignments against the modified reference containing both human transcripts were performed on the cluster. I performed 4 different aligments:
According to the Sleuth paper it might be beneficial perform 100 boostrap although they say that they don't really knows which is the minimun to obtain reliable results
kallisto quant --bias -b 100 --rf-stranded -t 4 -i INDEX -o OUTPUT_FOLDER READ1 READ2
I then copied the results localy for further analysis with Sleuth, since the size is rather small.
In [8]:
# Parse the stderr files containing the number of read aligned with kallisto
sname_list=[]
tae=[] # total all RNA ERCC
mae=[] # mapped all RNA ERCC
tle=[] # total lncRNA ERCC
mle=[] # mapped lncRNA ERCC
ma=[] # mapped all RNA no ERCC
ml=[] # mapped lncRNA no ERCC
flen_list=[]
# results obtained with basic alignment using Kallisto quant
for file in glob("./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/allRNA_ERCC/*/stderr.txt"):
sname_list.append(file.split("/")[-2])
lines = !cat {file}
flen_list.append(float(lines[12].split()[-1].replace(",","")))
tae.append(int(lines[11].split()[2].replace(",","")))
mae.append(int(lines[11].split()[4].replace(",","")))
# results obtained with stranded alignment using Kallisto quant
for file in glob("./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA_ERCC/*/stderr.txt"):
lines = !cat {file}
tle.append(int(lines[11].split()[2].replace(",","")))
mle.append(int(lines[11].split()[4].replace(",","")))
# results obtained with stranded alignment using Kallisto quant
for file in glob("./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/allRNA/*/stderr.txt"):
lines = !cat {file}
ma.append(int(lines[11].split()[4].replace(",","")))
# results obtained with stranded alignment using Kallisto quant
for file in glob("./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA/*/stderr.txt"):
lines = !cat {file}
ml.append(int(lines[11].split()[4].replace(",","")))
In [24]:
# Plot the results
index = np.arange(len(sname_list))
bar_width = 0.35
p0 = pl.bar(index, tae, bar_width, color="LightGreen", alpha=0.7)
p1 = pl.bar(index, mae, bar_width, color="ForestGreen", alpha=0.7)
p2 = pl.bar(index, ma, bar_width, color="DarkGreen", alpha=0.7)
p3 = pl.bar(index+bar_width, tle, bar_width, color="SkyBlue", alpha=0.7)
p4 = pl.bar(index+bar_width, mle, bar_width, color="Blue", alpha=0.7)
p5 = pl.bar(index+bar_width, ml, bar_width, color="DarkBlue", alpha=0.7)
pl.xticks(index+bar_width, sname_list, rotation=45, horizontalalignment ="right")
pl.legend((p0[0], p1[0], p2[0], p3[0], p4[0], p5[0]), ('Total Reads', 'Mapped Reads','Mapped Reads no ERCC','Total Reads', 'Mapped Reads lncRNA','Mapped Reads lncRNA no ERCC'))
pl.title("Number of reads analysed by Kallisto")
pl.show()
In [23]:
# Plot the results
index = np.arange(len(sname_list))
bar_width = 0.35
percent_unstrand = [mapped/total*100.0 for mapped, total in zip(mapped_list, total_list)]
percent_strand = [mapped/total*100.0 for mapped, total in zip(mapped_lnc_list, total_lnc_list)]
p1 = pl.bar(index, percent_unstrand, bar_width, color="green", alpha=0.7)
p2 = pl.bar(index+bar_width, percent_strand, bar_width, color="blue", alpha=0.7)
pl.xticks(index+bar_width, sname_list, rotation=45, horizontalalignment ="right")
pl.legend((p1[0], p2[0]), ('% Mapped Reads stranded', '% Mapped Reads stranded lnc'))
pl.title("% of reads aligned by Kallisto")
pl.show()
I obtained between 4E7 and 2.3E8 reads aligned (ie 60% and 90%). The stranded alignment results in less aligned reads but it is surely more accurate, so I will use only these results. There are much fewer read aligned on lncRNA (< 10%) but this is consistant with previous knowledge
For the analysis with Sleuth I will use the basic pipeline recommanded by the documentation https://rawgit.com/pachterlab/sleuth/master/inst/doc/intro.html. Apparently it is not possible to use the ERCC to normalize the tpm and est, since the bootstrap would also have to be normalized. Instead I will perform the analysis with the ERCC transcripts and will verify that they do not appear significantly differentially expressed. If so, the Sleuth normalization is certainly enough, else I would have to find another way to analyse kallisto data
Normalize the name of the replicates in the folder containing the samples which are sometime called Rep1 Re2p Rep3 or Rep4
In [28]:
def normalize_repname(basedir):
subst_dict = {"Rep1": "1", "Rep2": "2","Rep3": "1","Rep4": "2"}
for subdir in [path for path in glob(basedir+"/*") if isdir(path)]:
ss = subdir.split("_")
if ss[-1] in subst_dict:
ss[-1] = subst_dict[ss[-1]]
newname = "_".join(ss)
rename (subdir, newname)
In [37]:
for basedir in [
"./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA/",
"./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/allRNA/",
"./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA_ERCC/",
"./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/allRNA_ERCC/"
]:
normalize_repname(basedir)
!ls {basedir} | head -2
Create a file containing the name of samples, the cell type, the localization and the path to the dir containing kallisto results
In [6]:
# Create a file with sample names, descr and path of the result data
def sleuth_sample_list (basedir):
sample_list = []
list_file = basedir+"/list_file.tsv"
# Iterate throught the result folder
for subdir in [path for path in glob(basedir+"/*") if isdir(path)]:
# Extract info
ss = basename(subdir).split("_")
cell_type = ss[1].strip().replace("-", "")
localization = ss[2].strip().replace("-", "")
replicate = ss[3].strip().replace("-", "")
sample_list.append ({
"sample" : "{}_{}_{}".format(cell_type, localization, replicate),
"cell_type" : cell_type,
"localization" : localization,
"replicate" : replicate,
"path" : abspath(subdir)
})
# Sort the list to work with grouped data in the next steps of the analysis
sample_list.sort(key = lambda x: (x['cell_type'], x['localization'], x['replicate']))
# Find the number of replicates of each sample
rep_dict={}
for d in sample_list:
uniq_sample = "{}_{}".format(d["cell_type"], d["localization"])
if uniq_sample not in rep_dict:
rep_dict[uniq_sample] = 0
rep_dict[uniq_sample] += 1
# Filter out samples with less than 2 replicates
filter_list = []
for d in sample_list:
uniq_sample = "{}_{}".format(d["cell_type"], d["localization"])
if rep_dict[uniq_sample] >= 2:
filter_list.append(d)
else:
print ("{} was filtered out\n".format(d["sample"]))
# Write the results in a file
with open (list_file, "w") as outfp:
outfp.write("sample\tcell_type\tlocalization\treplicate\tpath\n")
for d in filter_list:
outfp.write("{}\t{}\t{}\t{}\t{}\n".format (d["sample"], d["cell_type"], d["localization"], d["replicate"], d["path"]))
return (list_file)
In [8]:
for basedir in [
"./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA",
"./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/allRNA",
"./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA_ERCC",
"./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/allRNA_ERCC"
]:
list_file = sleuth_sample_list(basedir)
head (list_file, 20)
H1-hESC was not included in the results because there are not replicates
Define an R function to analyse data with Sleuth
In [1]:
################## R KERNEL ##################
sleuth_analysis <- function(sample_list, transcript_info, select_col, select_val, de_col, beta_factor_list, outdir=".", ncore=4, max_bootstrap=10) {
# Imports and setups
library("sleuth")
options(mc.cores = ncore)
# Create the output directory
outdir=file.path(outdir, select_val)
dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
cat("IMPORT AND SELECT THE SAMPLES\n")
# Import the file in a table and select sample according to select val and col
s2c = read.table(sample_list, header = TRUE, stringsAsFactors=FALSE)
s2c = s2c[s2c[,select_col] == select_val,]
print (s2c$sample)
# Save the table in file for tracability
fname = file.path(outdir,"selected_datasets.tsv")
write.table(s2c, file = fname, sep="\t", col.names = TRUE)
cat("IMPORT THE GENE INFORMATIONS\n")
# Import the gene info in table and rename the fields for Sleuth compatibility
t2g = read.table(transcript_info, header = TRUE, stringsAsFactors=FALSE)
t2g = dplyr::rename(t2g, target_id = GENCODE_transcript_id, ens_gene = GENCODE_gene_id)
cat("PREPARE THE DATA FOR SLEUTH\n")
# Extract the data from the kallisto results, LIMIT to 3 because for 100 the kernel get killed !!
de_formula = as.formula(paste("~", de_col))
so = sleuth_prep(s2c, de_formula, max_bootstrap=max_bootstrap, target_mapping = t2g)
cat("FIT TO FULL MODEL\n")
# Estimate parameters for the sleuth response error measurement (full) model
so <- sleuth_fit(so)
cat("FIT TO REDUCED MODEL\n")
# Estimate parameters for the sleuth reduced model (shrinkage)
so <- sleuth_fit(so, ~1, 'reduced')
cat("PERFORM WALD TESTING\n")
# Performing Wald test
for (beta_factor in beta_factor_list) { so <- sleuth_wt(so, which_beta = beta_factor, 'full')}
cat("PERFORM LIKELYHOOD RATION TESTING\n")
# Perform likelihood ratio test
so <- sleuth_lrt(so, 'reduced', 'full')
cat("EXPORT RESULTS")
# Save all the raw abundance results in a table
fname = file.path(outdir, "abundance_raw.tsv")
write.table(so$obs_raw, file = fname, row.names=FALSE, na="",col.names=TRUE, sep="\t", quote=FALSE)
# Save all the normalised abundance results in a table
fname = file.path(outdir, "abundance_normed.tsv")
write.table(so$obs_norm, file = fname, row.names=FALSE, na="",col.names=TRUE, sep="\t", quote=FALSE)
# Save all the normalised and filtered abundance results in a table
fname = file.path(outdir, "abundance_normed_filtered.tsv")
write.table(so$obs_norm_filt, file = fname, row.names=FALSE, na="",col.names=TRUE, sep="\t", quote=FALSE)
# Save all the results in a table
results_table <- sleuth_results(so, 'reduced:full', test_type = 'lrt')
fname = file.path(outdir, "DE_Sleuth.tsv")
write.table(results_table, file = fname, row.names=TRUE, na="",col.names=TRUE, sep="\t", quote=FALSE)
# Save the Sleuth object for further analysis
fname = file.path(outdir, "DE_Sleuth.rds")
saveRDS(so, file = fname)
return (so)
}
Test with a sample
In [3]:
so = sleuth_analysis(
sample_list = "./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA/list_file.tsv",
transcript_info = "../../Reference_Genomes/gencode_v24_lncRNA_transcripts_info.tsv",
select_col = "cell_type",
select_val = "HeLaS3",
de_col = "localization + replicate",
beta_factor_list = c('localizationnucleus', 'replicate'),
outdir = "./Localisation_Original_Datasets/Djebali-ENCODE/test",
max_bootstrap=2
)
I launched the analysis with all the datasets
In [2]:
transcript_info = "../../Reference_Genomes/gencode_v24_lncRNA_transcripts_info.tsv"
kallisto_dir = "./Localisation_Original_Datasets/Djebali-ENCODE/kallisto"
sleuth_dir = "./Localisation_Original_Datasets/Djebali-ENCODE/sleuth"
subdir_list = c("lncRNA_ERCC") # not with all RNA for the moment.
select_col = "cell_type"
de_col = "localization + replicate"
beta_factor_list = c('localizationnucleus', 'replicate')
select_val_list = c("IMR90","A549","GM12878","HeLaS3","HepG2","HUVEC","K562","MCF7","NHEK","SKNSH")
for (subdir in subdir_list) {
sample_list = file.path(kallisto_dir, subdir, "list_file.tsv") # With the modified ordered file list
outdir = file.path(sleuth_dir, subdir)
cat (paste(c("\n########## ANALYSING", subdir, "DATAGROUP ##########\n"), collapse=" "))
for (select_val in select_val_list){
cat (paste(c("\n### ANALYSING", select_val, "SAMPLE ###\n"), collapse=" "))
sleuth_analysis (sample_list, transcript_info, select_col, select_val, de_col, beta_factor_list, outdir, max_bootstrap=100)
}
}
In [ ]:
# start a shinny web server to explore the data
library("sleuth")
so = readRDS("./Localisation_Original_Datasets/Djebali-ENCODE/sleuth/lncRNA_ERCC/SK-N-SH/DE_Sleuth.rds")
sleuth_live(so)
This is working OK but I will need to find a way to normalize between sample using a method more advanced that the one offered by Sleuth. In addition the transcript for the ERCC spike need to be used for this normalisation but removed from the final analysis.Finally I have to include the gene level analysis if I want to compare with the other methods at some point. All the samples were analysed on my laptop within one day. All the rds files are available to further data exploration with sleuth_live shiny interface
I will first have a look to the correspondance between the concentration of the ERCC spike and the abundance of the ERCC before and after normalization to see if there is any biais during the alignment or the normalization with sleuth
To analyse this difference I extracted the real concentrations of the 96 ERCC spikes (as reported by ENCODE) as well as the raw, normalized and filtered abundance files from Kallisto/Sleuth
In [10]:
################## PYTHON3 KERNEL ##################
# Parse the file with the concentration of ERCC
def ERCC_concentration_parsing (ERCC_conc):
# Read the concentration data from the ERCC
ERCC_dict = {}
with open (ERCC_conc, "r") as fp:
for line in fp:
sl = line.split()
if sl[0] != "#":
ERCC_dict[sl[0]] = float(sl[1])
return ERCC_dict
def ERCC_kallisto_parsing (ERCC_dict, kallisto_basedir):
# Parse the kallisto tsv files containing the tpm results and extract the tpm and the est values
# Build a dict table containing all the data indexed by sample and gene name
ERCC_abundance_dict = OrderedDict ()
abundance_results = {
"raw":"abundance_raw.tsv",
"norm":"abundance_normed.tsv",
"filter":"abundance_normed_filtered.tsv"}
# Iterate throught the result folder
for subdir in sorted([path for path in glob(kallisto_basedir+"/*") if isdir(path)]):
# Iterate throught the abundance result files
for res_type, abundance_file_suffix in abundance_results.items():
abundance_file = "{}/{}".format(subdir, abundance_file_suffix)
with open (abundance_file, "r") as fp:
# read the first line and determine the table structure that is not always the same...
header_line = next(fp)
hd = {name:i for i, name in enumerate(header_line.split())}
# now iterate through the other lines and find the lines with ERCC transcripts
for line in fp:
sl = line.split()
transcript_id = sl[hd["target_id"]]
sample_name = sl[hd["sample"]]
if transcript_id in ERCC_dict:
key = "{}_{}".format(transcript_id, sample_name)
# When found for the first time = extract generic informations for this sample/gen combination
if key not in ERCC_abundance_dict:
cell, localization, replicate = sample_name.split("_")
ERCC_abundance_dict[key] = OrderedDict ()
ERCC_abundance_dict[key]["cell_loc"] = "{}_{}".format(cell, localization)
ERCC_abundance_dict[key]["cell"] = cell
ERCC_abundance_dict[key]["localization"] = localization
ERCC_abundance_dict[key]["replicate"] = replicate
ERCC_abundance_dict[key]["transcript_id"] = transcript_id
ERCC_abundance_dict[key]["log_concentration"] = np.log(ERCC_dict[transcript_id])
ERCC_abundance_dict[key]["eff_length"] = float(sl[hd["eff_len"]])
ERCC_abundance_dict[key]["length"] = float(sl[hd["len"]])
# Then extract the tpm and est_count
ERCC_abundance_dict[key]["log_est_"+res_type] = np.log(float(sl[hd["est_counts"]]))
ERCC_abundance_dict[key]["log_tpm_"+res_type] = np.log(float(sl[hd["tpm"]]))
# Transform into a panda dataframe
ERCC_abundance_df = pd.DataFrame.from_dict(ERCC_abundance_dict, orient='index')
return (ERCC_abundance_dict, ERCC_abundance_df)
def ERCC_table_parsing (ERCC_abundance_dict, countfile, colname="log_col"):
# import the tximport file into a dataframe
df = pd.read_table(countfile)
# Iterate through the dataframe to extract tximport normalized aggregated counts
for gene_id, val_list in df.iterrows():
for sample_name, val in val_list.iteritems():
key = "{}_{}".format(gene_id, sample_name)
if key in ERCC_abundance_dict:
ERCC_abundance_dict[key][colname] = np.log(float(val))
# Transform into a panda dataframe
ERCC_abundance_df = pd.DataFrame.from_dict(ERCC_abundance_dict, orient='index')
return (ERCC_abundance_dict, ERCC_abundance_df)
In [7]:
# Parse the concentration file
ERCC_conc = "./Localisation_Original_Datasets/Djebali-ENCODE/Protocol_info/NistPool14Concentrations.tsv"
ERCC_dict = ERCC_concentration_parsing (ERCC_conc)
# Create a dataframe by combining the kallisto results and the ERCC concentrations
kallisto_basedir = "./Localisation_Original_Datasets/Djebali-ENCODE/sleuth/lncRNA_ERCC"
ERCC_abundance_dict, ERCC_abundance_df = ERCC_kallisto_parsing (ERCC_dict, kallisto_basedir)
ERCC_abundance_df.head(5)
Out[7]:
In [8]:
for X, Y in [["log_concentration", "log_tpm_raw"], ["log_concentration", "log_est_raw"], ["log_concentration", "log_tpm_norm"], ["log_concentration", "log_est_norm"], ["log_tpm_raw", "log_tpm_norm"]] :
g = sns.lmplot(x=X, y=Y, hue="localization", col="cell", data=ERCC_abundance_df, lowess=True)
plt.subplots_adjust(top=0.8)
g.fig.suptitle("{} /{}".format(Y.upper(), X.upper()))
Apparently the normalization algorithm of Sleuth is not doing a good job with the datasets. Indeed, instead of improving the variability between the replicates and the proximity of both nucleus and cytoplasm it increases the difference
I will compare with the sesults obtained with other mappers including STAR and HISAT
Generate an index with lncRNA and ERCC only
STAR2.5.2a --runThreadN 8 --runMode genomeGenerate --genomeDir ./ --genomeFastaFiles ../../Reference_Genomes/GRCh38_primary_assembly_ERCC.fa --sjdbGTFfile ../../Reference_Annotation/gencode_v24_lncRNAs_ERCC.gtf
Align all the samples with basic options recommanded by ENCODE using a simple bash script
RNAseq_STAR_pipeline.sh ../../../../../Index/STAR/GRCh38_ERCC_gencode_v24_lncRNAs/ R1 R2 8 output_dir 0
2 samples failed because of improper fastq formating "EXITING because of FATAL ERROR in reads input: quality string length is not equal to sequence length"*
List the STAR Samples similar to kallisto
In [14]:
# Create a file with sample names, descr and path of the result data
def STAR_sample_list (basedir):
sample_list = []
list_file = basedir+"/list_file.tsv"
# Iterate throught the result folder
for subdir in [path for path in glob(basedir+"/*") if isdir(path)]:
# Extract info
ss = basename(subdir).split("_")
cell_type = ss[1].strip().replace("-", "")
localization = ss[2].strip().replace("-", "")
replicate = ss[3].strip().replace("-", "")
sample_list.append ({
"sample" : "{}_{}_{}".format(cell_type, localization, replicate),
"cell_type" : cell_type,
"localization" : localization,
"replicate" : replicate,
"path" : abspath(subdir)+"/ReadsPerGene.out.tab"
})
# Sort the list to work with grouped data in the next steps of the analysis
sample_list.sort(key = lambda x: (x['cell_type'], x['localization'], x['replicate']))
# Find the number of replicates of each sample
rep_dict={}
for d in sample_list:
uniq_sample = "{}_{}".format(d["cell_type"], d["localization"])
if uniq_sample not in rep_dict:
rep_dict[uniq_sample] = 0
rep_dict[uniq_sample] += 1
# Filter out samples with less than 2 replicates
filter_list = []
for d in sample_list:
uniq_sample = "{}_{}".format(d["cell_type"], d["localization"])
if rep_dict[uniq_sample] >= 2:
filter_list.append(d)
else:
print ("{} was filtered out\n".format(d["sample"]))
# Write the results in a file
with open (list_file, "w") as outfp:
outfp.write("sample\tcell_type\tlocalization\treplicate\tpath\n")
for d in filter_list:
outfp.write("{}\t{}\t{}\t{}\t{}\n".format (d["sample"], d["cell_type"], d["localization"], d["replicate"], d["path"]))
return (list_file)
In [16]:
fp = STAR_sample_list("./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC")
head(fp, 20)
Aggregating all the results in a panda dataframe and export in a table
In [17]:
head("/home/aleg/Data/Datasets/Epi2_pilot/Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC/ENCSR000CTL_A549_cytosol_1/ReadsPerGene.out.tab")
In [18]:
def STAR_aggregate (list_file, outdir):
data_dict = OrderedDict()
# Read the STAR sample_list
with open (list_file, "r") as sample_list:
header = next(sample_list)
for sample_line in sample_list:
sample, cell_type, localization, replicate, path = sample_line.strip().split("\t")
# Read the STAR count files and collect the data
with open (path, "r") as count_file:
for count_line in count_file:
gene_id, unstrand_count, pos_strand_count, neg_strand_count = count_line.strip().split("\t")
if gene_id not in data_dict:
data_dict[gene_id] = OrderedDict()
if sample not in data_dict[gene_id]:
data_dict[gene_id][sample] = OrderedDict()
data_dict[gene_id][sample]["unstrand_count"] = unstrand_count
data_dict[gene_id][sample]["pos_strand_count"] = pos_strand_count
data_dict[gene_id][sample]["neg_strand_count"] = neg_strand_count
# Write the collected data in a dict separatly for unstranded, neg_strand and pos_strand
with \
open ( outdir+"/STAR_unstrand_count.tsv", "w") as unstrand_fp, \
open ( outdir+"/STAR_pos_strand_count.tsv", "w") as pos_strand_fp, \
open ( outdir+"/STAR_neg_strand_count.tsv", "w") as neg_strand_fp:
first_row = True
for gene_id, gene_dict in data_dict.items():
if first_row:
unstrand_fp.write("\t".join(gene_dict.keys())+"\n")
pos_strand_fp.write("\t".join(gene_dict.keys())+"\n")
neg_strand_fp.write("\t".join(gene_dict.keys())+"\n")
first_row = False
unstrand_fp.write(gene_id)
pos_strand_fp.write(gene_id)
neg_strand_fp.write(gene_id
)
for sample_dict in gene_dict.values():
unstrand_fp.write("\t{}".format(sample_dict["unstrand_count"]))
pos_strand_fp.write("\t{}".format(sample_dict["pos_strand_count"]))
neg_strand_fp.write("\t{}".format(sample_dict["neg_strand_count"]))
unstrand_fp.write("\n")
pos_strand_fp.write("\n")
neg_strand_fp.write("\n")
In [19]:
STAR_aggregate ("./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC/list_file.tsv", "./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC")
head("./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC/STAR_unstrand_count.tsv")
head("./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC/STAR_pos_strand_count.tsv")
head("./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC/STAR_neg_strand_count.tsv")
Adding the STAR information to the dict containing kallisto results
In [21]:
# Define the vasedir containing STAR count results
STAR_countfile = "./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC/STAR_neg_strand_count.tsv"
ERCC_abundance_dict, ERCC_abundance_df = ERCC_table_parsing (ERCC_abundance_dict, STAR_countfile, "log_STAR_count")
ERCC_abundance_df.head(20)
Out[21]:
In [22]:
for X, Y in [["log_concentration", "log_STAR_count"], ["log_STAR_count", "log_est_raw"], ["log_STAR_count", "log_est_norm"], ["log_STAR_count", "log_tpm_raw"], ["log_STAR_count", "log_tpm_norm"]] :
g = sns.lmplot(x=X, y=Y, hue="localization", col="cell", data=ERCC_abundance_df, lowess=True)
plt.subplots_adjust(top=0.8)
g.fig.suptitle("{} /{}".format(Y.upper(), X.upper()))
In [1]:
################## R KERNEL ##################
# Test of tximport with kallisto counts
vignette("tximport")
In [1]:
# Import libraries
library(dplyr)
library(tximport)
library(readr)
# Define a function to prepare data and analyse them with
tximport_analysis <- function(sample_list, transcript_info_gencode, transcript_info_ERCC, outname) {
# Import the sample name and path from the sample list file
s2p = read.table(sample_list, header = TRUE)
abundance_list = file.path(s2p$path, "abundance.tsv")
names(abundance_list) = s2p$sample
# Import the gene info from gencode genes in a dataframe and rename the fields for tximport compatibility
t2g_encode = read.table(transcript_info_gencode, header = TRUE, stringsAsFactors=FALSE)
t2g_encode = rename(t2g_encode, TXNAME = GENCODE_transcript_id, GENEID = GENCODE_gene_id)
t2g_encode = select(t2g_encode, c(TXNAME, GENEID))
# Same thing for ERCC gene info list
t2g_ERCC = read.table(transcript_info_ERCC, header = TRUE, stringsAsFactors=FALSE)
t2g_ERCC = rename(t2g_ERCC, TXNAME = transcript_name, GENEID = gene_id)
t2g_ERCC = select(t2g_ERCC, c(TXNAME, GENEID))
# concatenate the 2 dataframes
t2g = rbind(t2g_encode, t2g_ERCC)
# Now aggregate the results at the gene level with tximport with a tpm and transcript length normalization
txi = tximport(abundance_list, type="kallisto", tx2gene=t2g, reader=read_tsv, countsFromAbundance="lengthScaled") # Scaled by transcript lenght only
# Write the normalized aggregated counts in a tsv table file
write.table(txi$counts, file = outname, row.names=TRUE, na="",col.names=TRUE, sep="\t", quote=FALSE)
return(txi$counts)
}
In [2]:
# Define files
sample_list = "./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA_ERCC/list_file.tsv"
transcript_info_gencode = "../../Reference_Genomes/gencode_v24_lncRNA_transcripts_info.tsv"
transcript_info_ERCC = "../../Reference_Genomes/ERCC_info.tsv"
outname = "./Localisation_Original_Datasets/Djebali-ENCODE/tximport/lncRNA_ERCC_gene_level_counts.tsv"
counts = tximport_analysis(sample_list, transcript_info_gencode, transcript_info_ERCC, outname)
head (counts)
In [4]:
################## PYTHON3 KERNEL ##################
# Rerun all the data collection pipeline and add the last data from tximport
ERCC_dict = ERCC_concentration_parsing ("./Localisation_Original_Datasets/Djebali-ENCODE/Protocol_info/NistPool14Concentrations.tsv")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_kallisto_parsing (ERCC_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/sleuth/lncRNA_ERCC")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_table_parsing (ERCC_abundance_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC/STAR_neg_strand_count.tsv", "log_STAR_count")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_table_parsing (ERCC_abundance_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/tximport/lncRNA_ERCC_gene_level_counts.tsv", "log_tximport_count")
ERCC_abundance_df.head()
Out[4]:
In [5]:
for X, Y in [["log_concentration", "log_tximport_count"], ["log_concentration", "log_STAR_count"], ["log_concentration", "log_est_raw"], ["log_tximport_count", "log_STAR_count"], ["log_tximport_count", "log_est_raw"]] :
g = sns.lmplot(x=X, y=Y, hue="localization", col="cell", data=ERCC_abundance_df, lowess=True)
plt.subplots_adjust(top=0.8)
g.fig.suptitle("{} /{}".format(Y.upper(), X.upper()))
Althought the purpose was only to aggregate the kallisto results at gene level, we can notice a slight improvement after gene level aggreggation, sequencing depth normalization and transcript lenght normalization using tximport. The concentration vs count normalizaton seems a little better for low concentrations and the correlation with Star is always good
In [5]:
%%R
vignette('RUVSeq')
In [6]:
%%R
# Import libraries
library(dplyr)
library(RUVSeq)
library(RColorBrewer)
library(repr)
# Function to run the RUV-Seq normalization
RUVSeq_analysis <- function(count_file, outdir, prefix, unwanted_variation_factor=1) {
cat("IMPORT THE DATA\n")
# Import the data in a dataframe
allcount = read.table(count_file, header = TRUE)
genes = rownames(allcount)[grep("^ENS", rownames(allcount), perl = TRUE)]
spikes = rownames(allcount)[grep("^ERCC", rownames(allcount), perl = TRUE)]
cat(paste("\tGenes initial ", length(genes), "\n"))
cat(paste("\tSpikes initial ", length(spikes), "\n"))
cat("FILTERING NON AND LOW EXPRESSED GENES\n")
# Filter out non-expressed genes, by requiring more than 5 reads in at least two samples for each gene.
filter = apply(allcount, 1, function(x) length(x[x>5])>=2)
filtered = allcount[filter,]
genes = rownames(filtered)[grep("^ENS", rownames(filtered), perl = TRUE)]
spikes = rownames(filtered)[grep("^ERCC", rownames(filtered), perl = TRUE)]
cat(paste("\tGenes after filtering ", length(genes), "\n"))
cat(paste("\tSpikes after filtering ", length(spikes), "\n"))
cat("PLOTING DATA BEFORE NORMALIZATION\n")
# Create a list of samples by grouping sample without the replicate number at the end of the sample name (_#)
cond = as.factor(sub('(.)_.$', '\\1', colnames(filtered)))
#cond = as.factor(sapply(strsplit(colnames(filtered),'_'), "[", 1-2))
cell = as.factor(sapply(strsplit(colnames(filtered),'_'), "[", 1))
localization = as.factor(sapply(strsplit(colnames(filtered),'_'), "[", 2))
replicate = as.factor(sapply(strsplit(colnames(filtered),'_'), "[", 3))
df = data.frame(cell, localization, replicate, row.names=colnames(filtered))
set1 = newSeqExpressionSet(as.matrix(filtered), phenoData = df)
cat("PERFORM UPPER QUARTILE NORMALIZATION\n")
# Normalize between lanes using EDASeq upper quartile method
set2 = betweenLaneNormalization(set1, which="upper")
cat("PERFORM RUV G NORMALIZATION\n")
# Perform RUV g normalization based on the ERCC spike in controls
set3 = RUVg(set2, spikes, k=unwanted_variation_factor)
cat("PERFORM RUV R NORMALIZATION\n")
# Perform RUVr normalization based on residuals of a first GLM pass
# compute the residuals from the GLM fit
design = model.matrix(~localization, data=pData(set2))
y = DGEList(counts=counts(set2), group=localization)
y = calcNormFactors(y, method="upperquartile")
y = estimateGLMCommonDisp(y, design)
y = estimateGLMTagwiseDisp(y, design)
fit = glmFit(y, design)
res = residuals(fit, type="deviance")
# estimate the factors of unwanted variation for all genes
set4 = RUVr(set2, genes, k=unwanted_variation_factor, res)
cat("PLOTING DATA\n")
# Define a color vector to differentiate groups of replicates
n = length(levels(cond))
getPalette = colorRampPalette(brewer.pal(12, "Set3"))
colors = getPalette(n)
# Plot the Relative log ratio of read count to median read count accross sample = RLE
par(xpd=T, mar=c(0.2,4,2,10))
options(repr.plot.width=10, repr.plot.height=4)
plotRLE(set1, outline=F, las=2, cex.axis=0.5, ylim=c(-4, 4), col=colors[cond], ylab="RLE", main = "RLE plot before normalization")
legend("right",inset=c(-0.4,0), legend=levels(cond), fill=getPalette(n), cex = 0.7, y.intersp=1.5, bty="n")
plotRLE(set2, outline=F, las = 2, cex.axis = 0.5, ylim=c(-4, 4), col=colors[cond], ylab="RLE", main = "RLE plot after upper-quartile normalization")
legend("right",inset=c(-0.4,0), legend=levels(cond), fill=getPalette(n), cex = 0.7, y.intersp=1.5, bty="n")
plotRLE(set3, outline=F, las = 2, cex.axis = 0.5, ylim=c(-4, 4), col=colors[cond], ylab="RLE", main = "RLE plot after RUVg normalization")
legend("right",inset=c(-0.4,0), legend=levels(cond), fill=getPalette(n), cex = 0.7, y.intersp=1.5, bty="n")
plotRLE(set4, outline=F, las = 2, cex.axis = 0.5, ylim=c(-4, 4), col=colors[cond], ylab="RLE", main = "RLE plot after RUVr normalization")
legend("right",inset=c(-0.4,0), legend=levels(cond), fill=getPalette(n), cex = 0.7, y.intersp=1.5, bty="n")
# Plot 3D PCA of the datasets
par(mar=c(4,4,4,4))
plotPCA(set1, k=2, col=colors[cond], cex=0.7, main = "PCA plot before normalization")
plotPCA(set2, k=2, col=colors[cond], cex=0.7, main = "PCA plot after upper-quartile normalization")
plotPCA(set3, k=2, col=colors[cond], cex=0.7, main = "PCA plot after RUVg normalization")
plotPCA(set4, k=2, col=colors[cond], cex=0.7, main = "PCA plot after RUVr normalization")
cat("EXPORTING THE RESULTS\n")
fname = paste(c(outdir,"/",prefix,"RUVg_counts.tsv"), sep = "", collapse="")
write.table(normCounts(set3), file = fname, row.names=TRUE, na="",col.names=TRUE, sep="\t", quote=FALSE)
fname = paste(c(outdir,"/",prefix,"RUVr_counts.tsv"), sep = "", collapse="")
write.table(normCounts(set4), file = fname, row.names=TRUE, na="",col.names=TRUE, sep="\t", quote=FALSE)
return (set3)
}
In [7]:
%%R
# Normalising Kallisto_tximport data with k = 1
count_file = "./Localisation_Original_Datasets/Djebali-ENCODE/tximport/lncRNA_ERCC_gene_level_counts.tsv"
unwanted_variation_factor = 1
outname = "./Localisation_Original_Datasets/Djebali-ENCODE/RUV-seq/lncRNA_ERCC"
prefix = "kallisto_k1_"
set = RUVSeq_analysis(count_file, outname, prefix, unwanted_variation_factor)
In [9]:
%%R
# Normalising STAR data
count_file = "./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC/STAR_neg_strand_count.tsv"
unwanted_variation_factor = 1
outname = "./Localisation_Original_Datasets/Djebali-ENCODE/RUV-seq/lncRNA_ERCC"
prefix = "STAR_k1_"
set = RUVSeq_analysis(count_file, outname, prefix, unwanted_variation_factor)
The normalisation reduced significantly the difference of RLE between sample (Upper quartile norm) and the intra sample variability for both STAR and kallisto datasets. The PCA also shows that the replicates generally clustered together. After the UQ norm we can still see a clear nucleus / cytoplasm difference, but everything seems more mixed up after the RUVSeq normalization
In [11]:
# Rerun all the data collection pipeline and add the last data from tximport
ERCC_dict = ERCC_concentration_parsing ("./Localisation_Original_Datasets/Djebali-ENCODE/Protocol_info/NistPool14Concentrations.tsv")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_kallisto_parsing (ERCC_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/sleuth/lncRNA_ERCC/")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_table_parsing (ERCC_abundance_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/STAR/lncRNA_ERCC/STAR_neg_strand_count.tsv", "log_STAR_count")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_table_parsing (ERCC_abundance_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/tximport/lncRNA_ERCC_gene_level_counts.tsv", "log_kallisto_tximport_count")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_table_parsing (ERCC_abundance_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/RUV-seq/lncRNA_ERCC/kallisto_k1_RUVg_counts.tsv", "log_kallisto_RUVg_count")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_table_parsing (ERCC_abundance_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/RUV-seq/lncRNA_ERCC/kallisto_k1_RUVr_counts.tsv", "log_kallisto_RUVr_count")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_table_parsing (ERCC_abundance_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/RUV-seq/lncRNA_ERCC/STAR_k1_RUVg_counts.tsv", "log_STAR_RUVg_count")
ERCC_abundance_dict, ERCC_abundance_df = ERCC_table_parsing (ERCC_abundance_dict, "./Localisation_Original_Datasets/Djebali-ENCODE/RUV-seq/lncRNA_ERCC/STAR_k1_RUVr_counts.tsv", "log_STAR_RUVr_count")
ERCC_abundance_df.head()
Out[11]:
In [12]:
for X, Y in [["log_concentration", "log_est_raw"], ["log_concentration", "log_kallisto_RUVg_count"], ["log_concentration", "log_kallisto_RUVr_count"], ["log_concentration", "log_STAR_count"], ["log_concentration", "log_STAR_RUVg_count"], ["log_concentration", "log_STAR_RUVr_count"]] :
g = sns.lmplot(x=X, y=Y, hue="localization", col="cell", data=ERCC_abundance_df, lowess=True)
plt.subplots_adjust(top=0.8)
g.fig.suptitle("{} /{}".format(Y.upper(), X.upper()))
The RUVseq normalization is considerably reducing the already little difference between the 2 cellular fractions for the ERCC. This was expected since the normalization is based on the spike-in controls.
RUVr normalization doesn't seem to improve the results. I need to see with Catalina, Mat or Tom
The normalized results from STAR and Kallisto are still highly correlated, although maybe a little bit more variable for low values
Plot the count for the nucleus vs the cytoplasmic fraction
It will break the initial assumption so we are unsure if DESeq2 or EdgeR can be used in this case, at least not for the p-Value calculation
In [60]:
head("./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA_ERCC/ENCSR000COK_K562_cytosol_2/abundance.tsv", 3)
head("./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA_ERCC/list_file.tsv", 3)
In [13]:
# File path
sample_file = "./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA_ERCC/list_file.tsv"
gene_info_file = "../../Reference_Genomes/gencode_v24_lncRNA_transcripts_info.tsv"
fish_loc_file = "./Localisation_Original_Datasets/Cabili_RNA_FISH/Localization_cell_ensembl_valid_HeLa.csv"
# Read the list of sample
sample_df = pd.read_table(sample_file, index_col=0)
count_dict = OrderedDict()
for sample, sval in sample_df.iterrows():
# Read the kallisto count file
count_df = pd.read_table(sval["path"]+"/abundance.tsv", index_col=0)
print ("Parsing sample: {}".format(sample))
for target_id, cval in count_df.iterrows():
# key = transcript + cell type to agregate results from the same cell type together
key = "{}_{}".format(target_id, sval["cell_type"])
if key not in count_dict :
count_dict[key] = OrderedDict()
count_dict[key]["target_id"] = target_id
count_dict[key]["cell_type"] = sval["cell_type"]
count_dict[key]["tpm_nucleus"] = []
count_dict[key]["tpm_cytosol"] = []
count_dict[key]["est_nucleus"] = []
count_dict[key]["est_cytosol"] = []
count_dict[key]["tpm_"+sval["localization"]].append(float(cval["tpm"]))
count_dict[key]["est_"+sval["localization"]].append(float(cval["est_counts"]))
# Calculate the log of the mean for each tpm and est list
for key in count_dict.keys():
mean_tpm_nucleus = stat.mean(count_dict[key]["tpm_nucleus"])
mean_tpm_cytosol = stat.mean(count_dict[key]["tpm_cytosol"])
if mean_tpm_nucleus or mean_tpm_cytosol:
count_dict[key]["mean_log2_tpm_nucleus"] = np.log2(0.5+mean_tpm_nucleus)
count_dict[key]["mean_log2_tpm_cytosol"] = np.log2(0.5+mean_tpm_cytosol)
mean_est_nucleus = stat.mean(count_dict[key]["est_nucleus"])
mean_est_cytosol = stat.mean(count_dict[key]["est_cytosol"])
if mean_est_nucleus or mean_est_cytosol:
count_dict[key]["mean_log2_est_nucleus"] = np.log2(0.5+mean_est_nucleus)
count_dict[key]["mean_log2_est_cytosol"] = np.log2(0.5+mean_est_cytosol)
# Add an additional column based on the gene localization by FISH for Hela cells only
# Import the gene info file and select only the columns with gene ID and transcript ID
t2g_df = pd.read_table(gene_info_file)[[0,1]]
# Remove the number at the end of the info file to be compatible with the numberless IDs in the loc fish file
t2g_df.GENCODE_gene_id = t2g_df.GENCODE_gene_id.str.rsplit('.').str[0]
# Import the columns geneID and localization number of the fish loc file
loc_df = pd.read_table(fish_loc_file)[[0,2]]
# Merge the 2 dataframe to retain only the intersection, ie association of transcript with a localization index
t2l_df = pd.merge(loc_df, t2g_df, how='inner', left_on='gene_ID', right_on='GENCODE_gene_id')
# Select the important columns and save in a dictionnary
t2l_dict = {val["GENCODE_transcript_id"]:int(val["localization pattern"]) for i, val in t2l_df.iterrows()}
for key in count_dict.keys():
if count_dict[key]["cell_type"] == "HeLaS3" and count_dict[key]["target_id"] in t2l_dict:
count_dict[key]["real_localization"] = t2l_dict[count_dict[key]["target_id"]]
else:
count_dict[key]["real_localization"] = 0
count_df = pd.DataFrame.from_dict(count_dict, orient='index')
count_df.head()
Out[13]:
In [16]:
# Extensive data ploting for HeLaS3 cell line
sns.set(style="white", color_codes=True)
df = count_df[(count_df.cell_type == "HeLaS3")]
# Plot the results for Hela Cells EST counts
g=sns.jointplot("mean_log2_est_nucleus", "mean_log2_est_cytosol", data=df, kind="reg", size=7, xlim=[-2,20], ylim=[-2,20], color="0.8", line_kws={"color":"green", "linewidth":1})
# Highlight the data from the FISH dataset
for loc, color in [[1,"darkred"], [2,"red"], [3,"orange"], [4,"gold"], [5,"white"]]:
sub_df = df[(df.real_localization == loc)]
plt.scatter(sub_df.mean_log2_est_nucleus, sub_df.mean_log2_est_cytosol, c=color)
# Add the ERCC genes to see where they align
sns.regplot("mean_log2_est_nucleus", "mean_log2_est_cytosol", data=df[count_df['target_id'].str.contains("ERCC")], color="blue", line_kws={"color":"darkblue", "linewidth":1})
# Add a diagonal line
plt.plot([-2,20],[-2,20], ls="--", c="black", linewidth =1)
# Add title and legend
g.fig.suptitle("HELA S3 - EST RESULTS")
plt.legend(bbox_to_anchor=(1.3, 1), loc=2,
labels=["Regression line all RNA","Regression line ERCC","Diagonal","All RNA","Distribution All RNA", "1 or 2 large nuclear foci", "Large nuclear foci + scattered in nucleus", "Predominantly nuclear no foci", "Cytoplasmic and nuclear", "Predominantly cytoplasmic", "ERCC"])
# Plot the results for Hela Cells TPM counts
g=sns.jointplot("mean_log2_tpm_nucleus", "mean_log2_tpm_cytosol", data=df, kind="reg", size=7, xlim=[-2,20], ylim=[-2,20], color="0.8", line_kws={"color":"green", "linewidth":1})
# Highlight the data from the FISH dataset
for loc, color in [[1,"darkred"], [2,"red"], [3,"orange"], [4,"gold"], [5,"white"]]:
sub_df = df[(df.real_localization == loc)]
plt.scatter(sub_df.mean_log2_tpm_nucleus, sub_df.mean_log2_tpm_cytosol, c=color)
# Add the ERCC genes to see where they align
sns.regplot("mean_log2_tpm_nucleus", "mean_log2_tpm_cytosol", data=df[count_df['target_id'].str.contains("ERCC")], color="blue", line_kws={"color":"darkblue", "linewidth":1})
# Add a diagonal line
plt.plot([-2,20],[-2,20], ls="--", c="black", linewidth =1)
# Add title and legend
g.fig.suptitle("HELA S3 - TPM RESULTS")
plt.legend(bbox_to_anchor=(1.3, 1), loc=2,
labels=["Regression line all RNA","Regression line ERCC","Diagonal","All RNA","Distribution All RNA", "1 or 2 large nuclear foci", "Large nuclear foci + scattered in nucleus", "Predominantly nuclear no foci", "Cytoplasmic and nuclear", "Predominantly cytoplasmic", "ERCC"])
Out[16]:
In [17]:
# Plot the results for the other cell lines
sns.set(style="white")
for cell_type in ["IMR90","A549","GM12878","HepG2","HUVEC","K562","MCF7","NHEK","SKNSH"]:
df = count_df[(count_df.cell_type == cell_type)]
g=sns.jointplot("mean_log2_tpm_nucleus", "mean_log2_tpm_cytosol", data=df, kind="reg", size=5, xlim=[-2,20], ylim=[-2,20], color="0.8", line_kws={"color":"green", "linewidth":1})
# Add the ERCC genes to see where they align
sns.regplot("mean_log2_tpm_nucleus", "mean_log2_tpm_cytosol", data=df[count_df['target_id'].str.contains("ERCC")], color="blue", line_kws={"color":"darkblue", "linewidth":1})
# Add a diagonal line
plt.plot([-2,20],[-2,20], ls="--", c="black", linewidth =1)
# Add title and legend
g.fig.suptitle(cell_type+" - TPM RESULTS")
plt.legend(bbox_to_anchor=(1.3, 1), loc=2, labels=["Regression line all RNA","Regression line ERCC","Diagonal","All RNA","Distribution All RNA", "ERCC"])
The distribution is actually more balanced than expected between the 2 fractions, althougt there are clearly a lot of difference in the enrichment of the genes. A significant subset of the transcript are found only in the nucleus or only in the cytoplasm. They might be the more interesting candidates but one should take into account that they might be undetected because of the depth of sequencing of the relative abundance of the other genes
The ERCC reg line is quite perfectly fitted on the diagonal (ie balance between the 2 fractions) after tpm normalization.
Given these results I doesn't seems to be neccessary to perform any additional normalization step, it might not break the assumptions of DESeq2 or Sleuth
RUVseq normalization might sligthly improve the data but probably not very significantly
=> Analyse at gene level with DESeq2 including TPM normalization by tximport +- RUV seq normalization but without the geometric mean normalization
=> Analyse at transcript and gene level with Sleuth but I need to tweak the normalization function to deactivate it
In [21]:
%%R
# Imports and setups
library("sleuth")
library("dplyr")
sleuth_analysis = function(
sample_list,
transcript_info=NULL,
select_col=NULL,
select_val=NULL,
full_model=~1,
outdir="./out/",
ncore=1,
max_bootstrap=NULL,
norm_fun=norm_factors,
filter_fun=basic_filter,
aggregation_column=NULL) {
# Multicore option
options(mc.cores = ncore)
# Create the output directory
dir.create(outdir, showWarnings=F, recursive=T)
cat("IMPORT AND SELECT THE SAMPLES\n")
# Import the file in a table and select sample according to select val and col
s2c = read.table(sample_list, header=T, stringsAsFactors=F)
if (!is.null(select_col)) {s2c = s2c[s2c[,select_col] == select_val,]}
print (s2c$sample)
cat("IMPORT THE GENE INFORMATIONS\n")
# Import the gene info in table and rename the fields for Sleuth compatibility
t2g = read.table(transcript_info, header=T, stringsAsFactors=F)
#t2g = rename(t2g, target_id = GENCODE_transcript_id, ens_gene = GENCODE_gene_id)
t2g = select(t2g, target_id, ens_gene, transcript_name, gene_name)
cat("PREPARE THE DATA FOR SLEUTH\n")
# Extract the data from the kallisto results.
so = sleuth_prep(s2c, full_model,filter_fun, t2g, max_bootstrap, norm_fun, norm_fun, aggregation_column)
cat("FIT TO FULL AND REDUCED MODEL\n")
# Estimate parameters for the sleuth response error measurement (full) model
so <- sleuth_fit(so)
# Estimate parameters for the sleuth reduced model (shrinkage)
so <- sleuth_fit(so, ~1, 'reduced')
cat("PERFORM WALD TESTING\n")
# Performing Wald test for all possible cofactors
for (beta_factor in colnames(so$design_matrix)) {so <- sleuth_wt(so, which_beta = beta_factor, 'full')}
cat("PERFORM LIKELYHOOD RATION TESTING\n")
# Perform likelihood ratio test
so <- sleuth_lrt(so, 'reduced', 'full')
cat("EXPORT RESULTS")
# Save all the results in a table
results_table <- sleuth_results(so, 'reduced:full', test_type = 'lrt')
fname = paste0(outdir, "DE_Sleuth.tsv")
write.table(results_table, file = fname, row.names=TRUE, na="",col.names=TRUE, sep="\t", quote=FALSE)
# Save the Sleuth object for further analysis
fname = paste0(outdir, "DE_Sleuth.rds")
saveRDS(so, file = fname)
return(so)
}
In [23]:
%%R
# test with one dataset
so = sleuth_analysis (
sample_list="./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA_ERCC/list_file.tsv",
transcript_info="./Localisation_Original_Datasets/Djebali-ENCODE/Protocol_info/gencode_v24_transcripts_info_sleuth.tsv",
ncore=4,
max_bootstrap=3,
select_col="cell_type",
select_val="HeLaS3",
full_model=~localization+replicate,
outdir="./Localisation_Original_Datasets/Djebali-ENCODE/test/",
norm_fun = function(mat) {setNames(rep(1.0,ncol(mat)),colnames(mat))},
filter_fun = basic_filter)
In [ ]:
%%R
sleuth_live(so)
In [4]:
# Define generic arguments
sample_list = "./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/lncRNA_ERCC/list_file.tsv"
transcript_info = "../../Reference_Genomes/gencode_v24_lncRNA_transcripts_info.tsv"
outdir = "./Localisation_Original_Datasets/Djebali-ENCODE/sleuth/lncRNA_ERCC/"
ncore=4
max_bootstrap=100
norm_fun = function(mat) {setNames(rep(1.0,ncol(mat)),colnames(mat))}
filter_fun = function(row,min_reads=10,min_prop=0.47) {mean(row>=min_reads)>=min_prop}
# Analysis for all samples together at transcript level
cat ("\n\n### ANALYSING ALL SAMPLES ###\n")
so = sleuth_analysis (sample_list=sample_list, transcript_info=transcript_info, ncore=ncore, max_bootstrap=max_bootstrap,
full_model=~localization+replicate+cell_type,
outdir = paste0(outdir, "transcript_all","/"))
# Analysis cell line per cell line at transcript level
for (cell in c("IMR90","A549","GM12878","HeLaS3","HepG2","HUVEC","K562","MCF7","NHEK","SKNSH")) {
cat (paste0("\n\n### ANALYSING SAMPLE ", cell, "###\n"))
so = sleuth_analysis (sample_list=sample_list, transcript_info=transcript_info, ncore=ncore, max_bootstrap=max_bootstrap,
full_model=~localization+replicate,
outdir = paste0(outdir, "transcript_", cell,"/"),
select_col="cell_type",
select_val=cell)
}
In [2]:
!SleuthDEA.R
In [4]:
# Generic arguments
t2g = "./Localisation_Original_Datasets/Djebali-ENCODE/Protocol_info/gencode_v24_transcripts_info_sleuth.tsv"
select_col = "cell_type"
full_model = "localization+replicate"
min_est = 10
sig_level = 0.01
for gene_set in ["lncRNA_ERCC", "lncRNA"]:
for cell_line in ["IMR90","A549","GM12878","HeLaS3","HepG2","HUVEC","K562","MCF7","NHEK","SKNSH"]:
# Specific arguments
s2c = "./Localisation_Original_Datasets/Djebali-ENCODE/kallisto/{}/list_file.tsv".format(gene_set)
outdir = "./Localisation_Original_Datasets/Djebali-ENCODE/sleuth/transcript_{}/{}/".format(gene_set, cell_line)
# Build command line and launch it
cl = "SleuthDEA.R --s2c {} --t2g {} --outdir {} --select_col {} --select_val {} --full_model {} --min_est {} --sig_level {} --no_norm".format(s2c, t2g, outdir, select_col, cell_line, full_model, min_est, sig_level)
!{cl}
My laptop doesn't have enought RAM to analyse the larger datasets and the gene level aggregation mode. I used a similar python script than the one above to run all the conditions:
Extract the gene info from gencode datasets and identify with type of RNA is present in lncRNA and allRNA at GENE level
In [6]:
# Transform the gff3 format in a simple tsv format for simple parsing
# Common arguments
final_template=["{seqid}","\t","{type}","\t","{start}","\t","{end}","\t","{strand}","\t","{ID}","\t","{gene_type}","\t","{gene_name}"]
standard_template="gff3_ens_gene"
file_list = [
["../../Reference_Annotation/gencode_v24_lncRNAs.gff3", "../../Reference_Annotation/gencode_v24_lncRNAs_gene.tsv"],
["../../Reference_Annotation/gencode_v24.gff3", "../../Reference_Annotation/gencode_v24_gene.tsv"]]
for input_file, output_file in file_list:
# Parse and simplify the gff3 tables
reformat_table ( input_file=input_file, output_file=output_file, standard_template=standard_template,
final_template=final_template, keep_original_header=False, header_from_final_template= True)
# Analyse column 7 containing the type of the genes
print(colsum(output_file, colrange=[6], ret_type='report', max_items=0, header=True))
Extract the gene info from gencode datasets and identify with type of RNA is present in lncRNA and allRNA at TRANSCRIPT level
In [7]:
# Transform the gff3 format in a simple tsv format for simple parsing
# Common arguments
final_template=["{seqid}","\t","{type}","\t","{start}","\t","{end}","\t","{strand}","\t","{ID}","\t","{transcript_type}","\t","{transcript_name}","\t","{gene_id}","\t","{gene_type}","\t","{gene_name}"]
standard_template="gff3_ens_transcript"
file_list = [
["../../Reference_Annotation/gencode_v24_lncRNAs.gff3", "../../Reference_Annotation/gencode_v24_lncRNAs_transcript.tsv"],
["../../Reference_Annotation/gencode_v24.gff3", "../../Reference_Annotation/gencode_v24_transcript.tsv"]]
for input_file, output_file in file_list:
# Parse and simplify the gff3 tables
reformat_table ( input_file=input_file, output_file=output_file, standard_template=standard_template,
final_template=final_template, keep_original_header=False, header_from_final_template= True)
# Analyse column 7 containing the type of the genes
print(colsum(output_file, colrange=[6,9], ret_type='report', max_items=0, header=True))
In [21]:
# Find the count of the diffreent gene types in the localization data
df1 = pd.read_table("./Localisation/Djebali-ENCODE/sleuth/gene_allRNA/ALL/localizationnucleus_wald_test.tsv")
df2 = pd.read_table("../../Reference_Annotation/gencode_v24_gene.tsv")
df3 = pd.merge(df1,df2, how='inner', left_on="target_id", right_on="ID")
d = {gene_type:len(df) for gene_type, df in df3.groupby("gene_type")}
print(dict_to_report(d))
In [10]:
gene_file = "../../Reference_Annotation/gencode_v24_transcript.tsv"
DE_file_dir = "./Localisation/Djebali-ENCODE/sleuth/transcript_allRNA"
In [2]:
def lnc_coding_DE_plots (DE_file_dir, gene_file, FDR=1):
print_arg()
lncRNA_types = ["lincRNA", "antisense", "TEC", "sense_intronic", "processed_transcript", "sense_overlapping", "3prime_overlapping_ncrna", "non_coding", "bidirectional_promoter_lncrna", "macro_lncRNA"]
coding_RNA_types = ["protein_coding", "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene"]
# Import the df with all gene IDs and create the lists of target id to highlight
gene_df = pd.read_table(gene_file)
hl = [
{"target_id": gene_df.ID[(gene_df.gene_type.isin(coding_RNA_types))], "label": "Coding RNA", "alpha":0.2, "color":"lightskyblue"},
{"target_id": gene_df.ID[(gene_df.gene_type.isin(lncRNA_types))], "label": "lnc RNA", "alpha":0.2, "color":"coral"}]
#for condition in condition_list:
for condition in ["ALL","IMR90","A549","GM12878","HeLaS3","HepG2","HUVEC","K562","MCF7","NHEK","SKNSH"]:
# read the appropriate file
DE_file = "{}/{}/localizationnucleus_wald_test.tsv".format(DE_file_dir, condition)
df= pd.read_table(DE_file)
df = df.dropna()
# Plot the graphs
plot_text(condition, align="left", fontsize=20, fontweight="bold")
volcano_plot (df=df, highlight_list=hl, X="b", Y="qval", FDR=FDR, X_cutoff=0, figsize=[15,7], alpha=0.5, xlim=[-8,8], ylim=[0,150],
sig_neg_color="0.8", sig_pos_color="0.8", non_sig_color="0.9",
xlabel = "Cytosol <<< beta factor >>> Nucleus", ylabel = "-log10(qval)", highlight_FDR=FDR)
MA_plot (df=df, highlight_list=hl, X="mean_obs", Y="b", FDR=FDR, FDR_col="qval", figsize=[15,7], alpha=0.5, xlim=[0,14], ylim=[-10,10] ,
sig_neg_color="0.8", sig_pos_color="0.8", non_sig_color="0.9",
xlabel="Mean expression", ylabel="Cytosol <<< beta factor >>> Nucleus", highlight_FDR=FDR)
In [3]:
lnc_coding_DE_plots(
gene_file = "../../Reference_Annotation/gencode_v24_gene.tsv",
DE_file_dir = "./Localisation/Djebali-ENCODE/sleuth/gene_allRNA",
FDR=0.05
)
In [4]:
lnc_coding_DE_plots(
gene_file = "../../Reference_Annotation/gencode_v24_transcript.tsv",
DE_file_dir = "./Localisation/Djebali-ENCODE/sleuth/transcript_allRNA",
FDR=0.05
)
Further analyses at transcript level for All cells together
In [6]:
# Importing data and plotting all RNA
df = pd.read_table( "./Localisation_Original_Datasets/Djebali-ENCODE/sleuth/transcript_allRNA/ALL/localizationnucleus_wald_test.tsv")
df = df.dropna()
sns.kdeplot(df.mean_obs, df.b, cmap="Greys", shade=True, shade_lowest=False, alpha=0.5)
# Defining RNA categories and importing the list of genes in gencode v24
lncRNA_types = ["lincRNA", "antisense", "TEC", "sense_intronic", "processed_transcript", "sense_overlapping", "3prime_overlapping_ncrna", "non_coding", "bidirectional_promoter_lncrna", "macro_lncRNA"]
coding_RNA_types = ["protein_coding", "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene"]
gene_df = pd.read_table( "../../Reference_Annotation/gencode_v24_transcript.tsv")
# Subsetting coding genes and ploting
codingRNA_id = gene_df.ID[(gene_df.gene_type.isin(coding_RNA_types))]
codingRNA_df = df[df.target_id.isin(codingRNA_id)]
sns.kdeplot(codingRNA_df.mean_obs, codingRNA_df.b, cmap="Blues", shade=True, shade_lowest=False, alpha=0.5)
# Subsetting lncRNA genes and ploting
lncRNA_id = gene_df.ID[(gene_df.gene_type.isin(lncRNA_types))]
lncRNA_df = df[df.target_id.isin(lncRNA_id)]
sns.kdeplot(lncRNA_df.mean_obs, lncRNA_df.b, cmap="Reds", shade=True, shade_lowest=False, alpha=0.5)
# Draw a horizontal separation line
pl.hlines(0, 0,12, linestyle=":", color="0.75")
Out[6]:
all RNA and all cell line at transcript level depending of the transcript type
In [10]:
def transcript_level_DE_plots (DE_file, gene_file, min_transcript_per_class=20, max_RNA_classes=8, FDR=1):
print_arg()
# Import the data for all RNA and all cell line at transcript level
DE_df = pd.read_table(DE_file , skipinitialspace=True)
# Import transcript information
gene_df = pd.read_table(gene_file)
# Merge the 2 dataframes together
unique_col = DE_df.columns.difference(gene_df.columns)
df = pd.merge(gene_df, DE_df[unique_col], how="inner", left_on="ID", right_on="target_id")
# Index by gene name and transcript name and clean
df.drop(["ID", "type", "ens_gene"], axis=1, inplace=True)
df.dropna(inplace=True)
# Create list of RNA id per classes
hl=[]
for transcript_type, transcript_df in df.groupby("transcript_type"):
# filter by min_transcript_per_class
if len(transcript_df) >= min_transcript_per_class:
hl.append({"target_id": transcript_df.target_id, "label": transcript_type, "alpha":0.75, "linestyle":"--"})
# Filter by max_RNA_classes
hl.sort(key = lambda x: len(x["target_id"]), reverse=True)
hl = hl[:max_RNA_classes]
# Plot graphs
plot_text("Volcano plot Cytosol/ nuclear localization for each transcript biotypes", fontsize=15, align="center", fontweight="bold")
volcano_plot (df=df, highlight_list=hl, X="b", Y="qval", FDR=FDR, X_cutoff=0, figsize=[30,10], ylim=[0,100], alpha=0.5,
xlabel = "Cytosol <<< beta factor >>> Nucleus", ylabel = "-log10(qval)", highlight_FDR=FDR)
plot_text("MA plot Cytosol/ nuclear localization for each transcript biotypes", fontsize=15, align="center", fontweight="bold")
MA_plot (df=df, highlight_list=hl, X="mean_obs", Y="b", FDR=FDR, FDR_col="qval", figsize=[30,10], alpha=0.5, ylim=[-10,10],
xlabel="Mean expression", ylabel="Cytosol <<< beta factor >>> Nucleus", highlight_FDR=FDR)
plot_text("Cumulative distribution Cytosol/ nuclear localization for each transcript biotypes", fontsize=15, align="center", fontweight="bold")
density_plot(df[(df.qval <= FDR)], "b", figsize=[30,10], ylabel="Cumulative distribution", highlight_list= hl, cumulative = True)
plot_text("Mean expression distribution for each transcript biotypes", fontsize=15, align="center", fontweight="bold")
density_plot(df[(df.qval <= FDR)], "mean_obs", figsize=[30,10], ylabel="Mean expression distribution", highlight_list= hl, cumulative = False)
In [11]:
DE_file = "./Localisation/Djebali-ENCODE/sleuth/transcript_allRNA/ALL/localizationnucleus_wald_test.tsv"
gene_file = "../../Reference_Annotation/gencode_v24_transcript.tsv"
min_transcript_per_class = 20
max_RNA_classes = 10
FDR=1
transcript_level_DE_plots(DE_file, gene_file, min_transcript_per_class, max_RNA_classes, FDR)
Same analysis but with FDR thresholding to remove transcripts with uncertain localization
In [12]:
DE_file = "./Localisation/Djebali-ENCODE/sleuth/transcript_allRNA/ALL/localizationnucleus_wald_test.tsv"
gene_file = "../../Reference_Annotation/gencode_v24_transcript.tsv"
min_transcript_per_class = 20
max_RNA_classes = 8
FDR = 0.05
transcript_level_DE_plots(DE_file, gene_file, min_transcript_per_class, max_RNA_classes, FDR)
Most of the lncRNA classes (linc, processed transcripts, antisense pseudogene...) are clearly enriched in the nuclear fraction and their mean expression is rather low.
The "intron retained" class is the most nuclear class with an intermediate expression pattern. There nuclear localization was already reported in several studies. => good control of the analysis
The non-sense mediated decay class is equally distributed in cytosol and nucleus, also with an intermediate expression pattern
The protein coding transcripts are enriched apparently the only class enriched in the cytoplasm, which is also a good control because they are rrxpected to be in this compartment
HeLa cell and IMR90 are common between the 2 datasets. For the raw counts the correlation was not really good (see above). In addition I plotted the transcripts whose genes were reported in Cabili paper. This was maybe not a really good way to do it.
I reanalysed the Cabili dataset more extensively and I now have more precise results at both gene and transcript level.
The DE results from the Sleuth analysis are more representative of the nuclear or cytosol enrichment than the raw counts
In [9]:
condition_list = [[
"HeLa Gene Level",
"./Localisation/Djebali-ENCODE/sleuth/gene_lncRNA/HeLaS3/localizationnucleus_wald_test.tsv",
"./Localisation/Cabili_RNA_FISH/results_lncRNA/gene_HeLa.csv"],
[
"IMR90 Gene Level",
"./Localisation/Djebali-ENCODE/sleuth/gene_lncRNA/IMR90/localizationnucleus_wald_test.tsv",
"./Localisation/Cabili_RNA_FISH/results_lncRNA/gene_IMR90.csv"],
[
"HeLa Transcript Level",
"./Localisation/Djebali-ENCODE/sleuth/transcript_lncRNA/HeLaS3/localizationnucleus_wald_test.tsv",
"./Localisation/Cabili_RNA_FISH/results_lncRNA/transcript_HeLa.csv"],
[
"IMR90 Transcript Level",
"./Localisation/Djebali-ENCODE/sleuth/transcript_lncRNA/IMR90/localizationnucleus_wald_test.tsv",
"./Localisation/Cabili_RNA_FISH/results_lncRNA/transcript_IMR90.csv"]]
for condition, DE_file, FISH_file in condition_list:
# Load the Diff expression file from Sleuth
df= pd.read_table(DE_file)
df = df.dropna()
# Load the FISH localization results
loc_df = pd.read_table(FISH_file)
hl =[]
# Find the localization for each category of pattern from Cabili paper
for loc in [1,2,3,4,5]:
hl.append(
{"target_id": loc_df.target_id[(loc_df.localization == loc)], "label":"Localization pattern {}".format(loc)})
# Plot the graphs
plot_text(condition, align="left", fontsize=30, fontweight="bold")
volcano_plot (df=df, highlight_list=hl, X="b", Y="qval", FDR=0.05, X_cutoff=1, figsize=[15,7], alpha=0.5, ylim=[-1,30],
xlabel = "beta factor Nucleus localization", ylabel = "-log10(qval)")
MA_plot (df=df, highlight_list=hl, X="mean_obs", Y="b", FDR=0.05, FDR_col="qval", figsize=[15,7], alpha=0.5,
xlabel="Mean expression", ylabel="Beta factor Nucleus localization")
In [31]:
head("../../Reference_Annotation/gencode_v24_transcript.tsv")
Import the Sleuth DE data for transcript and merge with the transcript information from gencode
In [3]:
# Import the data for all RNA and all cell line at transcript level
DE_df = pd.read_table( "./Localisation_Original_Datasets/Djebali-ENCODE/sleuth/transcript_allRNA/ALL/localizationnucleus_wald_test.tsv", skipinitialspace=True)
# Import transcript information
gene_df = pd.read_table( "../../Reference_Annotation/gencode_v24_transcript.tsv")
# Merge the 2 dataframes together
unique_col = DE_df.columns.difference(gene_df.columns)
df = pd.merge(gene_df, DE_df[unique_col], how="inner", left_on="ID", right_on="target_id")
# Index by gene name and transcript name and clean
df.drop(["target_id", "type", "ens_gene"], axis=1, inplace=True)
df.rename(columns={'ID': 'transcript_id'}, inplace=True)
df.sort_values(by=['gene_id', 'transcript_id'], ascending=[True, True], inplace=True)
df.dropna(inplace=True)
df.head()
# Save to file for latter
df.to_csv("./Localisation_Original_Datasets/Djebali-ENCODE/All-RNA_All-cells_Sleuth-DE-Nuclear-loc_Gencodev24.tsv", sep="\t", index=False)
Plot the number of transcripts per gene depending of the types of RNAs (lncRNA or coding)
In [82]:
# Init counters
count_all=Counter()
count_codingRNA=Counter()
count_lncRNA=Counter()
w=0.25
# Define types of RNA
coding_RNA_types = ["protein_coding", "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "translated_unprocessed_pseudogene"]
lncRNA_types = ["lincRNA", "antisense", "TEC", "sense_intronic", "processed_transcript", "sense_overlapping", "3prime_overlapping_ncrna", "non_coding", "bidirectional_promoter_lncrna", "macro_lncRNA"]
# Loop and count the number of transcript per genes
for gene_id, transcript_df in df.groupby("gene_id"):
count_all[len(transcript_df)]+=1
if transcript_df["gene_type"].all() in coding_RNA_types:
count_codingRNA[len(transcript_df)]+=1
elif transcript_df["gene_type"].all() in lncRNA_types:
count_lncRNA[len(transcript_df)]+=1
# Ploting the distributions
c= get_color_list(4, "Reds_r")
pl.figure(figsize=(20,7))
sns.set(style="whitegrid")
p1 = pl.bar(np.array([float(i) for i in count_all.keys()]), count_all.values(), label="ALL RNA transcript per gene distribution", color=next(c), width=w)
p2 = pl.bar(np.array([float(i) for i in count_codingRNA.keys()])+w, count_codingRNA.values(), label="Coding RNA transcript per gene distribution", color=next(c), width=w)
p3 = pl.bar(np.array([float(i) for i in count_lncRNA.keys()])+w*2, count_lncRNA.values(), label="LncRNA transcript per gene distribution", color=next(c), width=w)
pl.xlabel("Number of expressed isoforms per gene")
pl.ylabel("Count")
pl.title("")
pl.legend(bbox_to_anchor=(1, 1), loc=2,frameon=False)
Out[82]:
The majority of the genes only have 1 isoform detected expressed. It is particularly true for lncRNAs.
Select only the genes with transcripts highly nuclear and transcripts highly cytoplasmic
In [69]:
min_beta_range = 2
min_beta = 10
max_beta = -10
min_n_transcript = 3
max_genes_per_type = 100
# General collectors
count_dict = OrderedDict()
count_dict["all"]=[]
genetype_df_list=[]
for genetype, genetype_df in df.groupby("gene_type"):
# geneid level collectors
geneid_df_list = []
count_dict[genetype]=[]
for geneid, geneid_df in genetype_df.groupby("gene_id"):
# Select gene with at least 2 transcripts
if len(geneid_df)>=2:
# Store the range of beta nuclear loc variation for this gene ID
count_dict["all"].append(geneid_df.b.max()-geneid_df.b.min())
count_dict[genetype].append(geneid_df.b.max()-geneid_df.b.min())
# Store the geneid df in a list
geneid_df_list.append(geneid_df)
# For this type of gene select the most variable genes
geneid_df_list.sort(key = lambda df: df.b.max()-df.b.min(), reverse=True)
geneid_df_list = [df for df in geneid_df_list if len(df)>=min_n_transcript and df.b.max()-df.b.min()>=min_beta_range and df.b.min()<=min_beta and df.b.max() >= max_beta]
geneid_df_list = geneid_df_list[0:max_genes_per_type]
# If there are still genes after filtering concat the res in a single df and add it to the df list
if geneid_df_list:
geneid_df = pd.concat(geneid_df_list)
genetype_df_list.append(geneid_df)
n_genes = len(geneid_df.gene_id.unique())
print("Gene type {}: {} genes selected".format(genetype, n_genes))
# Concat everything in a single dataframe
df2 = pd.concat(genetype_df_list)
n_genes = len(df2.gene_id.unique())
print("Total: {} genes selected".format(n_genes))
Plot the results per genes
In [62]:
# Graph distribution of beta value delta per genes with at least 2 transcripts
pl.figure(figsize=(20, 10))
for genetype, val_list in count_dict.items():
if len(val_list)>=20:
sns.kdeplot(pd.Series(val_list), label=genetype)
pl.xlabel("Delta beta value between transcripts")
pl.ylabel("Count")
pl.title("Delta between beta values of nuclear/cytoplasmic localization of transcript per gene")
pl.legend(bbox_to_anchor=(1, 1), loc=2,frameon=False)
pl.xlim(-2,10)
Out[62]:
In [70]:
sns.set(style="whitegrid")
# Graph plotting and tweaking
for genetype, genetype_df in df2.groupby("gene_type"):
plot_text("{} - Distribution of preferential Nuclear/cytoplasmic localization of transcript per gene".format(genetype), fontsize=15, fontweight="bold", align="center", plot_len=30)
n_genes = len(genetype_df.gene_id.unique())
pl.figure(figsize=(30, n_genes/6), frameon=False)
pl.axes(frameon=False)
g= sns.boxplot(x="b", y="gene_id", data=genetype_df, whis=np.inf, width=0)
g= sns.stripplot(x="b", y="gene_id", hue="transcript_type", data=genetype_df, palette="Set1", size=(df2.mean_obs*2))
pl.grid(b=False, which="both", axis="y")
pl.grid(b=True, which="both", axis="x", linestyle=":", color="grey", linewidth=2)
pl.legend(bbox_to_anchor=(1, 1), loc=2,frameon=False)
pl.xlim(-10,10)
In [ ]: