In [2]:
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<b>The raw code for this IPython notebook is by default hidden for easier reading.To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a></b>''')
Out[2]:
In [1]:
# Standard lib imports
from glob import glob
from os import remove
# Local lib import
import pycl
import pyBioPlot as pbl
# Third party import
import pandas as pd
import numpy as np
import pylab as pl
import seaborn as sns
import scipy.stats as stats
# Pyplot tweaking
%matplotlib inline
pl.rcParams['figure.figsize'] = 30, 10 # that's default image size for this interactive session
# Larger display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))
# Allow to use R directly
%load_ext rpy2.ipython
# Simplify warning reporting to lighten the notebook style
import warnings
warnings.formatwarning=(lambda message, *args: "{}\n".format(message))
Hendrickson D G, DR Kelley, D Tenen, B Bernstein, JL Rinn. Widespread RNA binding by chromatin-associated proteins.. Genome Biol 17, 28 (2016).
=> The data provided will be hard to analyse and cannnot simply be lifovered
=> I would have to re-analign the fastq files at transcript level, compare bound and unbound fractions, call the peaks...
I am not sure that this is worth to do it
JH Li, S Liu, LL Zheng, J Wu, WJ Sun, ZL Wang, H Zhou, LH Qu, JH Yang. Discovery of Protein-lncRNA Interactions by Integrating Large-Scale CLIP-Seq and RNA-Seq Datasets.. Front Bioeng Biotechnol 2, 88 (2014).
JH Li, S Liu, H Zhou, LH Qu, JH Yang. starBase v2.0: decoding miRNA-ceRNA, miRNA-ncRNA and protein-RNA interaction networks from large-scale CLIP-Seq data.. Nucleic Acids Res 42, D92-7 (2014).
Cross-match Protein location Uniprot with data protein/lncRNA database from Starbase
Selection of a limited number of candidate CLIP-Seq datasets based on their localization and the experimental conditions:
In [70]:
df = pd.read_csv("./Protein_interaction/Li_ClipSeq/Protocol_info/Selected_datasets.csv", sep="\t", index_col=0)
df
Out[70]:
In [2]:
def liftover (input_file, output_file, unmap_file, liftover_chainfile):
# Conversion of coordinates with Crossmap/liftover
with open (output_file, "w") as output, open(unmap_file, "w") as unmap:
# Build the command line
cmd = "CrossMap.py bed {} {}".format(liftover_chainfile, input_file)
# Write Liftovers coordinates
for line in pycl.bash(cmd, ret_stdout=True).split("\n"):
ls = line.split("\t")
try:
output.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(ls[7], ls[8], ls[9], ls[10], ls[11], ls[12]))
except IndexError as E:
unmap.write(line+"\n")
# Test with one file
pycl.mkdir("./Protein_interaction/Li_ClipSeq/test")
liftover_chainfile = "../../LiftOver_chain_files/hg19ToHg38.over.chain.gz"
input_file = "./Protein_interaction/Li_ClipSeq/Human/HHCD1.bed"
output_file = "./Protein_interaction/Li_ClipSeq/test/HHCD1_converted.bed"
unmap_file = "./Protein_interaction/Li_ClipSeq/test/HHCD1_unmap.bed"
liftover(input_file, output_file, unmap_file, liftover_chainfile)
pycl.head(input_file, 3)
pycl.head(output_file, 3)
pycl.head(unmap_file, 3)
In [3]:
import pybedtools
def intersect_bed (annotation_file, peak_file, output_file):
# Parse file with pybedtools and intersect the coordinate ranges
peak = pybedtools.BedTool(peak_file)
annotation = pybedtools.BedTool(annotation_file)
intersection = peak.intersect(annotation, wo=True, s=True)
# Reformat the file generated by pybedtools with the merged intervals to a simple Bed format
init_template=["{chrom}","\t","{start}","\t","{end}","\t","{info}","\t","{score}","\t","{strand}","\t","{chrom2}","\t","{source}","\t","{feature}","\t","{start2}",
"\t","{end2}","\t","{score2}","\t","{strand2}","\t","{frame}","\tID=","{ID}",";gene_id=","{gene_id}",";gene_type=","{gene_type}",
";gene_status=","{gene_status}",";gene_name=","{gene_name}",";level=","{level}",";havana_gene=","{havana_gene}","\t","{len}"]
final_template=["{chrom}","\t","{start}","\t","{end}","\t","{gene_id}","|","{gene_name}","|","{gene_type}","|","{len}","\t","{score}","\t","{strand}"]
pycl.reformat_table(
input_file=intersection.fn, output_file=output_file, init_template=init_template, final_template=final_template,
replace_internal_space='_', replace_null_val="-", keep_original_header=False, predicate=lambda v: v["feature"] == "gene", header_from_final_template=True)
# Test with one file
annotation_file = "../../Reference_Annotation/gencode_v24.gff3"
peak_file = "./Protein_interaction/Li_ClipSeq/test/HHCD1_converted.bed"
output_file = "./Protein_interaction/Li_ClipSeq/test/HHCD1_annotated.bed"
intersect_bed (annotation_file, peak_file, output_file)
pycl.linerange(peak_file)
pycl.linerange(output_file)
In [4]:
def unique_genes_list (input_file_list, output_file):
gene_dict = {}
# iterate on replicates of the same experiment
for input_file in input_file_list:
df = pd.read_csv(input_file, sep="[\t|]", engine='python')
# iterate on lines of a replicate bed file
for gene_id, gene_df in df.groupby("gene_id"):
if gene_id not in gene_dict:
gene_dict[gene_id] = {
"gene_name":gene_df["gene_name"].unique()[0],
"gene_type":gene_df["gene_type"].unique()[0],
"all_count":0, "rep_count":0}
gene_dict[gene_id]["all_count"]+=len(gene_df)
gene_dict[gene_id]["rep_count"]+=1
# Write the valid gene entries in a file
with open (output_file, "w") as output:
output.write("gene_id\tgene_name\tgene_type\tall_count\trep_count\n")
for gene_id, val in gene_dict.items():
if val["rep_count"] == len(input_file_list):
output.write("{}\t{}\t{}\t{}\t{}\n".format(gene_id, val["gene_name"], val["gene_type"], val["all_count"], val["rep_count"]))
# Test with one file
input_file = ["./Protein_interaction/Li_ClipSeq/test/HCTA1_annotated.bed"]
output_file = "./Protein_interaction/Li_ClipSeq/test/HCTA1_uniq_gene.tsv"
unique_genes_list(input_file, output_file)
pycl.linerange(output_file)
# Test with a list of files
input_file = [ "./Protein_interaction/Li_ClipSeq/test/HCTA1_annotated.bed", "./Protein_interaction/Li_ClipSeq/test/HHCD1_annotated.bed"]
output_file = "./Protein_interaction/Li_ClipSeq/test/HHMF1-3_uniq_gene.tsv"
unique_genes_list(input_file, output_file)
pycl.linerange(output_file)
In [20]:
liftover_chainfile = "../../LiftOver_chain_files/hg19ToHg38.over.chain.gz"
annotation_file = "../../Reference_Annotation/gencode_v24.gff3"
intermediate_folder = "./Protein_interaction/Li_ClipSeq/selected_datasets/"
final_folder = "./Protein_interaction/Li_ClipSeq/final_datasets/"
pycl.mkdir(intermediate_folder)
pycl.mkdir(final_folder)
# df containing the sample list details
df = pd.read_csv("./Protein_interaction/Li_ClipSeq/Protocol_info/Selected_datasets.csv", sep="\t", index_col=0)
# dict to count the file of the different dataset and experiments
dataset_dict = {}
exp_dict = {}
# Iterate by all sample of the same esperiment (include replicates if there are some replicates)
for (prot,exp, cl, loc, aut), exp_df in df.groupby(["protein","experiment", "cell_line", "localization", "author"]):
exp_id = "{}_{}_{}_{}_{}".format(prot,exp, cl, loc, aut)
print("Analysis experiment ", exp_id)
intersect_file_list=[]
# Iterate on the replicates of the same experiment
for dataset_id, line in exp_df.iterrows():
print("\tAnalysis dataset {}".format(dataset_id))
input_file = line["filepath"]
print ("\t\tConverting coordinates with liftover")
converted_file = "{}{}_coord_hg38.bed".format(intermediate_folder, dataset_id)
unmap_file = "{}{}_unmap.bed".format(intermediate_folder, dataset_id)
liftover(input_file, converted_file, unmap_file, liftover_chainfile)
print ("\t\tIntersect with annotation file")
intersect_file = "{}{}_coord_hg38_annot_gencode24.bed".format(intermediate_folder, dataset_id)
intersect_file_list.append (intersect_file)
intersect_bed (annotation_file, converted_file, intersect_file)
# Fill the counter
dataset_dict[dataset_id] = {
"Init_sites":pycl.simplecount(input_file),
"Converted_coord_sites": pycl.simplecount(converted_file),
"Unmap_sites": pycl.simplecount(unmap_file),
"Annotated_sites": pycl.simplecount(intersect_file)}
# list uniq genes for each experiment. Replicate experiment and inner-joined before
print ("\tList uniq genes")
unique_gene_file = "{}{}_uniq_genes.tsv".format(final_folder, exp_id)
unique_genes_list (intersect_file_list, unique_gene_file)
exp_dict[exp_id] = {
"Uniq_genes": pycl.simplecount(unique_gene_file),
"Replicate_Number": len(intersect_file_list)}
In [21]:
pycl.dict_to_html(dataset_dict)
Out[21]:
In [22]:
pycl.dict_to_html(exp_dict)
Out[22]:
It might also be interesting to have a set of core generic RNA associated with nuclear or cytoplasmic candidates. To do so, I need to intersect the data irrespective of their cell line I will create several datasets
In [8]:
pycl.linerange("./Protein_interaction/Li_ClipSeq/final_datasets/AGO1_PAR-CLIP_HEK293_cytosol_hafner_uniq_genes.tsv")
In [14]:
input_folder = "./Protein_interaction/Li_ClipSeq/final_datasets/"
output_file = "./Protein_interaction/Li_ClipSeq/final_datasets/ALL_dataset_uniq_genes.tsv"
remove(output_file)
# dict to count the file of the different categorie
gene_loc_dict = {}
for input_file in glob(input_folder+"*tsv"):
sample_name = pycl.file_basename(input_file)
print("Analysis dataset ", sample_name)
localization = sample_name.split("_")[3]
unique_gene_df = pd.read_csv(input_file, sep="\t", index_col=0)
for gene_id, gene_line in unique_gene_df.iterrows():
if gene_id not in gene_loc_dict:
gene_loc_dict[gene_id] = {"gene_name": gene_line["gene_name"] ,"gene_type":gene_line["gene_type"] ,"cytosol":0, "nucleus":0}
gene_loc_dict[gene_id][localization] += 1
# Write results in a file
with open (output_file, "w") as output:
output.write("gene_id\tgene_name\tgene_type\tcytosol_protein\tnuclear_protein\tnuclearness\n")
for gene_id, val in gene_loc_dict.items():
output.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(gene_id,val["gene_name"],val["gene_type"],val["cytosol"],val["nucleus"],val["nucleus"]-val["cytosol"]))
pycl.linerange(output_file)
In [6]:
# Create a distribution graph to identify the number of pure nuclear interacting or pure cytosol interacting RNAs
distrib_nucleus = []
distrib_cytosol = []
for gene_id, val in gene_loc_dict.items():
if not val["nucleus"]:
distrib_cytosol.append(val["cytosol"])
elif not val["cytosol"]:
distrib_nucleus.append(val["nucleus"])
distrib_nucleus.sort()
distrib_cytosol.sort()
p= pl.hist(distrib_nucleus,align="mid", histtype="stepfilled")
p= pl.hist(distrib_cytosol,align="mid", histtype="stepfilled")
In [16]:
# Create a distribution graph to identify the number of pure nuclear interacting or pure cytosol interacting RNAs
distrib_nuclearness = sorted([val["nucleus"]-val["cytosol"] for val in gene_loc_dict.values()])
p= pl.hist(distrib_nuclearness)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: