In [1]:
# For ipython inline ploting ploting
%matplotlib inline
# Larger display
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# Import of required packages
import pandas as pd
import pylab as pl
import pysam
from time import time
# Other third party imports
from pycl.pycl import jhelp, jprint, head, tail, cat
# Import functions from JGV
from JGV import JGV
from JGV_Reference import Reference
from JGV_Annotation import Annotation
from JGV_Alignment import Alignment
from JGV_Level import Level
In [4]:
!mkdir -p "./downloaded_data/"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh38.primary_assembly.genome.fa.gz" -O "./downloaded_data/GRCh38_primary.fa.gz"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.primary_assembly.annotation.gff3.gz" -O "./downloaded_data/gencode_v25_primary.gff3.gz"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.primary_assembly.annotation.gtf.gz" -O "./downloaded_data/gencode_v25_primary.gtf.gz"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.lncRNA_transcripts.fa.gz" -O "./downloaded_data/gencode_v25_lncRNA_transcripts.fa.gz"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.long_noncoding_RNAs.gff3.gz" -O "./downloaded_data/gencode_v25_lncRNA.gff3.gz"
!wget "http://fantom.gsc.riken.jp/5/suppl/Hon_et_al_2016/data/assembly/lv2_permissive/FANTOM_CAT.lv2_permissive.all_lncRNA.bed.gz" -O "./downloaded_data/FANTOM_5_all_lncRNA.bed.gz"
!wget "http://fantom.gsc.riken.jp/5/suppl/Hon_et_al_2016/data/assembly/lv2_permissive/FANTOM_CAT.lv2_permissive.all_lncRNA.gtf.gz" -O "./downloaded_data/FANTOM_5_all_lncRNA.gtf.gz"
!wget -v "http://www.ebi.ac.uk/~aleg/data/share/1M.bam" -O "./downloaded_data/1M.bam"
Expand BAM to SAM using samtools
In [5]:
!samtools view -h "./downloaded_data/1M.bam" > "./downloaded_data/1M.sam"
!samtools view -h "./downloaded_data/1M.bam" | head -n 100197 > "./downloaded_data/100k.sam"
!samtools view "./downloaded_data/1M.bam" > "./downloaded_data/1M_no_header.sam"
In [6]:
!ls -l "./downloaded_data/"
In [2]:
jhelp(Reference.__init__, full=True)
Test the instanciation of the Reference class from a fasta files
In [11]:
jprint(Reference ("./data/yeast.fa.gz", output_index=True, verbose=True))
In [12]:
jprint(Reference ("./data/yeast.fa.gz", refid_list="chr1", output_index=True, verbose=True)) #Xfail
In [10]:
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']
r = Reference ("./downloaded_data/GRCh38_primary.fa.gz", refid_list=l, output_index=True, verbose=True)
jprint(r)
Test the instanciation of the Reference class from a fasta index
In [13]:
r = Reference ("./downloaded_data/GRCh38_primary.tsv", verbose=True)
jprint(r)
In [14]:
r = Reference ("./downloaded_data/GRCh38_primary.tsv")
jprint ("Number of refid: ", r.refid_count)
jprint ("List of refid:\n", r.refid_list)
In [15]:
r = Reference ("./data/yeast.tsv")
jprint ("Number of refid: ", r.refid_count)
jprint ("List of refid:\n", r.refid_list)
In [3]:
jhelp(Reference.get_refid_len, full=True)
In [20]:
r = Reference ("./data/yeast.tsv")
r.get_refid_len("I", verbose=True)
Out[20]:
In [22]:
r = Reference ("./downloaded_data/GRCh38_primary.tsv")
r.get_refid_len("chr8", verbose=True)
Out[22]:
In [21]:
r = Reference ("./downloaded_data/GRCh38_primary.tsv")
r.get_refid_len("chrINVALID", verbose=True)
In [4]:
jhelp(Level.__init__, full=True)
In [16]:
l = Level (max_depth=3, offset=10, filter_pos=False, filter_neg=False, filter_unstrand=True)
jprint(l)
In [5]:
jhelp(Level.__call__, full=True)
Test object calling with various values of features
All should be valid except 4 and 10
In [56]:
l = Level (max_depth=3, offset=10, filter_pos=False, filter_neg=False, filter_unstrand=True)
jprint(l(ID="1", start=10, end=20, strand="+"))
jprint(l(ID="2", start=12, end=22, strand="+"))
jprint(l(ID="3", start=14, end=24, strand="+"))
jprint(l(ID="4", start=23, end=26, strand="+"))
jprint(l(ID="5", start=14, end=24, strand="-"))
jprint(l(ID="6", start=27, end=43, strand="-"))
jprint(l(ID="7", start=27, end=48, strand="-"))
jprint(l(ID="8", start=54, end=76, strand="-"))
jprint(l(ID="9", start=54, end=76, strand="+"))
jprint(l(ID="10", start=54, end=76, strand="."))
jprint(l)
Alternative calling
All should be valid except 3,5,6,7 and 8
In [60]:
l = Level (max_depth=2, offset=2, filter_pos=False, filter_neg=True, filter_unstrand=False)
jprint(l(ID="1", start=10, end=20, strand="+"))
jprint(l(ID="2", start=12, end=22, strand="+"))
jprint(l(ID="3", start=14, end=24, strand="+"))
jprint(l(ID="4", start=23, end=26, strand="+"))
jprint(l(ID="5", start=14, end=24, strand="-"))
jprint(l(ID="6", start=27, end=43, strand="-"))
jprint(l(ID="7", start=27, end=48, strand="-"))
jprint(l(ID="8", start=54, end=76, strand="-"))
jprint(l(ID="9", start=54, end=76, strand="."))
jprint(l(ID="10", start=54, end=76, strand="."))
jprint(l)
In [61]:
l.max_level
Out[61]:
In [62]:
l.min_level
Out[62]:
In [63]:
l.n_level
Out[63]:
In [6]:
jhelp(Annotation.__init__, full=True)
In [8]:
# Instantiation from bed, gtf and gff3
file_list = ["./downloaded_data/FANTOM_5_all_lncRNA.bed.gz", "./downloaded_data/gencode_v25_lncRNA.gff3.gz", "./downloaded_data/gencode_v25_primary.gtf.gz" ]
refid_list = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chrX", "chrY", "chrM"]
type_list = "exon"
min_len = 25
max_len = 5000
for fp in file_list:
jprint(fp, bold=True)
jprint (Annotation (fp, refid_list=refid_list, type_list=type_list, min_len=min_len, max_len=max_len, verbose=True))
In [8]:
jhelp(Annotation.to_pickle, full=True)
In [9]:
file_list = ["./downloaded_data/FANTOM_5_all_lncRNA.bed.gz","./downloaded_data/FANTOM_5_all_lncRNA.gtf.gz", "./downloaded_data/gencode_v25_lncRNA.gff3.gz", "./downloaded_data/gencode_v25_primary.gtf.gz" , "./downloaded_data/gencode_v25_primary.gff3.gz"]
for fp in file_list:
jprint(fp, bold=True)
a = Annotation (fp)
pickle_fp = a.to_pickle(verbose=True)
a = Annotation(pickle_fp, verbose=True)
In [10]:
a = Annotation ("./downloaded_data/FANTOM_5_all_lncRNA.bed.pkl", refid_list = ["chr21", "chrX", "chrY", "chrM"])
jprint ("Feature count:", a.feature_count)
jprint ("Refid count:", a.refid_count)
jprint ("Type count:", a.type_count)
jprint("Refid list:", a.refid_list)
jprint("Type list:", a.type_list)
display(a.refid_count_uniq.head())
display(a.type_count_uniq.head())
In [13]:
a = Annotation ("./downloaded_data/gencode_v25_primary.gff3.pkl", ref_list=["chr21", "chrX", "chrY", "chrM"], type_list=["exon", "transcript", "CDS"] )
jprint ("Feature count:", a.feature_count)
jprint ("Refid count:", a.refid_count)
jprint ("Type count:", a.type_count)
jprint("Refid list:", a.refid_list)
jprint("Type list:", a.type_list)
display(a.refid_count_uniq.head())
display(a.type_count_uniq.head())
In [7]:
jhelp(Annotation.interval_features, full=True)
In [8]:
a = Annotation ("./downloaded_data/FANTOM_5_all_lncRNA.bed.pkl")
display(a.interval_features("chrX", start=70723, end=1456774).head())
display(a.interval_features("chrX", start=71011, end=200000, feature_types=["exon"]).head()) ## Xempty
display(a.interval_features("chr1", start=71011, end=200000).head())
In [9]:
a = Annotation ("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl")
display(a.interval_features("chr2", start=100000, end=200000, feature_types="exon").head())
display(a.interval_features("chr2", start=100000, end=200000, feature_types=["exon", "transcript"]).head())
display(a.interval_features("chr2", start=0, end=200000000, max_features_per_type=2))
display(a.interval_features("chr2", start=0, end=10000, feature_types=["exon", "transcript"]).head()) ## Xempty
display(a.interval_features("chrX", start=71011, end=200000, feature_types="CDS").head()) ## Xempty
display(a.interval_features("chrINVALID", start=71011, end=200000, feature_types=["exon"]).head()) ## Xempty
display(a.interval_features("chrX", start=200000, end=10000).head()) ## Xempty
In [3]:
jhelp(Alignment.__init__, full=True)
Test instantiation from BAM, SAM and BED files
In [8]:
file_list = ["./downloaded_data/1M.bam", "./downloaded_data/1M.sam","./downloaded_data/100k.sam", "./downloaded_data/1M.bed.gz", "./downloaded_data/100k.bed.gz"]
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']
for fp in file_list:
jprint(fp, bold=True)
%time jprint(Alignment (fp, refid_list=l, min_coverage=5, output_bed=True, verbose=True))
Suposed to fail if no header in the sam/bam file
In [9]:
a = Alignment (fp="./downloaded_data/1M_no_header.sam", verbose=True)
In [11]:
a = Alignment (fp="./downloaded_data/1M.bed.gz", verbose=True)
jprint ("Number of refid: ", a.refid_count)
jprint ("List of refid:\n", a.refid_list)
jprint ("Base coverage of the reference sequence:\n", a.refid_nbases)
In [13]:
l = ['chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']
a = Alignment (fp="./downloaded_data/1M.bed.gz", verbose=True, refid_list=l)
jprint ("Number of refid: ", a.refid_count)
jprint ("List of refid:\n", a.refid_list)
jprint ("Base coverage of the reference sequence:\n", a.refid_nbases)
In [14]:
jhelp(Alignment.interval_coverage, full=True)
In [15]:
a = Alignment (fp="./downloaded_data/1M.bed.gz", verbose=True)
In [16]:
df = a.interval_coverage(refid="chrM", start=500, end=10000, bins=200, bin_repr_fun='max')
display(df.head(5))
df.plot.area(figsize=(30, 4), color=("dodgerblue", "lightskyblue"))
Out[16]:
In [17]:
df = a.interval_coverage(refid="chr1", start=0, end=300000000, bins=500, bin_repr_fun='mean')
display(df.head(5))
df.plot.area(figsize=(30, 4), color=("dodgerblue", "lightskyblue"))
Out[17]:
In [18]:
df = a.interval_coverage(refid="chr47", start=0, end=300000000)
display(df.head(5))
df.plot.area(figsize=(30, 4), color=("dodgerblue", "lightskyblue"))
Out[18]:
In [19]:
df = a.interval_coverage(refid="chr7", start=1997, end=97495, bins=100, bin_repr_fun='max')
display(df.head(5))
df.plot.area(figsize=(30, 4))
Out[19]:
In [20]:
jhelp(JGV.__init__, full= True)
In [21]:
# Init with a fasta file
fp = "./downloaded_data/GRCh38_primary.fa.gz"
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV(fp=fp, refid_list=l, output_index=True, verbose=True)
In [22]:
# Init with a fasta index file
fp = "./downloaded_data/GRCh38_primary.tsv"
j = JGV(fp=fp, verbose=True)
In [13]:
jhelp(JGV.add_annotation, full=True)
In [23]:
j.add_annotation("./downloaded_data/gencode_v25_primary.gff3.pkl", refid_list= ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10'], type_list=["exon", "transcript", "gene"], verbose=True)
In [24]:
j.add_annotation("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl", min_len=50, max_len=5000, verbose=True)
In [25]:
jhelp(JGV.add_alignment, full=True)
In [26]:
j.add_alignment("./downloaded_data/1M.bed.gz", min_coverage=10, verbose=True)
In [27]:
j.add_alignment("./downloaded_data/100k.sam", refid_list= ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10'], verbose=True)
In [28]:
jhelp(JGV.alignment_summary, full=True)
without and with annotation track
In [29]:
# Empty JGV object
j = JGV("./downloaded_data/GRCh38_primary.tsv", ref_list=l)
j.annotation_summary()
In [30]:
# JGV object with 3 annotations
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10']
j = JGV("./downloaded_data/GRCh38_primary.tsv", ref_list=l)
j.add_annotation("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl", refid_list=l)
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", refid_list=l)
j.add_annotation("./downloaded_data/gencode_v25_primary.gff3.pkl", refid_list=l)
j.annotation_summary()
In [31]:
jhelp(JGV.alignment_summary, full=True)
without and with alignment track
In [32]:
# Empty JGV object
j = JGV("./downloaded_data/GRCh38_primary.tsv", ref_list=l)
j.alignment_summary()
In [33]:
# JGV object with 2 alignment tracks
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10']
j = JGV("./downloaded_data/GRCh38_primary.tsv", refid_list=l)
j.add_alignment("./downloaded_data/1M.bed.gz", refid_list=l)
j.add_alignment("./downloaded_data/100k.bed.gz", refid_list=l)
j.alignment_summary()
In [34]:
jhelp(JGV.refid_coverage_plot, full=True)
In [36]:
# JGV object with 2 annotations and 2 alignment tracks
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV("./downloaded_data/GRCh38_primary.tsv", refid_list=l)
j.add_annotation("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl", refid_list=l)
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", refid_list=l)
j.add_alignment("./downloaded_data/1M.bed.gz", refid_list=l)
j.add_alignment("./downloaded_data/100k.bed.gz", refid_list=l)
In [42]:
df = j.refid_coverage_plot(norm_depth=True, norm_len=False)
display(df)
In [44]:
r = j.refid_coverage_plot(
norm_depth = True,
norm_len = True,
plot_style="ggplot",
figwidth = 30,
figheight = 10,
log = True,
refid_list = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20'],
color=("dodgerblue", "green"),
alpha=0.5,
fontsize=16)
In [45]:
jhelp(JGV.interval_plot, full=True)
In [54]:
# Create an instances of IGV
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV(fp = "./downloaded_data/GRCh38_primary.tsv", ref_list=l)
# Add annotation track only
j.add_annotation("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl", type_list="exon", ref_list=l)
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", ref_list=l)
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", name="gencode_lnc_gene_min_2000" ,min_len=2000, ref_list=l, type_list="gene")
In [55]:
j.interval_plot(refid="chrM") #Xempty
In [56]:
j.interval_plot(refid="chrY")
In [59]:
# Create an instances of IGV
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV(fp = "./downloaded_data/GRCh38_primary.tsv", ref_list=l)
# 3 alignment tracks only
j.add_alignment("./downloaded_data/1M.bed.gz", ref_list=l)
j.add_alignment("./downloaded_data/1M.bed.gz", name="1M_min_cov=50",ref_list=l, min_coverage=50)
j.add_alignment("./downloaded_data/100k.bed.gz", ref_list=l)
In [60]:
j.interval_plot(refid="chrM")
In [61]:
j.interval_plot(refid="chrY")
In [62]:
# Create an instances of IGV
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV(fp = "./downloaded_data/GRCh38_primary.tsv", ref_list=l)
# 1 alignment track
j.add_alignment("./downloaded_data/1M.bed.gz", ref_list=l)
# 1 annotation track
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", ref_list=l)
In [64]:
j.interval_plot(refid="chrM")
In [65]:
j.interval_plot(refid="chrY")
In [66]:
# Plot a specific windows and testing some of the options
j.interval_plot(refid="chr21", start=45400000, end=45600000, alignment_log= True, annotation_label=True, max_features_per_type=100, feature_types=["gene", "transcript"], max_label_size=15)
In [67]:
j.interval_plot(refid="chr12", start=56000000, end=58000000, alignment_log= False, annotation_track_height=1)
In [68]:
# Testing color customization
j.interval_plot(refid="chr19", alignment_color=("red", "violet"), alignment_alpha=0.75, annotation_color="green")
Test with a large annotation file
In [69]:
# Create fast instances of IGV
fp = "./downloaded_data/GRCh38_primary.tsv"
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']
# JGV object with 2 annotations and 2 alignment tracks
j = JGV(fp=fp, ref_list=l)
j.add_annotation("./downloaded_data/gencode_v25_primary.gff3.pkl")
j.add_alignment("./downloaded_data/1M.bed.gz")
In [71]:
# Plot
%time j.interval_plot(refid="chr1")