TEST JGV package

This notebook contains tests for all functions contained in JGV package

Imports


In [1]:
# For ipython inline ploting ploting
%matplotlib inline

# Larger display 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Import of required packages
import pandas as pd
import pylab as pl
import pysam
from time import time

# Other third party imports
from pycl.pycl import jhelp, jprint, head, tail, cat

# Import functions from JGV
from JGV import JGV
from JGV_Reference import Reference
from JGV_Annotation import Annotation
from JGV_Alignment import Alignment
from JGV_Level import Level


Folder setup and download test files


In [4]:
!mkdir -p "./downloaded_data/"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh38.primary_assembly.genome.fa.gz" -O "./downloaded_data/GRCh38_primary.fa.gz"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.primary_assembly.annotation.gff3.gz" -O "./downloaded_data/gencode_v25_primary.gff3.gz"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.primary_assembly.annotation.gtf.gz" -O "./downloaded_data/gencode_v25_primary.gtf.gz"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.lncRNA_transcripts.fa.gz" -O "./downloaded_data/gencode_v25_lncRNA_transcripts.fa.gz"
!wget "ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.long_noncoding_RNAs.gff3.gz" -O "./downloaded_data/gencode_v25_lncRNA.gff3.gz"
!wget "http://fantom.gsc.riken.jp/5/suppl/Hon_et_al_2016/data/assembly/lv2_permissive/FANTOM_CAT.lv2_permissive.all_lncRNA.bed.gz" -O "./downloaded_data/FANTOM_5_all_lncRNA.bed.gz"
!wget "http://fantom.gsc.riken.jp/5/suppl/Hon_et_al_2016/data/assembly/lv2_permissive/FANTOM_CAT.lv2_permissive.all_lncRNA.gtf.gz" -O "./downloaded_data/FANTOM_5_all_lncRNA.gtf.gz"
!wget -v "http://www.ebi.ac.uk/~aleg/data/share/1M.bam" -O "./downloaded_data/1M.bam"


--2017-07-26 14:55:02--  ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/GRCh38.primary_assembly.genome.fa.gz
           => ‘./downloaded_data/GRCh38_primary.fa.gz’
Resolving ftp.sanger.ac.uk (ftp.sanger.ac.uk)... 193.62.203.17
Connecting to ftp.sanger.ac.uk (ftp.sanger.ac.uk)|193.62.203.17|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/gencode/Gencode_human/release_25 ... done.
==> SIZE GRCh38.primary_assembly.genome.fa.gz ... 844691642
==> PASV ... done.    ==> RETR GRCh38.primary_assembly.genome.fa.gz ... done.
Length: 844691642 (806M) (unauthoritative)

GRCh38.primary_asse 100%[===================>] 805.56M  6.54MB/s    in 2m 16s  

2017-07-26 14:57:19 (5.91 MB/s) - ‘./downloaded_data/GRCh38_primary.fa.gz’ saved [844691642]

--2017-07-26 14:57:19--  ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.primary_assembly.annotation.gff3.gz
           => ‘./downloaded_data/gencode_v25_primary.gff3.gz’
Resolving ftp.sanger.ac.uk (ftp.sanger.ac.uk)... 193.62.203.17
Connecting to ftp.sanger.ac.uk (ftp.sanger.ac.uk)|193.62.203.17|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/gencode/Gencode_human/release_25 ... done.
==> SIZE gencode.v25.primary_assembly.annotation.gff3.gz ... 46752876
==> PASV ... done.    ==> RETR gencode.v25.primary_assembly.annotation.gff3.gz ... done.
Length: 46752876 (45M) (unauthoritative)

gencode.v25.primary 100%[===================>]  44.59M  6.54MB/s    in 7.3s    

2017-07-26 14:57:27 (6.08 MB/s) - ‘./downloaded_data/gencode_v25_primary.gff3.gz’ saved [46752876]

--2017-07-26 14:57:27--  ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.primary_assembly.annotation.gtf.gz
           => ‘./downloaded_data/gencode_v25_primary.gtf.gz’
Resolving ftp.sanger.ac.uk (ftp.sanger.ac.uk)... 193.62.203.17
Connecting to ftp.sanger.ac.uk (ftp.sanger.ac.uk)|193.62.203.17|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/gencode/Gencode_human/release_25 ... done.
==> SIZE gencode.v25.primary_assembly.annotation.gtf.gz ... 38827267
==> PASV ... done.    ==> RETR gencode.v25.primary_assembly.annotation.gtf.gz ... done.
Length: 38827267 (37M) (unauthoritative)

gencode.v25.primary 100%[===================>]  37.03M  5.91MB/s    in 6.3s    

2017-07-26 14:57:34 (5.87 MB/s) - ‘./downloaded_data/gencode_v25_primary.gtf.gz’ saved [38827267]

--2017-07-26 14:57:34--  ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.lncRNA_transcripts.fa.gz
           => ‘./downloaded_data/gencode_v25_lncRNA_transcripts.fa.gz’
Resolving ftp.sanger.ac.uk (ftp.sanger.ac.uk)... 193.62.203.17
Connecting to ftp.sanger.ac.uk (ftp.sanger.ac.uk)|193.62.203.17|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/gencode/Gencode_human/release_25 ... done.
==> SIZE gencode.v25.lncRNA_transcripts.fa.gz ... 7230335
==> PASV ... done.    ==> RETR gencode.v25.lncRNA_transcripts.fa.gz ... done.
Length: 7230335 (6.9M) (unauthoritative)

gencode.v25.lncRNA_ 100%[===================>]   6.89M  4.81MB/s    in 1.4s    

2017-07-26 14:57:36 (4.81 MB/s) - ‘./downloaded_data/gencode_v25_lncRNA_transcripts.fa.gz’ saved [7230335]

--2017-07-26 14:57:36--  ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_25/gencode.v25.long_noncoding_RNAs.gff3.gz
           => ‘./downloaded_data/gencode_v25_lncRNA.gff3.gz’
Resolving ftp.sanger.ac.uk (ftp.sanger.ac.uk)... 193.62.203.17
Connecting to ftp.sanger.ac.uk (ftp.sanger.ac.uk)|193.62.203.17|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /pub/gencode/Gencode_human/release_25 ... done.
==> SIZE gencode.v25.long_noncoding_RNAs.gff3.gz ... 3255620
==> PASV ... done.    ==> RETR gencode.v25.long_noncoding_RNAs.gff3.gz ... done.
Length: 3255620 (3.1M) (unauthoritative)

gencode.v25.long_no 100%[===================>]   3.10M  3.51MB/s    in 0.9s    

2017-07-26 14:57:37 (3.51 MB/s) - ‘./downloaded_data/gencode_v25_lncRNA.gff3.gz’ saved [3255620]

--2017-07-26 14:57:38--  http://fantom.gsc.riken.jp/5/suppl/Hon_et_al_2016/data/assembly/lv2_permissive/FANTOM_CAT.lv2_permissive.all_lncRNA.bed.gz
Resolving fantom.gsc.riken.jp (fantom.gsc.riken.jp)... 134.160.84.66
Connecting to fantom.gsc.riken.jp (fantom.gsc.riken.jp)|134.160.84.66|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4311175 (4.1M) [application/x-gzip]
Saving to: ‘./downloaded_data/FANTOM_5_all_lncRNA.bed.gz’

./downloaded_data/F 100%[===================>]   4.11M   524KB/s    in 13s     

2017-07-26 14:57:51 (327 KB/s) - ‘./downloaded_data/FANTOM_5_all_lncRNA.bed.gz’ saved [4311175/4311175]

--2017-07-26 14:57:51--  http://fantom.gsc.riken.jp/5/suppl/Hon_et_al_2016/data/assembly/lv2_permissive/FANTOM_CAT.lv2_permissive.all_lncRNA.gtf.gz
Resolving fantom.gsc.riken.jp (fantom.gsc.riken.jp)... 134.160.84.66
Connecting to fantom.gsc.riken.jp (fantom.gsc.riken.jp)|134.160.84.66|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7530977 (7.2M) [application/x-gzip]
Saving to: ‘./downloaded_data/FANTOM_5_all_lncRNA.gtf.gz’

./downloaded_data/F 100%[===================>]   7.18M  1.14MB/s    in 7.4s    

2017-07-26 14:58:00 (994 KB/s) - ‘./downloaded_data/FANTOM_5_all_lncRNA.gtf.gz’ saved [7530977/7530977]

--2017-07-26 14:58:00--  http://www.ebi.ac.uk/~aleg/data/share/1M.bam
Resolving www.ebi.ac.uk (www.ebi.ac.uk)... 193.62.193.80
Connecting to www.ebi.ac.uk (www.ebi.ac.uk)|193.62.193.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 98262031 (94M) [text/plain]
Saving to: ‘./downloaded_data/1M.bam’

./downloaded_data/1 100%[===================>]  93.71M  6.38MB/s    in 15s     

2017-07-26 14:58:15 (6.22 MB/s) - ‘./downloaded_data/1M.bam’ saved [98262031/98262031]

Expand BAM to SAM using samtools


In [5]:
!samtools view -h "./downloaded_data/1M.bam" > "./downloaded_data/1M.sam"
!samtools view -h "./downloaded_data/1M.bam" | head -n 100197 > "./downloaded_data/100k.sam"
!samtools view "./downloaded_data/1M.bam" > "./downloaded_data/1M_no_header.sam"


samtools view: writing to standard output failed: Broken pipe
samtools view: error closing standard output: -1

In [6]:
!ls -l "./downloaded_data/"


total 1565444
-rw-rw-r-- 1 aleg aleg    535815 Apr 13 17:02 100k.bed.gz
-rw-rw-r-- 1 aleg aleg  25775335 Jul 26 14:58 100k.sam
-rw-rw-r-- 1 aleg aleg  98262031 Feb  6 16:58 1M.bam
-rw-rw-r-- 1 aleg aleg  10403247 Apr 13 17:02 1M.bed.gz
-rw-rw-r-- 1 aleg aleg 257695746 Jul 26 14:58 1M_no_header.sam
-rw-rw-r-- 1 aleg aleg 257702066 Jul 26 14:58 1M.sam
-rw-rw-r-- 1 aleg aleg   4311175 Mar 22 08:21 FANTOM_5_all_lncRNA.bed.gz
-rw-rw-r-- 1 aleg aleg   7530977 Mar 22 08:21 FANTOM_5_all_lncRNA.gtf.gz
-rw-rw-r-- 1 aleg aleg   3255620 Jul 26 14:57 gencode_v25_lncRNA.gff3.gz
-rw-rw-r-- 1 aleg aleg   7230335 Jul 26 14:57 gencode_v25_lncRNA_transcripts.fa.gz
-rw-rw-r-- 1 aleg aleg  46752876 Jul 26 14:57 gencode_v25_primary.gff3.gz
-rw-rw-r-- 1 aleg aleg  38827267 Jul 26 14:57 gencode_v25_primary.gtf.gz
-rw-rw-r-- 1 aleg aleg 844691642 Jul 26 14:57 GRCh38_primary.fa.gz
-rw-rw-r-- 1 aleg aleg       376 Apr 13 16:55 GRCh38_primary.tsv

Reference

Instanciation


In [2]:
jhelp(Reference.__init__, full=True)


__init__ (self, fp, name=None, refid_list=[], output_index=False, verbose=False, **kwargs)
  • fp

A fasta file containing the reference sequences OR an tab separated index file containing at least 2 columns

with the refid and the length in bases (like a .fa.fai file generated by samtools faidx, or with the

output_index option of this function)

The fasta option will take more time as the file has to be parsed to get the refid and length of sequences.

Both fasta and infex file can be gziped

  • name

Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp

file name

  • refid_list

list of reference sequence id to select from the data file, by default all [ DEFAULT: [] ]

  • output_index

If True will write a simple A 2 column index tsv file containing the Reference sequence ids and their

lengths [ DEFAULT: False ]

Test the instanciation of the Reference class from a fasta files


In [11]:
jprint(Reference ("./data/yeast.fa.gz", output_index=True, verbose=True))


Parsing fasta file

Write a fasta index file: ./data/yeast.tsv

 Found 17 reference sequences

Reference instance
  IV length: 1531933
  XV length: 1091291
  VII length: 1090940
  XII length: 1078177
  XVI length: 948066
  XIII length: 924431
  II length: 813184
  XIV length: 784333
  X length: 745751
  XI length: 666816
  V length: 576874
  VIII length: 562643
  IX length: 439888
  III length: 316620
  VI length: 270161
  I length: 230218
  Mito length: 85779
 ext fa
 fp ./data/yeast.fa.gz
 name yeast


In [12]:
jprint(Reference ("./data/yeast.fa.gz", refid_list="chr1", output_index=True, verbose=True)) #Xfail


Parsing fasta file

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
<ipython-input-12-e17022b0aefe> in <module>()
----> 1 jprint(Reference ("./data/yeast.fa.gz", refid_list="chr1", output_index=True, verbose=True)) #Xfail

~/Programming/Python3/JupyterGenoViewer/JGV/JGV_Reference.py in __init__(self, fp, name, refid_list, output_index, verbose, **kwargs)
     69 
     70             # Check if sequences found
---> 71             assert d, "No Sequence found"
     72 
     73             # Transform the counter in a Dataframe and sort by length

AssertionError: No Sequence found

In [10]:
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']
r = Reference ("./downloaded_data/GRCh38_primary.fa.gz", refid_list=l, output_index=True, verbose=True)
jprint(r)


Parsing fasta file

Write a fasta index file: ./downloaded_data/GRCh38_primary.tsv

 Found 25 reference sequences

Reference instance
  chr1 length: 248956422
  chr2 length: 242193529
  chr3 length: 198295559
  chr4 length: 190214555
  chr5 length: 181538259
  chr6 length: 170805979
  chr7 length: 159345973
  chrX length: 156040895
  chr8 length: 145138636
  chr9 length: 138394717
  chr11 length: 135086622
  chr10 length: 133797422
  chr12 length: 133275309
  chr13 length: 114364328
  chr14 length: 107043718
  chr15 length: 101991189
  chr16 length: 90338345
  chr17 length: 83257441
  chr18 length: 80373285
  chr20 length: 64444167
  chr19 length: 58617616
  chrY length: 57227415
  chr22 length: 50818468
  chr21 length: 46709983
  chrM length: 16569
 ext fa
 fp ./downloaded_data/GRCh38_primary.fa.gz
 name GRCh38_primary

Test the instanciation of the Reference class from a fasta index


In [13]:
r = Reference ("./downloaded_data/GRCh38_primary.tsv", verbose=True)
jprint(r)


Assume the file is a fasta index

 Found 25 reference sequences

Reference instance
  chr1 length: 248956422
  chr2 length: 242193529
  chr3 length: 198295559
  chr4 length: 190214555
  chr5 length: 181538259
  chr6 length: 170805979
  chr7 length: 159345973
  chrX length: 156040895
  chr8 length: 145138636
  chr9 length: 138394717
  chr11 length: 135086622
  chr10 length: 133797422
  chr12 length: 133275309
  chr13 length: 114364328
  chr14 length: 107043718
  chr15 length: 101991189
  chr16 length: 90338345
  chr17 length: 83257441
  chr18 length: 80373285
  chr20 length: 64444167
  chr19 length: 58617616
  chrY length: 57227415
  chr22 length: 50818468
  chr21 length: 46709983
  chrM length: 16569
 ext tsv
 fp ./downloaded_data/GRCh38_primary.tsv
 name GRCh38_primary

Properties


In [14]:
r = Reference ("./downloaded_data/GRCh38_primary.tsv")

jprint ("Number of refid: ", r.refid_count)
jprint ("List of refid:\n", r.refid_list)


Number of refid: 25

List of refid:
['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chrX', 'chr8', 'chr9', 'chr11', 'chr10', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr20', 'chr19', 'chrY', 'chr22', 'chr21', 'chrM']


In [15]:
r = Reference ("./data/yeast.tsv")

jprint ("Number of refid: ", r.refid_count)
jprint ("List of refid:\n", r.refid_list)


Number of refid: 17

List of refid:
['IV', 'XV', 'VII', 'XII', 'XVI', 'XIII', 'II', 'XIV', 'X', 'XI', 'V', 'VIII', 'IX', 'III', 'VI', 'I', 'Mito']

get_refid_len


In [3]:
jhelp(Reference.get_refid_len, full=True)


get_refid_len (self, refid, verbose=False, **kwargs)

Return the length of a given refid, If the reference is not found return None


In [20]:
r = Reference ("./data/yeast.tsv")
r.get_refid_len("I", verbose=True)


Out[20]:
230218

In [22]:
r = Reference ("./downloaded_data/GRCh38_primary.tsv")
r.get_refid_len("chr8", verbose=True)


Out[22]:
145138636

In [21]:
r = Reference ("./downloaded_data/GRCh38_primary.tsv")
r.get_refid_len("chrINVALID", verbose=True)


The reference sequence chrINVALID was not found in the reference list

Level

Instanciation


In [4]:
jhelp(Level.__init__, full=True)


__init__ (self, max_depth=100, offset=10, filter_pos=False, filter_neg=False, filter_unstrand=False)

Define general options for Level class

  • max_depth

Maximal total number of positive or negative levels. Safeguard value in case of highly

overlapped annotations [ DEFAULT: 100 ]

  • offset

Minimal distance between 2 contigous annotation features on the same level [ DEFAULT: 10 ]

  • filter_pos

Filter-out annotation features on the positive strand [ DEFAULT: False ]

  • filter_neg

Filter-out annotation features on the negative strand [ DEFAULT: False ]

  • filter_unstrand

Filter-out annotation features with no strand specified [ DEFAULT: False ]


In [16]:
l = Level (max_depth=3, offset=10, filter_pos=False, filter_neg=False, filter_unstrand=True)
jprint(l)


Level instance
 count Counter()
 enhanced_feature 
 filter_neg False
 filter_pos False
 filter_unstrand True
 level_dict {}
 max_depth 3
 neg_arrowstyle <|-,head_width=1,head_length=2
 offset 10
 pos_arrowstyle -|>,head_width=1,head_length=2
 unstrand_arrowstyle -

Calling


In [5]:
jhelp(Level.__call__, full=True)


__call__ (self, ID, start, end, strand)

Compute the level of an annnotation feature based on the instance options and the other features previously

analysed to avoid overlapping. Iterative call of the function has to be done with annotation features sorted

by start coordinates.

  • ID

Name of the feature to fit in a level

  • start

Start coordinate of the feature to fit in a level, on the positive strand

  • end

End coordinate of the feature to fit in a level, on the positive strand

  • strand

Strand of the feature. Can be + - or . if unknown

Test object calling with various values of features

All should be valid except 4 and 10


In [56]:
l = Level (max_depth=3, offset=10, filter_pos=False, filter_neg=False, filter_unstrand=True)

jprint(l(ID="1", start=10, end=20, strand="+"))
jprint(l(ID="2", start=12, end=22, strand="+"))
jprint(l(ID="3", start=14, end=24, strand="+"))
jprint(l(ID="4", start=23, end=26, strand="+"))
jprint(l(ID="5", start=14, end=24, strand="-"))
jprint(l(ID="6", start=27, end=43, strand="-"))
jprint(l(ID="7", start=27, end=48, strand="-"))
jprint(l(ID="8", start=54, end=76, strand="-"))
jprint(l(ID="9", start=54, end=76, strand="+"))
jprint(l(ID="10", start=54, end=76, strand="."))

jprint(l)


enhanced_feature(ID='1', start=10, end=20, arrowstyle='-|>,head_width=1,head_length=2', level=1)

enhanced_feature(ID='2', start=12, end=22, arrowstyle='-|>,head_width=1,head_length=2', level=2)

enhanced_feature(ID='3', start=14, end=24, arrowstyle='-|>,head_width=1,head_length=2', level=3)

None

enhanced_feature(ID='5', start=14, end=24, arrowstyle='<|-,head_width=1,head_length=2', level=-1)

enhanced_feature(ID='6', start=27, end=43, arrowstyle='<|-,head_width=1,head_length=2', level=-2)

enhanced_feature(ID='7', start=27, end=48, arrowstyle='<|-,head_width=1,head_length=2', level=-3)

enhanced_feature(ID='8', start=54, end=76, arrowstyle='<|-,head_width=1,head_length=2', level=-1)

enhanced_feature(ID='9', start=54, end=76, arrowstyle='-|>,head_width=1,head_length=2', level=1)

None

Level instance
 count Counter({'all_features': 10, 'positive_features': 5, 'negative_features': 4})
 enhanced_feature 
 filter_neg False
 filter_pos False
 filter_unstrand True
 level_dict {1: 76, 2: 22, 3: 24, -1: 76, -3: 48, -2: 43}
 max_depth 3
 neg_arrowstyle <|-,head_width=1,head_length=2
 offset 10
 pos_arrowstyle -|>,head_width=1,head_length=2
 unstrand_arrowstyle -

Alternative calling

All should be valid except 3,5,6,7 and 8


In [60]:
l = Level (max_depth=2, offset=2, filter_pos=False, filter_neg=True, filter_unstrand=False)

jprint(l(ID="1", start=10, end=20, strand="+"))
jprint(l(ID="2", start=12, end=22, strand="+"))
jprint(l(ID="3", start=14, end=24, strand="+"))
jprint(l(ID="4", start=23, end=26, strand="+"))
jprint(l(ID="5", start=14, end=24, strand="-"))
jprint(l(ID="6", start=27, end=43, strand="-"))
jprint(l(ID="7", start=27, end=48, strand="-"))
jprint(l(ID="8", start=54, end=76, strand="-"))
jprint(l(ID="9", start=54, end=76, strand="."))
jprint(l(ID="10", start=54, end=76, strand="."))

jprint(l)


enhanced_feature(ID='1', start=10, end=20, arrowstyle='-|>,head_width=1,head_length=2', level=1)

enhanced_feature(ID='2', start=12, end=22, arrowstyle='-|>,head_width=1,head_length=2', level=2)

None

enhanced_feature(ID='4', start=23, end=26, arrowstyle='-|>,head_width=1,head_length=2', level=1)

None

None

None

None

enhanced_feature(ID='9', start=54, end=76, arrowstyle='-', level=0)

enhanced_feature(ID='10', start=54, end=76, arrowstyle='-', level=0)

Level instance
 count Counter({'all_features': 10, 'positive_features': 4, 'unstranded_features': 2})
 enhanced_feature 
 filter_neg True
 filter_pos False
 filter_unstrand False
 level_dict {0: 76, 1: 26, 2: 22}
 max_depth 2
 neg_arrowstyle <|-,head_width=1,head_length=2
 offset 2
 pos_arrowstyle -|>,head_width=1,head_length=2
 unstrand_arrowstyle -

Properties


In [61]:
l.max_level


Out[61]:
2

In [62]:
l.min_level


Out[62]:
0

In [63]:
l.n_level


Out[63]:
3

Annotation

Instantiation


In [6]:
jhelp(Annotation.__init__, full=True)


__init__ (self, fp, name=None, min_len=None, max_len=None, refid_list=None, type_list=None, verbose=False, **kwargs)
  • fp

A path to a standard genomic file containing features annotations among the following format

Alternatively, one can use a python pickle file (.pkl) generated during a previous run.

The file can eventually be compressed in ‘gzip’ format

  • min_len

Minimal size (start to end) of a feature to be selected [default None]

  • max_len

Maximal size (start to end) of a feature to be selected [default None]

  • refid_list

List of reference id to select. Example: ["chr1", "chr2", "chr3"] [default None]

  • type_list

List of feature type to select. Example: ["exon", "gene"] [default None]


In [8]:
# Instantiation from bed, gtf and gff3 
file_list = ["./downloaded_data/FANTOM_5_all_lncRNA.bed.gz", "./downloaded_data/gencode_v25_lncRNA.gff3.gz", "./downloaded_data/gencode_v25_primary.gtf.gz" ]
refid_list = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chrX", "chrY", "chrM"]
type_list =  "exon"
min_len = 25
max_len = 5000

for fp in file_list:
    jprint(fp, bold=True)
    jprint (Annotation (fp, refid_list=refid_list, type_list=type_list, min_len=min_len, max_len=max_len, verbose=True))


./downloaded_data/FANTOM_5_all_lncRNA.bed.gz

Parse Annotation file

 File is gziped

 Use BED parser to parse annotations

 Successfully imported as a bed12 file

 Remove null values

 Removed 0 invalid lines

 Cast coordinates to integer

Selecting features based on length

 Features before filtering: 175031

 Features after minimal length filtering: 174805

 Features after maximal length filtering: 99802

Selecting features based on reference id

 Features before filtering: 99802

 Features after filtering: 98253

Selecting features based on type

 Features before filtering: 98253

 Features after filtering: 0

Sorting and final cleanup

 Number of features imported: 0

Annotation: FANTOM_5_all_lncRNA - Feature count 0

./downloaded_data/gencode_v25_lncRNA.gff3.gz

Parse Annotation file

 File is gziped

 Use GFF3 parser to parse annotations

 Successfully imported as a gff3 file

 Remove null values

 Removed 0 invalid lines

 Cast coordinates to integer

Selecting features based on length

 Features before filtering: 126383

 Features after minimal length filtering: 125538

 Features after maximal length filtering: 102876

Selecting features based on reference id

 Features before filtering: 102876

 Features after filtering: 100609

Selecting features based on type

 Features before filtering: 100609

 Features after filtering: 80132

Sorting and final cleanup

 Number of features imported: 80132

Annotation: gencode_v25_lncRNA - Feature count 80132

./downloaded_data/gencode_v25_primary.gtf.gz

Parse Annotation file

 File is gziped

 Use GTF parser to parse annotations

 Successfully imported as a gtf file

 Remove null values

 Removed 0 invalid lines

 Cast coordinates to integer

Selecting features based on length

 Features before filtering: 2580534

 Features after minimal length filtering: 2351911

 Features after maximal length filtering: 2197057

Selecting features based on reference id

 Features before filtering: 2197057

 Features after filtering: 2148670

Selecting features based on type

 Features before filtering: 2148670

 Features after filtering: 1140997

Sorting and final cleanup

 Number of features imported: 1140997

Annotation: gencode_v25_primary - Feature count 1140997

to_pickle and load from pickle


In [8]:
jhelp(Annotation.to_pickle, full=True)


to_pickle (self, fp=None, verbose=False, **kwargs)

Store the parsed file in a pickle file for further use.

  • fp

Path to save the pickle file. By default original annotation file (- .gz/.tgz) + .pkl


In [9]:
file_list = ["./downloaded_data/FANTOM_5_all_lncRNA.bed.gz","./downloaded_data/FANTOM_5_all_lncRNA.gtf.gz", "./downloaded_data/gencode_v25_lncRNA.gff3.gz", "./downloaded_data/gencode_v25_primary.gtf.gz" , "./downloaded_data/gencode_v25_primary.gff3.gz"]
for fp in file_list:
    jprint(fp, bold=True)
    a = Annotation (fp)
    pickle_fp = a.to_pickle(verbose=True)
    a = Annotation(pickle_fp, verbose=True)


./downloaded_data/FANTOM_5_all_lncRNA.bed.gz

Pickle dataframe in file ./downloaded_data/FANTOM_5_all_lncRNA.bed.pkl

Parse Annotation file

 File is not compressed

 Try to load as a pickle file

Sorting and final cleanup

 Number of features imported: 175031

./downloaded_data/FANTOM_5_all_lncRNA.gtf.gz

Pickle dataframe in file ./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl

Parse Annotation file

 File is not compressed

 Try to load as a pickle file

Sorting and final cleanup

 Number of features imported: 639324

./downloaded_data/gencode_v25_lncRNA.gff3.gz

Pickle dataframe in file ./downloaded_data/gencode_v25_lncRNA.gff3.pkl

Parse Annotation file

 File is not compressed

 Try to load as a pickle file

Sorting and final cleanup

 Number of features imported: 126383

./downloaded_data/gencode_v25_primary.gtf.gz

Pickle dataframe in file ./downloaded_data/gencode_v25_primary.gtf.pkl

Parse Annotation file

 File is not compressed

 Try to load as a pickle file

Sorting and final cleanup

 Number of features imported: 2580534

./downloaded_data/gencode_v25_primary.gff3.gz

Pickle dataframe in file ./downloaded_data/gencode_v25_primary.gff3.pkl

Parse Annotation file

 File is not compressed

 Try to load as a pickle file

Sorting and final cleanup

 Number of features imported: 2577950

Properties


In [10]:
a = Annotation ("./downloaded_data/FANTOM_5_all_lncRNA.bed.pkl", refid_list = ["chr21", "chrX", "chrY", "chrM"])

jprint ("Feature count:", a.feature_count)
jprint ("Refid count:", a.refid_count)
jprint ("Type count:", a.type_count)
jprint("Refid list:", a.refid_list)
jprint("Type list:", a.type_list)
display(a.refid_count_uniq.head())
display(a.type_count_uniq.head())


Feature count: 8084

Refid count: 3

Type count: 1

Refid list: {'chrY', 'chr21', 'chrX'}

Type list: {'.'}

count
refid
chrX 4943
chr21 2765
chrY 376
count
type
. 8084

In [13]:
a = Annotation ("./downloaded_data/gencode_v25_primary.gff3.pkl", ref_list=["chr21", "chrX", "chrY", "chrM"], type_list=["exon", "transcript", "CDS"] )

jprint ("Feature count:", a.feature_count)
jprint ("Refid count:", a.refid_count)
jprint ("Type count:", a.type_count)
jprint("Refid list:", a.refid_list)
jprint("Type list:", a.type_list)
display(a.refid_count_uniq.head())
display(a.type_count_uniq.head())


Feature count: 2086243

Refid count: 47

Type count: 3

Refid list: {'chrY', 'KI270726.1', 'GL000218.1', 'chr19', 'GL000216.2', 'chr6', 'chr7', 'KI270744.1', 'GL000009.2', 'KI270721.1', 'KI270713.1', 'chr12', 'chr18', 'chr16', 'chr10', 'GL000213.1', 'GL000194.1', 'chr3', 'GL000225.1', 'KI270750.1', 'GL000205.2', 'KI270733.1', 'chr13', 'chrM', 'KI270711.1', 'chr17', 'KI270727.1', 'chr2', 'KI270731.1', 'chr9', 'KI270734.1', 'chr21', 'chrX', 'KI270728.1', 'GL000220.1', 'chr5', 'KI270442.1', 'chr15', 'chr20', 'chr4', 'GL000219.1', 'chr1', 'chr22', 'chr11', 'chr8', 'chr14', 'GL000195.1'}

Type list: {'CDS', 'exon', 'transcript'}

count
refid
chr1 190737
chr2 159035
chr17 132275
chr3 129701
chr19 128072
count
type
exon 1183020
CDS 705063
transcript 198160

interval_features


In [7]:
jhelp(Annotation.interval_features, full=True)


interval_features (self, refid, start, end, feature_types=None, max_features_per_type=None, verbose=False, **kwargs)

Parse the annotation file for the given refid and interval and return a dataframe containing all the features

found for each original line. Features are identified by their ID field for gff3 files, by the entire

attribute field for the bed files and by the first element in the attribute field for the gtf files

  • refid

Name of the sequence from the original fasta file to display

  • start

Start of the window to display. The coordinate is not verified, if outside of the range it will

return an empty dataframe

  • end

End of the window to display. The coordinate is not verified, if outside of the range it will

return an empty dataframe

  • feature_types

Name of a valid feature type ( "exon"|"transcript"|"gene"|"CDS"...) or list of names of feature type for

which a row will be returned. The option is not available for bed files. If not given, all features type

found in the interval will be returned [ DEFAULT: None ]

  • max_features_per_type

Maximal total number of features for a particular feature type. If more are found, a random sampling will

be performed. If None, all the features will be returned [ DEFAULT: None ]


In [8]:
a = Annotation ("./downloaded_data/FANTOM_5_all_lncRNA.bed.pkl")

display(a.interval_features("chrX", start=70723, end=1456774).head())
display(a.interval_features("chrX", start=71011, end=200000, feature_types=["exon"]).head()) ## Xempty
display(a.interval_features("chr1", start=71011, end=200000).head())


refid start end ID score strand type
0 chrX 70225 70724 CATG00000110193.1|HBMT00001528801.1 1 + .
1 chrX 70225 71012 CATG00000110193.1|ENCT00000463832.1 1 + .
2 chrX 70225 71610 CATG00000110193.1|ENCT00000463833.1 1 + .
3 chrX 184125 192877 CATG00000112915.1|ENCT00000473979.1 1 - .
4 chrX 192544 192877 CATG00000112915.1|FTMT29000000005.1 1 - .
refid start end strand ID type
refid start end ID score strand type
0 chr1 91420 762886 ENSG00000225880.4|FTMT20100027365.1 1 - .
1 chr1 91420 762886 ENSG00000240453.1|FTMT20100027364.1 1 - .

In [9]:
a = Annotation ("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl")

display(a.interval_features("chr2", start=100000, end=200000, feature_types="exon").head())
display(a.interval_features("chr2", start=100000, end=200000, feature_types=["exon", "transcript"]).head())
display(a.interval_features("chr2", start=0, end=200000000, max_features_per_type=2))
display(a.interval_features("chr2", start=0, end=10000, feature_types=["exon", "transcript"]).head()) ## Xempty
display(a.interval_features("chrX", start=71011, end=200000, feature_types="CDS").head()) ## Xempty
display(a.interval_features("chrINVALID", start=71011, end=200000, feature_types=["exon"]).head()) ## Xempty
display(a.interval_features("chrX", start=200000, end=10000).head()) ## Xempty


refid start end ID score strand type
0 chr2 101012 101164 CATG00000041546.1 . + exon
1 chr2 130910 131453 CATG00000041547.1 . + exon
2 chr2 159154 160407 CATG00000041548.1 . + exon
3 chr2 162495 162673 CATG00000041548.1 . + exon
4 chr2 167584 170667 CATG00000041548.1 . + exon
refid start end ID score strand type
0 chr2 80646 101164 CATG00000041546.1 . + transcript
1 chr2 101012 101164 CATG00000041546.1 . + exon
2 chr2 130910 131453 CATG00000041547.1 . + exon
3 chr2 130910 131453 CATG00000041547.1 . + transcript
4 chr2 159154 160407 CATG00000041548.1 . + exon
refid start end ID score strand type
0 chr2 64984044 64984697 ENSG00000234572.1 . - exon
1 chr2 110379029 110379291 CATG00000044127.1 . + exon
2 chr2 112186889 112252628 ENSG00000172965.10 . - transcript
3 chr2 182304818 182305863 CATG00000045729.1 . + transcript
4 chr2 194578338 194586977 CATG00000046066.1 . + gene
5 chr2 199608699 199617221 CATG00000046168.1 . + gene
refid start end strand ID type
refid start end strand ID type
refid start end strand ID type
refid start end strand ID type

Alignment

Instantiation


In [3]:
jhelp(Alignment.__init__, full=True)


__init__ (self, fp, name=None, min_coverage=5, refid_list=[], output_bed=False, verbose=False, **kwargs)
  • fp

A standard BAM or SAM (http://samtools.sourceforge.net/SAM1.pdf) containing aligned reads and a standard

header. The files do not need to be sorted or indexed.

One can also use a 6 fields bed (chrom, chromStart, chromEnd, name, score, strand, where score is the

coverage value (Much faster than from a Bam/Sam file, can be gzipped). http://www.ensembl.org/info/website/upload/bed.html

  • name

Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp

file name [ DEFAULT: None ]

  • min_coverage

Minimal coverage to compute the data. If less, the coverage will be considered null. Not used for

if fp is a bed coverage file [ DEFAULT: 5 ]

  • refid_list

list of reference sequence id to select from the data file, by default all, Not used for if fp is a bed

coverage file [ DEFAULT: [] ]

  • output_bed

If True will be write a 6 columns compressed bed file containing the coverage values for + and - strand

excluding positions with coverage lesser than min_coverage.the option will apply only is the input file is

BAM or SAM. [ DEFAULT: False ]

Test instantiation from BAM, SAM and BED files


In [8]:
file_list = ["./downloaded_data/1M.bam", "./downloaded_data/1M.sam","./downloaded_data/100k.sam", "./downloaded_data/1M.bed.gz", "./downloaded_data/100k.bed.gz"]
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']

for fp in file_list:
    jprint(fp, bold=True)
    %time jprint(Alignment (fp, refid_list=l, min_coverage=5, output_bed=True, verbose=True))


./downloaded_data/1M.bam

Compute coverage from bam/sam file ./downloaded_data/1M.bam

Write coverage data in file ./downloaded_data/1M.bed.gz

 Total base coverage 24786158 in 25 reference sequences

Alignment instance
 chr1 nbases:1672759
 chr14 nbases:969619
 chr2 nbases:1684153
 chr16 nbases:1905311
 chr21 nbases:4551459
 chr9 nbases:534310
 chr3 nbases:1656105
 chr18 nbases:39978
 chr15 nbases:299921
 chr20 nbases:1115702
 chr7 nbases:372747
 chr8 nbases:373550
 chr22 nbases:332211
 chr10 nbases:354604
 chr11 nbases:963085
 chr5 nbases:253869
 chr13 nbases:91130
 chr17 nbases:1317502
 chr19 nbases:1027703
 chrX nbases:717252
 chr12 nbases:888997
 chr4 nbases:491060
 chr6 nbases:1265690
 chrM nbases:1849737
 chrY nbases:57704
 ext bam
 fp ./downloaded_data/1M.bam
 name 1M
 nbases 24786158
 outfp ./downloaded_data/1M.bed.gz

CPU times: user 1min 11s, sys: 780 ms, total: 1min 11s
Wall time: 1min 11s

./downloaded_data/1M.sam

Compute coverage from bam/sam file ./downloaded_data/1M.sam

Write coverage data in file ./downloaded_data/1M.bed.gz

 Total base coverage 24786158 in 25 reference sequences

Alignment instance
 chr1 nbases:1672759
 chr14 nbases:969619
 chr2 nbases:1684153
 chr16 nbases:1905311
 chr21 nbases:4551459
 chr9 nbases:534310
 chr3 nbases:1656105
 chr18 nbases:39978
 chr15 nbases:299921
 chr20 nbases:1115702
 chr7 nbases:372747
 chr8 nbases:373550
 chr22 nbases:332211
 chr10 nbases:354604
 chr11 nbases:963085
 chr5 nbases:253869
 chr13 nbases:91130
 chr17 nbases:1317502
 chr19 nbases:1027703
 chrX nbases:717252
 chr12 nbases:888997
 chr4 nbases:491060
 chr6 nbases:1265690
 chrM nbases:1849737
 chrY nbases:57704
 ext sam
 fp ./downloaded_data/1M.sam
 name 1M
 nbases 24786158
 outfp ./downloaded_data/1M.bed.gz

CPU times: user 1min 9s, sys: 620 ms, total: 1min 9s
Wall time: 1min 9s

./downloaded_data/100k.sam

Compute coverage from bam/sam file ./downloaded_data/100k.sam

Write coverage data in file ./downloaded_data/100k.bed.gz

 Total base coverage 989837 in 25 reference sequences

Alignment instance
 chr1 nbases:75763
 chr14 nbases:32674
 chr2 nbases:51270
 chr16 nbases:27312
 chr21 nbases:351150
 chr9 nbases:7747
 chr3 nbases:63794
 chr18 nbases:60
 chr15 nbases:1703
 chr20 nbases:26905
 chr7 nbases:5132
 chr8 nbases:4186
 chr22 nbases:2625
 chr10 nbases:7164
 chr11 nbases:27229
 chr5 nbases:1525
 chr13 nbases:931
 chr17 nbases:13445
 chr19 nbases:23847
 chrX nbases:4491
 chr12 nbases:36391
 chr4 nbases:20578
 chr6 nbases:44164
 chrM nbases:159751
 chrY nbases:0
 ext sam
 fp ./downloaded_data/100k.sam
 name 100k
 nbases 989837
 outfp ./downloaded_data/100k.bed.gz

CPU times: user 5.69 s, sys: 72 ms, total: 5.76 s
Wall time: 5.76 s

./downloaded_data/1M.bed.gz

Extract coverage from bed file ./downloaded_data/1M.bed.gz

 Total base coverage 49572316 in 25 reference sequences

Alignment instance
 chr1 nbases:3345518
 chr14 nbases:1939238
 chr2 nbases:3368306
 chr16 nbases:3810622
 chr21 nbases:9102918
 chr9 nbases:1068620
 chr3 nbases:3312210
 chr18 nbases:79956
 chr15 nbases:599842
 chr20 nbases:2231404
 chr7 nbases:745494
 chr8 nbases:747100
 chr22 nbases:664422
 chr10 nbases:709208
 chr11 nbases:1926170
 chr5 nbases:507738
 chr13 nbases:182260
 chr17 nbases:2635004
 chr19 nbases:2055406
 chrX nbases:1434504
 chr12 nbases:1777994
 chr4 nbases:982120
 chr6 nbases:2531380
 chrM nbases:3699474
 chrY nbases:115408
 ext bed
 fp ./downloaded_data/1M.bed.gz
 name 1M
 nbases 49572316

CPU times: user 4.92 s, sys: 24 ms, total: 4.94 s
Wall time: 4.94 s

./downloaded_data/100k.bed.gz

Extract coverage from bed file ./downloaded_data/100k.bed.gz

 Total base coverage 1979674 in 24 reference sequences

Alignment instance
 chr1 nbases:151526
 chr14 nbases:65348
 chr2 nbases:102540
 chr16 nbases:54624
 chr21 nbases:702300
 chr9 nbases:15494
 chr3 nbases:127588
 chr18 nbases:120
 chr15 nbases:3406
 chr20 nbases:53810
 chr7 nbases:10264
 chr8 nbases:8372
 chr22 nbases:5250
 chr10 nbases:14328
 chr11 nbases:54458
 chr5 nbases:3050
 chr13 nbases:1862
 chr17 nbases:26890
 chr19 nbases:47694
 chrX nbases:8982
 chr12 nbases:72782
 chr4 nbases:41156
 chr6 nbases:88328
 chrM nbases:319502
 ext bed
 fp ./downloaded_data/100k.bed.gz
 name 100k
 nbases 1979674

CPU times: user 292 ms, sys: 0 ns, total: 292 ms
Wall time: 291 ms

Suposed to fail if no header in the sam/bam file


In [9]:
a = Alignment (fp="./downloaded_data/1M_no_header.sam", verbose=True)


Compute coverage from bam/sam file ./downloaded_data/1M_no_header.sam

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-72c52bdcf58c> in <module>()
----> 1 a = Alignment (fp="./downloaded_data/1M_no_header.sam", verbose=True)

~/Programming/Python3/JupyterGenoViewer/JGV/JGV_Alignment.py in __init__(self, fp, name, min_coverage, refid_list, output_bed, verbose, **kwargs)
     51         if self.ext in ["bam","sam"]:
     52             if verbose: jprint ("Compute coverage from bam/sam file ", self.fp)
---> 53             self.d = self._bam_parser(fp, min_coverage, refid_list)
     54             if output_bed:
     55                 outfp = "{}/{}.bed.gz".format(dir_path(fp), file_basename(fp))

~/Programming/Python3/JupyterGenoViewer/JGV/JGV_Alignment.py in _bam_parser(self, fp, min_coverage, refid_list, verbose, **kwargs)
    110         """
    111         d = OrderedDict()
--> 112         with pysam.AlignmentFile(fp) as bam:
    113             # Compute the genomic coverage for each reads
    114             if verbose: jprint ("\tTally coverage for each base")

pysam/libcalignmentfile.pyx in pysam.libcalignmentfile.AlignmentFile.__cinit__ (pysam/libcalignmentfile.c:5835)()

pysam/libcalignmentfile.pyx in pysam.libcalignmentfile.AlignmentFile._open (pysam/libcalignmentfile.c:8072)()

ValueError: file has no sequences defined (mode='r') - is it SAM/BAM format? Consider opening with check_sq=False

Properties


In [11]:
a = Alignment (fp="./downloaded_data/1M.bed.gz", verbose=True)
jprint ("Number of refid: ", a.refid_count)
jprint ("List of refid:\n", a.refid_list)
jprint ("Base coverage of the reference sequence:\n", a.refid_nbases)


Extract coverage from bed file ./downloaded_data/1M.bed.gz

 Total base coverage 49572316 in 25 reference sequences

Number of refid: 25

List of refid:
['chr1', 'chr14', 'chr2', 'chr16', 'chr21', 'chr9', 'chr3', 'chr18', 'chr15', 'chr20', 'chr7', 'chr8', 'chr22', 'chr10', 'chr11', 'chr5', 'chr13', 'chr17', 'chr19', 'chrX', 'chr12', 'chr4', 'chr6', 'chrM', 'chrY']

Base coverage of the reference sequence:
chr21 9102918
chr16 3810622
chrM 3699474
chr2 3368306
chr1 3345518
chr3 3312210
chr17 2635004
chr6 2531380
chr20 2231404
chr19 2055406
chr14 1939238
chr11 1926170
chr12 1777994
chrX 1434504
chr9 1068620
chr4 982120
chr8 747100
chr7 745494
chr10 709208
chr22 664422
chr15 599842
chr5 507738
chr13 182260
chrY 115408
chr18 79956
Name: nbases, dtype: int64


In [13]:
l = ['chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']
a = Alignment (fp="./downloaded_data/1M.bed.gz", verbose=True, refid_list=l)
jprint ("Number of refid: ", a.refid_count)
jprint ("List of refid:\n", a.refid_list)
jprint ("Base coverage of the reference sequence:\n", a.refid_nbases)


Extract coverage from bed file ./downloaded_data/1M.bed.gz

 Total base coverage 25829118 in 10 reference sequences

Number of refid: 10

List of refid:
['chr16', 'chr21', 'chr18', 'chr20', 'chr22', 'chr17', 'chr19', 'chrX', 'chrM', 'chrY']

Base coverage of the reference sequence:
chr21 9102918
chr16 3810622
chrM 3699474
chr17 2635004
chr20 2231404
chr19 2055406
chrX 1434504
chr22 664422
chrY 115408
chr18 79956
Name: nbases, dtype: int64

interval_coverage


In [14]:
jhelp(Alignment.interval_coverage, full=True)


interval_coverage (self, refid, start, end, bins=500, bin_repr_fun='max', verbose=False, **kwargs)

Parse the alignment file for a given refid and interval. The interval is splited in a number of windows equal to

bins, for which the coverage in computed. The method return a dataframe containing the starting positions of

the windows and the coverage for the + and - strands. If the refid or the coordinates are invalid a zero filled

dataframe will be returned.

  • refid

Name of the sequence from the original fasta file to display

  • start

Start of the window to display. The coordinate is not verified, if outside of the range it will

return empty bins

  • end

End of the window to display. The coordinate is not verified, if outside of the range it will

return empty bins

  • bins

Number of alignment count bins to divide the displayed window. Low number will result in low resolution

high value could result in a long ploting time. The value is automatically adjusted if lower than base

resolution, ie if the requested interval is lower than the number of bins [ DEFAULT: 500 ]

  • bin_repr_fun

Function to represent each bin ("max", "mean" and "sum") [ DEFAULT: "max" ]


In [15]:
a = Alignment (fp="./downloaded_data/1M.bed.gz", verbose=True)


Extract coverage from bed file ./downloaded_data/1M.bed.gz

 Total base coverage 49572316 in 25 reference sequences


In [16]:
df = a.interval_coverage(refid="chrM", start=500, end=10000, bins=200, bin_repr_fun='max')
display(df.head(5))
df.plot.area(figsize=(30, 4), color=("dodgerblue", "lightskyblue"))


+ -
500 0 0
547 0 0
595 0 0
642 165 0
690 228 8
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f724b80b278>

In [17]:
df = a.interval_coverage(refid="chr1", start=0, end=300000000, bins=500, bin_repr_fun='mean')
display(df.head(5))
df.plot.area(figsize=(30, 4), color=("dodgerblue", "lightskyblue"))


+ -
0 0.03275 0.0320333
600000 0.194345 0.196758
1200000 0.167758 0.156655
1800000 0.000535 0.000838333
2400000 0.00483333 0.00194667
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f724ca9b7b8>

In [18]:
df = a.interval_coverage(refid="chr47", start=0, end=300000000)
display(df.head(5))
df.plot.area(figsize=(30, 4), color=("dodgerblue", "lightskyblue"))


+ -
0 0 0
600000 0 0
1200000 0 0
1800000 0 0
2400000 0 0
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f72437bccf8>

In [19]:
df = a.interval_coverage(refid="chr7", start=1997, end=97495, bins=100, bin_repr_fun='max')
display(df.head(5))
df.plot.area(figsize=(30, 4))


+ -
1997 0 0
2951 0 0
3906 0 0
4861 0 0
5816 0 0
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f724c2093c8>

JGV

Test Instantiation


In [20]:
jhelp(JGV.__init__, full= True)


__init__ (self, fp, name=None, refid_list=[], output_index=False, verbose=False, **kwargs)
  • fp

A fasta file containing the reference sequences OR an tab separated index file containing at least 2 columns

with the refid and the length in bases (like a .fa.fai file generated by samtools faidx).

The fasta option will take more time as the file has to be parsed to get the refid and length of sequences.

A 2 column index tsv file will be automatically generated for latter usage as an index file.

Both fasta and infex file can be gziped

  • name

Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp

file name

  • refid_list

list of reference sequence id to select from the data file, by default all [ DEFAULT: [] ]

  • output_index

If True will write a simple A 2 column index tsv file containing the Reference sequence ids and their

lengths [ DEFAULT: False ]


In [21]:
# Init with a fasta file
fp = "./downloaded_data/GRCh38_primary.fa.gz"
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV(fp=fp, refid_list=l, output_index=True, verbose=True)


Add reference genome file

Parsing fasta file

Write a fasta index file: ./downloaded_data/GRCh38_primary.tsv

 Found 25 reference sequences


In [22]:
# Init with a fasta index file
fp = "./downloaded_data/GRCh38_primary.tsv"
j = JGV(fp=fp, verbose=True)


Add reference genome file

Assume the file is a fasta index

 Found 25 reference sequences

add_annotation


In [13]:
jhelp(JGV.add_annotation, full=True)


add_annotation (self, fp, name=None, min_len=None, max_len=None, refid_list=None, type_list=None, verbose=False, **kwargs)
  • fp

An URL to a standard genomic file containing features annotations among the following format:

Valid URL schemes include http, ftp, s3, and file.

The file can eventually be compressed in ‘gzip’, ‘bz2’, ‘zip’ or ‘xz’

  • name

Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp

file name [ DEFAULT: None ]

  • min_len

Minimal size (start to end) of a feature to be selected [default None]

  • max_len

Maximal size (start to end) of a feature to be selected [default None]

  • refid_list

List of reference id to select. Example: ["chr1", "chr2", "chr3"] [default None]

  • type_list

List of feature type to select. Example: ["exon", "gene"] [default None]


In [23]:
j.add_annotation("./downloaded_data/gencode_v25_primary.gff3.pkl", refid_list= ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10'], type_list=["exon", "transcript", "gene"], verbose=True)


Add annotation file

Parse Annotation file

 File is not compressed

 Try to load as a pickle file

Selecting features based on reference id

 Features before filtering: 2577950

 Features after filtering: 1311600

Selecting features based on type

 Features before filtering: 1311600

 Features after filtering: 737728

Sorting and final cleanup

 Number of features imported: 737728

/home/aleg/Programming/Python3/JupyterGenoViewer/JGV/JGV.py:130: UserWarning: No annotation found for chr19,chr17,chr14,chr16,chr15,chr18,chr22,chrY,chr12,chr11,chr21,chr13,chrM,chr20,chrX
  warnings.warn("No annotation found for {}".format(",".join(not_found)))

In [24]:
j.add_annotation("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl", min_len=50, max_len=5000, verbose=True)


Add annotation file

Parse Annotation file

 File is not compressed

 Try to load as a pickle file

Selecting features based on length

 Features before filtering: 639324

 Features after minimal length filtering: 620680

 Features after maximal length filtering: 504267

Sorting and final cleanup

 Number of features imported: 504267

/home/aleg/Programming/Python3/JupyterGenoViewer/JGV/JGV.py:130: UserWarning: No annotation found for chrM
  warnings.warn("No annotation found for {}".format(",".join(not_found)))

add_alignment


In [25]:
jhelp(JGV.add_alignment, full=True)


add_alignment (self, fp, name=None, min_coverage=5, refid_list=[], output_bed=False, verbose=False, **kwargs)
  • fp

A standard BAM or SAM (http://samtools.sourceforge.net/SAM1.pdf) containing aligned reads and a standard

header. The files do not need to be sorted or indexed.

One can also use a 6 fields bed (chrom, chromStart, chromEnd, name, score, strand) file with a hastaged

commented header listing the reference sequences id and length, similar to the format generated by the

output_bed option (Much faster than from a Bam/Sam file, can be gzipped). http://www.ensembl.org/info/website/upload/bed.html

  • name

Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp

file name [ DEFAULT: None ]

  • min_coverage

Minimal coverage to compute the data. If less, the coverage will be considered null. Not used for

if fp is a bed coverage file [ DEFAULT: 5 ]

  • output_bed

If True will be write a 6 columns compressed bed file containing the coverage values for + and - strand

excluding positions with coverage lesser than min_coverage.the option will apply only is the input file is

BAM or SAM. The file starts with a header consisting of a list of the ID of the reference sequences and

their length [ DEFAULT: False ]. Example:

chr20 64444167

chr21 46709983

chr20 276516 276516 pos1 5 +

chr20 276517 276517 pos2 5 +


In [26]:
j.add_alignment("./downloaded_data/1M.bed.gz", min_coverage=10, verbose=True)


Add alignment file

Extract coverage from bed file ./downloaded_data/1M.bed.gz

 Total base coverage 42508895 in 25 reference sequences


In [27]:
j.add_alignment("./downloaded_data/100k.sam", refid_list= ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10'], verbose=True)


Add alignment file

Compute coverage from bam/sam file ./downloaded_data/100k.sam

 Total base coverage 281323 in 10 reference sequences

/home/aleg/Programming/Python3/JupyterGenoViewer/JGV/JGV.py:164: UserWarning: No coverage found for chr19,chr17,chr14,chr16,chr15,chr18,chr22,chrY,chr12,chr11,chr21,chr13,chrM,chr20,chrX
  warnings.warn("No coverage found for {}".format(",".join(not_found)))

Annotation_summary


In [28]:
jhelp(JGV.alignment_summary, full=True)


alignment_summary (self, verbose=False, **kwargs)

Display table summarizing annotation file information

without and with annotation track


In [29]:
# Empty JGV object
j = JGV("./downloaded_data/GRCh38_primary.tsv", ref_list=l)
j.annotation_summary()


/home/aleg/Programming/Python3/JupyterGenoViewer/JGV/JGV.py:171: UserWarning: No annotation track loaded
  warnings.warn("No annotation track loaded")

In [30]:
# JGV object with 3 annotations
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10']
j = JGV("./downloaded_data/GRCh38_primary.tsv", ref_list=l)
j.add_annotation("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl", refid_list=l)
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", refid_list=l)
j.add_annotation("./downloaded_data/gencode_v25_primary.gff3.pkl", refid_list=l)
j.annotation_summary()


Counts per Annotation file

Feature count Refid count Feature type count
FANTOM_5_all_lncRNA 377830 10 3
gencode_v25_lncRNA 70372 10 3
gencode_v25_primary 1311600 10 9

Counts per Reference sequence

FANTOM_5_all_lncRNA gencode_v25_lncRNA gencode_v25_primary
refid
chr1 53694 10896 231881
chr10 28426 5201 90525
chr2 55904 13051 192231
chr3 39353 8302 159916
chr4 33650 5568 99668
chr5 36594 6628 114550
chr6 40392 5487 115882
chr7 33329 5589 122040
chr8 32179 5530 94109
chr9 24309 4120 90798

Counts per feature types

FANTOM_5_all_lncRNA gencode_v25_lncRNA gencode_v25_primary
type
CDS NaN NaN 363100
exon 236297.0 47411.0 608187
five_prime_UTR NaN NaN 68824
gene 37652.0 7862.0 30077
start_codon NaN NaN 40226
stop_codon NaN NaN 36431
stop_codon_redefined_as_selenocysteine NaN NaN 61
three_prime_UTR NaN NaN 65230
transcript 103881.0 15099.0 99464

alignment_summary


In [31]:
jhelp(JGV.alignment_summary, full=True)


alignment_summary (self, verbose=False, **kwargs)

Display table summarizing annotation file information

without and with alignment track


In [32]:
# Empty JGV object
j = JGV("./downloaded_data/GRCh38_primary.tsv", ref_list=l)
j.alignment_summary()


/home/aleg/Programming/Python3/JupyterGenoViewer/JGV/JGV.py:199: UserWarning: No alignment track loaded
  warnings.warn("No alignment track loaded")

In [33]:
# JGV object with 2 alignment tracks
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10']
j = JGV("./downloaded_data/GRCh38_primary.tsv", refid_list=l)
j.add_alignment("./downloaded_data/1M.bed.gz", refid_list=l)
j.add_alignment("./downloaded_data/100k.bed.gz", refid_list=l)
j.alignment_summary()


Counts per Alignment file

Refid count Base coverage
1M 10 17317694
100k 10 562646

Counts per Reference sequence

1M 100k
chr1 3345518 151526
chr10 709208 14328
chr2 3368306 102540
chr3 3312210 127588
chr4 982120 41156
chr5 507738 3050
chr6 2531380 88328
chr7 745494 10264
chr8 747100 8372
chr9 1068620 15494

refid_coverage_plot


In [34]:
jhelp(JGV.refid_coverage_plot, full=True)


refid_coverage_plot (self, norm_depth=True, norm_len=True, plot_style='ggplot', figwidth=20, figheight=5, log=False, refid_list=[], verbose=False, **kwargs)
  • norm_len

If True, for each refid, the base counts are normalised by the length of refid in bases

  • norm_depth [ DEFAULT: True ]

If True, for each track, the base counts are normalised by the overall number of bases mapped

  • plot_style [ DEFAULT: True ]

Default plot style for pyplot ('grayscale'|'bmh'|'ggplot'|'dark_background'|'classic'|'fivethirtyeight'...)

[ DEFAULT: "ggplot" ]

  • figwidth

Width of the ploting area in inches [ DEFAULT: 20 ]

  • figheight

height of the ploting area in inches [ DEFAULT: 5 ]

  • log

if True the yscale will be log10 else it will be linear [ DEFAULT: True ]

  • refid_list

list of reference sequence id to display, by default all. The list is also used to reorder the reference in

the dataframe and in the graph [ DEFAULT: [] ]

  • kwargs

Additional parameters for plot appearance derived from pylab basic plot arguments such as: color, alpha,

fontsize...


In [36]:
# JGV object with 2 annotations and 2 alignment tracks
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV("./downloaded_data/GRCh38_primary.tsv", refid_list=l)
j.add_annotation("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl", refid_list=l)
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", refid_list=l)
j.add_alignment("./downloaded_data/1M.bed.gz", refid_list=l)
j.add_alignment("./downloaded_data/100k.bed.gz", refid_list=l)

In [42]:
df = j.refid_coverage_plot(norm_depth=True, norm_len=False)
display(df)


1M 100k
chr1 67.487628 76.540885
chr10 14.306534 7.237555
chr11 38.855760 27.508570
chr12 35.866672 36.764639
chr13 3.676649 0.940559
chr14 39.119375 33.009475
chr15 12.100342 1.720485
chr16 76.869961 27.592422
chr17 53.154749 13.583044
chr18 1.612916 0.060616
chr19 41.462779 24.091845
chr2 67.947320 51.796407
chr20 45.013108 27.181243
chr21 183.629064 354.755379
chr22 13.403086 2.651952
chr3 66.815720 64.448995
chr4 19.811864 20.789281
chr5 10.242370 1.540658
chr6 51.064388 44.617447
chr7 15.038515 5.184692
chr8 15.070912 4.228979
chr9 21.556790 7.826541
chrM 74.627823 161.391219
chrX 28.937603 4.537111
chrY 2.328074 NaN

In [44]:
r = j.refid_coverage_plot(
    norm_depth = True,
    norm_len =  True,
    plot_style="ggplot",
    figwidth = 30,
    figheight = 10, 
    log = True,
    refid_list = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20'],
    color=("dodgerblue", "green"),
    alpha=0.5,
    fontsize=16)


Interval_plot


In [45]:
jhelp(JGV.interval_plot, full=True)


interval_plot (self, refid, start=None, end=None, plot_style='ggplot', figwidth=30, alignment_track_height=5, annotation_track_height=2, alignment_bins=500, alignment_bin_repr_fun='max', alignment_log=True, alignment_color=('dodgerblue', 'darkorange'), alignment_alpha=0.5, feature_types=[], max_features_per_type=500, annotation_offset=None, annotation_label=False, max_label_size=50, annotation_color='grey', verbose=False, **kwargs)
  • refid

Name of the sequence from the original fasta file to display

  • start

Start of the window to display. If not given, will be set to 0 [ DEFAULT: None ]

  • end

End of the window to display. If not given, will be set to the length of refid [ DEFAULT: None ]

  • plot_style [ DEFAULT: True ]

Default plot style for pyplot ('grayscale'|'bmh'|'ggplot'|'dark_background'|'classic'|'fivethirtyeight'...)

[ DEFAULT: "ggplot" ]

  • figwidth

Width of the ploting area in inches [ DEFAULT: 20 ]

  • alignment_track_height

Height of individual aligment tracks [DEFAULT : 5 ]

  • annotation_track_height

Height of individual annotation tracks for each feature types [DEFAULT : 2 ]

  • alignment_bins

Number of alignment count bins to divide the displayed window. Low number will result in low resolution

high value could result in a long ploting time. The value is automatically adjusted if lower than base

resolution, ie if the requested interval is lower than the number of bins [ DEFAULT: 500 ]

  • alignment_bin_repr_fun

Function to represent each bin ("max", "mean" and "sum") [ DEFAULT: "max" ]

  • alignment_log

if True the yscale will be log10 else it will be linear [ DEFAULT: True ]

  • alignment_color

Tuple of 2 color for the alignment + and - tracks [DEFAULT : ("dodgerblue", "darkorange") ]

  • alignment_alpha

Transparency of the alignment coverage area between 0 and 1 [ DEFAULT: 0.5 ]

  • feature_types

Name of a valid feature type ( "exon"|"transcript"|"gene"|"CDS"...) or list of names of feature type for

which a row will be returned. The option is not available for bed files. If not given, all features type

found in the interval will be returned [ DEFAULT: None ]

  • max_features_per_type

Maximal total number of features for a particular feature type. If more are found, a random sampling will

be performed. If None, all the features will be returned [ DEFAULT: 500 ]

  • annotation_offset

Minimal distance between 2 contigous annotation features on the same level. If not given, will be

automatically set to 1/400 of the windows to display [DEFAULT : None ]

  • annotation_label

If True, labels of features will be plotted. To be avoid when expecting many features [DEFAULT : False ]

  • max_label_size

limit the size of the label text for each feature [DEFAULT : "50" ]

  • annotation_color

Color of the annotation arrows [DEFAULT : "grey" ]

  • kwargs

In [54]:
# Create an instances of IGV
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV(fp = "./downloaded_data/GRCh38_primary.tsv", ref_list=l)

# Add annotation track only
j.add_annotation("./downloaded_data/FANTOM_5_all_lncRNA.gtf.pkl", type_list="exon", ref_list=l)
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", ref_list=l)
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", name="gencode_lnc_gene_min_2000" ,min_len=2000, ref_list=l, type_list="gene")

In [55]:
j.interval_plot(refid="chrM") #Xempty



In [56]:
j.interval_plot(refid="chrY")



In [59]:
# Create an instances of IGV
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV(fp = "./downloaded_data/GRCh38_primary.tsv", ref_list=l)

# 3 alignment tracks only
j.add_alignment("./downloaded_data/1M.bed.gz", ref_list=l)
j.add_alignment("./downloaded_data/1M.bed.gz", name="1M_min_cov=50",ref_list=l, min_coverage=50)
j.add_alignment("./downloaded_data/100k.bed.gz", ref_list=l)

In [60]:
j.interval_plot(refid="chrM")



In [61]:
j.interval_plot(refid="chrY")



In [62]:
# Create an instances of IGV
l = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15','chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY','chrM']
j = JGV(fp = "./downloaded_data/GRCh38_primary.tsv", ref_list=l)

# 1 alignment track
j.add_alignment("./downloaded_data/1M.bed.gz", ref_list=l)

# 1 annotation track
j.add_annotation("./downloaded_data/gencode_v25_lncRNA.gff3.pkl", ref_list=l)

In [64]:
j.interval_plot(refid="chrM")



In [65]:
j.interval_plot(refid="chrY")



In [66]:
# Plot a specific windows and testing some of the options
j.interval_plot(refid="chr21", start=45400000, end=45600000, alignment_log= True, annotation_label=True, max_features_per_type=100, feature_types=["gene", "transcript"], max_label_size=15)



In [67]:
j.interval_plot(refid="chr12", start=56000000, end=58000000, alignment_log= False, annotation_track_height=1)



In [68]:
# Testing color customization
j.interval_plot(refid="chr19",  alignment_color=("red", "violet"), alignment_alpha=0.75, annotation_color="green")


Test with a large annotation file


In [69]:
# Create fast instances of IGV
fp = "./downloaded_data/GRCh38_primary.tsv"
l = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY', 'chrM']

# JGV object with 2 annotations and 2 alignment tracks
j = JGV(fp=fp, ref_list=l)
j.add_annotation("./downloaded_data/gencode_v25_primary.gff3.pkl")
j.add_alignment("./downloaded_data/1M.bed.gz")

In [71]:
# Plot 
%time j.interval_plot(refid="chr1")


CPU times: user 6.43 s, sys: 1.68 s, total: 8.1 s
Wall time: 6.05 s