biocLite()
### try http:// if https:// URLs are not supported
source("https://bioconductor.org/biocLite.R")
biocLite("BSgenome.Hsapiens.UCSC.hg19")
In [1]:
suppressWarnings(suppressMessages(library(gUtils)))
In [2]:
library(BSgenome.Hsapiens.UCSC.hg19)
Sys.setenv(DEFAULT_BSGENOME = 'BSgenome.Hsapiens.UCSC.hg19::Hsapiens')
In [3]:
## instantiate an example GRanges objects, gr
gr = GRanges(1, IRanges(c(3, 9, 13), c(9, 13, 20)), strand=c('+','-','-'), name=c("A","B","C"))
print(gr)
In [4]:
## instantiate multiple GRanges objects (to be used in examples below)
gr1 = suppressWarnings(GRanges(
seqnames = Rle(c(1, 2, 1, 3), c(1, 3, 2, 4)),
ranges = IRanges(101:110, end = 111:120, names = head(letters, 10)),
strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
score = 1:10,
GC = seq(1, 0, length=10)))
gr2 = GRanges(1, IRanges(c(103, 107, 113), c(105, 109, 116)),
strand=c('+','-','-'),
name=c("A","B","C"))
print(gr1)
print(gr2)
In [5]:
## Motivation:
## Adding integer N to a GRanges object with IRanges(start, end)
## will result in a GRanges with IRanges(start - N, end + N)
gr + 5
## Subtracting integer N to a GRanges object with IRanges(start, end)
## will result in a GRanges with IRanges(start + N, end - N)
gr - 2
## The functions '%+%' and '%-%' are designed to easily manipulate a GRanges object to add/substract bases
## to 'start' and 'end'
In [6]:
## Function:
## '%+%' is an operator which shifts GRanges by N bases to the right, i.e. IRanges(start + N, end + N)
## Use:
## gr %+% N = original GRanges 'gr' shifted N bases to the right.
In [7]:
gr %+% 5
In [8]:
## Function:
## '%-%' is an operator which shifts GRanges by N bases to the left, , i.e. IRanges(start - N, end - N)
## Use:
## gr %-% N = original GRanges 'gr' shifted N bases to the right.
In [9]:
gr %-% 5
In [10]:
## Function:
## '%Q%' subsets the GRanges 'gr' based on conditionals of the metadata columns in 'gr'
## Use:
## gr %Q% (gene_names == 'KRAS')
## gr %Q% ((mapq >= 40) && (GC < 0.5))
## gr %Q% (width < 100)
In [11]:
gr %Q% (name != 'B')
In [12]:
## Function:
## '%*%' Performs "natural join" or merge of metadata columns of grA and grB
## using interval overlap as a "primary key", outputs a new GRanges whose maximum length is length(grA)*length(grB).
## (See please see documentation for function 'gr.findoverlaps()' for more complex queries)
## Use:
## grA %*% grB # strand-agnostic merging
## grA %**% grB # strand-specific merging
In [13]:
## grA %*% grB # strand-agnostic merging
gr1 %*% gr2 ## gr2 %*% gr1 will have metadata columns in a different order
In [14]:
## grA %**% grB # strand-specific merging
gr1 %**% gr2
In [15]:
## Function:
## '%$%' aggregates the metadata in grB across the territory of each range in grA
## Aggregates the metadata in grB across the territory of each range in grA.
## This returns grA appended with additional metadata columns of grA with values aggregated over where grA and grB overlap.
## For character or factor-valued metadata columns of grB, aggregation will return a comma collapsed character value of all grB values that overlap grA[i].
## For numeric columns of grB it will return the width-weighted mean value of that column across the grA[i] and grB overlap.
## Use:
## grA %$% grB # strand-agnostic aggregation
## grA %$$% grB # strand-specific aggregation
In [16]:
## grA %$% grB # strand-agnostic aggregation
gr1 %$% gr2
In [17]:
gr2 %$% gr1
In [18]:
## grA %$$% grB # strand-specific aggregation
gr1 %$$% gr2
In [19]:
gr2 %$$% gr1
In [20]:
## Function:
## '%&%' return the subset of ranges in grA that overlap with at least one range in grB.
## Use:
## grA %&% grB # strand-agnostic
## grA %&&% grB # strand-specific
In [21]:
## grA %&% grB # strand-agnostic
gr1 %&% gr2
In [22]:
gr2 %&% gr1
In [23]:
## grA %&&% grB # strand-specific
gr1 %&&% gr2
In [24]:
gr2 %&&% gr1
In [25]:
## Function:
## '%O%'
## Returns a length(grA) numeric vector whose item i is the *fraction of the width* of grA[i]
## which overlaps at least one range in grB.
## Use:
## grA %O% grB # strand-agnostic
## grA %OO% grB # strand-specific
In [26]:
## grA %O% grB # strand-agnostic
print('gr1 %O% gr2 outputs')
print(gr1 %O% gr2)
print('')
print('gr2 %O% gr1 outputs')
print(gr2 %O% gr1)
In [27]:
## grA %OO% grB # strand-specific
print('gr1 %OO% gr2 outputs')
print(gr1 %OO% gr2)
print('')
print('gr2 %OO% gr1 outputs')
print(gr2 %OO% gr1)
In [28]:
# %o%
## Function:
## '%o%'
## Returns a length(grA) numeric vector whose item i is the *number of bases* in grA[i]
## which overlaps at least one range in grB.
## Use:
## grA %o% grB # strand-agnostic
## grA %oo% grB # strand-specific
In [29]:
## grA %o% grB # strand-agnostic
print('gr1 %o% gr2 outputs')
print(gr1 %o% gr2)
print('')
print('gr2 %o% gr1 outputs')
print(gr2 %o% gr1)
In [30]:
## grA %oo% grB # strand-specific
print('gr1 %oo% gr2 outputs')
print(gr1 %oo% gr2)
print('')
print('gr2 %oo% gr1 outputs')
print(gr2 %oo% gr1)
In [31]:
## Function:
## '%N%'
## Returns a length(grA) numeric vector whose item i is the total number of ranges in grB that overlap with grA[i]
## Use:
## grA %N% grB # strand-agnostic
## grA %NN% grB # strand-specific
In [32]:
## grA %N% grB # strand-agnostic
print('gr1 %N% gr2 outputs')
print(gr1 %N% gr2)
print('')
print('gr2 %N% gr1 outputs')
print(gr2 %N% gr1)
In [33]:
## grA %NN% grB # strand-agnostic
print('gr1 %NN% gr2 outputs')
print(gr1 %NN% gr2)
print('')
print('gr2 %N% gr1 outputs')
print(gr2 %NN% gr1)
In [34]:
## Function:
## '%^%'
## Returns a length(grA) logical vector whose item i TRUE if the grA[i] overlap at least on range in grB
## Use:
## grA %^% grB # strand-agnostic
## grA %^^% grB # strand-specific
In [35]:
## grA %^% grB # strand-agnostic
print('gr1 %^% gr2 outputs')
print(gr1 %^% gr2)
print('')
print('gr2 %^% gr1 outputs')
print(gr2 %^% gr1)
In [36]:
## grA %^^% grB # strand-specific
print('gr1 %^^% gr2 outputs')
print(gr1 %^^% gr2)
print('')
print('gr2 %^^% gr1 outputs')
print(gr2 %^^% gr1)
In [37]:
## Function:
## '%_%'
## Similar to the function BiocGenerics::setdiff(), but strand-agnostic (cf. https://github.com/Bioconductor/BiocGenerics)
## Use:
## grA %_% grB
In [38]:
print('gr1 %_% gr2 outputs')
print(suppressWarnings(gr1 %_% gr2))
print('')
print('gr2 %_% gr1 outputs')
print(suppressWarnings(gr2 %_% gr1))
In [39]:
# hg_seqlengths()
## Function: 'hg_seqlengths()'
## Details:
## Outputs standard hg19 seqlengths as an integer vector
In [40]:
hg_seqlengths()[1:6]
In [41]:
# example_genes
## Data: 'example_genes', GRanges
## Details:
## GRanges of RefSeq genes with exon count and names, using hg19
In [42]:
head(example_genes)
length(example_genes)
In [43]:
# example_dnase
## Data: 'example_dnase', GRanges
## Details:
## DNAaseI hypersensitivity sites from UCSC Table Browser hg19,
## subsampled to 10,000 sites
In [44]:
head(example_dnase)
length(example_dnase)
In [45]:
# grl1
## Data: 'grl1', GRangesList
## Details:
## simulated SV rearrangements, first dataset
In [46]:
head(grl1, 3)
In [47]:
# grl2
## Data: 'grl2', GRangesList
## Details:
## simulated SV rearrangements, second dataset
In [48]:
head(grl2, 3)
In [49]:
# si
## Data: si, Seqinfo object
## Details:
## Seqinfo object for hg19
si
In [50]:
# grl.hiC
## Data: 'grl.hiC', GRangesList
## Details:
## HiC data for chr14 from Lieberman-Aiden 2009 (in hg19), subsampled
## to 10,000 interactions
head(grl.hiC)
In [ ]:
In [ ]: