In [1]:
suppressWarnings(suppressMessages(library(gUtils)))
suppressWarnings(suppressMessages(library(BSgenome.Hsapiens.UCSC.hg19)))
Sys.setenv(DEFAULT_BSGENOME = 'BSgenome.Hsapiens.UCSC.hg19::Hsapiens')
In [2]:
## instantiate GRanges objects
gr1 = suppressWarnings(GRanges(
seqnames = Rle(c(1, 2, 1, 3), c(1, 3, 2, 4)),
ranges = IRanges(101:110, end = 111:120, names = head(letters, 10)),
strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
score = 1:10,
GC = seq(1, 0, length=10)))
gr2 = GRanges(1, IRanges(c(103, 107, 113), c(105, 109, 116)),
strand=c('+','-','-'),
name=c("A","B","C"))
print('print gr1:')
print(gr1)
print('print gr2 :')
print(gr2)
In [3]:
### Function:
### gr.match()
### gr.match(grA, grB) returns a length(grA) integer vector whose item i contains the first index in grB overlapping grA[i]
### The function has been implemented to be multithreaded via `parallel::mc.cores`
###
### Args:
### 'query' GRanges
### 'subject' GRanges
### 'ignore.strand' if FALSE, gr.match() will be strand specific. if TRUE, it is strand-agnostic
### 'max.slice' max slice of query to match at a time (default = Inf)
### 'verbose' boolean flag to set whether to give verbose output (default = FALSE)
### 'mc.cores' the number of cores to use (default = 1)
###
### Use:
### gr.match(query, subject)
###
In [4]:
gr.match(gr1, gr2, ignore.strand=TRUE)
In [5]:
gr.match(gr1, gr2, ignore.strand=FALSE)
In [6]:
gr.match(gr2, gr1, ignore.strand=TRUE)
In [7]:
gr.match(gr2, gr1, ignore.strand=FALSE)
In [8]:
## Function:
## gr.tile.map()
## gr.tile.map(gr_query, gr_subject) returns a length(gr_query) list whose items are integer vectors of indices in gr_subject
## which overlap gr_query
##
## gr.tile.map() is strand-agnostic (it ignores the strand)
##
## 'gr.tile.map()' assumes that input query and subject have no gaps (including at end) or overlaps, i.e. ignores end()
#' coordinates and only uses "starts"
##
## Use:
## gr.tile.map(query, subject, verbose = FALSE)
##
## Refer to function 'gr.tile()'
In [9]:
gr.tile.map(gr2, gr1)
In [10]:
## Function:
## grbind() returns a concatenated GRanges
##
## Use:
## grbind(input_GRanges1, input_GRanges2, ...)
##
## See function 'grl.bind()'
In [11]:
grbind(gr1, gr2)
In [12]:
grbind(gr2, gr1) ## order-specific
In [13]:
## Function:
## grl.bind() returns a concatenated GRangesList
##
## Use:
## grl.bind(input_GRangesList1, input_GRangesList2, ...)
##
## See function 'grbind()'
In [14]:
grlist1 = grl1[1:5] ## create two small GRangeLists as an example
grlist2 = grl2[1:5]
grl.bind(grlist1, grlist2)
In [15]:
## Function:
## gr.dist() returns a concatenated GRangesList
##
## Args:
## 'ignore.strand' If TRUE, will ignore strand. If FALSE, operation is strand-specific.
##
## Use:
## gr.dist(input_GRanges1, input_GRanges2)
##
In [16]:
gr.dist(gr2, gr1)
In [17]:
gr.dist(gr2, gr1, ignore.strand = TRUE)
In [18]:
## Function:
## gr.in()
## gr.in(query, subject) returns boolean vector TRUE if query range i is found in any range in subject.
##
## Motivation:
## modified implementation of GenomicRanges::findOverlaps()
##
## Args:
## 'query' GRanges Set of GRanges to query. Refer to gr.findoverlaps() and GenomicRanges::findOverlaps()
## 'subject' GRanges Set of GRanges as 'subject' in query. Refer to gr.findoverlaps() and GenomicRanges::findOverlaps()
##
## Use:
## gr.in(query, subject)
##
## See function 'grl.in()'
In [19]:
gr.in(gr1, gr2)
In [20]:
## Function:
## grl.in() calculates intersection of \code{GRangesList} with windows on genome, returning a boolean vector of match status
##
## Motivation:
## modified implementation of %in% for GRangesList
##
## Args:
## 'windows' GRanges pile of windows
## 'some' If TRUE, will return TRUE for GRangesList elements that intersect at least on window range (default = FALSE)
## 'only' If TRUE, will return TRUE for GRangesList elements only if there are no elements of query that fail to intersect with windows (default = FALSE)
## 'logical' If TRUE, will return logical otherwise will return numeric vector of number of windows overlapping each GRangeslist (default = TRUE)
## 'exact' If TRUE, will return exact intersection
##
## Use:
## gr.in(query, subject)
##
## See function 'gr.in()'
In [21]:
grl.in(grl.hiC[1:10], gr2)
In [22]:
## Function:
## gr.findoverlaps()
## gr.findoverlaps(query, subject) returns intersecting GRanges, with query.id and subject.id marking sources
##
## Motivation:
## modification of {GenomicRanges::findOverlaps() with added funtionality
##
## Args:
## 'query' GRanges as query
## 'subject' GRanges as subject
## 'ignore.strand' Don't consider strand information during overlaps (default = TRUE)
## 'first' If TRUEm restricts to only the first match of the subject. If FALSE will return all matches. (default = FALSE)
## 'qcol' \vector of query meta-data columns to add to results (default = NULL)
## 'scol' vector of subject meta-data columns to add to results (default = NULL)
## 'type' 'type' argument as defined by \code{IRanges::findOverlaps} (\code{"any"}, \code{"start"}, \code{"end"}, \code{"within"}, \code{"equal"}). (default = 'any')
## 'by' Meta-data column to consider when performing overlaps (default = NULL)
## 'return.type'Select data format to return (supplied as character): \code{"same"}, \code{"data.table"}, \code{"GRanges"}. (default = 'same')
## 'max.chunk' Maximum number of \code{query*subject} ranges to consider at once. Lower number increases runtime but decreased memory. If \code{length(query)*length(subject)} is less than \code{max.chunk}, overlaps will run in one batch. (default = 1e3)
## 'verbose' If TRUE, will increase the verbosity. (default = FALSE)
## 'mc.cores' Number of cores to use when running in chunked mode (default = 1)
##
## Use:
## gr.findoverlaps(query, subject, by='column_example')
##
In [23]:
gr.findoverlaps(gr1, gr2)
In [24]:
gr1$name = c('A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'D', 'D') ## add meta data column 'name'
gr.findoverlaps(gr1, gr2, by='name')
In [25]:
## Function:
## gr.setdiff() returns indices of query in subject (modified from GenomicRanges::setdiff())
##
## Motivation:
## modification of GenomicRanges::setdiff()
## 'gr.setdiff()' is robust to common edge cases of setdiff(gr1, gr2) where gr2 ranges are contained inside gr1's (yieldings
## setdiffs yield two output ranges for some of the input gr1 intervals.
##
##
## Args:
## 'query' GRanges as query
## 'subject' GRanges as subject
## 'ignore.strand' If TRUE, operations will ignore strand. Refer to 'gr.findoverlaps()'. (default = TRUE)
## 'by' Meta-data column to consider when performing overlaps. Refer to 'gr.findoverlaps()' documentation (default = NULL)
##
## Use:
## gr.setdiff(query, subject)
##
In [26]:
gr.setdiff(gr1, gr2)
In [27]:
## Function:
## gr.merge() merges GRanges by using coordinates as primary key
##
## Args:
## 'query' Set of GRanges to query.
## 'subject' \Set of GRanges as 'subject'
## 'by' Additional metadata fields to join on
## 'all' If TRUE, executes left and right joins
## 'all.query' If TRUE, executes left join (default = all)
## 'all.subject' If TRUE, executes right join (default = all)
##
## Use:
## gr.merge(query, subject)
##
In [28]:
gr1 = GRanges(
seqnames = Rle(c(1, 2, 1, 3), c(1, 3, 2, 4)),
ranges = IRanges(101:110, end = 111:120, names = head(letters, 10)),
strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
score = 1:10,
GC = seq(1, 0, length=10))
gr.merge(gr1, gr2)
In [29]:
gr1$name = c('A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'D', 'D') ## add meta data column 'name'
gr.merge(gr1, gr2, by='name')
In [30]:
## Function:
## gr.val() annotates input GRanges with values from another GRanges
##
## 'query' and 'target' can be GRangesList, in which case 'val' will refer to GRangeslist level values fields
##
## Args:
## 'query' GRanges of query ranges whose 'val' column we will populate with aggregated values of 'target'
## 'target' GRanges of target ranges that already have "val" column populated
## 'val' If a character field: then aggregation will paste together the (unique) overlapping values, collapsing by comma. (default = NULL)
## 'mean' If FALSE, then will return sum instead of mean, only applies if target 'val' column is numeric. (default = TRUE)
## 'weighted' Calculate a weighted mean. If FALSE, calculates unweighted mean. (default = 'mean')
## 'na.rm' If TRUE, removes NA values when calulating means. Only applies if val column of target is numeric (default = FALSE)
## 'by' specifies additional "by" column of query AND target that will be used to match up query and target pairs (i.e. in addition to pure GRanges overlap). (default = NULL)
## 'by.prefix' Choose a set of 'val'fields by a shared prefix. (default = 'val')
## 'merge' If merge = FALSE then will cross every range in query with every level of "by" in target (and create data matrix), otherwise will assume query has "by" and merge only ranges that have matching "by" values in both query and target (default = FALSE)
## 'FUN' Optional different function to call than mean. Takes two arguments (value, na.rm = TRUE) if weighted = FALSE, and three (value, width, na.rm = TRUE) if weighted = TRUE. (default = NULL)
## 'default.val' If no hit in 'target' found in 'query', fill output 'val' field with this value. (default = NA)
## 'max.slice' Maximum number of query ranges to consider in one memory chunk. (default = Inf)
## 'mc.cores' Number of cores to use when running in chunked mode (default = 1)
## 'sep' Specifies character to use as separator when aggregating character "vals" from target, only applies if target is character (default = ', ')
## 'verbose'If TRUE, increase the verbosity of the output (default = FALSE)
##
##
## Use:
## gr.val(query, target)
##
In [31]:
gr1 = GRanges(
seqnames = Rle(c(1, 2, 1, 3), c(1, 3, 2, 4)),
ranges = IRanges(101:110, end = 111:120, names = head(letters, 10)),
strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
score = 1:10,
GC = seq(1, 0, length=10))
gr.val(gr2, gr1)
In [32]:
gr1$name = c('A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'D', 'D') ## add meta data column 'name'
gr.merge(gr2, gr1, by='name')
In [33]:
## Function:
## anchorlift()
##
## "lifts" all queries with respect to subject in coordinates that are within "pad"
## i.e. puts the queries into subject-centric coordinates, which is a new genome with label "Anchor" (default)
##
## Respects strand of subject (i.e. if subject strand gr is "-" then will lift all queries to the left of it
## into positive subject-centric coordinates). Keeps track of subject and query id for later deconvolution if need be.
##
## Args:
## 'query' GRanges that will be lifted around the subject
## 'subject' GRanges around which the queries will be lifted
## 'window' integer specifying how far around each subject to gather query intervals to lift (default = 1e9)
## 'by' character vector specifying additional columms (e.g. sample id) around which to restrict overlaps (via gr.findoverlaps()). Refer to `gr.findoverlaps()` documentation. (default = NULL)
## 'seqname' String specifying the name of the output sequence around which to anchor (default = "Anchor")
## 'include.values' If TRUE, include values from query and subject (default = TRUE)
##
## Use:
## anchorlift(query, subject)
##
In [34]:
anchorlift(gr1, gr2, window=100)
In [35]:
## Function:
## rrbind()
##
## Motivation:
## Implementation of'rbind()', but for calculating intersecting/union columns of data.frame/data.table
##
## Args:
## 'union' If TRUE, takes union of columns (default = TRUE)
#' 'as.data.table' If TRUE, returns the binded data as a data.table (default = FALSE)
##
## Use:
## rrbind(dt1, dt2)
##
In [36]:
dt1 = gr2dt(gr1) ## example data.table dt1
dt2 = gr2dt(gr2) ## example data.table dt2
In [37]:
rrbind(dt1, dt2, as.data.table=TRUE)
In [38]:
rrbind(dt2, dt1, as.data.table=TRUE)
In [ ]:
In [ ]: