In [1]:
suppressWarnings(suppressMessages(library(gUtils)))
suppressWarnings(suppressMessages(library(BSgenome.Hsapiens.UCSC.hg19)))
Sys.setenv(DEFAULT_BSGENOME = 'BSgenome.Hsapiens.UCSC.hg19::Hsapiens')
In [2]:
## instantiate an example GRanges objects, 'gr' and data.table 'dt'
grexample = GRanges(1, IRanges(c(3, 9, 13), c(9, 13, 20)), strand=c('+','-','-'), name=c("A","B","C"))
dt = data.table(seqnames=1, start=c(2,5,10), end=c(3,8,15))
print('grexample')
print(grexample)
print('dt')
print(dt)
In [3]:
## Function:
## 'gr.start()' returns a GRanges of the first coordinate (or first k coordinates) in each interval
##
## Args:
## 'width' integer Specify subranges of greater width including the start of the range. (default = 1)
## 'force' boolean If TRUE, allows returned GRanges to have ranges outside of its 'Seqinfo' bounds. (default = FALSE)
## 'ignore.strand' If FALSE, gr.start() will extend '-' strands from the other direction (default = TRUE)
## 'clip' boolean If TRUE, trims returned GRanges so that it does not extend beyond bounds of the input (default = TRUE)
##
## Use:
## gr.start(grexample) = returns start of each genomic range in the input 'grexample'
##
In [4]:
gr.start(grexample)
In [5]:
gr.start(grexample, width=3)
In [6]:
## if argument 'width' larger than width(grexample), then the corresponding output will have subranges specified to the range end
gr.start(grexample, width=50)
In [7]:
gr.start(grexample, ignore.strand=FALSE)
In [8]:
## Function:
## 'gr.end()' returns a GRanges of the last coordinate (or last k coordinates) in each interval
##
## Args:
## 'width' integer Specify subranges of greater width including the start of the range. (default = 1)
## 'force' boolean If TRUE, allows returned GRanges to have ranges outside of its 'Seqinfo' bounds. (default = FALSE)
## 'ignore.strand' If FALSE, gr.start() will extend '-' strands from the other direction (default = TRUE)
## 'clip' boolean If TRUE, trims returned GRanges so that it does not extend beyond bounds of the input (default = TRUE)
##
## Use:
## gr.end(grexample) = returns end of each genomic range in the input 'grexample'
##
In [9]:
gr.end(grexample)
In [10]:
gr.end(grexample, width=3)
In [11]:
## if argument 'width' larger than width(grexample), subranges specified to the range start
gr.end(grexample, width=50)
In [12]:
gr.end(grexample, ignore.strand=FALSE)
In [13]:
## Function:
## 'gr.mid()' returns a GRanges of the midpoint coordinate in each interval
##
## Use:
## gr.mid(grexample) = returns midpoints of each genomic range in the input 'grexample'
##
In [14]:
gr.mid(grexample)
In [15]:
## midpoint between (100, 200)
gr.mid(GRanges(1, IRanges(100, 200), seqinfo=Seqinfo("1", 200)))
In [16]:
## Function:
## 'dt2gr()' inputs an R data.table/data.frame and outputs a GRanges, if columns 'start', 'end', 'seqnames' exist
##
## Args:
## 'seqlengths' named integer vector representing genome (default = hg_seqlengths())
## 'seqinfo' Seqinfo of output GRanges object
##
## Use:
## dt2gr(dt)
##
In [17]:
dt2gr(dt) ## data.table converted to GRanges
In [18]:
## Function:
## 'gr2dt()' returns GRanges converted into an R data.table/data.frame
##
## Use:
## gr2dt(grexample)
##
In [19]:
gr2dt(grexample) ## GRanges converted to data.table
print(is(gr2dt(grexample), 'data.table'))
In [20]:
## Function:
## 'gr.tile()' tiles the input GRanges 'input_gr' with non-overlapping bins of width K.
##
## Args:
## 'width' the width of each tile (default = 1e3)
##
## Use:
## gr.tile(input_gr, width = K)
##
## Refer to function 'gr.tile.map()'
In [21]:
gr.tile(grexample, 2)
In [22]:
## create 10 tiles of width 10 from start=1 to end=100
gr.tile(GRanges(1, IRanges(1,100)), width=10)
In [23]:
## Function:
## 'gr.chr()' prepends 'chr' to GRanges seqlevels()
##
## Use:
## gr.chr(grexample)
##
In [24]:
gr.chr(grexample)
In [25]:
## Function:
## 'gr.nochr()' removes the prefix 'chr' from GRanges seqlevels()
##
## Use:
## gr.nochr(grexample)
##
In [26]:
gr.nochr(gr.chr(grexample))
In [27]:
## Function:
## gr.stripstrand() sets input GRanges strand to '*'
##
## Use:
## gr.stripstrand(grexample)
##
In [28]:
gr.stripstrand(grexample)
In [29]:
## Function:
## gr.flipstrand() inverts GRanges strand, '+' <-> '-'
##
## Use:
## gr.flipstrand(grexample)
##
In [30]:
gr.flipstrand(grexample)
In [31]:
## Function:
## gr.flatten() returns input GRanges as data.frame with input ranges superimposed
## onto a single "flattened" chromosome, with ranges laid end-to-end
##
## Args:
## 'gap' number of bases between ranges on the new chromosome (default = 0)
##
## Use:
## gr.flatten(grexample)
##
In [32]:
gr.flatten(grexample)
In [33]:
gr.flatten(grexample, gap=500)
In [34]:
## Function:
## gr.rand(w, genome) returns randomly-generated GRanges from input genome
##
## Args:
## 'w' vector of widths, whereby 'length(w)' determines length of output
## 'genome' GRanges, GRangesList, or Seqinfo genome
##
## Use:
## gr.rand(w, genome)
##
In [35]:
## Generate 5 non-overlapping regions of width 10 on hg19
## set seed for reproducibility
set.seed(123)
gr.rand(rep(10,5), BSgenome.Hsapiens.UCSC.hg19::Hsapiens)
In [36]:
## Function:
## 'gr.trim()' trims input GRanges relative to the specified <local> coordinates of each range
## The motivation behind this function is to fill the role of current GRanges functionality
## e.g. shift(), reduce(), restrict(), shift(), resize(), flank()
##
## Args:
## 'start' vector of widths, whereby 'length(w)' determines length of output
## 'end' GRanges, GRangesList, or Seqinfo genome
##
## Use:
## gr.trim(input_gr, starts, ends)
##
In [37]:
GRanges(1, IRanges(1e6, width=1000))
In [38]:
## Given GRangeswith genomic coordinates 1:1,000,000-1,001,000,
## gr.trim() to "trim" the first 20 bases and last 50 bases
gr.trim(GRanges(1, IRanges(1e6, width=1500)), starts=20, ends=950)
In [39]:
## Function:
## gr.sample() samples input GRanges intervals within a given territory
##
## Args:
## 'k' number of ranges to sample
## 'wid' length of the GRanges element to produce (default = 100)
## 'replace' If TRUE, will bootstrap. If FALSE, otherwise will sample without replacement. (default = TRUE)
##
## 'k' integer Number of ranges to sample
## 'wid' integer Length of the \code{GRanges} element to produce (default = 100)
## 'replace' boolean If TRUE, will bootstrap, otherwise will sample without replacement. (default = TRUE)
##
## If 'k' is a scalar then 'gr.rand()' will (uniformly) select 'k' intervals from the summed territory of GRanges
## If 'k' is a vector of 'length(gr)' then will uniformly select 'k' intervals from each.
##
## Note: This is different from the function GenomicRanges::sample(), which samples from a pile of GRanges
##
## Use:
## gr.sample(gr, k, wid, replace = TRUE)
##
In [40]:
## sample five GRanges of length 10 each from territory of RefSeq genes
## set seed for reproducibility
set.seed(42)
gr.sample(reduce(example_genes), k=5, wid=10)
In [41]:
## Function:
## gr.fix() "repairs" genome seqlevels; returns input with a repaired Seqinfo
##
## If "genome" not specified will replace NA 'seqlengths' in GRanges to reflect largest coordinate per 'seqlevel'
## and removes all 'NA seqlevels' after this fix.
##
## If "genome" defined (i.e. as Seqinfo object, or a BSgenome, GRanges, GRangesList object with populated \code{seqlengths}),
## then will replace \code{seqlengths} in \code{gr} with those for that genome
##
## Args:
## 'genome' Genome to fix to: Seqinfo, BSgenome, GRanges (w/seqlengths), GRangesList (w/seqlengths) (default = NULL)
## 'gname' Name of the genome (optional) (default = NULL)
## 'drop' If TRUE, remove ranges that are not present in the supplied genome (default = FALSE)
##
## Use:
## gr.fix(genome, gname, drop)
##
In [42]:
## before gr.fix()
seqlengths(grexample)
## after gr.fix()
seqlengths(gr.fix(grexample))
In [43]:
## Function:
## 'streduce' returns a reduced, sorted GRanges with no strand information, i.e. a GRanges with a minimal data footprint
##
## 'pad' Expand the input data before reducing (default = 0)
## 'sort' If TRUE, sorts the input (default = TRUE)
##
## Use:
## streduce(input_gr, pad, sort)
##
In [44]:
streduce(grexample)
In [45]:
## Function:
## 'gr.string' returns UCSC-style interval string corresponding to GRanges pile (i.e. chr:start-end)
##
##
## 'add.chr' If TRUE, prepends seqnames with "chr" (default = FALSE)
## 'mb' If TRUE, rounds to the nearest megabase (default = FALSE)
## 'round' If 'mb', 'round' sets the number of digits to round to. (default = 3)
## 'other.cols' Names of additional 'mcols' fields to add to the string (seperated by ";")
## 'pretty' If TRUE, outputs interval string in more readable format
##
## Use:
## gr.string(example_dnase)
##
In [46]:
gr.string(grexample)
In [47]:
## Function:
## 'grl.string' generates a string representation of an input GRangesList
##
## 'mb' If TRUE, returns as MB and round to "round" (default = FALSE)
## 'sep' Character to separate single \code{GRanges} ranges (default = ',')
##
## Use:
## grl.string(input_GRangesList)
##
In [48]:
## create input GRangeList 'grl_size_five'
grl_size_five = grl1[1:5]
grl.string(grl_size_five)
In [49]:
grl.string(grl_size_five, sep = ':', mb=TRUE)
In [50]:
## Function:
## 'grl.reduce' returns GRangesList with GRanges of intervals of the input GRanges +/- padding
##
## 'pad' padding to add to ranges inside GRangesList before reducing (default = 0)
## 'clip' If TRUE, add padding to ranges inside GRangesList before reducing (default = FALSE)
##
## Use:
## grl.reduce(input_GRangesList)
##
In [51]:
## create input GRangeList 'grl_size_five'
grl_size_five = grl1[1:5]
print(grl_size_five)
print('')
print('apply grl.reduce(), with padding = 50')
grl.reduce(grl_size_five, pad = 50, clip=TRUE)
In [52]:
## Function:
## 'gr.duplicated' allows to restrict duplicates using "by" columns and allows inexact matching
## It will return a boolean vector of the "match status"
##
## 'by' column used to restrict duplicates. See the 'by' argument for function gr.match()
## 'type' string 'type' used. See the 'type' argument for function gr.match()
##
## Use:
## gr.duplicated(input_GRangesList)
##
In [53]:
first_10_genes = example_genes[1:10]
gr.duplicated(first_10_genes)
In [54]:
## Function:
## 'gr.dice' "dices up" GRanges, returning a width=1 GRanges spanning the input
##
## Use:
## gr.dice(input_gr)
##
In [55]:
gr.dice(grexample)
In [56]:
## Function:
## 'grl.unlist' executes a robust unlisting of GRangesList that keeps track of origin
##
## Motivation:
## 'grl.unlist()' does a "nice" unlist of a GRangesList object adding a field 'grl.ix' denoting
## which element of the GRangesList each GRanges corresponds to, and a field 'grl.iix' which
## saves the (local) index that that gr was in its corresponding GRangesList item
##
## In this way, grl.unlist() is reversible, while BiocGenerics::unlist() is not.
##
## Use:
## grl.unlist(input_GRangesList)
##
In [57]:
head(grl.unlist(grl2)) ## using 'head()' as the resulting GRanges has 502 ranges
In [58]:
## Function:
## 'gr.collapse' executes a robust unlisting of GRanges that keeps track of origin
##
## Motivation:
## Similar to GenomicRanges::reduce() except only collapses *adjacent* ranges in the input
## In this way, grl.unlist() is reversible, while BiocGenerics::unlist() is not.
##
## Args:
## 'pad' Padding for not quite adjacent elements to be considered overlapping (default = 1)
##
## Use:
## gr.collapse(input_GRanges)
##
In [59]:
## gr.collapse(grexample)
In [60]:
## Function:
## 'rle.query' queries an 'RleList' representing genomic data, returning an
## 'Rle' representing the (concatenated) vector of data
## This function has been implemeneted to be multithreaded via 'parallel:mc.cores()'
##
## Motivation:
## Recall: An Rle (run-length-encoded) vector is a specific representation of a vector
## An Rle is an efficient data structure if (1) the vector is quite large and/or
## (2) there are many consecutive values of the same identity.
##
## Args:
## 'subject.rle' Rle
## 'query.gr' GRangeslist or GRanges
## 'chunksize' integer Number of \code{query.gr} ranges to consider in one memory chunk. (default = 1e9)
## 'mc.cores' Number of cores to apply when doing chunked operation (default = 1)
## 'verbose' Set the verbosity of the output (default = FALSE)
##
## Use:
## rle.query(subject.rle, query.gr)
##
In [61]:
## an example RleList
chr1_Rle = Rle(10:1, 1:10)
chr2_Rle = Rle(10:1, 1:10)
example_rlelist = RleList( chr1=chr1_Rle, chr2=chr2_Rle)
print('example_rlelist')
print(example_rlelist)
print('rle.query()')
rle.query(example_rlelist, grexample)
In [62]:
## Function:
## 'grl.pivot' pivots GRangesList object 'x' by returning a new GRangesList 'y' whose
## kth item is GRanges object of ranges x[[i]][k] for all i in 1:length(x)
##
## e.g. If the length of a GRangesList `grl is 50, `length(grl)=50`, and the length of each GRanges element inside is 2,
## then 'grl.pivot(grl' will produce a length 3 GRangesList with 50 elements per GRanges
##
## Note: Assumes all GRanges in "x" are of equal length
##
##
## Use:
## grl.pivot(input_GRangesList)
##
In [63]:
head(grl.pivot(grl1))
In [64]:
## Function:
## 'gr.sub' applies gsub to seqlevels of input GRanges
## by default, removes 'chr', and "0.1" suffixes, and replaces "MT" with "M"
## kth item is GRanges object of ranges x[[i]][k] for all i in 1:length(x)
##
## 'a' vector of regular expressions of things to substitute out (default = c("(^chr)(\\.1$)", "MT"))
## 'b' vector of values to substitute in (default = c("", "M"))
##
## Use:
## gr.sub(input_GRanges)
##
In [65]:
seqlevels(grexample) = 'chr1'
seqlevels(gr.sub(grexample))
In [66]:
## Function:
## 'seg2gr' converts "GRanges-like" data.frame into GRanges
##
## There are a number of acceptable columns to specify chromsomes, segment starts, segment ends, and strand
## Please refer to 'standardize_segs()' for full details.
##
## Use:
## seg2gr(input_data.frame)
##
In [67]:
dt[, strand := '*'] ## define strand column in data.table dt
## similar to defining columns in data.frame, i.e.'dt$strand = "*"'
print('print(dt) outputs')
print(dt)
print('seg2gr(dt) outputs')
seg2gr(dt) ## outputs converted data.table
In [68]:
## segments_dt = data.table(patient = c('patient11235813', 'patient3141592', 'patient42'),
## chromosome = c('chr3', 'chr5', 'chrX'),
## start_position = c(1000, 2000, 3000),
## end_position = c(5000, 6000, 7000),
## strand = c('-', '+', '*'))
##
##
## seg2gr(segments_dt)
##
In [69]:
## Function:
## 'grl.eval' evaluates and aggregates an R expression on GRanges column in GRangesList
##
## Args:
## 'expr' Any syntactically valid R expression, on columns of GRanges or GRangesList
##
## Use:
## grl.eval(input_GRangesList, expr=valid_R_expression)
##
In [70]:
## NOTE: only printing first 5 rows
## calculate 'bin ** 2' whereby bin is the metadata column in 'grl1'
grl.eval(grl1, bin**2)[1:5]
## calculate 'width() - 5'
grl.eval(grl1, width-5)[1:5]
In [71]:
## Function:
## 'gr.simplify' calculates the pairwise-distance for SV rearrangements represented by GRangesList
##
## Args:
## 'field' character scalar, corresponding to value field of input GRanges. (default = NULL)
## val character scalar (default = NULL)
## include.val If TRUE, will include in out input GRanges the values field of first matching record in said input GRanges. (default = TRUE)
## split If TRUE, will split the output into GRangesList split by "field". (default = FALSE)
## pad integer Pad ranges by this amount before doing merge. [1], which merges contiguous but non-overlapping ranges. (default = 1)
##
##
## Use:
## gr.simplify(input_GRanges, field = NULL, val = NULL, include.val = TRUE, split = FALSE, pad = 1)
##
In [72]:
gr.simplify(grexample, pad=4, field="name", split = TRUE)
In [73]:
## Function:
## 'gr.parse' returns GRanges parsed from input IGV-/UCSC-style strings
##
## Use:
## gr.parse(input_IGV_UCSC_strings)
##
In [74]:
parse.gr(c('1:1e6-5e6+', '2:2e6-5e6-'))
In [75]:
## Function:
## 'grl.parse' returns GRangesList parsed from input IGV-/UCSC-style strings
##
## Use:
## grl.parse(input_IGV_UCSC_strings)
##
In [76]:
parse.grl(c('1:1e6-5e6+;5:10-2000', '2:2e6-5e6-;10:100231321-100231399'))
In [77]:
## Function:
## gr.sum() sums either by doing coverage and either weighting them equally
## or using a field "weight". Will return either sum or average.
##
## Args:
## 'field' field from input GRanges to use as a weight
## 'mean' if TRUE, calculates mena. Flag specifies whether to divide the output at each interval but the total number of intervals overlapping it (only applies if field == NULL) (default FALSE)
##
## gr.sum(input_GRanges)
In [78]:
gr.sum(grexample)
In [79]:
grexample$GC = c(0.75, 1.0, 0.25) ## add 'GC' column
gr.sum(grexample, field='GC')
In [ ]:
In [ ]:
In [ ]: