Load packages


In [1]:
suppressWarnings(suppressMessages(library(gUtils)))   
suppressWarnings(suppressMessages(library(BSgenome.Hsapiens.UCSC.hg19)))
Sys.setenv(DEFAULT_BSGENOME = 'BSgenome.Hsapiens.UCSC.hg19::Hsapiens')

Create example GRanges and data.table objects


In [2]:
## instantiate an example GRanges objects, 'gr' and data.table 'dt'

grexample = GRanges(1, IRanges(c(3, 9, 13), c(9, 13, 20)), strand=c('+','-','-'), name=c("A","B","C"))

dt = data.table(seqnames=1, start=c(2,5,10), end=c(3,8,15))


print('grexample')
print(grexample)
print('dt')
print(dt)


[1] "grexample"
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      + |           A
  [2]        1  [ 9, 13]      - |           B
  [3]        1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths
[1] "dt"
   seqnames start end
1:        1     2   3
2:        1     5   8
3:        1    10  15

gr.start()

gr.start() returns a GRanges of the first coordinate (or first k coordinates) in each interval


In [3]:
## Function: 
## 'gr.start()' returns a GRanges of the first coordinate (or first k coordinates) in each interval 
## 
## Args:
## 'width' integer Specify subranges of greater width including the start of the range. (default = 1)
## 'force' boolean If TRUE, allows returned GRanges to have ranges outside of its 'Seqinfo' bounds. (default = FALSE)
## 'ignore.strand' If FALSE, gr.start() will extend '-' strands from the other direction (default = TRUE)
## 'clip' boolean  If TRUE, trims returned GRanges so that it does not extend beyond bounds of the input (default = TRUE)
## 
## Use:
## gr.start(grexample) = returns start of each genomic range in the input 'grexample'
##

In [4]:
gr.start(grexample)


Warning message in gr.start(grexample):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  3]      + |           A
  [2]        1  [ 9,  9]      - |           B
  [3]        1  [13, 13]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [5]:
gr.start(grexample, width=3)


Warning message in gr.start(grexample, width = 3):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  5]      + |           A
  [2]        1  [ 9, 11]      - |           B
  [3]        1  [13, 15]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [6]:
## if argument 'width' larger than width(grexample), then the corresponding output will have subranges specified to the range end 
gr.start(grexample, width=50)


Warning message in gr.start(grexample, width = 50):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      + |           A
  [2]        1  [ 9, 13]      - |           B
  [3]        1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [7]:
gr.start(grexample, ignore.strand=FALSE)


Warning message in gr.start(grexample, ignore.strand = FALSE):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  3]      + |           A
  [2]        1  [13, 13]      - |           B
  [3]        1  [20, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.end()

gr.end() returns a GRanges of the last coordinate (or last k coordinates) in each interval


In [8]:
## Function: 
## 'gr.end()' returns a GRanges of the last coordinate (or last k coordinates) in each interval 
## 
## Args:
## 'width' integer Specify subranges of greater width including the start of the range. (default = 1)
## 'force' boolean If TRUE, allows returned GRanges to have ranges outside of its 'Seqinfo' bounds. (default = FALSE)
## 'ignore.strand' If FALSE, gr.start() will extend '-' strands from the other direction (default = TRUE)
## 'clip' boolean  If TRUE, trims returned GRanges so that it does not extend beyond bounds of the input (default = TRUE)
## 
## Use:
## gr.end(grexample) = returns end of each genomic range in the input 'grexample'
##

In [9]:
gr.end(grexample)


Warning message in gr.end(grexample):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 9,  9]      + |           A
  [2]        1  [13, 13]      - |           B
  [3]        1  [20, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [10]:
gr.end(grexample, width=3)


Warning message in gr.end(grexample, width = 3):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 7,  9]      + |           A
  [2]        1  [11, 13]      - |           B
  [3]        1  [18, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [11]:
## if argument 'width' larger than width(grexample), subranges specified to the range start
gr.end(grexample, width=50)


Warning message in gr.end(grexample, width = 50):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      + |           A
  [2]        1  [ 9, 13]      - |           B
  [3]        1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [12]:
gr.end(grexample, ignore.strand=FALSE)


Warning message in gr.end(grexample, ignore.strand = FALSE):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 9,  9]      + |           A
  [2]        1  [ 9,  9]      - |           B
  [3]        1  [13, 13]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.mid()

gr.mid() returns a GRanges of the last coordinate (or last k coordinates) in each interval


In [13]:
## Function: 
## 'gr.mid()' returns a GRanges of the midpoint coordinate in each interval 
## 
## Use:
## gr.mid(grexample) = returns midpoints of each genomic range in the input 'grexample'
##

In [14]:
gr.mid(grexample)


GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 6,  6]      + |           A
  [2]        1  [11, 11]      - |           B
  [3]        1  [16, 16]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [15]:
## midpoint between (100, 200)

gr.mid(GRanges(1, IRanges(100, 200), seqinfo=Seqinfo("1", 200)))


GRanges object with 1 range and 0 metadata columns:
      seqnames     ranges strand
         <Rle>  <IRanges>  <Rle>
  [1]        1 [150, 150]      *
  -------
  seqinfo: 1 sequence from an unspecified genome

dt2gr()

dt2gr() inputs an R data.table/data.frame and outputs a GRanges, if columns 'start', 'end', 'seqnames' exist


In [16]:
## Function: 
## 'dt2gr()' inputs an R data.table/data.frame and outputs a GRanges, if columns 'start', 'end', 'seqnames' exist
## 
## Args:
## 'seqlengths' named integer vector representing genome (default = hg_seqlengths())
## 'seqinfo' Seqinfo of output GRanges object
##
## Use:
## dt2gr(dt)
##

In [17]:
dt2gr(dt)  ## data.table converted to GRanges


GRanges object with 3 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]        1  [ 2,  3]      *
  [2]        1  [ 5,  8]      *
  [3]        1  [10, 15]      *
  -------
  seqinfo: 25 sequences from an unspecified genome

gr2dt()

gr2dt() returns GRanges 'input_gr' converted into an R data.table/data.frame


In [18]:
## Function: 
## 'gr2dt()' returns GRanges converted into an R data.table/data.frame 
##
## Use:
## gr2dt(grexample)
##

In [19]:
gr2dt(grexample)   ## GRanges converted to data.table

print(is(gr2dt(grexample), 'data.table'))


seqnamesstartendstrandwidthname
1 3 9+ 7 A
1 913- 5 B
1 1320- 8 C
[1] TRUE

gr.tile()

gr.tile() tiles the input GRanges 'input_gr' with non-overlapping bins of width K.


In [20]:
## Function: 
## 'gr.tile()' tiles the input GRanges 'input_gr' with non-overlapping bins of width K.
## 
## Args:
## 'width' the width of each tile (default = 1e3)
##
## Use:
## gr.tile(input_gr, width = K)
##
## Refer to function 'gr.tile.map()'

In [21]:
gr.tile(grexample, 2)


GRanges object with 11 ranges and 2 metadata columns:
       seqnames    ranges strand |  query.id   tile.id
          <Rle> <IRanges>  <Rle> | <integer> <integer>
   [1]        1  [ 3,  4]      + |         1         1
   [2]        1  [ 5,  6]      + |         1         2
   [3]        1  [ 7,  8]      + |         1         3
   [4]        1  [ 9,  9]      + |         1         4
   [5]        1  [ 9, 10]      - |         2         5
   [6]        1  [11, 12]      - |         2         6
   [7]        1  [13, 13]      - |         2         7
   [8]        1  [13, 14]      - |         3         8
   [9]        1  [15, 16]      - |         3         9
  [10]        1  [17, 18]      - |         3        10
  [11]        1  [19, 20]      - |         3        11
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [22]:
## create 10 tiles of width 10 from start=1 to end=100
gr.tile(GRanges(1, IRanges(1,100)), width=10)


GRanges object with 10 ranges and 2 metadata columns:
       seqnames    ranges strand |  query.id   tile.id
          <Rle> <IRanges>  <Rle> | <integer> <integer>
   [1]        1 [ 1,  10]      * |         1         1
   [2]        1 [11,  20]      * |         1         2
   [3]        1 [21,  30]      * |         1         3
   [4]        1 [31,  40]      * |         1         4
   [5]        1 [41,  50]      * |         1         5
   [6]        1 [51,  60]      * |         1         6
   [7]        1 [61,  70]      * |         1         7
   [8]        1 [71,  80]      * |         1         8
   [9]        1 [81,  90]      * |         1         9
  [10]        1 [91, 100]      * |         1        10
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.chr()

gr.chr() prepends 'chr' to GRanges seqlevels()


In [23]:
## Function: 
## 'gr.chr()' prepends 'chr' to GRanges seqlevels()
## 
## Use:
## gr.chr(grexample)
##

In [24]:
gr.chr(grexample)


GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]     chr1  [ 3,  9]      + |           A
  [2]     chr1  [ 9, 13]      - |           B
  [3]     chr1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.nochr()

gr.nochr() removes the prefix 'chr' from GRanges seqlevels()


In [25]:
## Function: 
## 'gr.nochr()' removes the prefix 'chr' from GRanges seqlevels()
## 
## Use:
## gr.nochr(grexample)
##

In [26]:
gr.nochr(gr.chr(grexample))


GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      + |           A
  [2]        1  [ 9, 13]      - |           B
  [3]        1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.stripstrand()

gr.stripstrand() sets input GRanges strand to '*'


In [27]:
## Function: 
## gr.stripstrand() sets input GRanges strand to '*'
## 
## Use:
## gr.stripstrand(grexample)
##

In [28]:
gr.stripstrand(grexample)


GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      * |           A
  [2]        1  [ 9, 13]      * |           B
  [3]        1  [13, 20]      * |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.strandflip()

gr.strandflip() reverses GRanges strand signs


In [29]:
## Function: 
## gr.flipstrand() inverts GRanges strand, '+' <-> '-'
## 
## Use:
## gr.flipstrand(grexample)
##

In [30]:
gr.flipstrand(grexample)


GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      - |           A
  [2]        1  [ 9, 13]      + |           B
  [3]        1  [13, 20]      + |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.flatten()

gr.flatten() returns input GRanges as data.frame with input ranges superimposed onto a single "flattened" chromosome, with ranges laid end-to-end


In [31]:
## Function: 
## gr.flatten() returns input GRanges as data.frame with input ranges superimposed 
## onto a single "flattened" chromosome, with ranges laid end-to-end
## 
## Args:
## 'gap' number of bases between ranges on the new chromosome (default = 0) 
## 
## Use:
## gr.flatten(grexample)
##

In [32]:
gr.flatten(grexample)


startendname
1 7A
812B
1320C

In [33]:
gr.flatten(grexample, gap=500)


startendname
1 7A
508 512B
10131020C

gr.rand()

gr.rand(w, genome) returns randomly-generated GRanges from input genome


In [34]:
## Function: 
## gr.rand(w, genome) returns randomly-generated GRanges from input genome
## 
## Args:
## 'w' vector of widths, whereby 'length(w)' determines length of output
## 'genome' GRanges, GRangesList, or Seqinfo genome
## 
## Use:
## gr.rand(w, genome)
##

In [35]:
## Generate 5 non-overlapping regions of width 10 on hg19

## set seed for reproducibility 

set.seed(123)

gr.rand(rep(10,5), BSgenome.Hsapiens.UCSC.hg19::Hsapiens)


GRanges object with 5 ranges and 0 metadata columns:
      seqnames               ranges strand
         <Rle>            <IRanges>  <Rle>
  [1]     chr5 [20550152, 20550161]      *
  [2]    chr16 [63222704, 63222713]      *
  [3]     chr7 [49369246, 49369255]      *
  [4]    chr20 [51594129, 51594138]      *
  [5]     chrX [69363664, 69363673]      *
  -------
  seqinfo: 93 sequences from an unspecified genome

gr.trim()

gr.trim() "trims" input GRanges relative to the specified coordinates of each range


In [36]:
## Function: 
## 'gr.trim()' trims input GRanges relative to the specified <local> coordinates of each range
## The motivation behind this function is to fill the role of current GRanges functionality
## e.g. shift(), reduce(), restrict(), shift(), resize(), flank()
## 
## Args:
## 'start' vector of widths, whereby 'length(w)' determines length of output
## 'end' GRanges, GRangesList, or Seqinfo genome
## 
## Use:
## gr.trim(input_gr, starts, ends)
##

In [37]:
GRanges(1, IRanges(1e6, width=1000))


GRanges object with 1 range and 0 metadata columns:
      seqnames             ranges strand
         <Rle>          <IRanges>  <Rle>
  [1]        1 [1000000, 1000999]      *
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [38]:
## Given GRangeswith genomic coordinates 1:1,000,000-1,001,000, 
## gr.trim() to "trim" the first 20 bases and last 50 bases 

gr.trim(GRanges(1, IRanges(1e6, width=1500)), starts=20, ends=950)


GRanges object with 1 range and 0 metadata columns:
      seqnames             ranges strand
         <Rle>          <IRanges>  <Rle>
  [1]        1 [1000019, 1000949]      *
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.sample()

gr.sample(gr, k, wid) samples input GRanges intervals within a given territory


In [39]:
## Function: 
## gr.sample() samples input GRanges intervals within a given territory
## 
## Args:
## 'k'  number of ranges to sample
## 'wid' length of the GRanges element to produce (default = 100)
## 'replace' If TRUE, will bootstrap. If FALSE, otherwise will sample without replacement. (default = TRUE)
## 
## 'k'  integer Number of ranges to sample
## 'wid' integer Length of the \code{GRanges} element to produce (default = 100)
## 'replace' boolean If TRUE, will bootstrap, otherwise will sample without replacement. (default = TRUE)
## 
## If 'k' is a scalar then 'gr.rand()' will (uniformly) select 'k' intervals from the summed territory of GRanges
## If 'k' is a vector of 'length(gr)' then will uniformly select 'k' intervals from each.
##
## Note: This is different from the function GenomicRanges::sample(), which samples from a pile of GRanges
##
## Use:
## gr.sample(gr, k, wid, replace = TRUE)
##

In [40]:
## sample five GRanges of length 10 each from territory of RefSeq genes

## set seed for reproducibility 

set.seed(42)

gr.sample(reduce(example_genes), k=5, wid=10)


GRanges object with 5 ranges and 1 metadata column:
      seqnames               ranges strand |  query.id
         <Rle>            <IRanges>  <Rle> | <integer>
  [1]        4 [90662968, 90662977]      * |      4619
  [2]       11 [27367873, 27367882]      * |     10217
  [3]       16 [87924776, 87924785]      * |     13795
  [4]       19 [52504313, 52504322]      * |     16412
  [5]       21 [30486843, 30486852]      * |     17028
  -------
  seqinfo: 25 sequences from an unspecified genome

gr.fix()

gr.fix() "repairs" genome seqlevels; returns input with a repaired Seqinfo


In [41]:
## Function: 
## gr.fix() "repairs" genome seqlevels; returns input with a repaired Seqinfo
##
## If "genome" not specified will replace NA 'seqlengths' in GRanges to reflect largest coordinate per 'seqlevel'
## and removes all 'NA seqlevels' after this fix.
##
## If "genome" defined (i.e. as Seqinfo object, or a BSgenome, GRanges, GRangesList object with populated \code{seqlengths}),
## then will replace \code{seqlengths} in \code{gr} with those for that genome
## 
## Args:
## 'genome' Genome to fix to: Seqinfo, BSgenome, GRanges (w/seqlengths), GRangesList (w/seqlengths) (default = NULL)
## 'gname'  Name of the genome (optional) (default = NULL)
## 'drop'   If TRUE, remove ranges that are not present in the supplied genome (default = FALSE)
##
## Use:
## gr.fix(genome, gname, drop)
##

In [42]:
## before gr.fix()
seqlengths(grexample)

## after gr.fix()
seqlengths(gr.fix(grexample))


1: <NA>
1: 20

streduce()

streduce() returns a reduced, sorted GRanges with no strand information


In [43]:
## Function: 
## 'streduce' returns a reduced, sorted GRanges with no strand information, i.e. a GRanges with a minimal data footprint
##
## 'pad' Expand the input data before reducing (default = 0)
## 'sort' If TRUE, sorts the input (default = TRUE)
##
## Use:
## streduce(input_gr, pad, sort)
##

In [44]:
streduce(grexample)


GRanges object with 1 range and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]        1   [3, 20]      *
  -------
  seqinfo: 1 sequence from an unspecified genome

gr.string()

gr.string() returns UCSC-style interval string corresponding to GRanges pile (i.e. chr:start-end)`


In [45]:
## Function: 
## 'gr.string' returns UCSC-style interval string corresponding to GRanges pile (i.e. chr:start-end)
##
##
## 'add.chr' If TRUE, prepends seqnames with "chr" (default = FALSE)
## 'mb' If TRUE, rounds to the nearest megabase (default = FALSE)
## 'round' If 'mb', 'round' sets the number of digits to round to. (default = 3)
## 'other.cols' Names of additional 'mcols' fields to add to the string (seperated by ";")
## 'pretty' If TRUE, outputs interval string in more readable format 
##
## Use:
## gr.string(example_dnase)
##

In [46]:
gr.string(grexample)


  1. '1:3-9+'
  2. '1:9-13-'
  3. '1:13-20-'

grl.string()

grl.string() generates a string representation of an input GRangesList


In [47]:
## Function: 
## 'grl.string' generates a string representation of an input GRangesList
##
## 'mb' If TRUE, returns as MB and round to "round" (default = FALSE)
## 'sep' Character to separate single \code{GRanges} ranges (default = ',')
##
## Use:
## grl.string(input_GRangesList)
##

In [48]:
## create input GRangeList 'grl_size_five'

grl_size_five = grl1[1:5]


grl.string(grl_size_five)


10
'5:146981323-146981323-,14:104605059-104605059+'
26
'9:140100229-140100229-,19:24309057-24309057-'
49
'5:132216750-132216750-,17:4803057-4803057+'
50
'11:9492855-9492855-,11:69514029-69514029-'
88
'2:228337137-228337137+,7:97736489-97736489+'

In [49]:
grl.string(grl_size_five, sep = ':', mb=TRUE)


10
'5:146.981-146.981-:14:104.605-104.605+'
26
'9:140.1-140.1-:19:24.309-24.309-'
49
'5:132.217-132.217-:17:4.803-4.803+'
50
'11:9.493-9.493-:11:69.514-69.514-'
88
'2:228.337-228.337+:7:97.736-97.736+'

grl.reduce()

grl.reduce() returns GRangesList with GRanges of intervals of the input GRanges +/- padding


In [50]:
## Function: 
## 'grl.reduce' returns GRangesList with GRanges of intervals of the input GRanges +/- padding
##
## 'pad' padding to add to ranges inside GRangesList before reducing (default = 0)
## 'clip' If TRUE, add padding to ranges inside GRangesList before reducing (default = FALSE)
##
## Use:
## grl.reduce(input_GRangesList)
##

In [51]:
## create input GRangeList 'grl_size_five'

grl_size_five = grl1[1:5]

print(grl_size_five)
print('')
print('apply grl.reduce(), with padding = 50')

grl.reduce(grl_size_five, pad = 50, clip=TRUE)


GRangesList object of length 5:
$10 
GRanges object with 2 ranges and 1 metadata column:
      seqnames                 ranges strand |       bin
         <Rle>              <IRanges>  <Rle> | <integer>
  [1]        5 [146981323, 146981323]      - |        10
  [2]       14 [104605059, 104605059]      + |        10

$26 
GRanges object with 2 ranges and 1 metadata column:
      seqnames                 ranges strand | bin
  [1]        9 [140100229, 140100229]      - |  26
  [2]       19 [ 24309057,  24309057]      - |  26

$49 
GRanges object with 2 ranges and 1 metadata column:
      seqnames                 ranges strand | bin
  [1]        5 [132216750, 132216750]      - |  49
  [2]       17 [  4803057,   4803057]      + |  49

...
<2 more elements>
-------
seqinfo: 25 sequences from an unspecified genome
[1] ""
[1] "apply grl.reduce(), with padding = 50"
Warning message in dt2gr(out):
“Warning: Coercing to GRanges via non-standard columns”
GRangesList object of length 5:
$10 
GRanges object with 2 ranges and 2 metadata columns:
      seqnames                 ranges strand |         run     group
         <Rle>              <IRanges>  <Rle> | <character> <integer>
  [1]       14 [104605009, 104605060]      * |         1 1         1
  [2]        5 [146981273, 146981324]      * |         1 2         1

$26 
GRanges object with 2 ranges and 2 metadata columns:
      seqnames                 ranges strand | run group
  [1]       19 [ 24309007,  24309058]      * | 2 1     2
  [2]        9 [140100179, 140100230]      * | 2 2     2

$49 
GRanges object with 2 ranges and 2 metadata columns:
      seqnames                 ranges strand | run group
  [1]       17 [  4803007,   4803058]      * | 3 1     3
  [2]        5 [132216700, 132216751]      * | 3 2     3

...
<2 more elements>
-------
seqinfo: 25 sequences from an unspecified genome

gr.duplicated()

gr.duplicated allows to restrict duplicates using "by" columns; returns boolean vector of match status


In [52]:
## Function: 
## 'gr.duplicated' allows to restrict duplicates using "by" columns and allows inexact matching
## It will return a boolean vector of the "match status"
##
## 'by' column used to restrict duplicates. See the 'by' argument for function gr.match()
## 'type' string 'type' used. See the 'type' argument for function gr.match()
##
## Use:
## gr.duplicated(input_GRangesList)
##

In [53]:
first_10_genes = example_genes[1:10]

gr.duplicated(first_10_genes)


Warning message:
“"ranges" method for Hits objects is deprecated. Please use
  overlapsRanges() instead.”
  1. FALSE
  2. FALSE
  3. TRUE
  4. TRUE
  5. FALSE
  6. FALSE
  7. FALSE
  8. FALSE
  9. FALSE
  10. FALSE

gr.dice()

gr.dice() "dices up" GRanges, returning a width=1 GRanges spanning the input


In [54]:
## Function: 
## 'gr.dice' "dices up" GRanges, returning a width=1 GRanges spanning the input
##
## Use:
## gr.dice(input_gr)
##

In [55]:
gr.dice(grexample)


GRangesList object of length 3:
$1 
GRanges object with 7 ranges and 1 metadata column:
      seqnames    ranges strand |           X
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1    [3, 3]      + |           A
  [2]        1    [4, 4]      + |           A
  [3]        1    [5, 5]      + |           A
  [4]        1    [6, 6]      + |           A
  [5]        1    [7, 7]      + |           A
  [6]        1    [8, 8]      + |           A
  [7]        1    [9, 9]      + |           A

$2 
GRanges object with 5 ranges and 1 metadata column:
      seqnames   ranges strand | X
  [1]        1 [13, 13]      - | B
  [2]        1 [12, 12]      - | B
  [3]        1 [11, 11]      - | B
  [4]        1 [10, 10]      - | B
  [5]        1 [ 9,  9]      - | B

$3 
GRanges object with 8 ranges and 1 metadata column:
      seqnames   ranges strand | X
  [1]        1 [20, 20]      - | C
  [2]        1 [19, 19]      - | C
  [3]        1 [18, 18]      - | C
  [4]        1 [17, 17]      - | C
  [5]        1 [16, 16]      - | C
  [6]        1 [15, 15]      - | C
  [7]        1 [14, 14]      - | C
  [8]        1 [13, 13]      - | C

-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths

grl.unlist()

grl.unlists executes a robust unlisting of GRangesList that keeps track of origin


In [56]:
## Function: 
## 'grl.unlist' executes a robust unlisting of GRangesList that keeps track of origin
##
## Motivation:
## 'grl.unlist()' does a "nice" unlist of a GRangesList object adding a field 'grl.ix' denoting 
## which element of the GRangesList each GRanges corresponds to,  and a field 'grl.iix' which 
## saves the (local) index that that gr was in its corresponding GRangesList item
##
## In this way, grl.unlist() is reversible, while BiocGenerics::unlist() is not.
##
## Use:
## grl.unlist(input_GRangesList)
##

In [57]:
head(grl.unlist(grl2))   ## using 'head()' as the resulting GRanges has 502 ranges


GRanges object with 6 ranges and 3 metadata columns:
      seqnames                 ranges strand |       bin    grl.ix   grl.iix
         <Rle>              <IRanges>  <Rle> | <integer> <integer> <integer>
  [1]        8 [110374809, 110374809]      - |      4678         1         1
  [2]       17 [ 72874459,  72874459]      + |      4678         1         2
  [3]        7 [116166641, 116166641]      + |      4713         2         1
  [4]       22 [ 21738147,  21738147]      - |      4713         2         2
  [5]        6 [ 32822006,  32822006]      + |      4723         3         1
  [6]        9 [136529010, 136529010]      - |      4723         3         2
  -------
  seqinfo: 25 sequences from an unspecified genome

gr.collapse()

gr.collapse() executes a robust unlisting of GRanges that keeps track of origin


In [58]:
## Function: 
## 'gr.collapse' executes a robust unlisting of GRanges that keeps track of origin
##
## Motivation:
## Similar to GenomicRanges::reduce() except only collapses *adjacent* ranges in the input
## In this way, grl.unlist() is reversible, while BiocGenerics::unlist() is not.
##
## Args:
## 'pad' Padding for not quite adjacent elements to be considered overlapping (default = 1)
##
## Use:
## gr.collapse(input_GRanges)
##

In [59]:
## gr.collapse(grexample)

rle.query()

rle.query() queries an 'RleList' representing genomic data, returning an Rle representing the (concatenated) vector of data


In [60]:
## Function: 
## 'rle.query' queries an 'RleList' representing genomic data, returning an  
## 'Rle' representing the (concatenated) vector of data
## This function has been implemeneted to be multithreaded via 'parallel:mc.cores()'
##
## Motivation:
## Recall: An Rle (run-length-encoded) vector is a specific representation of a vector
## An Rle is an efficient data structure if (1) the vector is quite large and/or
## (2) there are many consecutive values of the same identity. 
##
## Args:
## 'subject.rle' Rle
## 'query.gr' GRangeslist or GRanges
## 'chunksize' integer Number of \code{query.gr} ranges to consider in one memory chunk. (default = 1e9)
## 'mc.cores' Number of cores to apply when doing chunked operation (default = 1)
## 'verbose' Set the verbosity of the output (default = FALSE)
## 
## Use:
## rle.query(subject.rle, query.gr)
##

In [61]:
## an example RleList
chr1_Rle = Rle(10:1, 1:10)
chr2_Rle = Rle(10:1, 1:10)
example_rlelist = RleList( chr1=chr1_Rle, chr2=chr2_Rle)

print('example_rlelist')
print(example_rlelist)


print('rle.query()')
rle.query(example_rlelist, grexample)


[1] "example_rlelist"
RleList of length 2
$chr1
integer-Rle of length 55 with 10 runs
  Lengths:  1  2  3  4  5  6  7  8  9 10
  Values : 10  9  8  7  6  5  4  3  2  1

$chr2
integer-Rle of length 55 with 10 runs
  Lengths:  1  2  3  4  5  6  7  8  9 10
  Values : 10  9  8  7  6  5  4  3  2  1

[1] "rle.query()"
logical-Rle of length 20 with 1 run
  Lengths: 20
  Values : NA

grl.pivot()

grl.pivot() pivots GRangesList object 'x' by returning a new GRangesList 'y' whose kth item is GRanges object of ranges x[[i]][k] for all i in 1:length(x)


In [62]:
## Function: 
## 'grl.pivot' pivots GRangesList object 'x' by returning a new GRangesList 'y' whose
## kth item is GRanges object of ranges x[[i]][k] for all i in 1:length(x) 
## 
## e.g. If the length of a GRangesList `grl is 50, `length(grl)=50`, and the length of each GRanges element inside is 2, 
## then 'grl.pivot(grl' will produce a length 3 GRangesList with 50 elements per GRanges
##
## Note: Assumes all GRanges in "x" are of equal length
##
##
## Use:
## grl.pivot(input_GRangesList)
##

In [63]:
head(grl.pivot(grl1))


GRangesList object of length 2:
$1 
GRanges object with 250 ranges and 1 metadata column:
       seqnames                 ranges strand |       bin
          <Rle>              <IRanges>  <Rle> | <integer>
    10        5 [146981323, 146981323]      - |        10
    26        9 [140100229, 140100229]      - |        26
    49        5 [132216750, 132216750]      - |        49
    50       11 [  9492855,   9492855]      - |        50
    88        2 [228337137, 228337137]      + |        88
   ...      ...                    ...    ... .       ...
  4601        2 [ 69002312,  69002312]      + |      4601
  4604        6 [136173136, 136173136]      + |      4604
  4643        8 [ 22005672,  22005672]      - |      4643
  4673        4 [ 71554394,  71554394]      - |      4673
  4678        8 [110374809, 110374809]      - |      4678

...
<1 more element>
-------
seqinfo: 25 sequences from an unspecified genome

gr.sub()

gr.sub() applies gsub to seqlevels of input GRanges


In [64]:
## Function: 
## 'gr.sub' applies gsub to seqlevels of input GRanges
## by default, removes 'chr', and "0.1" suffixes, and replaces "MT" with "M"
## kth item is GRanges object of ranges x[[i]][k] for all i in 1:length(x) 
## 
## 'a' vector of regular expressions of things to substitute out (default = c("(^chr)(\\.1$)", "MT"))
## 'b' vector of values to substitute in (default = c("", "M"))
## 
## Use:
## gr.sub(input_GRanges)
##

In [65]:
seqlevels(grexample) = 'chr1'
seqlevels(gr.sub(grexample))


'chr1'

seg2gr()

seg2gr() converts "GRanges-like" data.frame into GRanges


In [66]:
## Function: 
## 'seg2gr' converts "GRanges-like" data.frame into GRanges
##
## There are a number of acceptable columns to specify chromsomes, segment starts, segment ends, and strand
## Please refer to 'standardize_segs()' for full details.
## 
## Use:
## seg2gr(input_data.frame)
##

In [67]:
dt[, strand := '*']  ## define strand column in data.table dt
## similar to defining columns in data.frame, i.e.'dt$strand = "*"'

print('print(dt) outputs')
print(dt)

print('seg2gr(dt) outputs')
seg2gr(dt)   ## outputs converted data.table


[1] "print(dt) outputs"
   seqnames start end strand
1:        1     2   3      *
2:        1     5   8      *
3:        1    10  15      *
[1] "seg2gr(dt) outputs"
GRanges object with 3 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]        1  [ 2,  3]      *
  [2]        1  [ 5,  8]      *
  [3]        1  [10, 15]      *
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

In [68]:
## segments_dt = data.table(patient = c('patient11235813', 'patient3141592', 'patient42'), 
##                         chromosome = c('chr3', 'chr5', 'chrX'), 
##                         start_position = c(1000, 2000, 3000), 
##                         end_position = c(5000, 6000, 7000), 
##                         strand = c('-', '+', '*'))
##                         
## 
## seg2gr(segments_dt)
##

grl.eval()

grl.eval() evaluates and aggregates an R expression on GRanges column in GRangesList


In [69]:
## Function: 
## 'grl.eval' evaluates and aggregates an R expression on GRanges column in GRangesList
##
## Args:
## 'expr' Any syntactically valid R expression, on columns of GRanges or GRangesList
## 
## Use:
## grl.eval(input_GRangesList, expr=valid_R_expression)
##

In [70]:
## NOTE: only printing first 5 rows

## calculate 'bin ** 2' whereby bin is the metadata column in 'grl1'
grl.eval(grl1, bin**2)[1:5]  

## calculate 'width() - 5' 
grl.eval(grl1, width-5)[1:5]


  1. 100
  2. 100
  3. 676
  4. 676
  5. 2401
  1. -4
  2. -4
  3. -4
  4. -4
  5. -4

gr.simplify()

gr.simplify() calculates the pairwise-distance for SV rearrangements represented by GRangesList


In [71]:
## Function: 
## 'gr.simplify' calculates the pairwise-distance for SV rearrangements represented by GRangesList
##
## Args:
## 'field' character scalar, corresponding to value field of input GRanges. (default = NULL)
## val character scalar (default = NULL)
## include.val If TRUE, will include in out input GRanges the values field of first matching record in said input GRanges. (default = TRUE)
## split If TRUE, will split the output into GRangesList split by "field". (default = FALSE)
## pad integer Pad ranges by this amount before doing merge. [1], which merges contiguous but non-overlapping ranges. (default = 1)
##
## 
## Use:
## gr.simplify(input_GRanges, field = NULL, val = NULL, include.val = TRUE, split = FALSE, pad = 1)
##

In [72]:
gr.simplify(grexample, pad=4, field="name", split = TRUE)


GRangesList object of length 3:
$A 
GRanges object with 1 range and 1 metadata column:
      seqnames    ranges strand |           X
         <Rle> <IRanges>  <Rle> | <character>
  [1]     chr1    [3, 9]      + |           A

$B 
GRanges object with 1 range and 1 metadata column:
      seqnames  ranges strand | X
  [1]     chr1 [9, 13]      - | B

$C 
GRanges object with 1 range and 1 metadata column:
      seqnames   ranges strand | X
  [1]     chr1 [13, 20]      - | C

-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.parse()

gr.parse() returns GRanges parsed from input IGV-/UCSC-style strings


In [73]:
## Function: 
## 'gr.parse' returns GRanges parsed from input IGV-/UCSC-style strings
## 
## Use:
## gr.parse(input_IGV_UCSC_strings)
##

In [74]:
parse.gr(c('1:1e6-5e6+', '2:2e6-5e6-'))


GRanges object with 2 ranges and 0 metadata columns:
      seqnames             ranges strand
         <Rle>          <IRanges>  <Rle>
  [1]        1 [1000000, 5000000]      +
  [2]        2 [2000000, 5000000]      -
  -------
  seqinfo: 25 sequences from an unspecified genome

grl.parse()

grl.parse() eturns GRangesList parsed from input IGV-/UCSC-style strings


In [75]:
## Function: 
## 'grl.parse' returns GRangesList parsed from input IGV-/UCSC-style strings
## 
## Use:
## grl.parse(input_IGV_UCSC_strings)
##

In [76]:
parse.grl(c('1:1e6-5e6+;5:10-2000', '2:2e6-5e6-;10:100231321-100231399'))


GRangesList object of length 2:
[[1]] 
GRanges object with 2 ranges and 0 metadata columns:
      seqnames             ranges strand
         <Rle>          <IRanges>  <Rle>
  [1]        1 [1000000, 5000000]      +
  [2]        5 [     10,    2000]      *

[[2]] 
GRanges object with 2 ranges and 0 metadata columns:
      seqnames                 ranges strand
  [1]        2 [  2000000,   5000000]      -
  [2]       10 [100231321, 100231399]      *

-------
seqinfo: 25 sequences from an unspecified genome

gr.sum()

gr.sum() sums either by doing coverage and either weighting them equally or using a field "weight".


In [77]:
## Function: 
## gr.sum() sums either by doing coverage and either weighting them equally
## or using a field "weight".  Will return either sum or average.
##
## Args: 
## 'field' field from input GRanges to use as a weight
## 'mean' if TRUE, calculates mena. Flag specifies whether to divide the output at each interval but the total number of intervals overlapping it (only applies if field == NULL) (default FALSE)
##
## gr.sum(input_GRanges)

In [78]:
gr.sum(grexample)


GRanges object with 6 ranges and 1 metadata column:
      seqnames    ranges strand |     score
         <Rle> <IRanges>  <Rle> | <numeric>
  [1]     chr1  [ 1,  2]      * |         0
  [2]     chr1  [ 3,  8]      * |         1
  [3]     chr1  [ 9,  9]      * |         2
  [4]     chr1  [10, 12]      * |         1
  [5]     chr1  [13, 13]      * |         2
  [6]     chr1  [14, 20]      * |         1
  -------
  seqinfo: 1 sequence from an unspecified genome

In [79]:
grexample$GC = c(0.75, 1.0, 0.25)  ## add 'GC' column

gr.sum(grexample, field='GC')


GRanges object with 6 ranges and 1 metadata column:
      seqnames    ranges strand |        GC
         <Rle> <IRanges>  <Rle> | <numeric>
  [1]     chr1  [ 1,  2]      * |         0
  [2]     chr1  [ 3,  8]      * |      0.75
  [3]     chr1  [ 9,  9]      * |      1.75
  [4]     chr1  [10, 12]      * |         1
  [5]     chr1  [13, 13]      * |      1.25
  [6]     chr1  [14, 20]      * |      0.25
  -------
  seqinfo: 1 sequence from an unspecified genome

In [ ]:


In [ ]:


In [ ]: