Load packages



In [1]:

    
suppressWarnings(suppressMessages(library(gUtils)))   
suppressWarnings(suppressMessages(library(BSgenome.Hsapiens.UCSC.hg19)))
Sys.setenv(DEFAULT_BSGENOME = 'BSgenome.Hsapiens.UCSC.hg19::Hsapiens')

Create example GRanges and data.table objects



In [2]:

    
## instantiate an example GRanges objects, 'gr' and data.table 'dt'

grexample = GRanges(1, IRanges(c(3, 9, 13), c(9, 13, 20)), strand=c('+','-','-'), name=c("A","B","C"))

dt = data.table(seqnames=1, start=c(2,5,10), end=c(3,8,15))


print('grexample')
print(grexample)
print('dt')
print(dt)









    



[1] "grexample"
GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      + |           A
  [2]        1  [ 9, 13]      - |           B
  [3]        1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths
[1] "dt"
   seqnames start end
1:        1     2   3
2:        1     5   8
3:        1    10  15

gr.start()

gr.start() returns a GRanges of the first coordinate (or first k coordinates) in each interval



In [3]:

    
## Function: 
## 'gr.start()' returns a GRanges of the first coordinate (or first k coordinates) in each interval 
## 
## Args:
## 'width' integer Specify subranges of greater width including the start of the range. (default = 1)
## 'force' boolean If TRUE, allows returned GRanges to have ranges outside of its 'Seqinfo' bounds. (default = FALSE)
## 'ignore.strand' If FALSE, gr.start() will extend '-' strands from the other direction (default = TRUE)
## 'clip' boolean  If TRUE, trims returned GRanges so that it does not extend beyond bounds of the input (default = TRUE)
## 
## Use:
## gr.start(grexample) = returns start of each genomic range in the input 'grexample'
##



In [4]:

    
gr.start(grexample)









    



Warning message in gr.start(grexample):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”





    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  3]      + |           A
  [2]        1  [ 9,  9]      - |           B
  [3]        1  [13, 13]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [5]:

    
gr.start(grexample, width=3)









    



Warning message in gr.start(grexample, width = 3):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”





    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  5]      + |           A
  [2]        1  [ 9, 11]      - |           B
  [3]        1  [13, 15]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [6]:

    
## if argument 'width' larger than width(grexample), then the corresponding output will have subranges specified to the range end 
gr.start(grexample, width=50)









    



Warning message in gr.start(grexample, width = 50):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”





    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      + |           A
  [2]        1  [ 9, 13]      - |           B
  [3]        1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [7]:

    
gr.start(grexample, ignore.strand=FALSE)









    



Warning message in gr.start(grexample, ignore.strand = FALSE):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”





    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  3]      + |           A
  [2]        1  [13, 13]      - |           B
  [3]        1  [20, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.end()

gr.end() returns a GRanges of the last coordinate (or last k coordinates) in each interval



In [8]:

    
## Function: 
## 'gr.end()' returns a GRanges of the last coordinate (or last k coordinates) in each interval 
## 
## Args:
## 'width' integer Specify subranges of greater width including the start of the range. (default = 1)
## 'force' boolean If TRUE, allows returned GRanges to have ranges outside of its 'Seqinfo' bounds. (default = FALSE)
## 'ignore.strand' If FALSE, gr.start() will extend '-' strands from the other direction (default = TRUE)
## 'clip' boolean  If TRUE, trims returned GRanges so that it does not extend beyond bounds of the input (default = TRUE)
## 
## Use:
## gr.end(grexample) = returns end of each genomic range in the input 'grexample'
##



In [9]:

    
gr.end(grexample)









    



Warning message in gr.end(grexample):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”





    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 9,  9]      + |           A
  [2]        1  [13, 13]      - |           B
  [3]        1  [20, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [10]:

    
gr.end(grexample, width=3)









    



Warning message in gr.end(grexample, width = 3):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”





    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 7,  9]      + |           A
  [2]        1  [11, 13]      - |           B
  [3]        1  [18, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [11]:

    
## if argument 'width' larger than width(grexample), subranges specified to the range start
gr.end(grexample, width=50)









    



Warning message in gr.end(grexample, width = 50):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”





    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      + |           A
  [2]        1  [ 9, 13]      - |           B
  [3]        1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [12]:

    
gr.end(grexample, ignore.strand=FALSE)









    



Warning message in gr.end(grexample, ignore.strand = FALSE):
“Warning: Check or fix seqlengths, some are equal 0 or NA, may lead to negative widths”





    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 9,  9]      + |           A
  [2]        1  [ 9,  9]      - |           B
  [3]        1  [13, 13]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.mid()

gr.mid() returns a GRanges of the last coordinate (or last k coordinates) in each interval



In [13]:

    
## Function: 
## 'gr.mid()' returns a GRanges of the midpoint coordinate in each interval 
## 
## Use:
## gr.mid(grexample) = returns midpoints of each genomic range in the input 'grexample'
##



In [14]:

    
gr.mid(grexample)









    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 6,  6]      + |           A
  [2]        1  [11, 11]      - |           B
  [3]        1  [16, 16]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [15]:

    
## midpoint between (100, 200)

gr.mid(GRanges(1, IRanges(100, 200), seqinfo=Seqinfo("1", 200)))









    





GRanges object with 1 range and 0 metadata columns:
      seqnames     ranges strand
         <Rle>  <IRanges>  <Rle>
  [1]        1 [150, 150]      *
  -------
  seqinfo: 1 sequence from an unspecified genome

dt2gr()

dt2gr() inputs an R data.table/data.frame and outputs a GRanges, if columns 'start', 'end', 'seqnames' exist



In [16]:

    
## Function: 
## 'dt2gr()' inputs an R data.table/data.frame and outputs a GRanges, if columns 'start', 'end', 'seqnames' exist
## 
## Args:
## 'seqlengths' named integer vector representing genome (default = hg_seqlengths())
## 'seqinfo' Seqinfo of output GRanges object
##
## Use:
## dt2gr(dt)
##



In [17]:

    
dt2gr(dt)  ## data.table converted to GRanges









    





GRanges object with 3 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]        1  [ 2,  3]      *
  [2]        1  [ 5,  8]      *
  [3]        1  [10, 15]      *
  -------
  seqinfo: 25 sequences from an unspecified genome

gr2dt()

gr2dt() returns GRanges 'input_gr' converted into an R data.table/data.frame



In [18]:

    
## Function: 
## 'gr2dt()' returns GRanges converted into an R data.table/data.frame 
##
## Use:
## gr2dt(grexample)
##



In [19]:

    
gr2dt(grexample)   ## GRanges converted to data.table

print(is(gr2dt(grexample), 'data.table'))









    





seqnames start end strand width name

	1  3  9 + 7 A 
	1  9 13 - 5 B 
	1 13 20 - 8 C 









    



[1] TRUE

gr.tile()

gr.tile() tiles the input GRanges 'input_gr' with non-overlapping bins of width K.



In [20]:

    
## Function: 
## 'gr.tile()' tiles the input GRanges 'input_gr' with non-overlapping bins of width K.
## 
## Args:
## 'width' the width of each tile (default = 1e3)
##
## Use:
## gr.tile(input_gr, width = K)
##
## Refer to function 'gr.tile.map()'



In [21]:

    
gr.tile(grexample, 2)









    





GRanges object with 11 ranges and 2 metadata columns:
       seqnames    ranges strand |  query.id   tile.id
          <Rle> <IRanges>  <Rle> | <integer> <integer>
   [1]        1  [ 3,  4]      + |         1         1
   [2]        1  [ 5,  6]      + |         1         2
   [3]        1  [ 7,  8]      + |         1         3
   [4]        1  [ 9,  9]      + |         1         4
   [5]        1  [ 9, 10]      - |         2         5
   [6]        1  [11, 12]      - |         2         6
   [7]        1  [13, 13]      - |         2         7
   [8]        1  [13, 14]      - |         3         8
   [9]        1  [15, 16]      - |         3         9
  [10]        1  [17, 18]      - |         3        10
  [11]        1  [19, 20]      - |         3        11
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [22]:

    
## create 10 tiles of width 10 from start=1 to end=100
gr.tile(GRanges(1, IRanges(1,100)), width=10)









    





GRanges object with 10 ranges and 2 metadata columns:
       seqnames    ranges strand |  query.id   tile.id
          <Rle> <IRanges>  <Rle> | <integer> <integer>
   [1]        1 [ 1,  10]      * |         1         1
   [2]        1 [11,  20]      * |         1         2
   [3]        1 [21,  30]      * |         1         3
   [4]        1 [31,  40]      * |         1         4
   [5]        1 [41,  50]      * |         1         5
   [6]        1 [51,  60]      * |         1         6
   [7]        1 [61,  70]      * |         1         7
   [8]        1 [71,  80]      * |         1         8
   [9]        1 [81,  90]      * |         1         9
  [10]        1 [91, 100]      * |         1        10
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.chr()

gr.chr() prepends 'chr' to GRanges seqlevels()



In [23]:

    
## Function: 
## 'gr.chr()' prepends 'chr' to GRanges seqlevels()
## 
## Use:
## gr.chr(grexample)
##



In [24]:

    
gr.chr(grexample)









    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]     chr1  [ 3,  9]      + |           A
  [2]     chr1  [ 9, 13]      - |           B
  [3]     chr1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.nochr()

gr.nochr() removes the prefix 'chr' from GRanges seqlevels()



In [25]:

    
## Function: 
## 'gr.nochr()' removes the prefix 'chr' from GRanges seqlevels()
## 
## Use:
## gr.nochr(grexample)
##



In [26]:

    
gr.nochr(gr.chr(grexample))









    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      + |           A
  [2]        1  [ 9, 13]      - |           B
  [3]        1  [13, 20]      - |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.stripstrand()

gr.stripstrand() sets input GRanges strand to '*'



In [27]:

    
## Function: 
## gr.stripstrand() sets input GRanges strand to '*'
## 
## Use:
## gr.stripstrand(grexample)
##



In [28]:

    
gr.stripstrand(grexample)









    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      * |           A
  [2]        1  [ 9, 13]      * |           B
  [3]        1  [13, 20]      * |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.strandflip()

gr.strandflip() reverses GRanges strand signs



In [29]:

    
## Function: 
## gr.flipstrand() inverts GRanges strand, '+' <-> '-'
## 
## Use:
## gr.flipstrand(grexample)
##



In [30]:

    
gr.flipstrand(grexample)









    





GRanges object with 3 ranges and 1 metadata column:
      seqnames    ranges strand |        name
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1  [ 3,  9]      - |           A
  [2]        1  [ 9, 13]      + |           B
  [3]        1  [13, 20]      + |           C
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.flatten()

gr.flatten() returns input GRanges as data.frame with input ranges superimposed onto a single "flattened" chromosome, with ranges laid end-to-end



In [31]:

    
## Function: 
## gr.flatten() returns input GRanges as data.frame with input ranges superimposed 
## onto a single "flattened" chromosome, with ranges laid end-to-end
## 
## Args:
## 'gap' number of bases between ranges on the new chromosome (default = 0) 
## 
## Use:
## gr.flatten(grexample)
##



In [32]:

    
gr.flatten(grexample)









    





start end name

	 1  7 A 
	 8 12 B 
	13 20 C



In [33]:

    
gr.flatten(grexample, gap=500)









    





start end name

	   1    7 A   
	 508  512 B   
	1013 1020 C

gr.rand()

gr.rand(w, genome) returns randomly-generated GRanges from input genome



In [34]:

    
## Function: 
## gr.rand(w, genome) returns randomly-generated GRanges from input genome
## 
## Args:
## 'w' vector of widths, whereby 'length(w)' determines length of output
## 'genome' GRanges, GRangesList, or Seqinfo genome
## 
## Use:
## gr.rand(w, genome)
##



In [35]:

    
## Generate 5 non-overlapping regions of width 10 on hg19

## set seed for reproducibility 

set.seed(123)

gr.rand(rep(10,5), BSgenome.Hsapiens.UCSC.hg19::Hsapiens)









    





GRanges object with 5 ranges and 0 metadata columns:
      seqnames               ranges strand
         <Rle>            <IRanges>  <Rle>
  [1]     chr5 [20550152, 20550161]      *
  [2]    chr16 [63222704, 63222713]      *
  [3]     chr7 [49369246, 49369255]      *
  [4]    chr20 [51594129, 51594138]      *
  [5]     chrX [69363664, 69363673]      *
  -------
  seqinfo: 93 sequences from an unspecified genome

gr.trim()

gr.trim() "trims" input GRanges relative to the specified coordinates of each range



In [36]:

    
## Function: 
## 'gr.trim()' trims input GRanges relative to the specified <local> coordinates of each range
## The motivation behind this function is to fill the role of current GRanges functionality
## e.g. shift(), reduce(), restrict(), shift(), resize(), flank()
## 
## Args:
## 'start' vector of widths, whereby 'length(w)' determines length of output
## 'end' GRanges, GRangesList, or Seqinfo genome
## 
## Use:
## gr.trim(input_gr, starts, ends)
##



In [37]:

    
GRanges(1, IRanges(1e6, width=1000))









    





GRanges object with 1 range and 0 metadata columns:
      seqnames             ranges strand
         <Rle>          <IRanges>  <Rle>
  [1]        1 [1000000, 1000999]      *
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [38]:

    
## Given GRangeswith genomic coordinates 1:1,000,000-1,001,000, 
## gr.trim() to "trim" the first 20 bases and last 50 bases 

gr.trim(GRanges(1, IRanges(1e6, width=1500)), starts=20, ends=950)









    





GRanges object with 1 range and 0 metadata columns:
      seqnames             ranges strand
         <Rle>          <IRanges>  <Rle>
  [1]        1 [1000019, 1000949]      *
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.sample()

gr.sample(gr, k, wid) samples input GRanges intervals within a given territory



In [39]:

    
## Function: 
## gr.sample() samples input GRanges intervals within a given territory
## 
## Args:
## 'k'  number of ranges to sample
## 'wid' length of the GRanges element to produce (default = 100)
## 'replace' If TRUE, will bootstrap. If FALSE, otherwise will sample without replacement. (default = TRUE)
## 
## 'k'  integer Number of ranges to sample
## 'wid' integer Length of the \code{GRanges} element to produce (default = 100)
## 'replace' boolean If TRUE, will bootstrap, otherwise will sample without replacement. (default = TRUE)
## 
## If 'k' is a scalar then 'gr.rand()' will (uniformly) select 'k' intervals from the summed territory of GRanges
## If 'k' is a vector of 'length(gr)' then will uniformly select 'k' intervals from each.
##
## Note: This is different from the function GenomicRanges::sample(), which samples from a pile of GRanges
##
## Use:
## gr.sample(gr, k, wid, replace = TRUE)
##



In [40]:

    
## sample five GRanges of length 10 each from territory of RefSeq genes

## set seed for reproducibility 

set.seed(42)

gr.sample(reduce(example_genes), k=5, wid=10)









    





GRanges object with 5 ranges and 1 metadata column:
      seqnames               ranges strand |  query.id
         <Rle>            <IRanges>  <Rle> | <integer>
  [1]        4 [90662968, 90662977]      * |      4619
  [2]       11 [27367873, 27367882]      * |     10217
  [3]       16 [87924776, 87924785]      * |     13795
  [4]       19 [52504313, 52504322]      * |     16412
  [5]       21 [30486843, 30486852]      * |     17028
  -------
  seqinfo: 25 sequences from an unspecified genome

gr.fix()

gr.fix() "repairs" genome seqlevels; returns input with a repaired Seqinfo



In [41]:

    
## Function: 
## gr.fix() "repairs" genome seqlevels; returns input with a repaired Seqinfo
##
## If "genome" not specified will replace NA 'seqlengths' in GRanges to reflect largest coordinate per 'seqlevel'
## and removes all 'NA seqlevels' after this fix.
##
## If "genome" defined (i.e. as Seqinfo object, or a BSgenome, GRanges, GRangesList object with populated \code{seqlengths}),
## then will replace \code{seqlengths} in \code{gr} with those for that genome
## 
## Args:
## 'genome' Genome to fix to: Seqinfo, BSgenome, GRanges (w/seqlengths), GRangesList (w/seqlengths) (default = NULL)
## 'gname'  Name of the genome (optional) (default = NULL)
## 'drop'   If TRUE, remove ranges that are not present in the supplied genome (default = FALSE)
##
## Use:
## gr.fix(genome, gname, drop)
##



In [42]:

    
## before gr.fix()
seqlengths(grexample)

## after gr.fix()
seqlengths(gr.fix(grexample))

streduce()

streduce() returns a reduced, sorted GRanges with no strand information



In [43]:

    
## Function: 
## 'streduce' returns a reduced, sorted GRanges with no strand information, i.e. a GRanges with a minimal data footprint
##
## 'pad' Expand the input data before reducing (default = 0)
## 'sort' If TRUE, sorts the input (default = TRUE)
##
## Use:
## streduce(input_gr, pad, sort)
##



In [44]:

    
streduce(grexample)









    





GRanges object with 1 range and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]        1   [3, 20]      *
  -------
  seqinfo: 1 sequence from an unspecified genome

gr.string()

gr.string() returns UCSC-style interval string corresponding to GRanges pile (i.e. chr:start-end)`



In [45]:

    
## Function: 
## 'gr.string' returns UCSC-style interval string corresponding to GRanges pile (i.e. chr:start-end)
##
##
## 'add.chr' If TRUE, prepends seqnames with "chr" (default = FALSE)
## 'mb' If TRUE, rounds to the nearest megabase (default = FALSE)
## 'round' If 'mb', 'round' sets the number of digits to round to. (default = 3)
## 'other.cols' Names of additional 'mcols' fields to add to the string (seperated by ";")
## 'pretty' If TRUE, outputs interval string in more readable format 
##
## Use:
## gr.string(example_dnase)
##



In [46]:

    
gr.string(grexample)









    





	'1:3-9+'
	'1:9-13-'
	'1:13-20-'

grl.string()

grl.string() generates a string representation of an input GRangesList



In [47]:

    
## Function: 
## 'grl.string' generates a string representation of an input GRangesList
##
## 'mb' If TRUE, returns as MB and round to "round" (default = FALSE)
## 'sep' Character to separate single \code{GRanges} ranges (default = ',')
##
## Use:
## grl.string(input_GRangesList)
##



In [48]:

    
## create input GRangeList 'grl_size_five'

grl_size_five = grl1[1:5]


grl.string(grl_size_five)









    





	10
		'5:146981323-146981323-,14:104605059-104605059+'
	26
		'9:140100229-140100229-,19:24309057-24309057-'
	49
		'5:132216750-132216750-,17:4803057-4803057+'
	50
		'11:9492855-9492855-,11:69514029-69514029-'
	88
		'2:228337137-228337137+,7:97736489-97736489+'



In [49]:

    
grl.string(grl_size_five, sep = ':', mb=TRUE)









    





	10
		'5:146.981-146.981-:14:104.605-104.605+'
	26
		'9:140.1-140.1-:19:24.309-24.309-'
	49
		'5:132.217-132.217-:17:4.803-4.803+'
	50
		'11:9.493-9.493-:11:69.514-69.514-'
	88
		'2:228.337-228.337+:7:97.736-97.736+'

grl.reduce()

grl.reduce() returns GRangesList with GRanges of intervals of the input GRanges +/- padding



In [50]:

    
## Function: 
## 'grl.reduce' returns GRangesList with GRanges of intervals of the input GRanges +/- padding
##
## 'pad' padding to add to ranges inside GRangesList before reducing (default = 0)
## 'clip' If TRUE, add padding to ranges inside GRangesList before reducing (default = FALSE)
##
## Use:
## grl.reduce(input_GRangesList)
##



In [51]:

    
## create input GRangeList 'grl_size_five'

grl_size_five = grl1[1:5]

print(grl_size_five)
print('')
print('apply grl.reduce(), with padding = 50')

grl.reduce(grl_size_five, pad = 50, clip=TRUE)









    



GRangesList object of length 5:
$10 
GRanges object with 2 ranges and 1 metadata column:
      seqnames                 ranges strand |       bin
         <Rle>              <IRanges>  <Rle> | <integer>
  [1]        5 [146981323, 146981323]      - |        10
  [2]       14 [104605059, 104605059]      + |        10

$26 
GRanges object with 2 ranges and 1 metadata column:
      seqnames                 ranges strand | bin
  [1]        9 [140100229, 140100229]      - |  26
  [2]       19 [ 24309057,  24309057]      - |  26

$49 
GRanges object with 2 ranges and 1 metadata column:
      seqnames                 ranges strand | bin
  [1]        5 [132216750, 132216750]      - |  49
  [2]       17 [  4803057,   4803057]      + |  49

...
<2 more elements>
-------
seqinfo: 25 sequences from an unspecified genome
[1] ""
[1] "apply grl.reduce(), with padding = 50"






    



Warning message in dt2gr(out):
“Warning: Coercing to GRanges via non-standard columns”





    





GRangesList object of length 5:
$10 
GRanges object with 2 ranges and 2 metadata columns:
      seqnames                 ranges strand |         run     group
         <Rle>              <IRanges>  <Rle> | <character> <integer>
  [1]       14 [104605009, 104605060]      * |         1 1         1
  [2]        5 [146981273, 146981324]      * |         1 2         1

$26 
GRanges object with 2 ranges and 2 metadata columns:
      seqnames                 ranges strand | run group
  [1]       19 [ 24309007,  24309058]      * | 2 1     2
  [2]        9 [140100179, 140100230]      * | 2 2     2

$49 
GRanges object with 2 ranges and 2 metadata columns:
      seqnames                 ranges strand | run group
  [1]       17 [  4803007,   4803058]      * | 3 1     3
  [2]        5 [132216700, 132216751]      * | 3 2     3

...
<2 more elements>
-------
seqinfo: 25 sequences from an unspecified genome

gr.duplicated()

gr.duplicated allows to restrict duplicates using "by" columns; returns boolean vector of match status



In [52]:

    
## Function: 
## 'gr.duplicated' allows to restrict duplicates using "by" columns and allows inexact matching
## It will return a boolean vector of the "match status"
##
## 'by' column used to restrict duplicates. See the 'by' argument for function gr.match()
## 'type' string 'type' used. See the 'type' argument for function gr.match()
##
## Use:
## gr.duplicated(input_GRangesList)
##



In [53]:

    
first_10_genes = example_genes[1:10]

gr.duplicated(first_10_genes)









    



Warning message:
“"ranges" method for Hits objects is deprecated. Please use
  overlapsRanges() instead.”





    





	FALSE
	FALSE
	TRUE
	TRUE
	FALSE
	FALSE
	FALSE
	FALSE
	FALSE
	FALSE

gr.dice()

gr.dice() "dices up" GRanges, returning a width=1 GRanges spanning the input



In [54]:

    
## Function: 
## 'gr.dice' "dices up" GRanges, returning a width=1 GRanges spanning the input
##
## Use:
## gr.dice(input_gr)
##



In [55]:

    
gr.dice(grexample)









    





GRangesList object of length 3:
$1 
GRanges object with 7 ranges and 1 metadata column:
      seqnames    ranges strand |           X
         <Rle> <IRanges>  <Rle> | <character>
  [1]        1    [3, 3]      + |           A
  [2]        1    [4, 4]      + |           A
  [3]        1    [5, 5]      + |           A
  [4]        1    [6, 6]      + |           A
  [5]        1    [7, 7]      + |           A
  [6]        1    [8, 8]      + |           A
  [7]        1    [9, 9]      + |           A

$2 
GRanges object with 5 ranges and 1 metadata column:
      seqnames   ranges strand | X
  [1]        1 [13, 13]      - | B
  [2]        1 [12, 12]      - | B
  [3]        1 [11, 11]      - | B
  [4]        1 [10, 10]      - | B
  [5]        1 [ 9,  9]      - | B

$3 
GRanges object with 8 ranges and 1 metadata column:
      seqnames   ranges strand | X
  [1]        1 [20, 20]      - | C
  [2]        1 [19, 19]      - | C
  [3]        1 [18, 18]      - | C
  [4]        1 [17, 17]      - | C
  [5]        1 [16, 16]      - | C
  [6]        1 [15, 15]      - | C
  [7]        1 [14, 14]      - | C
  [8]        1 [13, 13]      - | C

-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths

grl.unlist()

grl.unlists executes a robust unlisting of GRangesList that keeps track of origin



In [56]:

    
## Function: 
## 'grl.unlist' executes a robust unlisting of GRangesList that keeps track of origin
##
## Motivation:
## 'grl.unlist()' does a "nice" unlist of a GRangesList object adding a field 'grl.ix' denoting 
## which element of the GRangesList each GRanges corresponds to,  and a field 'grl.iix' which 
## saves the (local) index that that gr was in its corresponding GRangesList item
##
## In this way, grl.unlist() is reversible, while BiocGenerics::unlist() is not.
##
## Use:
## grl.unlist(input_GRangesList)
##



In [57]:

    
head(grl.unlist(grl2))   ## using 'head()' as the resulting GRanges has 502 ranges









    





GRanges object with 6 ranges and 3 metadata columns:
      seqnames                 ranges strand |       bin    grl.ix   grl.iix
         <Rle>              <IRanges>  <Rle> | <integer> <integer> <integer>
  [1]        8 [110374809, 110374809]      - |      4678         1         1
  [2]       17 [ 72874459,  72874459]      + |      4678         1         2
  [3]        7 [116166641, 116166641]      + |      4713         2         1
  [4]       22 [ 21738147,  21738147]      - |      4713         2         2
  [5]        6 [ 32822006,  32822006]      + |      4723         3         1
  [6]        9 [136529010, 136529010]      - |      4723         3         2
  -------
  seqinfo: 25 sequences from an unspecified genome

gr.collapse()

gr.collapse() executes a robust unlisting of GRanges that keeps track of origin



In [58]:

    
## Function: 
## 'gr.collapse' executes a robust unlisting of GRanges that keeps track of origin
##
## Motivation:
## Similar to GenomicRanges::reduce() except only collapses *adjacent* ranges in the input
## In this way, grl.unlist() is reversible, while BiocGenerics::unlist() is not.
##
## Args:
## 'pad' Padding for not quite adjacent elements to be considered overlapping (default = 1)
##
## Use:
## gr.collapse(input_GRanges)
##



In [59]:

    
## gr.collapse(grexample)

rle.query()

rle.query() queries an 'RleList' representing genomic data, returning an Rle representing the (concatenated) vector of data



In [60]:

    
## Function: 
## 'rle.query' queries an 'RleList' representing genomic data, returning an  
## 'Rle' representing the (concatenated) vector of data
## This function has been implemeneted to be multithreaded via 'parallel:mc.cores()'
##
## Motivation:
## Recall: An Rle (run-length-encoded) vector is a specific representation of a vector
## An Rle is an efficient data structure if (1) the vector is quite large and/or
## (2) there are many consecutive values of the same identity. 
##
## Args:
## 'subject.rle' Rle
## 'query.gr' GRangeslist or GRanges
## 'chunksize' integer Number of \code{query.gr} ranges to consider in one memory chunk. (default = 1e9)
## 'mc.cores' Number of cores to apply when doing chunked operation (default = 1)
## 'verbose' Set the verbosity of the output (default = FALSE)
## 
## Use:
## rle.query(subject.rle, query.gr)
##



In [61]:

    
## an example RleList
chr1_Rle = Rle(10:1, 1:10)
chr2_Rle = Rle(10:1, 1:10)
example_rlelist = RleList( chr1=chr1_Rle, chr2=chr2_Rle)

print('example_rlelist')
print(example_rlelist)


print('rle.query()')
rle.query(example_rlelist, grexample)









    



[1] "example_rlelist"
RleList of length 2
$chr1
integer-Rle of length 55 with 10 runs
  Lengths:  1  2  3  4  5  6  7  8  9 10
  Values : 10  9  8  7  6  5  4  3  2  1

$chr2
integer-Rle of length 55 with 10 runs
  Lengths:  1  2  3  4  5  6  7  8  9 10
  Values : 10  9  8  7  6  5  4  3  2  1

[1] "rle.query()"






    





logical-Rle of length 20 with 1 run
  Lengths: 20
  Values : NA

grl.pivot()

grl.pivot() pivots GRangesList object 'x' by returning a new GRangesList 'y' whose kth item is GRanges object of ranges x[[i]][k] for all i in 1:length(x)



In [62]:

    
## Function: 
## 'grl.pivot' pivots GRangesList object 'x' by returning a new GRangesList 'y' whose
## kth item is GRanges object of ranges x[[i]][k] for all i in 1:length(x) 
## 
## e.g. If the length of a GRangesList `grl is 50, `length(grl)=50`, and the length of each GRanges element inside is 2, 
## then 'grl.pivot(grl' will produce a length 3 GRangesList with 50 elements per GRanges
##
## Note: Assumes all GRanges in "x" are of equal length
##
##
## Use:
## grl.pivot(input_GRangesList)
##



In [63]:

    
head(grl.pivot(grl1))









    





GRangesList object of length 2:
$1 
GRanges object with 250 ranges and 1 metadata column:
       seqnames                 ranges strand |       bin
          <Rle>              <IRanges>  <Rle> | <integer>
    10        5 [146981323, 146981323]      - |        10
    26        9 [140100229, 140100229]      - |        26
    49        5 [132216750, 132216750]      - |        49
    50       11 [  9492855,   9492855]      - |        50
    88        2 [228337137, 228337137]      + |        88
   ...      ...                    ...    ... .       ...
  4601        2 [ 69002312,  69002312]      + |      4601
  4604        6 [136173136, 136173136]      + |      4604
  4643        8 [ 22005672,  22005672]      - |      4643
  4673        4 [ 71554394,  71554394]      - |      4673
  4678        8 [110374809, 110374809]      - |      4678

...
<1 more element>
-------
seqinfo: 25 sequences from an unspecified genome

gr.sub()

gr.sub() applies gsub to seqlevels of input GRanges



In [64]:

    
## Function: 
## 'gr.sub' applies gsub to seqlevels of input GRanges
## by default, removes 'chr', and "0.1" suffixes, and replaces "MT" with "M"
## kth item is GRanges object of ranges x[[i]][k] for all i in 1:length(x) 
## 
## 'a' vector of regular expressions of things to substitute out (default = c("(^chr)(\\.1$)", "MT"))
## 'b' vector of values to substitute in (default = c("", "M"))
## 
## Use:
## gr.sub(input_GRanges)
##



In [65]:

    
seqlevels(grexample) = 'chr1'
seqlevels(gr.sub(grexample))









    




'chr1'

seg2gr()

seg2gr() converts "GRanges-like" data.frame into GRanges



In [66]:

    
## Function: 
## 'seg2gr' converts "GRanges-like" data.frame into GRanges
##
## There are a number of acceptable columns to specify chromsomes, segment starts, segment ends, and strand
## Please refer to 'standardize_segs()' for full details.
## 
## Use:
## seg2gr(input_data.frame)
##



In [67]:

    
dt[, strand := '*']  ## define strand column in data.table dt
## similar to defining columns in data.frame, i.e.'dt$strand = "*"'

print('print(dt) outputs')
print(dt)

print('seg2gr(dt) outputs')
seg2gr(dt)   ## outputs converted data.table









    









    



[1] "print(dt) outputs"
   seqnames start end strand
1:        1     2   3      *
2:        1     5   8      *
3:        1    10  15      *
[1] "seg2gr(dt) outputs"






    





GRanges object with 3 ranges and 0 metadata columns:
      seqnames    ranges strand
         <Rle> <IRanges>  <Rle>
  [1]        1  [ 2,  3]      *
  [2]        1  [ 5,  8]      *
  [3]        1  [10, 15]      *
  -------
  seqinfo: 1 sequence from an unspecified genome; no seqlengths



In [68]:

    
## segments_dt = data.table(patient = c('patient11235813', 'patient3141592', 'patient42'), 
##                         chromosome = c('chr3', 'chr5', 'chrX'), 
##                         start_position = c(1000, 2000, 3000), 
##                         end_position = c(5000, 6000, 7000), 
##                         strand = c('-', '+', '*'))
##                         
## 
## seg2gr(segments_dt)
##

grl.eval()

grl.eval() evaluates and aggregates an R expression on GRanges column in GRangesList



In [69]:

    
## Function: 
## 'grl.eval' evaluates and aggregates an R expression on GRanges column in GRangesList
##
## Args:
## 'expr' Any syntactically valid R expression, on columns of GRanges or GRangesList
## 
## Use:
## grl.eval(input_GRangesList, expr=valid_R_expression)
##



In [70]:

    
## NOTE: only printing first 5 rows

## calculate 'bin ** 2' whereby bin is the metadata column in 'grl1'
grl.eval(grl1, bin**2)[1:5]  

## calculate 'width() - 5' 
grl.eval(grl1, width-5)[1:5]

gr.simplify()

gr.simplify() calculates the pairwise-distance for SV rearrangements represented by GRangesList



In [71]:

    
## Function: 
## 'gr.simplify' calculates the pairwise-distance for SV rearrangements represented by GRangesList
##
## Args:
## 'field' character scalar, corresponding to value field of input GRanges. (default = NULL)
## val character scalar (default = NULL)
## include.val If TRUE, will include in out input GRanges the values field of first matching record in said input GRanges. (default = TRUE)
## split If TRUE, will split the output into GRangesList split by "field". (default = FALSE)
## pad integer Pad ranges by this amount before doing merge. [1], which merges contiguous but non-overlapping ranges. (default = 1)
##
## 
## Use:
## gr.simplify(input_GRanges, field = NULL, val = NULL, include.val = TRUE, split = FALSE, pad = 1)
##



In [72]:

    
gr.simplify(grexample, pad=4, field="name", split = TRUE)









    





GRangesList object of length 3:
$A 
GRanges object with 1 range and 1 metadata column:
      seqnames    ranges strand |           X
         <Rle> <IRanges>  <Rle> | <character>
  [1]     chr1    [3, 9]      + |           A

$B 
GRanges object with 1 range and 1 metadata column:
      seqnames  ranges strand | X
  [1]     chr1 [9, 13]      - | B

$C 
GRanges object with 1 range and 1 metadata column:
      seqnames   ranges strand | X
  [1]     chr1 [13, 20]      - | C

-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths

gr.parse()

gr.parse() returns GRanges parsed from input IGV-/UCSC-style strings



In [73]:

    
## Function: 
## 'gr.parse' returns GRanges parsed from input IGV-/UCSC-style strings
## 
## Use:
## gr.parse(input_IGV_UCSC_strings)
##



In [74]:

    
parse.gr(c('1:1e6-5e6+', '2:2e6-5e6-'))









    





GRanges object with 2 ranges and 0 metadata columns:
      seqnames             ranges strand
         <Rle>          <IRanges>  <Rle>
  [1]        1 [1000000, 5000000]      +
  [2]        2 [2000000, 5000000]      -
  -------
  seqinfo: 25 sequences from an unspecified genome

grl.parse()

grl.parse() eturns GRangesList parsed from input IGV-/UCSC-style strings



In [75]:

    
## Function: 
## 'grl.parse' returns GRangesList parsed from input IGV-/UCSC-style strings
## 
## Use:
## grl.parse(input_IGV_UCSC_strings)
##



In [76]:

    
parse.grl(c('1:1e6-5e6+;5:10-2000', '2:2e6-5e6-;10:100231321-100231399'))









    





GRangesList object of length 2:
[[1]] 
GRanges object with 2 ranges and 0 metadata columns:
      seqnames             ranges strand
         <Rle>          <IRanges>  <Rle>
  [1]        1 [1000000, 5000000]      +
  [2]        5 [     10,    2000]      *

[[2]] 
GRanges object with 2 ranges and 0 metadata columns:
      seqnames                 ranges strand
  [1]        2 [  2000000,   5000000]      -
  [2]       10 [100231321, 100231399]      *

-------
seqinfo: 25 sequences from an unspecified genome

gr.sum()

gr.sum() sums either by doing coverage and either weighting them equally or using a field "weight".



In [77]:

    
## Function: 
## gr.sum() sums either by doing coverage and either weighting them equally
## or using a field "weight".  Will return either sum or average.
##
## Args: 
## 'field' field from input GRanges to use as a weight
## 'mean' if TRUE, calculates mena. Flag specifies whether to divide the output at each interval but the total number of intervals overlapping it (only applies if field == NULL) (default FALSE)
##
## gr.sum(input_GRanges)



In [78]:

    
gr.sum(grexample)









    





GRanges object with 6 ranges and 1 metadata column:
      seqnames    ranges strand |     score
         <Rle> <IRanges>  <Rle> | <numeric>
  [1]     chr1  [ 1,  2]      * |         0
  [2]     chr1  [ 3,  8]      * |         1
  [3]     chr1  [ 9,  9]      * |         2
  [4]     chr1  [10, 12]      * |         1
  [5]     chr1  [13, 13]      * |         2
  [6]     chr1  [14, 20]      * |         1
  -------
  seqinfo: 1 sequence from an unspecified genome



In [79]:

    
grexample$GC = c(0.75, 1.0, 0.25)  ## add 'GC' column

gr.sum(grexample, field='GC')









    





GRanges object with 6 ranges and 1 metadata column:
      seqnames    ranges strand |        GC
         <Rle> <IRanges>  <Rle> | <numeric>
  [1]     chr1  [ 1,  2]      * |         0
  [2]     chr1  [ 3,  8]      * |      0.75
  [3]     chr1  [ 9,  9]      * |      1.75
  [4]     chr1  [10, 12]      * |         1
  [5]     chr1  [13, 13]      * |      1.25
  [6]     chr1  [14, 20]      * |      0.25
  -------
  seqinfo: 1 sequence from an unspecified genome



In [ ]:



In [ ]:



In [ ]: