TEST pycltools package

This notebook contains tests for the functions contained in pycltools package



In [167]:

    
# Jupyter specific imports
from IPython.core.display import display, HTML, Markdown
# Import of required packages
from os import remove
# import all the functions from pyCL
from pycltools.pycltools import *

JUPYTER NOTEBOOK SPECIFIC TOOLS

jhelp



In [168]:

    
help(jhelp)









    



Help on function jhelp in module pycltools.pycltools:

jhelp(function, full=True, print_private=False, **kwargs)
    Print a nice looking help string based on the name of a declared function. By default print the function
    definition and description
    * function
        Name of a declared function or class method
    * full
        If True, the help string will included a description of all arguments



In [169]:

    
jhelp(jhelp, full=True)









    




jhelp (function, full=True, print_private=False, **kwargs)

    Print a nice looking help string based on the name of a declared function. By default print the function
    definition and description
    * function
        Name of a declared function or class method
    * full
        If True, the help string will included a description of all arguments

jprint



In [170]:

    
jhelp(jprint, full=True)









    




jprint (*args, **kwargs)

    FOR JUPYTER NOTEBOOK ONLY
    Format a string in HTML and print the output. Equivalent of print, but highly customizable. Many options can be
    passed to the function.
    * *args
        One or several objects that can be cast in str
    * **kwargs
        Formatting options to tweak the html rendering
        Boolean options : bold, italic, highlight, underlined, striked, subscripted, superscripted
        String oprions: font, color, size, align, background_color, line_height



In [45]:

    
txt="Lorem ipsum condimentum elementum sapien nam eleifend quisque sapien curae"
jprint(txt,font="sans", color="purple", size=200, bold=True)









    




Lorem ipsum condimentum elementum sapien nam eleifend quisque sapien curae



In [46]:

    
txt="Lorem ipsum\n\tcondimentum elementum\n\t\tsapien nam eleifend quisque\n\t\t\tsapien curae"
jprint(txt,font="sans", color="powderblue", size=200, bold=True, line_height=50)









    




Lorem ipsum
 condimentum elementum
  sapien nam eleifend quisque
   sapien curae



In [47]:

    
jprint("Lorem","ipsum","condimentum","elementum", 1, True, bold=False, italic=False, highlight=False, underlined=True, striked=False, subscripted=False, superscripted=False, font="calibri", color="grey", size=250, align="center")









    




Lorem ipsum condimentum elementum 1 True

toogle_code



In [171]:

    
jhelp(toogle_code, full=True)









    




toogle_code (**kwargs)

    FOR JUPYTER NOTEBOOK ONLY
    Hide code with a clickable link in a j
    upyter notebook



In [49]:

    
#toogle_code()

larger_display



In [172]:

    
jhelp(larger_display, full=True)









    




larger_display (percent=100, **kwargs)

    FOR JUPYTER NOTEBOOK ONLY
    Resize the area of the screen containing the notebook according to a given percentage of the available width
    *  percent percentage of the width of the screen to use [DEFAULT:100]



In [173]:

    
larger_display(100)

PREDICATES

is_readable_file



In [174]:

    
jhelp(is_readable_file, full=True)









    




is_readable_file (fp, raise_exception=True, **kwargs)

    Verify the readability of a file or list of file



In [53]:

    
try:
    is_readable_file("./data/KJHYTGYUJ")
    print ("OK")
except OSError as E:
    print(E)









    



./data/KJHYTGYUJ is not a valid file



In [54]:

    
try:
    is_readable_file("./data/RADAR_Secondary.txt")
    print ("OK")
except OSError as E:
    print(E)

OK

is_gziped



In [175]:

    
jhelp(is_gziped, full=True)









    




is_gziped (fp, **kwargs)

    Return True if the file is Gziped else False



In [56]:

    
is_gziped("./data/RADAR_Secondary.txt")









    Out[56]:





False



In [57]:

    
is_gziped("./data/RADAR_Secondary.txt.gz")









    Out[57]:





True

has_extension



In [176]:

    
jhelp(has_extension, full=True)









    




has_extension (fp, ext, pos=-1, raise_exception=False, **kwargs)

    Test presence of extension in a file path
    * ext
        Single extension name or list of extension names  without dot. Example ["gz, "fa"]
    * pos
        Postition of the extension in the file path. -1 for the last, -2 for the penultimate and so on [DEFAULT -1 = Last position]



In [59]:

    
has_extension("./data/test/RADAR_Secondary.txt.gz", "gz")









    Out[59]:





True



In [60]:

    
has_extension("./data/test/RADAR_Secondary.txt.gz", "fa")









    Out[60]:





False



In [61]:

    
has_extension("./data/test/RADAR_Secondary.txt.gz", "txt", -2)









    Out[61]:





True

PATH MANIPULATION

file_basename



In [177]:

    
jhelp(file_basename, full=True)









    




file_basename (fp, **kwargs)

    Return the basename of a file without folder location and extension



In [63]:

    
file_basename("./data/RADAR_Secondary.txt.gz")









    Out[63]:





'RADAR_Secondary'

extensions



In [178]:

    
jhelp(extensions, full=True)









    




extensions (fp, comp_ext_list=['gz', 'tgz', 'zip', 'xz', 'bz2'], **kwargs)

    Return The extension of a file in lower-case. If archived file ("gz", "tgz", "zip", "xz", "bz2")
    the method will output the base extension + the archive extension as a string



In [65]:

    
print(extensions("./data/RADAR_Secondary.txt.gz"))
print(extensions("./data/RADAR_Secondary.txt"))
print(extensions("./data/RADAR_Secondary"))









    



.txt.gz
.txt

extensions_list



In [66]:

    
jhelp(extensions_list, full=True)









    




extensions_list (fp, comp_ext_list=['gz', 'tgz', 'zip', 'xz', 'bz2'], **kwargs)

    Return The extension of a file in lower-case. If archived file ("gz", "tgz", "zip", "xz", "bz2")
    the method will output the base extension + the archive extension as a list



In [67]:

    
print(extensions_list("./data/RADAR_Secondary.txt.gz"))
print(extensions_list("./data/RADAR_Secondary.txt"))
print(extensions_list("./data/RADAR_Secondary"))









    



['txt', 'gz']
['txt']
[]

file_name



In [179]:

    
jhelp(file_name, full=True)









    




file_name (fp, **kwargs)

    Return The complete name of a file with the extension but without folder location



In [69]:

    
file_name("./data/test/RADAR_Secondary.txt.gz")









    Out[69]:





'RADAR_Secondary.txt.gz'

dir_name



In [180]:

    
jhelp(dir_name, full=True)









    




dir_name (fp, **kwargs)

    Return the name of the directory where the file is located



In [71]:

    
print(dir_name("./data/test/RADAR_Secondary.txt.gz"))
print(dir_name("./__init__.py"))
print(dir_name("/bin/bash"))









    



test
.
bin

dir_path



In [72]:

    
jhelp(dir_path, full=True)









    




dir_path (fp, **kwargs)

    Return the directory path of a file



In [73]:

    
print(dir_path("./data/test/RADAR_Secondary.txt.gz"))
print(dir_path("./__init__.py"))
print(dir_path("/bin/bash"))









    



./data/test
.
/bin

STRING FORMATTING

supersplit



In [181]:

    
jhelp(supersplit, full=True)









    




supersplit (string, separator='', **kwargs)

    like split but can take a list of separators instead of a simple separator



In [75]:

    
a = "chr7\t74138\t774138\tA>I|LOC100129917|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:22327324\t0"

print(supersplit(a, ["\t","|"]))

print(supersplit(a))

print(supersplit(a, "|"))









    



['chr7', '74138', '774138', 'A>I', 'LOC100129917', 'LUNG:LYMPHOBLASTOID_CELL_LINE', '15342557:15258596:22327324', '0']
['chr7', '74138', '774138', 'A>I|LOC100129917|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:22327324', '0']
['chr7\t74138\t774138\tA>I', 'LOC100129917', 'LUNG:LYMPHOBLASTOID_CELL_LINE', '15342557:15258596:22327324\t0']

rm_blank



In [182]:

    
jhelp(rm_blank, full=True)









    




rm_blank (name, replace='', **kwargs)
 Replace blank spaces in a name by a given character (default = remove)
    Blanks at extremities are always removed and nor replaced



In [77]:

    
a = "chr\t\t17|LU NG:LYMPHOBLAST    OID_CELL_LINE|15342557:152585     96:22327324\t0"

print(rm_blank(a))

print(rm_blank(a, replace="*"))









    



chr17|LUNG:LYMPHOBLASTOID_CELL_LINE|15342557:15258596:223273240
chr*17|LU*NG:LYMPHOBLAST*OID_CELL_LINE|15342557:152585*96:22327324*0

FILE MANIPULATION

copyFile



In [183]:

    
jhelp(copyFile, full=True)









    




copyFile (src, dest, **kwargs)

    Copy a single file to a destination file or folder (with error handling/reporting)
    * src
        Source file path
    * dest
        Path of the folder where to copy the source file



In [79]:

    
copyFile(src="./data/RADAR_Secondary.txt", dest="./data/")









    



Error: './data/RADAR_Secondary.txt' and './data/RADAR_Secondary.txt' are the same file



In [80]:

    
copyFile(src="./data/RADAR_Secondary.txt", dest="./data/RADAR_Secondary_copy.txt")

gzip_file



In [184]:

    
jhelp(gzip_file, full=True)









    




gzip_file (fpin, fpout=None, **kwargs)

    gzip a file
    * fpin
        Path of the input uncompressed file
    * fpout
        Path of the output compressed file (facultative)



In [82]:

    
gzip_file("./data/RADAR_Secondary.txt")









    



Compressing ./data/RADAR_Secondary.txt






    Out[82]:





'/home/aleg/Programming/pycltools/docs/data/RADAR_Secondary.txt.gz'

gunzip_file



In [185]:

    
jhelp(gunzip_file, full=True)









    




gunzip_file (fpin, fpout=None, **kwargs)

    ungzip a file
    * fpin
        Path of the input compressed file
    * fpout
        Path of the output uncompressed file (facultative)



In [84]:

    
gunzip_file("./data/RADAR_Secondary.txt.gz")









    



Uncompressing ./data/RADAR_Secondary.txt.gz






    Out[84]:





'/home/aleg/Programming/pycltools/docs/data/RADAR_Secondary.txt'

FILE INFORMATION

linerange



In [186]:

    
jhelp(linerange, full=True)









    




linerange (fp, range_list=[], line_numbering=True, max_char_line=150, **kwargs)

    Print a range of lines in a file according to a list of start end lists. Handle gziped files
    * fp
        Path to the file to be parsed
    * range_list
        list of start, end coordinates lists or tuples
    * line_numbering
        If True the number of the line will be indicated in front of the line
    * max_char_line
        Maximal number of character to print per line



In [86]:

    
file = "./data/RADAR_Secondary.txt"
linerange (file)









    



0	#location	reference	tissue	coverage	editing_level(%)
1	chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
2	chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
...
97	chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
98	chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
99	chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86



In [87]:

    
file = "./data/gencode_sample.gff3"
linerange (file, [[2,5],[10,12],[98,100]], max_char_line=100)









    



...
2	#provider: GENCODE
3	#contact: gencode-help@sanger.ac.uk
4	#format: gff3
5	#date: 2015-12-03
...
10	chr1	HAVANA	exon	30564	30667	.	+	.	ID=exon:ENST00000473358.1:2;Parent=ENST00000473358.1;gene_id=E...
11	chr1	HAVANA	exon	30976	31097	.	+	.	ID=exon:ENST00000473358.1:3;Parent=ENST00000473358.1;gene_id=E...
12	chr1	HAVANA	transcript	30267	31109	.	+	.	ID=ENST00000469289.1;Parent=ENSG00000243485.3;gene_id=EN...
...
98	chr1	HAVANA	exon	287517	287921	.	-	.	ID=exon:ENST00000335577.4:2;Parent=ENST00000335577.4;gene_id...
99	chr1	HAVANA	gene	357383	359681	.	-	.	ID=ENSG00000236743.1;gene_id=ENSG00000236743.1;gene_type=lin...
100	chr1	HAVANA	transcript	357383	359681	.	-	.	ID=ENST00000441866.1;Parent=ENSG00000236743.1;gene_id...
...



In [88]:

    
file = "./data/RADAR_Secondary.txt.gz"
linerange (file, line_numbering=False)









    



#location	reference	tissue	coverage	editing_level(%)
chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
...
chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86

cat



In [187]:

    
jhelp(cat, full=True)









    




cat (fp, max_lines=100, line_numbering=False, max_char_line=150, **kwargs)

    Emulate linux cat cmd but with line cap protection. Handle gziped files
    * fp
        Path to the file to be parsed
    * max_lines
        Maximal number of lines to print
    * line_numbering
        If True the number of the line will be indicated in front of the line
    * max_char_line
        Maximal number of character to print per line



In [90]:

    
file = "./data/RADAR_Secondary.txt.gz"
cat (file, max_lines=10)









    



#location	reference	tissue	coverage	editing_level(%)
chr1:1037916	Peng et al 2012	Lymphoblastoid cell line	9	66.67
chr1:1156882	Peng et al 2012	Lymphoblastoid cell line	42	36.59
chr1:1157460	Peng et al 2012	Lymphoblastoid cell line	66	22.73
chr1:1252441	Peng et al 2012	Lymphoblastoid cell line	11	72.73
...
chr1:10521237	Peng et al 2012	Lymphoblastoid cell line	34	17.65
chr1:10521238	Peng et al 2012	Lymphoblastoid cell line	35	37.14
chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86



In [91]:

    
file="./data/gencode_sample.gff3"
cat (file, max_lines=20, line_numbering=True, max_char_line=100)









    



0	##gff-version 3
1	#description: evidence-based annotation of the human genome (GRCh38), version 24 (Ensembl 83) - lo...
2	#provider: GENCODE
3	#contact: gencode-help@sanger.ac.uk
4	#format: gff3
5	#date: 2015-12-03
6	##sequence-region chr1 1 248956422
7	chr1	HAVANA	gene	29554	31109	.	+	.	ID=ENSG00000243485.3;gene_id=ENSG00000243485.3;gene_type=lincRN...
8	chr1	HAVANA	transcript	29554	31097	.	+	.	ID=ENST00000473358.1;Parent=ENSG00000243485.3;gene_id=ENS...
9	chr1	HAVANA	exon	29554	30039	.	+	.	ID=exon:ENST00000473358.1:1;Parent=ENST00000473358.1;gene_id=EN...
...
9990	chr1	HAVANA	exon	221983000	221983143	.	+	.	ID=exon:ENST00000421147.5:3;Parent=ENST00000421147.5...
9991	chr1	HAVANA	transcript	221966410	221984964	.	+	.	ID=ENST00000441160.1;Parent=ENSG00000228437.5;...
9992	chr1	HAVANA	exon	221966410	221966502	.	+	.	ID=exon:ENST00000441160.1:1;Parent=ENST00000441160.1...
9993	chr1	HAVANA	exon	221983000	221983143	.	+	.	ID=exon:ENST00000441160.1:2;Parent=ENST00000441160.1...
9994	chr1	HAVANA	exon	221984054	221984964	.	+	.	ID=exon:ENST00000441160.1:3;Parent=ENST00000441160.1...
9995	chr1	HAVANA	gene	222041705	222064763	.	-	.	ID=ENSG00000232679.1;gene_id=ENSG00000232679.1;gene_...
9996	chr1	HAVANA	transcript	222041705	222064763	.	-	.	ID=ENST00000438158.1;Parent=ENSG00000232679.1;...
9997	chr1	HAVANA	exon	222064685	222064763	.	-	.	ID=exon:ENST00000438158.1:1;Parent=ENST00000438158.1...
9998	chr1	HAVANA	exon	222058414	222058678	.	-	.	ID=exon:ENST00000438158.1:2;Parent=ENST00000438158.1...
9999	chr1	HAVANA	exon	222041705	222041922	.	-	.	ID=exon:ENST00000438158.1:3;Parent=ENST00000438158.1...

tail



In [188]:

    
jhelp(tail, full=True)









    




tail (fp, n=10, line_numbering=False, max_char_line=150, **kwargs)

    Emulate linux tail cmd. Handle gziped files
    * fp
        Path to the file to be parsed
    * n
        Number of lines to print starting from the end of the file
    * line_numbering
        If True the number of the line will be indicated in front of the line
    * max_char_line
        Maximal number of character to print per line



In [93]:

    
file = "./data/RADAR_clean.txt"
tail (file, n = 4)









    



...
chr1	225974581	225974581	A>I|SRP9|YH|22327324	28.89	+
chr1	225974735	225974735	A>I|SRP9|YH|22327324	23.88	+
chr1	225974746	225974746	A>I|SRP9|YH|22327324	71.19	+



In [94]:

    
file = "./data/RADAR_Secondary.txt.gz"
tail (file, n = 4, line_numbering=True)









    



...
97	chr1:10560773	Peng et al 2012	Lymphoblastoid cell line	20	40.00
98	chr1:10602697	Peng et al 2012	Lymphoblastoid cell line	5	60.00
99	chr1:11138237	Peng et al 2012	Lymphoblastoid cell line	14	42.86



In [95]:

    
file="./data/gencode_sample.gff3"
tail (file, n = 5, max_char_line=100)









    



...
chr1	HAVANA	transcript	222041705	222064763	.	-	.	ID=ENST00000438158.1;Parent=ENSG00000232679.1;gene_...
chr1	HAVANA	exon	222064685	222064763	.	-	.	ID=exon:ENST00000438158.1:1;Parent=ENST00000438158.1;gene...
chr1	HAVANA	exon	222058414	222058678	.	-	.	ID=exon:ENST00000438158.1:2;Parent=ENST00000438158.1;gene...
chr1	HAVANA	exon	222041705	222041922	.	-	.	ID=exon:ENST00000438158.1:3;Parent=ENST00000438158.1;gene...

head



In [189]:

    
jhelp(head, full=True)









    




head (fp, n=10, ignore_comment_line=False, comment_char='#', max_char_line=200, sep='\t', max_char_col=50, **kwargs)

    Emulate linux head cmd. Handle gziped files and bam files
    * fp
        Path to the file to be parsed. Works with text, gunziped and binary bam/sam files
    * n
        Number of lines to print starting from the begining of the file (Default 10)
    * ignore_comment_line
        Skip initial lines starting with a specific character. Pointless for bam files(Default False)
    * comment_char
        Character or string for ignore_comment_line argument (Default "#")
    * max_char_line
        Maximal number of character to print per line (Default 150)



In [97]:

    
head("./data/RADAR_Main.txt", n= 3)









    



#chromosome position  gene       strand annot1     annot2     alu? non_alu_repetitive? conservation_chimp conservation_rhesus conservation_mouse 
chr1        206256301 C1orf186   -      intronic   intronic   no   no                  N                  N                   N                  
chr6        116991832 intergenic -      intergenic intergenic no   no                  N                  N                   N



In [98]:

    
head("./data/RADAR_Main.txt", ignore_comment_line=True,n= 3)









    



chr1 206256301 C1orf186   - intronic   intronic   no no N N N 
chr6 116991832 intergenic - intergenic intergenic no no N N N 
chr7 30504355  NOD1       - intronic   intronic   no no N N N



In [99]:

    
head("./data/RADAR_Main.txt", n=5, max_char_line=110)









    



#chromosome position  gene       strand annot1     annot2     alu? non_alu_repetitive? conservation_chimp cons...
chr1        206256301 C1orf186   -      intronic   intronic   no   no                  N                  N   ...
chr6        116991832 intergenic -      intergenic intergenic no   no                  N                  N   ...
chr7        30504355  NOD1       -      intronic   intronic   no   no                  N                  N   ...
chr1        85127959  SSX2IP     -      Syn        Gln->Gln   no   no                  N                  N   ...



In [100]:

    
head("./data/RADAR_Secondary.txt.gz", n=6, ignore_comment_line=True)









    



chr1:1037916 Peng et al 2012 Lymphoblastoid cell line 9  66.67 
chr1:1156882 Peng et al 2012 Lymphoblastoid cell line 42 36.59 
chr1:1157460 Peng et al 2012 Lymphoblastoid cell line 66 22.73 
chr1:1252441 Peng et al 2012 Lymphoblastoid cell line 11 72.73 
chr1:1252443 Peng et al 2012 Lymphoblastoid cell line 11 45.45 
chr1:1253357 Peng et al 2012 Lymphoblastoid cell line 31 32.26



In [101]:

    
head("./data/sample.sam", n=6, ignore_comment_line=True)









    



chr1|35235|35295|-|5.1   272 chr12 37283     0 61M * 0 0 *                                                  *                                                  
chr1|90965|91025|-|7.57  256 chr16 90215899  0 61M * 0 0 *                                                  *                                                  
chr1|91055|91115|-|7.60  256 chr2  168290980 0 61M * 0 0 *                                                  *                                                  
chr1|92081|92141|-|8.1   272 chr1  268657    0 61M * 0 0 *                                                  *                                                  
chr1|92111|92171|-|8.2   256 chr5  181462264 0 61M * 0 0 *                                                  *                                                  
chr1|110943|111003|-|9.1 0   chrY  24307299  0 61M * 0 0 AATGAAAGATATGTGTTTTTCATATTACCAGGTAGATGATAAGGAGATTT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII



In [102]:

    
head ("./data/sample_100.bam", n=6)









    



chr1|1736694|1736754|-|168.51      256 chr6  108404793 0  32M29H   * 0 0 *                                                  *                                                  
chr1|20158612|20158672|+|508.32    0   chr1  20158612  60 61M      * 0 0 CTCAGAGGCTTGAAAAGTAGCATCCACCCCCTTCTGGGCATCAATCACAG IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 
chr1|47096793|47096853|-|1008.6    272 chr1  156061950 0  2H54M5H  * 0 0 *                                                  *                                                  
chr1|65003940|65004000|-|1364.17   256 chr13 107349700 0  16M1I44M * 0 0 *                                                  *                                                  
chr1|108202106|108202166|+|1958.74 0   chr1  108202106 60 61M      * 0 0 GGACAGAAAACAAATCAGTAGTTACCAGTTGTGACTAGCGGGAAGGGAAT IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 
chr1|147173091|147173151|+|2353.12 272 chr2  74122749  0  22H39M   * 0 0 *                                                  *

linesample



In [103]:

    
jhelp(linesample, full=True)









    




linesample (fp, n_lines=100, line_numbering=True, max_char_line=150, **kwargs)

    Randomly sample lines in a file and print them. Handle gziped files
    * fp
        Path to the file to be parsed
    * n_lines
        Number of lines to sample in the file
    * line_numbering
        If True the number of the line will be indicated in front of the line
    * max_char_line
        Maximal number of character to print per line



In [104]:

    
linesample("./data/RADAR_clean.txt", n_lines=10, line_numbering=True)









    



77	chr1	6710595	6710595	A>I|DNAJC11|YH|22327324	50.00	-
96	chr1	10521237	10521237	A>I|DFFA|YH|22327324	17.65	-
266	chr1	32737172	32737172	A>I|LCK|YH|22327324	35.71	+
342	chr1	40205396	40205396	A>I|PPIE|YH|22327324	63.64	+
448	chr1	52875019	52875019	A>I|PRPF38A|YH|22327324	38.89	+
533	chr1	85449497	85449497	A>I|MCOLN2|YH|22327324	20.00	-
610	chr1	114296188	114296188	A>I|PHTF1|YH|22327324	20.59	-
767	chr1	155444343	155444343	A>I|ASH1L|YH|22327324	42.86	-
824	chr1	157516004	157516004	A>I|FCRL5|YH|22327324	21.88	-
946	chr1	204526795	204526795	A>I|MDM4|YH|22327324	32.29	+



In [105]:

    
linesample("./data/RADAR_Secondary.txt.gz", n_lines=10, line_numbering=True)









    



4	chr1:1252441	Peng et al 2012	Lymphoblastoid cell line	11	72.73
8	chr1:1418532	Peng et al 2012	Lymphoblastoid cell line	5	60.00
51	chr1:6608345	Peng et al 2012	Lymphoblastoid cell line	13	46.15
56	chr1:6707305	Peng et al 2012	Lymphoblastoid cell line	33	39.39
61	chr1:6708354	Peng et al 2012	Lymphoblastoid cell line	15	40.00
62	chr1:6708680	Peng et al 2012	Lymphoblastoid cell line	24	25.00
63	chr1:6708681	Peng et al 2012	Lymphoblastoid cell line	24	20.83
75	chr1:6710585	Peng et al 2012	Lymphoblastoid cell line	30	65.52
90	chr1:10520702	Peng et al 2012	Lymphoblastoid cell line	98	11.22
93	chr1:10520751	Peng et al 2012	Lymphoblastoid cell line	166	28.92

count_uniq



In [106]:

    
jhelp(count_uniq, full=True)









    




count_uniq (fp, colnum, select_values=None, drop_values=None, skip_comment='#', sep='\t', **kwargs)

    Count unique occurences in a specific column of a tabulated file
    * fp
        Path to the file to be parsed (gzipped or not)
    * colnum
        Index number of the column to summarize
    * select_values
        Select specific lines in the file based on a dictionary containing column index(es) and valu(es) or list
        of values to select. Exemple {2:["exon", "transcript"], 4:"lincRNA"}. DEFAULT=None
    * drop_values
        Same think that select_value but will drop the lines instead. DEFAULT=None
    * skip_comment
        Drop any comment lines starting with this character. DEFAULT="#"
    * sep
        Character or list of characters to use in order to split the lines. Exemple [" ",";"]. DEFAULT=" "



In [107]:

    
count_uniq("./data/Small_editing_Peng_hg38.bed", colnum=17, sep=['\t',"|"])









    Out[107]:





17
intergenic    110
intron         55
3-UTR          17
unknown        12
dtype: int64



In [108]:

    
count_uniq("./data/gencode_sample.gff3", colnum=17, sep=["\t","=", ";"], select_values={2:["transcript", "exon"], 6:"+"})









    Out[108]:





17
lincRNA                     2031
antisense                   1600
processed_transcript         686
sense_intronic               105
TEC                           36
sense_overlapping             11
3prime_overlapping_ncrna       2
dtype: int64

colsum



In [190]:

    
jhelp(colsum, full=True)









    




colsum (fp, colrange=None, separator='', header=False, ignore_hashtag_line=False, max_items=10, ret_type='md', **kwargs)

    Create a summary of selected columns of a file
    * fp
        Path to the file to be parsed
    * colrange
        A list of column index to parse
    * separator
        A character or a list of characters to split the lines
    * ignore_hashtag_line
        skip line starting with a # symbol
    * max_items
        maximum item per line
    * ret_type
        Possible return types:
        md = markdown formatted table,
        dict = raw parsing dict,
        report = Indented_text_report



In [110]:

    
display(Markdown(colsum("./data/RADAR_Main.txt", header=True, colrange=[0,2,6], max_items=15)))









    






#chromosome
chr1
chr17
chr9
chr15
chr6
chr14
chr18
chr2
chrY
chr4
chr7




Count
4
3
2
2
2
1
1
1
1
1
1





gene
RABEP1
NUP133
JUB
GREB1L
SPHKAP
NLGN4Y
CELSR2
RBPJ
TLE4
SOCS7
ADPGK
UBE2O
TSC1
GRIK2
MEF2A
...




Count
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
...





alu?
no




Count
19



In [111]:

    
colsum("./data/RADAR_Main.txt", header=True, ret_type="dict", colrange=[0,3])









    Out[111]:





OrderedDict([(0,
              OrderedDict([('chr1', 4),
                           ('chr6', 2),
                           ('chr7', 1),
                           ('chr15', 2),
                           ('chr9', 2),
                           ('chr17', 3),
                           ('chr4', 1),
                           ('chrY', 1),
                           ('chr2', 1),
                           ('chr18', 1),
                           ('chr14', 1)])),
             (3, OrderedDict([('-', 10), ('+', 9)]))])



In [112]:

    
print(colsum(
        "./data/RADAR_clean.txt",
        header=True,
        ignore_hashtag_line=True,
        ret_type="report",
        separator=["\t","|"],
        max_items=5))









    



0
	chr1	997
1
	225974746	1
	225974735	1
	225974581	1
	224599486	1
	224584888	1
	...	...
2
	225974746	1
	225974735	1
	225974581	1
	224599486	1
	224584888	1
	...	...
3
	A>I	997
4
	FDPS	34
	MDM4	31
	CTSS	28
	DNAJC11	25
	S100PBP	24
	...	...
5
	YH	997
6
	22327324	997
7
	33.33	31
	66.67	31
	50.00	23
	57.14	22
	60.00	22
	...	...
8
	-	527
	+	470

fastcount



In [191]:

    
jhelp(fastcount, full=True)









    




fastcount (fp, **kwargs)

    Efficient way to count the number of lines in a file. Handle gziped files



In [114]:

    
fastcount("./data/RADAR_Secondary.txt")









    Out[114]:





100



In [115]:

    
fastcount("./data/RADAR_Secondary.txt.gz")









    Out[115]:





100

simplecount



In [192]:

    
jhelp(simplecount, full=True)









    




simplecount (fp, ignore_hashtag_line=False, **kwargs)

    Simple way to count the number of lines in a file with more options



In [117]:

    
simplecount("./data/Small_m5C_Squires_hg38.bed", ignore_hashtag_line=True)









    Out[117]:





194



In [118]:

    
simplecount("./data/RADAR_Secondary.txt.gz")









    Out[118]:





100

DIRECTORY MANIPULATION

mkdir



In [193]:

    
jhelp(mkdir, full=True)









    




mkdir (fp, level=1, **kwargs)

    Reproduce the ability of UNIX "mkdir -p" command
    (ie if the path already exits no exception will be raised).
    Can create nested directories by recursivity
    * fp
        path name where the folder should be created
    * level
        level in the path where to start to create the directories. Used by the program for the recursive creation of
        directories



In [120]:

    
mkdir("./data/test_dir")



In [121]:

    
mkdir ("./test/test/test")
!rm -rf ./test









    



Creating /home/aleg/Programming/pycltools/docs/test
Creating /home/aleg/Programming/pycltools/docs/test/test
Creating /home/aleg/Programming/pycltools/docs/test/test/test

SHELL MANIPULATION

make_cmd_str



In [194]:

    
jhelp(make_cmd_str, full=True)









    




make_cmd_str (prog_name, opt_dict={}, opt_list=[], **kwargs)

    Create a Unix like command line string from the prog name, a dict named arguments and a list of unmammed arguments
    exemple make_cmd_str("bwa", {"b":None, t":6, "i":"../idx/seq.fa"}, ["../read1", "../read2"])
    * prog_name
        Name (if added to the system path) or path of the program
    * opt_dict
        Dictionary of option arguments such as "-t 5". The option flag have to be the key (without "-") and the the
        option value in the dictionary value. If no value is requested after the option flag "None" had to be assigned
        to the value field.
    * opt_list
        List of simple command line arguments



In [123]:

    
make_cmd_str("bwa", {"-b":None, "-t":6, "-i":"../idx/seq.fa"}, ["../read1", "../read2"])









    Out[123]:





'bwa -b -t 6 -i ../idx/seq.fa ../read1 ../read2 '

bash_basic



In [195]:

    
jhelp(bash_basic, full=True)









    




bash_basic (cmd, virtualenv=None, **kwargs)

    Sent basic bash command
    * cmd
        A command line string formatted as a string
    * virtualenv
        If specified will try to load a virtualenvwrapper environment before runing the command



In [125]:

    
print(bash_basic("ls -l"))
print(bash_basic("echo TTTT"))
print(bash_basic("grep ./data/RADAR_Secondary.txt"))









    



total 136
drwxrwxr-x 3 aleg aleg  4096 Dec 10 10:54 data
-rw-rw-r-- 1 aleg aleg 39582 Dec 10 12:05 pycltools_functions_list.ipynb
-rw-rw-r-- 1 aleg aleg 93686 Dec 10 12:07 pycltools_tests.ipynb


None
TTTT


None


None

bash



In [196]:

    
jhelp(bash, full=True)









    




bash (cmd, virtualenv=None, live='stdout', print_stdout=True, ret_stdout=False, log_stdout=None, print_stderr=True, ret_stderr=False, log_stderr=None, print_cmd=False, dry=False, **kwargs)

    More advanced version of bash calling with live printing of the standard output and possibilities to log the
    redirect the output and error as a string return or directly in files. If ret_stderr and ret_stdout are True a
    tuple will be returned and if both are False None will be returned
    * cmd
        A command line string formatted as a string
    * virtualenv
        If specified will try to load a virtualenvwrapper environment before runing the command
    * print_stdout
        If True the standard output will be LIVE printed through the system standard output stream
    * ret_stdout
        If True the standard output will be returned as a string
    * log_stdout
        If a filename is given, the standard output will logged in this file
    * print_stderr
        If True the standard error will be printed through the system standard error stream
    * ret_stderr
        If True the standard error will be returned as a string
    * log_stderr
        If a filename is given, the standard error will logged in this file



In [127]:

    
bash("ls", print_stdout=True, ret_stdout=True,)









    



data
pycltools_functions_list.ipynb
pycltools_tests.ipynb






    Out[127]:





'data\npycltools_functions_list.ipynb\npycltools_tests.ipynb\n'



In [128]:

    
bash("for i in 1 2 3 4; do echo $i && sleep 1 && ls error ;done", live="stderr",  print_stdout=True, ret_stdout=True, print_stderr=True)









    



ls: cannot access 'error': No such file or directory
ls: cannot access 'error': No such file or directory
ls: cannot access 'error': No such file or directory
ls: cannot access 'error': No such file or directory
Error code #2 during execution of the command : for i in 1 2 3 4; do echo $i && sleep 1 && ls error ;done



In [129]:

    
bash("ls", print_stdout=False, ret_stdout=False, log_stdout="./data/stdout.txt")
head("./data/stdout.txt")









    



Only 3 lines in the file
data                           
pycltools_functions_list.ipynb 
pycltools_tests.ipynb

bash_update



In [197]:

    
jhelp(bash_update, full=True)









    




bash_update (cmd, update_freq=1, **kwargs)

    FOR JUPYTER NOTEBOOK
    Run a bash command and print the output in the cell. The output is updated each time until the output is None.
    This is suitable for monitoring tasks that log events until there is nothing else to print such as bjobs or bpeeks.
    * cmd
        A command line string formatted as a string
    * update_freq
        The frequency of output updating in seconds [DEFAULT: 1]



In [131]:

    
#bash_update("htop")

DICTIONNARY FORMATTING

dict_to_md



In [198]:

    
jhelp(dict_to_md, full=True)









    




dict_to_md (d, key_label='', value_label='', transpose=False, sort_by_key=False, sort_by_val=True, max_items=None, **kwargs)

    Transform a dict into a markdown formated table



In [133]:

    
d = {"a":12,"b":14,"c":8,"d":56,"e":76}
display(Markdown(dict_to_md(d, "Letter", "Number", sort_by_val=True)))
display(Markdown(dict_to_md(d, "Letter", "Number", transpose=True, max_items=3)))









    






Letter
Number




e
76


d
56


b
14


a
12


c
8










    






Letter
e
d
b
...




Number
76
56
14
...

dict_to_report



In [199]:

    
jhelp(dict_to_report, full=True)









    




dict_to_report (d, tab='\t', ntab=0, sep=':', sort_dict=True, max_items=None, **kwargs)

    Recursive function to return a text report from nested dict or OrderedDict objects



In [135]:

    
d = {"a":12,"b":14,"c":{"c1":12,"c2":{"c2.1":33221,"c2.2":765},"c3":32,"c4":443},"d":56,"e":76}
print(dict_to_report(d, tab=" | "))

d = {"a":12,"b":14,"c":{"c1":12,"c2":{"c2.1":33221,"c2.2":765, "c2.3":7533,"c2.4":76433,"c2.5":876543,"c2.6":89765,"c2.7":8654},"c3":32,"c4":443},"d":56,"e":76}
print(dict_to_report(d, tab="--", max_items=4, sort_dict=True))









    



a:12
b:14
c
 | c1:12
 | c2
 |  | c2.1:33221
 |  | c2.2:765
 | c3:32
 | c4:443
d:56
e:76

a:12
b:14
c
--c1:12
--c2
----c2.5:876543
----c2.6:89765
----c2.4:76433
----c2.1:33221
----...:...
--c3:32
--c4:443
d:56
e:76

TABLE FORMATTING

reformat_table



In [200]:

    
jhelp(reformat_table, full = True)









    




reformat_table (input_file, output_file='', return_df=False, init_template=[], final_template=[], header='', keep_original_header=True, header_from_final_template=False, replace_internal_space='_', replace_null_val='*', subst_dict={}, filter_dict=[], predicate=None, standard_template=None, verbose=False, **kwargs)

    Reformat a table given an initial and a final line templates indicated as a list where numbers
    indicate the data column and strings the formatting characters

    *  input_file
        A file with a structured text formatting (gzipped or not)
    *  output_file
        A file path to output the reformatted table (if empty will not write in a file)
    *  return_df
        If true will return a pandas dataframe containing the reformated table (Third party pandas package required)
        by default the columns will be names after the final template [DEFAULT:False]
    *  init_template
        A list of indexes and separators describing the structure of the input file
            Example initial line = "chr1    631539    631540    Squires|id1    0    +"
            Initial template = [0," ",1," ",2," ",3,"|",4," ",5," ",6]
            Alternatively, instead of the numbers, string indexes can be used, but they need to be enclosed in curly
            brackets to differentiate them from the separators. This greatly simplify the writing of the final template.
            Example initial line = "chr1    631539    631540    Squires|id1    0    +"
            Initial template = ["{chrom}"," ","{start}"," ","{end}","|","{name}"," ","{score}"," ","{strand}"]
    *  final_template
        A list of indexes and separators describing the required structure of the output file. Name indexes need to
        match indexes of the init_template and have to follow the same synthax  [DEFAULT:Same that init template]
            Example final line = "chr1    631539    631540    m5C|-|HeLa|22344696    -    -"
            Final template = [0," ",1," ",2," m5C|-|HeLa|22344696 - ",6]
    *  header
        A string to write as a file header at the beginning of the file
    *  keep_original_header
        If True the original header of the input file will be copied at the beginning of the output file [DEFAULT:True]
    *  header_from_final_template
        Generate a header according to the name or number of the fields given in the final_template [DEFAULT:True]
    *  replace_internal_space
        All internal blank space will be replaced by this character [DEFAULT:"_"]
    *  replace_null_val
        Field with no value will be replaced by this character [DEFAULT:"*"]
    *  subst_dict
        Nested dictionary of substitution per position to replace specific values by others [DEFAULT:None]
            Example: { 0:{"chr1":"1","chr2":"2"}, 3:{"Squires":"5376774764","Li":"27664684"}}
    *  filter_dict
        A dictionary of list per position  to filter out lines  with specific values [DEFAULT:None]
            Example: { 0:["chr2", "chr4"], 1:["46767", "87765"], 5:["76559", "77543"]}
    *  predicate
        A lambda predicate function for more advance filtering operations [DEFAULT:None]
            Example:  lambda val_dict: abs(int(val_dict[1])-int(val_dict[2])) <= 2000
    *  standard_template
        Existing standard template to parse the file  instead of providing one manually. List of saved templates:
        - "gff3_ens_gene" = Template for ensembl gff3 fields. Select only the genes lines and decompose to individual elements.
        - "gff3_ens_transcript" = Template for ensembl gff3 fields. Select only the transcript lines and decompose to individual elements.
        - "gtf_ens_gene" = Template for ensembl gft fields. Select only the genes lines and decompose to individual elements
    * verbose
        If True will print detailed information [DEFAULT:False]



In [137]:

    
# With numeric index
reformat_table(
    input_file="./data/Small_m5C_Squires_hg38.bed",
    output_file="./data/Small_m5C_Squires_hg38_reformat.bed",
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"\t",5,"\t",6],
    final_template=[0,"\t",1,"\t",2,"\tm5C|*|HeLa|22344696\t-\t",6],
    replace_internal_space='_',
    replace_null_val="*",
    keep_original_header=False,
    header="# New header\n"
    )

linerange ("./data/Small_m5C_Squires_hg38.bed")
linerange ("./data/Small_m5C_Squires_hg38_reformat.bed")









    



0	# Transcriptome-wide map of m5C [hg38 coordinates]
1	# Reference: Squires et al., Nucleic Acids Res. 40, 5023 (2012) [PMID 22344696, DOI 10.1093/nar/gks144]
2	#
...
197	chr1	19311959	19311960	Squires|id185	0	-
198	chr1	19608342	19608343	Squires|id186	0	+
199	chr1	19608343	19608344	Squires|id187	0	+
0	# New header
1	chr1	631539	631540	m5C|*|HeLa|22344696	-	+
2	chr1	631540	631541	m5C|*|HeLa|22344696	-	+
...
192	chr1	19311959	19311960	m5C|*|HeLa|22344696	-	-
193	chr1	19608342	19608343	m5C|*|HeLa|22344696	-	+
194	chr1	19608343	19608344	m5C|*|HeLa|22344696	-	+



In [138]:

    
# With str index
reformat_table(
    input_file="./data/Small_m5C_Squires_hg38.bed",
    output_file="./data/Small_m5C_Squires_hg38_reformat.bed",
    init_template=["{chrom}","\t","{start}","\t","{end}","|","{name}","\t","{score}","\t","{strand}"],
    final_template=["{start}","\t","{end}","\tadditional_informations\t","{name}"],
    replace_internal_space='_',
    replace_null_val="*",
    keep_original_header=False,
    header="# New header\n",
    verbose=True
    )

linerange ("./data/Small_m5C_Squires_hg38.bed")
linerange ("./data/Small_m5C_Squires_hg38_reformat.bed")









    



Enumerated named argument list:
	verbose: True
	standard_template: None
	predicate: None
	filter_dict: []
	subst_dict: {}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: False
	keep_original_header: False
	header: # New header

	final_template: ['{start}', '\t', '{end}', '\tadditional_informations\t', '{name}']
	init_template: ['{chrom}', '\t', '{start}', '\t', '{end}', '|', '{name}', '\t', '{score}', '\t', '{strand}']
	return_df: False
	output_file: ./data/Small_m5C_Squires_hg38_reformat.bed
	input_file: ./data/Small_m5C_Squires_hg38.bed
Unenumerated named arguments list:
Initial template values
chrom	start	end|name	score	strand
Final template values
start	end	additional_informations	name
194 Lines processed	194 Lines pass	0 Lines filtered out	0 Lines fail
0	# Transcriptome-wide map of m5C [hg38 coordinates]
1	# Reference: Squires et al., Nucleic Acids Res. 40, 5023 (2012) [PMID 22344696, DOI 10.1093/nar/gks144]
2	#
...
197	chr1	19311959	19311960	Squires|id185	0	-
198	chr1	19608342	19608343	Squires|id186	0	+
199	chr1	19608343	19608344	Squires|id187	0	+
0	# New header
1	631539	631540	Squires	additional_informations	id1
2	631540	631541	Squires	additional_informations	id2
...
192	19311959	19311960	Squires	additional_informations	id185
193	19608342	19608343	Squires	additional_informations	id186
194	19608343	19608344	Squires	additional_informations	id187



In [139]:

    
subst_dict = {0:{"chr1":"1", "chr2":"2"}, 3:{"Peng":"22344696"}}
filter_dict = {18:["intron"]}
input_file="./data/Small_editing_Peng_hg38.bed"
output_file="./data/Small_editing_Peng_hg38_reformat.bed"

reformat_table(
    input_file, output_file,
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"|",8,"|",9,"->",10,"|",11,"%|",12,"|",13,"|",14,"|",15,"|",16,"|",17,"|",18,"|",19,"\t",20,"\t",21],
    final_template=[0,"\t",1,"\t",2,"\t",9,">",10,"|",3,"|HeLa|",19,"\t",11,"\t",21],
    replace_internal_space='_',
    replace_null_val="*",
    subst_dict = subst_dict,
    filter_dict = filter_dict,
    verbose=True
    )

linerange (input_file)
linerange (output_file)









    



Enumerated named argument list:
	verbose: True
	standard_template: None
	predicate: None
	filter_dict: {18: ['intron']}
	subst_dict: {0: {'chr1': '1', 'chr2': '2'}, 3: {'Peng': '22344696'}}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: False
	keep_original_header: True
	header: 
	final_template: [0, '\t', 1, '\t', 2, '\t', 9, '>', 10, '|', 3, '|HeLa|', 19, '\t', 11, '\t', 21]
	init_template: [0, '\t', 1, '\t', 2, '\t', 3, '|', 4, '|', 5, '|', 6, '|', 7, '|', 8, '|', 9, '->', 10, '|', 11, '%|', 12, '|', 13, '|', 14, '|', 15, '|', 16, '|', 17, '|', 18, '|', 19, '\t', 20, '\t', 21]
	return_df: False
	output_file: ./data/Small_editing_Peng_hg38_reformat.bed
	input_file: ./data/Small_editing_Peng_hg38.bed
Unenumerated named arguments list:
Initial template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
Final template values
0	1	2	9>10|3|HeLa|19	11	21
194 Lines processed	139 Lines pass	55 Lines filtered out	0 Lines fail
0	# Transcriptome-wide map of editing sites [hg38 coordinates]
1	# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
2	#
...
197	chr1	9173454	9173455	Peng|chr1|9156101|-|T|Y|A->G|35.14%|99|T|24|C|13|37|intergenic|-	0	-
198	chr1	9173533	9173534	Peng|chr1|9156180|-|T|Y|A->G|24.10%|61|T|148|C|47|195|intergenic|-	0	-
199	chr1	9173535	9173536	Peng|chr1|9156182|-|T|Y|A->G|66.15%|99|C|129|T|66|195|intergenic|-	0	-
0	# Transcriptome-wide map of editing sites [hg38 coordinates]
1	# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
2	#
...
142	1	9173454	9173455	A>G|22344696|HeLa|-	35.14	-
143	1	9173533	9173534	A>G|22344696|HeLa|-	24.10	-
144	1	9173535	9173536	A>G|22344696|HeLa|-	66.15	-



In [140]:

    
input_file="./data/Small_editing_Peng_hg38.bed"

df = reformat_table(
    input_file,
    return_df=True,
    init_template=[0,"\t",1,"\t",2,"\t",3,"|",4,"|",5,"|",6,"|",7,"|",8,"|",9,"->",10,"|",11,"%|",12,"|",13,"|",14,"|",15,"|",16,"|",17,"|",18,"|",19,"\t",20,"\t",21],
    replace_internal_space='_',
    replace_null_val="*",
    verbose=True)

print(head(input_file, 11))

df.head()









    



Enumerated named argument list:
	verbose: True
	standard_template: None
	predicate: None
	filter_dict: []
	subst_dict: {}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: False
	keep_original_header: True
	header: 
	final_template: []
	init_template: [0, '\t', 1, '\t', 2, '\t', 3, '|', 4, '|', 5, '|', 6, '|', 7, '|', 8, '|', 9, '->', 10, '|', 11, '%|', 12, '|', 13, '|', 14, '|', 15, '|', 16, '|', 17, '|', 18, '|', 19, '\t', 20, '\t', 21]
	return_df: True
	output_file: 
	input_file: ./data/Small_editing_Peng_hg38.bed
Unenumerated named arguments list:
No final template given. Create final template from init template
Initial template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
Final template values
0	1	2	3|4|5|6|7|8|9->10|11%|12|13|14|15|16|17|18|19	20	21
# Transcriptome-wide map of editing sites [hg38 coordinates]
# Reference: Peng et al., Nat. Biotechnol. 30, 253 (2012) [PMID 22327324, DOI 10.1038/nbt.2122]
#
# Data cleaned and converted to BED6, coordinate conversion to hg38 using liftOver.
# Maintainer: Maurits Evers (maurits.evers@anu.edu.au)
#
chr1	1102535	1102536	Peng|chr1|1027779|-|T|Y|A->G|66.67%|37|C|6|T|3|9|intron|C1orf159	0	-
chr1	1221501	1221502	Peng|chr1|1146745|-|T|Y|A->G|36.59%|99|T|26|C|15|42|intron|SDF4	0	-
chr1	1222079	1222080	Peng|chr1|1147323|-|T|Y|A->G|22.73%|94|T|51|C|15|66|intron|SDF4	0	-
chr1	1251840	1251841	Peng|chr1|1177084|-|T|Y|A->G|56.25%|99|C|9|T|7|16|intergenic|-	0	-
chr1	1252243	1252244	Peng|chr1|1177487|-|T|Y|A->G|19.44%|30|T|29|C|7|36|intergenic|-	0	-

None






    Out[140]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
    
  
  
    
      0
      chr1
      1102535
      1102536
      Peng
      chr1
      1027779
      -
      T
      Y
      A
      ...
      37
      C
      6
      T
      3
      9
      intron
      C1orf159
      0
      -
    
    
      1
      chr1
      1221501
      1221502
      Peng
      chr1
      1146745
      -
      T
      Y
      A
      ...
      99
      T
      26
      C
      15
      42
      intron
      SDF4
      0
      -
    
    
      2
      chr1
      1222079
      1222080
      Peng
      chr1
      1147323
      -
      T
      Y
      A
      ...
      94
      T
      51
      C
      15
      66
      intron
      SDF4
      0
      -
    
    
      3
      chr1
      1251840
      1251841
      Peng
      chr1
      1177084
      -
      T
      Y
      A
      ...
      99
      C
      9
      T
      7
      16
      intergenic
      -
      0
      -
    
    
      4
      chr1
      1252243
      1252244
      Peng
      chr1
      1177487
      -
      T
      Y
      A
      ...
      30
      T
      29
      C
      7
      36
      intergenic
      -
      0
      -
    
  

5 rows × 22 columns



In [141]:

    
input_file = "./data/gencode_sample.gff3"

df = reformat_table(
    input_file,
    return_df=True,
    standard_template="gff3_ens_transcript", 
    keep_original_header=False,
    header_from_final_template= True,
    verbose=True
    )

print(head(input_file, 11))
df.head()









    



Enumerated named argument list:
	verbose: True
	standard_template: gff3_ens_transcript
	predicate: None
	filter_dict: []
	subst_dict: {}
	replace_null_val: *
	replace_internal_space: _
	header_from_final_template: True
	keep_original_header: False
	header: 
	final_template: []
	init_template: []
	return_df: True
	output_file: 
	input_file: ./data/gencode_sample.gff3
Unenumerated named arguments list:
Using gff3 ensembl transcript template. Non-transcript features will be filtered out
No final template given. Create final template from init template
Initial template values
seqid	source	type	start	end	score	strand	phase	ID=ID;Parent=Parent;gene_id=gene_id;transcript_id=transcript_id;gene_type=gene_type;gene_status=gene_status;gene_name=gene_name;transcript_type=transcript_type;transcript_status=transcript_status;transcript_name=transcript_name;level=level;transcript_support_level=transcript_support_level;tag=tag;havana_gene=havana_gene;havana_transcript=havana_transcript
Final template values
seqid	source	type	start	end	score	strand	phase	ID=ID;Parent=Parent;gene_id=gene_id;transcript_id=transcript_id;gene_type=gene_type;gene_status=gene_status;gene_name=gene_name;transcript_type=transcript_type;transcript_status=transcript_status;transcript_name=transcript_name;level=level;transcript_support_level=transcript_support_level;tag=tag;havana_gene=havana_gene;havana_transcript=havana_transcript
##gff-version 3
#description: evidence-based annotation of the human genome (GRCh38), version 24 (Ensembl 83) - long non-coding RNAs
#provider: GENCODE
#contact: gencode-help@sanger.ac.uk
#format: gff3
#date: 2015-12-03
##sequence-region chr1 1 248956422
chr1	HAVANA	gene	29554	31109	.	+	.	ID=ENSG00000243485.3;gene_id=ENSG00000243485.3;gene_type=lincRNA;gene_status=KNOWN;gene_name=RP11-34P13.3;level=2;tag=ncRNA_host;havana_gene=OTTHUMG00000000959.2
chr1	HAVANA	transcript	29554	31097	.	+	.	ID=ENST00000473358.1;Parent=ENSG00000243485.3;gene_id=ENSG00000243485.3;transcript_id=ENST00000473358.1;gene_type=lincRNA;gene_status=KNOWN;gene_name=RP11-34P1...
chr1	HAVANA	exon	29554	30039	.	+	.	ID=exon:ENST00000473358.1:1;Parent=ENST00000473358.1;gene_id=ENSG00000243485.3;transcript_id=ENST00000473358.1;gene_type=lincRNA;gene_status=KNOWN;gene_name=RP11-34P...
chr1	HAVANA	exon	30564	30667	.	+	.	ID=exon:ENST00000473358.1:2;Parent=ENST00000473358.1;gene_id=ENSG00000243485.3;transcript_id=ENST00000473358.1;gene_type=lincRNA;gene_status=KNOWN;gene_name=RP11-34P...

None






    Out[141]:







  
    
      
      seqid
      source
      type
      start
      end
      score
      strand
      phase
      ID
      Parent
      ...
      gene_status
      gene_name
      transcript_type
      transcript_status
      transcript_name
      level
      transcript_support_level
      tag
      havana_gene
      havana_transcript
    
  
  
    
      0
      chr1
      HAVANA
      transcript
      29554
      31097
      .
      +
      .
      ENST00000473358.1
      ENSG00000243485.3
      ...
      KNOWN
      RP11-34P13.3
      lincRNA
      KNOWN
      RP11-34P13.3-001
      2
      5
      not_best_in_genome_evidence,dotter_confirmed,b...
      OTTHUMG00000000959.2
      OTTHUMT00000002840.1
    
    
      1
      chr1
      HAVANA
      transcript
      30267
      31109
      .
      +
      .
      ENST00000469289.1
      ENSG00000243485.3
      ...
      KNOWN
      RP11-34P13.3
      lincRNA
      KNOWN
      RP11-34P13.3-002
      2
      5
      not_best_in_genome_evidence,basic
      OTTHUMG00000000959.2
      OTTHUMT00000002841.2
    
    
      2
      chr1
      HAVANA
      transcript
      34554
      36081
      .
      -
      .
      ENST00000417324.1
      ENSG00000237613.2
      ...
      KNOWN
      FAM138A
      lincRNA
      KNOWN
      FAM138A-001
      2
      1
      basic
      OTTHUMG00000000960.1
      OTTHUMT00000002842.1
    
    
      3
      chr1
      HAVANA
      transcript
      35245
      36073
      .
      -
      .
      ENST00000461467.1
      ENSG00000237613.2
      ...
      KNOWN
      FAM138A
      lincRNA
      KNOWN
      FAM138A-002
      2
      3;havana_gene=OTTHUMG00000000960.1;havana_tran...
      *
      *
      *
    
    
      4
      chr1
      HAVANA
      transcript
      89295
      120932
      .
      -
      .
      ENST00000466430.5
      ENSG00000238009.6
      ...
      KNOWN
      RP11-34P13.7
      lincRNA
      KNOWN
      RP11-34P13.7-001
      2
      5
      not_best_in_genome_evidence,basic
      OTTHUMG00000001096.2
      OTTHUMT00000003225.1
    
  

5 rows × 23 columns

WEB TOOLS

url_exist



In [201]:

    
jhelp(url_exist, full=True)









    




url_exist (url, **kwargs)

    Predicate verifying if an url exist without downloading all the link



In [143]:

    
url_exist("http://www.google.com") # When this one will be False it will probably be the end of the world









    Out[143]:





True



In [144]:

    
url_exist("http://www.JUYGKUYHGJHFJ.com")









    Out[144]:





True

wget



In [202]:

    
jhelp(wget, full=True)









    




wget (url, out_name='', progress_block=100000000, **kwargs)

    Download a file from an URL to a local storage.
    *  url
        A internet URL pointing to the file to download
    *  outname
        Name of the outfile where (facultative)
    *  progress_block
        size of the byte block for the progression of the download



In [146]:

    
outfile = wget("")
if outfile:
    print(outfile)
    remove(outfile)









    



unknown url type: ''



In [147]:

    
outfile = wget("https://www.encodeproject.org/files/ENCFF000HJC/@@download/ENCFF000HJC.bigWig", "test.bigWig", 50000000)
if outfile:
    print(outfile)
    remove(outfile)









    



Downloading: https://www.encodeproject.org/files/ENCFF000HJC/@@download/ENCFF000HJC.bigWig	Bytes: 258930225
50.0 MB Downloaded	[19.31 %]
100.0 MB Downloaded	[38.62 %]
150.0 MB Downloaded	[57.93 %]
200.0 MB Downloaded	[77.24 %]
250.0 MB Downloaded	[96.55 %]
258.9 MB Downloaded	[100 %]
test.bigWig

FUNCTION TOOLS

print_arg



In [203]:

    
jhelp(print_arg, full=True)









    




print_arg (**kwargs)

    Print calling function named and unnamed arguments



In [149]:

    
def test (A,B,C=7,*args, **kwarg):
    print_arg()

test(1,2,3,5, z=65, x=100)









    



Enumerated named argument list:
	C: 3
	B: 2
	A: 1
Unenumerated named arguments list:
	z: 65
	x: 100
Unnamed positional arguments list:
	5

SSH TOOLS

scp



In [204]:

    
jhelp(scp, full=True)









    




scp (hostname, local_file, remote_dir, username=None, rsa_private_key=None, ssh_config='~/.ssh/config', verbose=False, **kwargs)

    Copy a file over ssh in a target remote directory
    * hostname
        Name of the host ssh server
    * username
        name of the user
    * rsa_private_key
        path to the rsa private key
    * local_file
        path to the local file
    * remote_dir
        path to the target directory
    * ssh_config
        use as an alternative method instead of giving the username and rsa_private_key. Will fetch them from the config file directly



In [151]:

    
#scp(hostname="ebi-cli-001.ebi.ac.uk", local_file="../README.md", remote_dir="~/test", username="aleg", rsa_private_key="/home/aleg/.ssh/ebi_rsa")



In [152]:

    
#scp(hostname="ebi", local_file="../README.md", remote_dir="~/test")

Package Tools

get_package_file



In [153]:

    
jhelp(get_package_file, full=True)









    




get_package_file (package, fp='', **kwargs)

    Verify the existence of a file from the package data and return a file path
    * package
        Name of the package
    * fp
        Relative path to the file in the package. Usually package_name/data/file_name
        if the path points to a directory the directory arborescence will be printed



In [154]:

    
get_package_file("pyCL", "pyCL/")









    



/home/aleg/Programming/pycltools/pycltools/pycltools.py:1947: UserWarning: The 'pyCL' distribution was not found and is required by the application
  warnings.warn(str(E))

SAM/BAM TOOLS

bam_sample



In [155]:

    
jhelp(bam_sample, full=True)









    




bam_sample (fp_in, fp_out, n_reads, verbose=False, **kwargs)

    Sample reads from a SAM/BAM file and write in a new file
    * fp_in
        Path to the input file in .bam/.sam/.cram (the format will be infered from extension)
    * fp_out
        Path to the output file in .bam/.sam/.cram (the format will be infered from extension)
    * n_reads
        number of reads to sample



In [156]:

    
bam_sample("./data/sample.sam", fp_out="./data/sample_100.sam", n_reads=100, verbose=True)
linesample("./data/sample_100.sam", n_lines=10, max_char_line=100)









    



Found 5000 reads in input file
Wrote 100 reads in output file
20	@SQ	SN:chr21	LN:46709983
44	@SQ	SN:KI270305.1	LN:1472
111	@SQ	SN:KI270508.1	LN:1951
146	@SQ	SN:KI270710.1	LN:40176
170	@SQ	SN:KI270734.1	LN:165050
171	@SQ	SN:KI270735.1	LN:42811
217	chr14|61657775|61657835|+|13447.7	272	chr7	127489894	0	61M	*	0	0	*	*	NM:i:3	MD:Z:39A14A4A1	AS:i:49
234	chr17|43159683|43159737|-|19991.10	272	chr9	131908717	0	55M	*	0	0	*	*	NM:i:0	MD:Z:55	AS:i:55
239	chr18|14010134|14010194|+|21568.4	272	chr5	4925139	0	61M	*	0	0	*	*	NM:i:0	MD:Z:61	AS:i:61
266	chr3|138485055|138485115|+|33361.101	256	chr12	6132886	0	61M	*	0	0	*	*	NM:i:5	MD:Z:16G4C0A3C25G8	AS:...



In [157]:

    
bam_sample("./data/sample.sam", fp_out="./data/sample_100.bam", n_reads=100, verbose=True)
!samtools view "./data/sample_100.bam" | head









    



Found 5000 reads in input file
Wrote 100 reads in output file
chr1|805036|805096|+|89.10	272	chr8	436410	0	61M	*	0	0	*	*	NM:i:3	MD:Z:7A19C0A32	AS:i:46
chr1|110408997|110409057|+|2013.22	272	chr15	35143322	0	13H48M	*	0	0	*	*	NM:i:3	MD:Z:37G0A2T6	AS:i:37
chr1|121462469|121462529|+|2240.83	0	chr1	121462469	48	61M	*	0	0	AATCTATTTATTTATTTTTCTTCAGTGTTACAATGAAACAACATTGCTTTATTTAAATTTT	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	MD:Z:61	AS:i:61	XS:i:46
chr1|205386423|205386483|+|3446.41	272	KI270750.1	53599	0	7H47M7H	*	0	0	*	*	NM:i:3	MD:Z:27C0A6A11	AS:i:32
chr1|221508699|221508759|+|3731.6	272	chrX	69857918	0	37M24H	*	0	0	*	*	NM:i:0	MD:Z:37	AS:i:37
chr1|246607871|246607931|+|4121.10	256	chr19	29557507	0	19H42M	*	0	0	*	*	NM:i:0	MD:Z:42	AS:i:42
chr10|14878128|14878188|-|4488.16	256	chr15	84959120	0	17H44M	*	0	0	*	*	NM:i:0	MD:Z:44	AS:i:44
chr10|65751058|65751118|+|5083.9	272	chr2	222774610	0	18H43M	*	0	0	*	*	NM:i:1	MD:Z:6G36	AS:i:38
chr10|106187699|106187759|+|5744.7	272	chr10	73831305	0	61M	*	0	0	*	*	NM:i:5	MD:Z:0T44C4T3A5C0	AS:i:44
chr10|125698897|125698957|+|5980.5	0	chr10	125698897	60	61M	*	0	0	AGGTGGGCTCCATTTGGCCTCCTTCCTTGGTCCATTCTCATCTTCCTGGGCCCTGCGGATG	IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII	NM:i:0	MD:Z:61	AS:i:61	XS:i:0



In [158]:

    
bam_sample("./data/sample.txt", fp_out="./data/sample_100.bam", n_reads=100, verbose=True)









    



/home/aleg/Programming/pycltools/pycltools/pycltools.py:1990: UserWarning: Invalid input file format (.bam/.sam/.cram)
  warnings.warn ("Invalid input file format (.bam/.sam/.cram)")



In [159]:

    
bam_sample("./data/sample.sam", fp_out="./data/sample_100.txt", n_reads=100, verbose=True)









    



/home/aleg/Programming/pycltools/pycltools/pycltools.py:1999: UserWarning: Invalid output file format (.bam/.sam/.cram)
  warnings.warn ("Invalid output file format (.bam/.sam/.cram)")

DNA SEQUENCE TOOLS

base_generator



In [205]:

    
jhelp(base_generator, full = True)









    




base_generator (bases=['A', 'T', 'C', 'G'], weights=[0.280788, 0.281691, 0.193973, 0.194773], **kwargs)

    Generator returning DNA/RNA bases according to a probability weightning
    * bases: list (default ["A","T","C","G"])
        DNA RNA bases allowed
    * weights: list (default [0.280788,0.281691,0.193973,0.194773])
        Probability of each base to be returned. Should match the index of bases. The sum does not need to be equal to 1.
        If the list is empty bases will be returned with a flat probability. The default values represent the frequency in the human
        genome (excluding N).



In [161]:

    
bg = base_generator()
for i in range(10):
    print (next(bg))









    



A
T
C
A
T
C
G
G
T
T



In [162]:

    
bg = base_generator(bases=['A', 'T', 'C', 'G', 'N'], weights=[0.8, 0.8, 0.2, 0.2, 0.1])
for i in range(10):
    print (next(bg))









    



G
T
A
G
G
G
G
A
T
G

make_sequence



In [206]:

    
jhelp(make_sequence, full=True)









    




make_sequence (bases=['A', 'T', 'C', 'G'], weights=[0.280788, 0.281691, 0.193973, 0.194773], length=1000, **kwargs)

    return a sequence of DNA/RNA bases according to a probability weightning
    * bases: list (default ["A","T","C","G"])
        DNA RNA bases allowed in the sequence
    * weights: list (default [0.280788,0.281691,0.193973,0.194773])
        Probability of each base to be returned. Should match the index of bases. The sum does not need to be equal to 1.
        If the list is empty bases will be returned with a flat probability. The default values represent the frequency in the human
        genome (excluding N).
    * length: int (default 1000)
        length of the sequence to be returned



In [164]:

    
make_sequence()









    Out[164]:





'ACTGGCGTCGGATCGTGAGGTACTGATATTTCCGGCTCGCTGCCTATACCTATCAGTCCAAGTATGATGACTAGGAAGAACGCTAGTAATAGTGGGCGTTCACGGTTGAGAACCTCTTATTCATGGAAATAAATATTGAGTCTTGTGGGTCTGATAAGCGTTCCCCAAGTAAGTACGAAAAATCTGAGAGCCAAAGGAACTACCGTTATGAGGATCTCTGTTTAAATTCTGATAATATGTATTTGGATCCGAAATACGCGGTGATGGTGTGTAGTTACCTTAGGCTGATCGGTAAGCACTGCATCTACAGTTATAGTCCCCACTTTTCGTTTGCAAGCAAAAGTTGATCTATGTCACCCTCAATCTCGTAAAGGTGTTGCTATGGTTAAAGTAAGTGTCTCCTAGTGCTGATCAGAGCAAACGCTAAGGGAAAGGGGAGCTAAGCCCTTATGATCAAAGAGACAGATGGCTTAGCGCCCAATTCAGCTATTATGTGAAATACATGTACGGGAAAAATTCTTCACTTGGAAGAAACAATGGTGAGTCTTTATCCAGGAACATGTAAGGAATTTGTAGTTCCAAATTCGGTCTATGTCCAATGATGACAGAAGCTAACGTATTGCGTTATGAATCAGGTGTACTTGTGTTTGATTTTAGTAATCCTTCGACTGAATTTGCATCTGTGGACGAGATATCACGGAGATTTGGGTGTCTCTACTTGAACATCATAGTTTGTCATAGGGCTAGTTCTTGGCATTTAATAAAATTAATAATATTGACTAATAACAACGCGACTGTTCGTCGCTAAATTGAAAACCATACAATGATCTATTTCAATACCTATTTGTCCCCACAGTAATCGATTTGCTTTATTTATAAGAGAAGATTATCAATATTTTAAGTTCTATGAATTCCTAGCACTCATAGGTCTGTGTCCCGGTGTTCCAATCTGGTGTCAACGTCGATCAGCCTTTGTCTAGTTCTTAATCTAGAGTTTAGT'



In [165]:

    
make_sequence(bases=['A', 'T', 'C', 'G', 'N'], weights=[], length=100)









    Out[165]:





'TATNGGATTNANGGCGTNGAATGNATNANCGTTGNNCCAAATTGANCGNTGTNNTTNGATNNTNAGGCTTGCCCTCNCGCAAAACCNGNCAACTTNNNNG'



In [166]:

    
make_sequence(bases=['A', 'T', 'C', 'G', 'N'], weights=[0.8, 0.8, 0.2, 0.2, 0.1], length=100)









    Out[166]:





'ATCATGATCGNTTTTAATCAAAATTATCTTAATAAATTAATTTCTATTTTANGNAANAGATATCTNTCTTCCTNATACNCAATATAAGTTAAAACTAGGG'

	0	1	2	3	4	5	6	7	8	9	...	12	13	14	15	16	17	18	19	21
0	chr1	1102535	1102536	Peng	chr1	1027779	-	T	Y	A	...	37	C	6	T	3	9	intron	C1orf159	-
1	chr1	1221501	1221502	Peng	chr1	1146745	-	T	Y	A	...	99	T	26	C	15	42	intron	SDF4	-
2	chr1	1222079	1222080	Peng	chr1	1147323	-	T	Y	A	...	94	T	51	C	15	66	intron	SDF4	-
3	chr1	1251840	1251841	Peng	chr1	1177084	-	T	Y	A	...	99	C	9	T	7	16	intergenic	-	-
4	chr1	1252243	1252244	Peng	chr1	1177487	-	T	Y	A	...	30	T	29	C	7	36	intergenic	-	-

	seqid	source	type	start	end	score	strand	phase	ID	Parent	...	gene_status	gene_name	transcript_type	transcript_status	transcript_name	level	transcript_support_level	tag	havana_gene	havana_transcript
0	chr1	HAVANA	transcript	29554	31097	.	+	.	ENST00000473358.1	ENSG00000243485.3	...	KNOWN	RP11-34P13.3	lincRNA	KNOWN	RP11-34P13.3-001	2	5	not_best_in_genome_evidence,dotter_confirmed,b...	OTTHUMG00000000959.2	OTTHUMT00000002840.1
1	chr1	HAVANA	transcript	30267	31109	.	+	.	ENST00000469289.1	ENSG00000243485.3	...	KNOWN	RP11-34P13.3	lincRNA	KNOWN	RP11-34P13.3-002	2	5	not_best_in_genome_evidence,basic	OTTHUMG00000000959.2	OTTHUMT00000002841.2
2	chr1	HAVANA	transcript	34554	36081	.	-	.	ENST00000417324.1	ENSG00000237613.2	...	KNOWN	FAM138A	lincRNA	KNOWN	FAM138A-001	2	1	basic	OTTHUMG00000000960.1	OTTHUMT00000002842.1
3	chr1	HAVANA	transcript	35245	36073	.	-	.	ENST00000461467.1	ENSG00000237613.2	...	KNOWN	FAM138A	lincRNA	KNOWN	FAM138A-002	2	3;havana_gene=OTTHUMG00000000960.1;havana_tran...	*	*	*
4	chr1	HAVANA	transcript	89295	120932	.	-	.	ENST00000466430.5	ENSG00000238009.6	...	KNOWN	RP11-34P13.7	lincRNA	KNOWN	RP11-34P13.7-001	2	5	not_best_in_genome_evidence,basic	OTTHUMG00000001096.2	OTTHUMT00000003225.1