In [2]:
from inspect import getmembers, getsource
from pycltools import pycltools
for line in (getsource(pycltools)).split("\n"):
if line.startswith("##"):
pycltools.jprint (line, bold=True, size=130)
if line.startswith("def"):
name = line.split()[1]
method = getattr(pycltools, name)
pycltools.jhelp (method)
##~~~~~~~ JUPYTER NOTEBOOK SPECIFIC TOOLS ~~~~~~~#
jhelp (function, full=True, print_private=False, **kwargs)
Print a nice looking help string based on the name of a declared function. By default print the function
definition and description
* function
Name of a declared function or class method
* full
If True, the help string will included a description of all arguments
stdout_print (*args)
Emulate print but uses sys stdout instead. It could sometimes be useful in specific situations where print
is in is not behaving optimaly (like with tqdm for example)
jprint (*args, **kwargs)
FOR JUPYTER NOTEBOOK ONLY
Format a string in HTML and print the output. Equivalent of print, but highly customizable. Many options can be
passed to the function.
* *args
One or several objects that can be cast in str
* **kwargs
Formatting options to tweak the html rendering
Boolean options : bold, italic, highlight, underlined, striked, subscripted, superscripted
String oprions: font, color, size, align, background_color, line_height
toogle_code (**kwargs)
FOR JUPYTER NOTEBOOK ONLY
Hide code with a clickable link in a j
upyter notebook
larger_display (percent=100, **kwargs)
FOR JUPYTER NOTEBOOK ONLY
Resize the area of the screen containing the notebook according to a given percentage of the available width
* percent percentage of the width of the screen to use [DEFAULT:100]
hide_traceback ()
FOR JUPYTER NOTEBOOK ONLY
Remove the traceback of exception and return only the Exception message and type
is_readable_file (fp, raise_exception=True, **kwargs)
Verify the readability of a file or list of file
is_gziped (fp, **kwargs)
Return True if the file is Gziped else False
has_extension (fp, ext, pos=-1, raise_exception=False, **kwargs)
Test presence of extension in a file path
* ext
Single extension name or list of extension names without dot. Example ["gz, "fa"]
* pos
Postition of the extension in the file path. -1 for the last, -2 for the penultimate and so on [DEFAULT -1 = Last position]
file_basename (fp, **kwargs)
Return the basename of a file without folder location and extension
extensions (fp, comp_ext_list=['gz', 'tgz', 'zip', 'xz', 'bz2'], **kwargs)
Return The extension of a file in lower-case. If archived file ("gz", "tgz", "zip", "xz", "bz2")
the method will output the base extension + the archive extension as a string
extensions_list (fp, comp_ext_list=['gz', 'tgz', 'zip', 'xz', 'bz2'], **kwargs)
Return The extension of a file in lower-case. If archived file ("gz", "tgz", "zip", "xz", "bz2")
the method will output the base extension + the archive extension as a list
file_name (fp, **kwargs)
Return The complete name of a file with the extension but without folder location
dir_name (fp, **kwargs)
Return the name of the directory where the file is located
dir_path (fp, **kwargs)
Return the directory path of a file
##~~~~~~~ STRING FORMATTING ~~~~~~~#
supersplit (string, separator='', **kwargs)
like split but can take a list of separators instead of a simple separator
rm_blank (name, replace='', **kwargs)
Replace blank spaces in a name by a given character (default = remove)
Blanks at extremities are always removed and nor replaced
concatenate (src_list, dest, **kwargs)
Concatenate a list of scr files in a single output file. Handle gziped files (mixed input and output)
copyFile (src, dest, **kwargs)
Copy a single file to a destination file or folder (with error handling/reporting)
* src
Source file path
* dest
Path of the folder where to copy the source file
gzip_file (fpin, fpout=None, **kwargs)
gzip a file
* fpin
Path of the input uncompressed file
* fpout
Path of the output compressed file (facultative)
gunzip_file (fpin, fpout=None, **kwargs)
ungzip a file
* fpin
Path of the input compressed file
* fpout
Path of the output uncompressed file (facultative)
remove_file (fp, exception_if_exist=False)
Try to remove a file from disk.
linerange (fp, range_list=[], line_numbering=True, max_char_line=150, **kwargs)
Print a range of lines in a file according to a list of start end lists. Handle gziped files
* fp
Path to the file to be parsed
* range_list
list of start, end coordinates lists or tuples
* line_numbering
If True the number of the line will be indicated in front of the line
* max_char_line
Maximal number of character to print per line
cat (fp, max_lines=100, line_numbering=False, max_char_line=150, **kwargs)
Emulate linux cat cmd but with line cap protection. Handle gziped files
* fp
Path to the file to be parsed
* max_lines
Maximal number of lines to print
* line_numbering
If True the number of the line will be indicated in front of the line
* max_char_line
Maximal number of character to print per line
tail (fp, n=10, line_numbering=False, max_char_line=150, **kwargs)
Emulate linux tail cmd. Handle gziped files
* fp
Path to the file to be parsed
* n
Number of lines to print starting from the end of the file
* line_numbering
If True the number of the line will be indicated in front of the line
* max_char_line
Maximal number of character to print per line
head (fp, n=10, ignore_comment_line=False, comment_char='#', max_char_line=200, sep='\t', max_char_col=50, **kwargs)
Emulate linux head cmd. Handle gziped files and bam files
* fp
Path to the file to be parsed. Works with text, gunziped and binary bam/sam files
* n
Number of lines to print starting from the begining of the file (Default 10)
* ignore_comment_line
Skip initial lines starting with a specific character. Pointless for bam files(Default False)
* comment_char
Character or string for ignore_comment_line argument (Default "#")
* max_char_line
Maximal number of character to print per line (Default 150)
linesample (fp, n_lines=100, line_numbering=True, max_char_line=150, **kwargs)
Randomly sample lines in a file and print them. Handle gziped files
* fp
Path to the file to be parsed
* n_lines
Number of lines to sample in the file
* line_numbering
If True the number of the line will be indicated in front of the line
* max_char_line
Maximal number of character to print per line
count_uniq (fp, colnum, select_values=None, drop_values=None, skip_comment='#', sep='\t', **kwargs)
Count unique occurences in a specific column of a tabulated file
* fp
Path to the file to be parsed (gzipped or not)
* colnum
Index number of the column to summarize
* select_values
Select specific lines in the file based on a dictionary containing column index(es) and valu(es) or list
of values to select. Exemple {2:["exon", "transcript"], 4:"lincRNA"}. DEFAULT=None
* drop_values
Same think that select_value but will drop the lines instead. DEFAULT=None
* skip_comment
Drop any comment lines starting with this character. DEFAULT="#"
* sep
Character or list of characters to use in order to split the lines. Exemple [" ",";"]. DEFAULT=" "
colsum (fp, colrange=None, separator='', header=False, ignore_hashtag_line=False, max_items=10, ret_type='md', **kwargs)
Create a summary of selected columns of a file
* fp
Path to the file to be parsed
* colrange
A list of column index to parse
* separator
A character or a list of characters to split the lines
* ignore_hashtag_line
skip line starting with a # symbol
* max_items
maximum item per line
* ret_type
Possible return types:
md = markdown formatted table,
dict = raw parsing dict,
report = Indented_text_report
fastcount (fp, **kwargs)
Efficient way to count the number of lines in a file. Handle gziped files
simplecount (fp, ignore_hashtag_line=False, **kwargs)
Simple way to count the number of lines in a file with more options
mkdir (fp, level=1, **kwargs)
Reproduce the ability of UNIX "mkdir -p" command
(ie if the path already exits no exception will be raised).
Can create nested directories by recursivity
* fp
path name where the folder should be created
* level
level in the path where to start to create the directories. Used by the program for the recursive creation of
directories
dir_walk (fp)
Print a directory arborescence
make_cmd_str (prog_name, opt_dict={}, opt_list=[], **kwargs)
Create a Unix like command line string from the prog name, a dict named arguments and a list of unmammed arguments
exemple make_cmd_str("bwa", {"b":None, t":6, "i":"../idx/seq.fa"}, ["../read1", "../read2"])
* prog_name
Name (if added to the system path) or path of the program
* opt_dict
Dictionary of option arguments such as "-t 5". The option flag have to be the key (without "-") and the the
option value in the dictionary value. If no value is requested after the option flag "None" had to be assigned
to the value field.
* opt_list
List of simple command line arguments
bash_basic (cmd, virtualenv=None, **kwargs)
Sent basic bash command
* cmd
A command line string formatted as a string
* virtualenv
If specified will try to load a virtualenvwrapper environment before runing the command
bash (cmd, virtualenv=None, live='stdout', print_stdout=True, ret_stdout=False, log_stdout=None, print_stderr=True, ret_stderr=False, log_stderr=None, print_cmd=False, dry=False, **kwargs)
More advanced version of bash calling with live printing of the standard output and possibilities to log the
redirect the output and error as a string return or directly in files. If ret_stderr and ret_stdout are True a
tuple will be returned and if both are False None will be returned
* cmd
A command line string formatted as a string
* virtualenv
If specified will try to load a virtualenvwrapper environment before runing the command
* print_stdout
If True the standard output will be LIVE printed through the system standard output stream
* ret_stdout
If True the standard output will be returned as a string
* log_stdout
If a filename is given, the standard output will logged in this file
* print_stderr
If True the standard error will be printed through the system standard error stream
* ret_stderr
If True the standard error will be returned as a string
* log_stderr
If a filename is given, the standard error will logged in this file
bash_update (cmd, update_freq=1, **kwargs)
FOR JUPYTER NOTEBOOK
Run a bash command and print the output in the cell. The output is updated each time until the output is None.
This is suitable for monitoring tasks that log events until there is nothing else to print such as bjobs or bpeeks.
* cmd
A command line string formatted as a string
* update_freq
The frequency of output updating in seconds [DEFAULT: 1]
bsub (cmd=None, virtualenv=None, mem=None, threads=None, queue=None, wait_jobid=None, stdout_fp=None, stderr_fp=None, send_email=False, print_cmd=True, dry=False, **kwargs)
FOR JUPYTER NOTEBOOK IN LSF environment
Send an LSF bsub command through bash and return the JOBID
For more information read the bsub documentation
* cmd
A command line string formatted as a string
* virtualenv
If specified will try to load a virtualenvwrapper environment before runing the command
* mem
Memory to reserve (-M and -R 'rusage[mem=])
* threads
Number of thread to reserve (-n)
* queue
Name of the LSF queue to be used (-q)
* wait_jobid
jobid of list of jobid to wait before executing this command(-w 'post_done(jobid))
* stdout_fp
Path of the file where to write the standard output of the command (-oo)
* stderr_fp
Path of the file where to write the standard error of the command (-eo)
* send_email
If True, will force LSF to send an email even if stdout_fp and/or stderr_fp is given
bjobs ()
FOR JUPYTER NOTEBOOK IN LSF environment
Emulate LSF bjobs command. Return a Dataframe of jobs
bjobs_update (update_freq=5)
FOR JUPYTER NOTEBOOK IN LSF environment
Emulate LSF bjobs command but update the cell every x seconds
Cell is locked
bjobs_lock (jobid=None, update_freq=2, final_delay=2)
FOR JUPYTER NOTEBOOK IN LSF environment
Check if bjobs has running or pending jobs until all are done
* jobid
List of jobid to Check
* update_freq
The frequency of output updating in seconds [DEFAULT: 2]
* final_delay
Final delay in seconds at the end of all jobs to prevent IO errors [DEFAULT: 5]
##~~~~~~~ DICTIONNARY FORMATTING ~~~~~~~#
dict_to_md (d, key_label='', value_label='', transpose=False, sort_by_key=False, sort_by_val=True, max_items=None, **kwargs)
Transform a dict into a markdown formated table
dict_to_report (d, tab='\t', ntab=0, sep=':', sort_dict=True, max_items=None, **kwargs)
Recursive function to return a text report from nested dict or OrderedDict objects
##~~~~~~~ TABLE FORMATTING ~~~~~~~#
reformat_table (input_file, output_file='', return_df=False, init_template=[], final_template=[], header='', keep_original_header=True, header_from_final_template=False, replace_internal_space='_', replace_null_val='*', subst_dict={}, filter_dict=[], predicate=None, standard_template=None, verbose=False, **kwargs)
Reformat a table given an initial and a final line templates indicated as a list where numbers
indicate the data column and strings the formatting characters
* input_file
A file with a structured text formatting (gzipped or not)
* output_file
A file path to output the reformatted table (if empty will not write in a file)
* return_df
If true will return a pandas dataframe containing the reformated table (Third party pandas package required)
by default the columns will be names after the final template [DEFAULT:False]
* init_template
A list of indexes and separators describing the structure of the input file
Example initial line = "chr1 631539 631540 Squires|id1 0 +"
Initial template = [0," ",1," ",2," ",3,"|",4," ",5," ",6]
Alternatively, instead of the numbers, string indexes can be used, but they need to be enclosed in curly
brackets to differentiate them from the separators. This greatly simplify the writing of the final template.
Example initial line = "chr1 631539 631540 Squires|id1 0 +"
Initial template = ["{chrom}"," ","{start}"," ","{end}","|","{name}"," ","{score}"," ","{strand}"]
* final_template
A list of indexes and separators describing the required structure of the output file. Name indexes need to
match indexes of the init_template and have to follow the same synthax [DEFAULT:Same that init template]
Example final line = "chr1 631539 631540 m5C|-|HeLa|22344696 - -"
Final template = [0," ",1," ",2," m5C|-|HeLa|22344696 - ",6]
* header
A string to write as a file header at the beginning of the file
* keep_original_header
If True the original header of the input file will be copied at the beginning of the output file [DEFAULT:True]
* header_from_final_template
Generate a header according to the name or number of the fields given in the final_template [DEFAULT:True]
* replace_internal_space
All internal blank space will be replaced by this character [DEFAULT:"_"]
* replace_null_val
Field with no value will be replaced by this character [DEFAULT:"*"]
* subst_dict
Nested dictionary of substitution per position to replace specific values by others [DEFAULT:None]
Example: { 0:{"chr1":"1","chr2":"2"}, 3:{"Squires":"5376774764","Li":"27664684"}}
* filter_dict
A dictionary of list per position to filter out lines with specific values [DEFAULT:None]
Example: { 0:["chr2", "chr4"], 1:["46767", "87765"], 5:["76559", "77543"]}
* predicate
A lambda predicate function for more advance filtering operations [DEFAULT:None]
Example: lambda val_dict: abs(int(val_dict[1])-int(val_dict[2])) <= 2000
* standard_template
Existing standard template to parse the file instead of providing one manually. List of saved templates:
- "gff3_ens_gene" = Template for ensembl gff3 fields. Select only the genes lines and decompose to individual elements.
- "gff3_ens_transcript" = Template for ensembl gff3 fields. Select only the transcript lines and decompose to individual elements.
- "gtf_ens_gene" = Template for ensembl gft fields. Select only the genes lines and decompose to individual elements
* verbose
If True will print detailed information [DEFAULT:False]
##~~~~~~~ WEB TOOLS ~~~~~~~#
url_exist (url, **kwargs)
Predicate verifying if an url exist without downloading all the link
wget (url, out_name='', progress_block=100000000, **kwargs)
Download a file from an URL to a local storage.
* url
A internet URL pointing to the file to download
* outname
Name of the outfile where (facultative)
* progress_block
size of the byte block for the progression of the download
##~~~~~~~ FUNCTIONS TOOLS ~~~~~~~#
print_arg (**kwargs)
Print calling function named and unnamed arguments
##~~~~~~~ SSH TOOLS ~~~~~~~#
scp (hostname, local_file, remote_dir, username=None, rsa_private_key=None, ssh_config='~/.ssh/config', verbose=False, **kwargs)
Copy a file over ssh in a target remote directory
* hostname
Name of the host ssh server
* username
name of the user
* rsa_private_key
path to the rsa private key
* local_file
path to the local file
* remote_dir
path to the target directory
* ssh_config
use as an alternative method instead of giving the username and rsa_private_key. Will fetch them from the config file directly
##~~~~~~~ PACKAGE TOOLS ~~~~~~~#
get_package_file (package, fp='', **kwargs)
Verify the existence of a file from the package data and return a file path
* package
Name of the package
* fp
Relative path to the file in the package. Usually package_name/data/file_name
if the path points to a directory the directory arborescence will be printed
##~~~~~~~ SAM/BAM TOOLS ~~~~~~~#
bam_sample (fp_in, fp_out, n_reads, verbose=False, **kwargs)
Sample reads from a SAM/BAM file and write in a new file
* fp_in
Path to the input file in .bam/.sam/.cram (the format will be infered from extension)
* fp_out
Path to the output file in .bam/.sam/.cram (the format will be infered from extension)
* n_reads
number of reads to sample
##~~~~~~~ DNA SEQUENCE TOOLS ~~~~~~~#
base_generator (bases=['A', 'T', 'C', 'G'], weights=[0.280788, 0.281691, 0.193973, 0.194773], **kwargs)
Generator returning DNA/RNA bases according to a probability weightning
* bases: list (default ["A","T","C","G"])
DNA RNA bases allowed
* weights: list (default [0.280788,0.281691,0.193973,0.194773])
Probability of each base to be returned. Should match the index of bases. The sum does not need to be equal to 1.
If the list is empty bases will be returned with a flat probability. The default values represent the frequency in the human
genome (excluding N).
make_sequence (bases=['A', 'T', 'C', 'G'], weights=[0.280788, 0.281691, 0.193973, 0.194773], length=1000, **kwargs)
return a sequence of DNA/RNA bases according to a probability weightning
* bases: list (default ["A","T","C","G"])
DNA RNA bases allowed in the sequence
* weights: list (default [0.280788,0.281691,0.193973,0.194773])
Probability of each base to be returned. Should match the index of bases. The sum does not need to be equal to 1.
If the list is empty bases will be returned with a flat probability. The default values represent the frequency in the human
genome (excluding N).
* length: int (default 1000)
length of the sequence to be returned
Content source: a-slide/pycl
Similar notebooks: