In [2]:

    
from inspect import getmembers, getsource
from pycltools import pycltools

for line in (getsource(pycltools)).split("\n"):
    if line.startswith("##"):
        pycltools.jprint (line, bold=True, size=130)
    if line.startswith("def"):
        name = line.split()[1]
        method = getattr(pycltools, name)        
        pycltools.jhelp (method)









    




##~~~~~~~ JUPYTER NOTEBOOK SPECIFIC TOOLS ~~~~~~~#






    




jhelp (function, full=True, print_private=False, **kwargs)

    Print a nice looking help string based on the name of a declared function. By default print the function
    definition and description
    * function
        Name of a declared function or class method
    * full
        If True, the help string will included a description of all arguments
    






    




stdout_print (*args)

    Emulate print but uses sys stdout instead. It could sometimes be useful in specific situations where print
    is in is not behaving optimaly (like with tqdm for example)
    






    




jprint (*args, **kwargs)

    FOR JUPYTER NOTEBOOK ONLY
    Format a string in HTML and print the output. Equivalent of print, but highly customizable. Many options can be
    passed to the function.
    * *args
        One or several objects that can be cast in str
    * **kwargs
        Formatting options to tweak the html rendering
        Boolean options : bold, italic, highlight, underlined, striked, subscripted, superscripted
        String oprions: font, color, size, align, background_color, line_height
    






    




toogle_code (**kwargs)

    FOR JUPYTER NOTEBOOK ONLY
    Hide code with a clickable link in a j
    upyter notebook
    






    




larger_display (percent=100, **kwargs)

    FOR JUPYTER NOTEBOOK ONLY
    Resize the area of the screen containing the notebook according to a given percentage of the available width
    *  percent percentage of the width of the screen to use [DEFAULT:100]
    






    




hide_traceback ()

    FOR JUPYTER NOTEBOOK ONLY
    Remove the traceback of exception and return only the Exception message and type
    






    




is_readable_file (fp, raise_exception=True, **kwargs)

    Verify the readability of a file or list of file
    






    




is_gziped (fp, **kwargs)

    Return True if the file is Gziped else False
    






    




has_extension (fp, ext, pos=-1, raise_exception=False, **kwargs)

    Test presence of extension in a file path
    * ext
        Single extension name or list of extension names  without dot. Example ["gz, "fa"]
    * pos
        Postition of the extension in the file path. -1 for the last, -2 for the penultimate and so on [DEFAULT -1 = Last position]
    






    




file_basename (fp, **kwargs)

    Return the basename of a file without folder location and extension
    






    




extensions (fp, comp_ext_list=['gz', 'tgz', 'zip', 'xz', 'bz2'], **kwargs)

    Return The extension of a file in lower-case. If archived file ("gz", "tgz", "zip", "xz", "bz2")
    the method will output the base extension + the archive extension as a string
    






    




extensions_list (fp, comp_ext_list=['gz', 'tgz', 'zip', 'xz', 'bz2'], **kwargs)

    Return The extension of a file in lower-case. If archived file ("gz", "tgz", "zip", "xz", "bz2")
    the method will output the base extension + the archive extension as a list
    






    




file_name (fp, **kwargs)

    Return The complete name of a file with the extension but without folder location
    






    




dir_name (fp, **kwargs)

    Return the name of the directory where the file is located
    






    




dir_path (fp, **kwargs)

    Return the directory path of a file
    






    




##~~~~~~~ STRING FORMATTING ~~~~~~~#






    




supersplit (string, separator='', **kwargs)

    like split but can take a list of separators instead of a simple separator
    






    




rm_blank (name, replace='', **kwargs)
 Replace blank spaces in a name by a given character (default = remove)
    Blanks at extremities are always removed and nor replaced 






    




concatenate (src_list, dest, **kwargs)

    Concatenate a list of scr files in a single output file. Handle gziped files (mixed input and output)
    






    




copyFile (src, dest, **kwargs)

    Copy a single file to a destination file or folder (with error handling/reporting)
    * src
        Source file path
    * dest
        Path of the folder where to copy the source file
    






    




gzip_file (fpin, fpout=None, **kwargs)

    gzip a file
    * fpin
        Path of the input uncompressed file
    * fpout
        Path of the output compressed file (facultative)
    






    




gunzip_file (fpin, fpout=None, **kwargs)

    ungzip a file
    * fpin
        Path of the input compressed file
    * fpout
        Path of the output uncompressed file (facultative)
    






    




remove_file (fp, exception_if_exist=False)

    Try to remove a file from disk.
    






    




linerange (fp, range_list=[], line_numbering=True, max_char_line=150, **kwargs)

    Print a range of lines in a file according to a list of start end lists. Handle gziped files
    * fp
        Path to the file to be parsed
    * range_list
        list of start, end coordinates lists or tuples
    * line_numbering
        If True the number of the line will be indicated in front of the line
    * max_char_line
        Maximal number of character to print per line
    






    




cat (fp, max_lines=100, line_numbering=False, max_char_line=150, **kwargs)

    Emulate linux cat cmd but with line cap protection. Handle gziped files
    * fp
        Path to the file to be parsed
    * max_lines
        Maximal number of lines to print
    * line_numbering
        If True the number of the line will be indicated in front of the line
    * max_char_line
        Maximal number of character to print per line
    






    




tail (fp, n=10, line_numbering=False, max_char_line=150, **kwargs)

    Emulate linux tail cmd. Handle gziped files
    * fp
        Path to the file to be parsed
    * n
        Number of lines to print starting from the end of the file
    * line_numbering
        If True the number of the line will be indicated in front of the line
    * max_char_line
        Maximal number of character to print per line
    






    




head (fp, n=10, ignore_comment_line=False, comment_char='#', max_char_line=200, sep='\t', max_char_col=50, **kwargs)

    Emulate linux head cmd. Handle gziped files and bam files
    * fp
        Path to the file to be parsed. Works with text, gunziped and binary bam/sam files
    * n
        Number of lines to print starting from the begining of the file (Default 10)
    * ignore_comment_line
        Skip initial lines starting with a specific character. Pointless for bam files(Default False)
    * comment_char
        Character or string for ignore_comment_line argument (Default "#")
    * max_char_line
        Maximal number of character to print per line (Default 150)
    






    




linesample (fp, n_lines=100, line_numbering=True, max_char_line=150, **kwargs)

    Randomly sample lines in a file and print them. Handle gziped files
    * fp
        Path to the file to be parsed
    * n_lines
        Number of lines to sample in the file
    * line_numbering
        If True the number of the line will be indicated in front of the line
    * max_char_line
        Maximal number of character to print per line
    






    




count_uniq (fp, colnum, select_values=None, drop_values=None, skip_comment='#', sep='\t', **kwargs)

    Count unique occurences in a specific column of a tabulated file
    * fp
        Path to the file to be parsed (gzipped or not)
    * colnum
        Index number of the column to summarize
    * select_values
        Select specific lines in the file based on a dictionary containing column index(es) and valu(es) or list
        of values to select. Exemple {2:["exon", "transcript"], 4:"lincRNA"}. DEFAULT=None
    * drop_values
        Same think that select_value but will drop the lines instead. DEFAULT=None
    * skip_comment
        Drop any comment lines starting with this character. DEFAULT="#"
    * sep
        Character or list of characters to use in order to split the lines. Exemple [" ",";"]. DEFAULT=" "
    






    




colsum (fp, colrange=None, separator='', header=False, ignore_hashtag_line=False, max_items=10, ret_type='md', **kwargs)

    Create a summary of selected columns of a file
    * fp
        Path to the file to be parsed
    * colrange
        A list of column index to parse
    * separator
        A character or a list of characters to split the lines
    * ignore_hashtag_line
        skip line starting with a # symbol
    * max_items
        maximum item per line
    * ret_type
        Possible return types:
        md = markdown formatted table,
        dict = raw parsing dict,
        report = Indented_text_report
    






    




fastcount (fp, **kwargs)

    Efficient way to count the number of lines in a file. Handle gziped files
    






    




simplecount (fp, ignore_hashtag_line=False, **kwargs)

    Simple way to count the number of lines in a file with more options
    






    




mkdir (fp, level=1, **kwargs)

    Reproduce the ability of UNIX "mkdir -p" command
    (ie if the path already exits no exception will be raised).
    Can create nested directories by recursivity
    * fp
        path name where the folder should be created
    * level
        level in the path where to start to create the directories. Used by the program for the recursive creation of
        directories
    






    




dir_walk (fp)

    Print a directory arborescence
    






    




make_cmd_str (prog_name, opt_dict={}, opt_list=[], **kwargs)

    Create a Unix like command line string from the prog name, a dict named arguments and a list of unmammed arguments
    exemple make_cmd_str("bwa", {"b":None, t":6, "i":"../idx/seq.fa"}, ["../read1", "../read2"])
    * prog_name
        Name (if added to the system path) or path of the program
    * opt_dict
        Dictionary of option arguments such as "-t 5". The option flag have to be the key (without "-") and the the
        option value in the dictionary value. If no value is requested after the option flag "None" had to be assigned
        to the value field.
    * opt_list
        List of simple command line arguments
    






    




bash_basic (cmd, virtualenv=None, **kwargs)

    Sent basic bash command
    * cmd
        A command line string formatted as a string
    * virtualenv
        If specified will try to load a virtualenvwrapper environment before runing the command
    






    




bash (cmd, virtualenv=None, live='stdout', print_stdout=True, ret_stdout=False, log_stdout=None, print_stderr=True, ret_stderr=False, log_stderr=None, print_cmd=False, dry=False, **kwargs)

    More advanced version of bash calling with live printing of the standard output and possibilities to log the
    redirect the output and error as a string return or directly in files. If ret_stderr and ret_stdout are True a
    tuple will be returned and if both are False None will be returned
    * cmd
        A command line string formatted as a string
    * virtualenv
        If specified will try to load a virtualenvwrapper environment before runing the command
    * print_stdout
        If True the standard output will be LIVE printed through the system standard output stream
    * ret_stdout
        If True the standard output will be returned as a string
    * log_stdout
        If a filename is given, the standard output will logged in this file
    * print_stderr
        If True the standard error will be printed through the system standard error stream
    * ret_stderr
        If True the standard error will be returned as a string
    * log_stderr
        If a filename is given, the standard error will logged in this file
    






    




bash_update (cmd, update_freq=1, **kwargs)

    FOR JUPYTER NOTEBOOK
    Run a bash command and print the output in the cell. The output is updated each time until the output is None.
    This is suitable for monitoring tasks that log events until there is nothing else to print such as bjobs or bpeeks.
    * cmd
        A command line string formatted as a string
    * update_freq
        The frequency of output updating in seconds [DEFAULT: 1]
    






    




bsub (cmd=None, virtualenv=None, mem=None, threads=None, queue=None, wait_jobid=None, stdout_fp=None, stderr_fp=None, send_email=False, print_cmd=True, dry=False, **kwargs)

    FOR JUPYTER NOTEBOOK IN LSF environment
    Send an LSF bsub command through bash and return the JOBID
    For more information read the bsub documentation
    * cmd
        A command line string formatted as a string
    * virtualenv
        If specified will try to load a virtualenvwrapper environment before runing the command
    * mem
        Memory to reserve (-M and -R 'rusage[mem=])
    * threads
        Number of thread to reserve (-n)
    * queue
        Name of the LSF queue to be used (-q)
    * wait_jobid
        jobid of list of jobid to wait before executing this command(-w 'post_done(jobid))
    * stdout_fp
        Path of the file where to write the standard output of the command (-oo)
    * stderr_fp
        Path of the file where to write the standard error of the command (-eo)
    * send_email
        If True, will force LSF to send an email even if stdout_fp and/or stderr_fp is given
    






    




bjobs ()

    FOR JUPYTER NOTEBOOK IN LSF environment
    Emulate LSF bjobs command. Return a Dataframe of jobs
    






    




bjobs_update (update_freq=5)

    FOR JUPYTER NOTEBOOK IN LSF environment
    Emulate LSF bjobs command but update the cell every x seconds
    Cell is locked
    






    




bjobs_lock (jobid=None, update_freq=2, final_delay=2)

    FOR JUPYTER NOTEBOOK IN LSF environment
    Check if bjobs has running or pending jobs until all are done
    * jobid
        List of jobid to Check
    * update_freq
        The frequency of output updating in seconds [DEFAULT: 2]
    * final_delay
        Final delay in seconds at the end of all jobs to prevent IO errors [DEFAULT: 5]
    






    




##~~~~~~~ DICTIONNARY FORMATTING ~~~~~~~#






    




dict_to_md (d, key_label='', value_label='', transpose=False, sort_by_key=False, sort_by_val=True, max_items=None, **kwargs)

    Transform a dict into a markdown formated table
    






    




dict_to_report (d, tab='\t', ntab=0, sep=':', sort_dict=True, max_items=None, **kwargs)

    Recursive function to return a text report from nested dict or OrderedDict objects
    






    




##~~~~~~~ TABLE FORMATTING ~~~~~~~#






    




reformat_table (input_file, output_file='', return_df=False, init_template=[], final_template=[], header='', keep_original_header=True, header_from_final_template=False, replace_internal_space='_', replace_null_val='*', subst_dict={}, filter_dict=[], predicate=None, standard_template=None, verbose=False, **kwargs)

    Reformat a table given an initial and a final line templates indicated as a list where numbers
    indicate the data column and strings the formatting characters

    *  input_file
        A file with a structured text formatting (gzipped or not)
    *  output_file
        A file path to output the reformatted table (if empty will not write in a file)
    *  return_df
        If true will return a pandas dataframe containing the reformated table (Third party pandas package required)
        by default the columns will be names after the final template [DEFAULT:False]
    *  init_template
        A list of indexes and separators describing the structure of the input file
            Example initial line = "chr1    631539    631540    Squires|id1    0    +"
            Initial template = [0," ",1," ",2," ",3,"|",4," ",5," ",6]
            Alternatively, instead of the numbers, string indexes can be used, but they need to be enclosed in curly
            brackets to differentiate them from the separators. This greatly simplify the writing of the final template.
            Example initial line = "chr1    631539    631540    Squires|id1    0    +"
            Initial template = ["{chrom}"," ","{start}"," ","{end}","|","{name}"," ","{score}"," ","{strand}"]
    *  final_template
        A list of indexes and separators describing the required structure of the output file. Name indexes need to
        match indexes of the init_template and have to follow the same synthax  [DEFAULT:Same that init template]
            Example final line = "chr1    631539    631540    m5C|-|HeLa|22344696    -    -"
            Final template = [0," ",1," ",2," m5C|-|HeLa|22344696 - ",6]
    *  header
        A string to write as a file header at the beginning of the file
    *  keep_original_header
        If True the original header of the input file will be copied at the beginning of the output file [DEFAULT:True]
    *  header_from_final_template
        Generate a header according to the name or number of the fields given in the final_template [DEFAULT:True]
    *  replace_internal_space
        All internal blank space will be replaced by this character [DEFAULT:"_"]
    *  replace_null_val
        Field with no value will be replaced by this character [DEFAULT:"*"]
    *  subst_dict
        Nested dictionary of substitution per position to replace specific values by others [DEFAULT:None]
            Example: { 0:{"chr1":"1","chr2":"2"}, 3:{"Squires":"5376774764","Li":"27664684"}}
    *  filter_dict
        A dictionary of list per position  to filter out lines  with specific values [DEFAULT:None]
            Example: { 0:["chr2", "chr4"], 1:["46767", "87765"], 5:["76559", "77543"]}
    *  predicate
        A lambda predicate function for more advance filtering operations [DEFAULT:None]
            Example:  lambda val_dict: abs(int(val_dict[1])-int(val_dict[2])) <= 2000
    *  standard_template
        Existing standard template to parse the file  instead of providing one manually. List of saved templates:
        - "gff3_ens_gene" = Template for ensembl gff3 fields. Select only the genes lines and decompose to individual elements.
        - "gff3_ens_transcript" = Template for ensembl gff3 fields. Select only the transcript lines and decompose to individual elements.
        - "gtf_ens_gene" = Template for ensembl gft fields. Select only the genes lines and decompose to individual elements
    * verbose
        If True will print detailed information [DEFAULT:False]
    






    




##~~~~~~~ WEB TOOLS ~~~~~~~#






    




url_exist (url, **kwargs)

    Predicate verifying if an url exist without downloading all the link
    






    




wget (url, out_name='', progress_block=100000000, **kwargs)

    Download a file from an URL to a local storage.
    *  url
        A internet URL pointing to the file to download
    *  outname
        Name of the outfile where (facultative)
    *  progress_block
        size of the byte block for the progression of the download
    






    




##~~~~~~~ FUNCTIONS TOOLS ~~~~~~~#






    




print_arg (**kwargs)

    Print calling function named and unnamed arguments
    






    




##~~~~~~~ SSH TOOLS ~~~~~~~#






    




scp (hostname, local_file, remote_dir, username=None, rsa_private_key=None, ssh_config='~/.ssh/config', verbose=False, **kwargs)

    Copy a file over ssh in a target remote directory
    * hostname
        Name of the host ssh server
    * username
        name of the user
    * rsa_private_key
        path to the rsa private key
    * local_file
        path to the local file
    * remote_dir
        path to the target directory
    * ssh_config
        use as an alternative method instead of giving the username and rsa_private_key. Will fetch them from the config file directly
    






    




##~~~~~~~ PACKAGE TOOLS ~~~~~~~#






    




get_package_file (package, fp='', **kwargs)

    Verify the existence of a file from the package data and return a file path
    * package
        Name of the package
    * fp
        Relative path to the file in the package. Usually package_name/data/file_name
        if the path points to a directory the directory arborescence will be printed
    






    




##~~~~~~~ SAM/BAM TOOLS ~~~~~~~#






    




bam_sample (fp_in, fp_out, n_reads, verbose=False, **kwargs)

    Sample reads from a SAM/BAM file and write in a new file
    * fp_in
        Path to the input file in .bam/.sam/.cram (the format will be infered from extension)
    * fp_out
        Path to the output file in .bam/.sam/.cram (the format will be infered from extension)
    * n_reads
        number of reads to sample
    






    




##~~~~~~~ DNA SEQUENCE TOOLS ~~~~~~~#






    




base_generator (bases=['A', 'T', 'C', 'G'], weights=[0.280788, 0.281691, 0.193973, 0.194773], **kwargs)

    Generator returning DNA/RNA bases according to a probability weightning
    * bases: list (default ["A","T","C","G"])
        DNA RNA bases allowed
    * weights: list (default [0.280788,0.281691,0.193973,0.194773])
        Probability of each base to be returned. Should match the index of bases. The sum does not need to be equal to 1.
        If the list is empty bases will be returned with a flat probability. The default values represent the frequency in the human
        genome (excluding N).
    






    




make_sequence (bases=['A', 'T', 'C', 'G'], weights=[0.280788, 0.281691, 0.193973, 0.194773], length=1000, **kwargs)

    return a sequence of DNA/RNA bases according to a probability weightning
    * bases: list (default ["A","T","C","G"])
        DNA RNA bases allowed in the sequence
    * weights: list (default [0.280788,0.281691,0.193973,0.194773])
        Probability of each base to be returned. Should match the index of bases. The sum does not need to be equal to 1.
        If the list is empty bases will be returned with a flat probability. The default values represent the frequency in the human
        genome (excluding N).
    * length: int (default 1000)
        length of the sequence to be returned

pycltools functions list