Parameters


In [391]:
CONFIG_LOCAL = False
#CONFIG_LOCAL = True

In [392]:
if CONFIG_LOCAL:
    config = {
        'data_folder'   : 'probes',
        'RUN'           : 'run_BWA_JKL_23_JU_1_orig',
        'chromosome'    : 'SL2.40ch12',
        'scaffold'      : 'SL2.40sc05611',
        'out_folder'    : 'reports',
        'out_extensions': ['eps', 'png']
    }

    if True:
        config['BAC'            ] = 'JBPP0904'
        config['BAC_coord_start'] = 967164
        config['BAC_coord_end'  ] = 970727
        config['BAC_coord'      ] = '%012d-%012d' % ( config['BAC_coord_start'], config['BAC_coord_end'  ] )
        config['RUN'            ] = 'run_BWA_JKL_23_JU_1_orig_PROBES'

In [393]:
if CONFIG_LOCAL:
    %run -i probes_cfg.ipynb

In [394]:
if CONFIG_LOCAL:
    %run -i probes_cfg_header.ipynb


Singularity Report

Run: run_BWA_JKL_23_JU_1_orig_PROBES

Chromosome: SL2.40ch12

Scaffold: SL2.40sc05611

BAC: JBPP0904

BAC coord start: 967164

BAC coord end: 970727

Config
	BAC            : JBPP0904
	BAC_coord      : 000000967164-000000970727
	BAC_coord_end  : 970727
	BAC_coord_start: 967164
	RUN            : run_BWA_JKL_23_JU_1_orig_PROBES
	chromosome     : SL2.40ch12
	data_folder    : probes
	out_extensions : ['eps', 'png', 'pdf']
	out_folder     : reports
	scaffold       : SL2.40sc05611
Max Rows     : 10000
Column Names
	K-mer Coverage
	Sequencing Coverage
	BLAST Coverage
	Ns
	AGP Contig
	AGP Gap
	K-mer Coverage averaged: 500 bp
	K-mer Coverage averaged: 5 Kbp
	K-mer Coverage averaged: 1 Mbp

Imports


In [395]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
import os
import operator

#import pylab
#pylab.show()

%pylab inline

pd.set_option('display.notebook_repr_html', True)
#pd.set_option('display.max_columns', 20)
#pd.set_option('display.max_rows', 25)

from IPython.display import HTML, display
def addHeader(level, text):
    display( HTML('''<h%(level)d>%(text)s</h%(level)d>''' % {'level':level, 'text':text}) )


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['axes', 'f', 'axis']
`%matplotlib` prevents importing * from pylab and numpy

Files

Input Files


In [396]:
addHeader(1,'Files')
addHeader(2,'Input Files')
"""
S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc06147.sam.cov
S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc06147.sam.cov.prop.cov
S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc06147.sam.pos
"""
    
KmerCoverageFile       = "%(data_folder)s/%(RUN)s/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.%(chromosome)s.sam.%(scaffold)s.sam.cov.prop.cov.gz" % config
SequencingCoverageFile = "%(data_folder)s/mapping/out/S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.%(chromosome)s.pos.cov.%(scaffold)s.cov.gz"    % config
AgpContigFile          = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.contig.agp.cov.gz"  % config
AgpGapFile             = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.gap.agp.cov.gz"     % config
AgpOtherFile           = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.other.agp.cov.gz"   % config
AgpUnknownFile         = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.unknown.agp.cov.gz" % config
NsFile                 = "%(data_folder)s/Ns/S_lycopersicum_scaffolds.fa_NONE.tab.%(chromosome)s.tab.%(scaffold)s.tab.cov.gz"     % config
infiles                = [KmerCoverageFile, SequencingCoverageFile, 
                          AgpContigFile, AgpGapFile, 
                          AgpUnknownFile, AgpOtherFile, NsFile]

if BAC_MODE:
    """

                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.SL2.40ch12.pos.cov.SL2.40sc05611.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc05611.sam.cov.prop.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727_Product.fasta.blast.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.contig.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.gap.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.other.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.unknown.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds.fa_NONE.tab.SL2.40ch12.tab.SL2.40sc05611.tab.cov
    """
                                                      
    config['in_base_name'] = "%(data_folder)s/%(RUN)s/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.%(chromosome)s.%(scaffold)s.%(BAC)s_primer.%(BAC_coord)s" % config
    KmerCoverageFile       =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.%(chromosome)s.sam.%(scaffold)s.sam.cov.prop.cov.gz" % config
    SequencingCoverageFile =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.%(chromosome)s.pos.cov.%(scaffold)s.cov.gz"        % config
    BlastCoverageFile      =                                                                                                                                      "%(in_base_name)s_Product.fasta.blast.cov.gz"                                                         % config
    AgpContigFile          =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.contig.agp.cov.gz"  % config
    AgpGapFile             =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.gap.agp.cov.gz"     % config
    AgpOtherFile           =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.other.agp.cov.gz"   % config
    AgpUnknownFile         =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.unknown.agp.cov.gz" % config
    NsFile                 =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds.fa_NONE.tab.%(chromosome)s.tab.%(scaffold)s.tab.cov.gz"    % config
    
    infiles                = [KmerCoverageFile , SequencingCoverageFile, 
                              BlastCoverageFile, AgpContigFile, 
                              AgpGapFile       , AgpUnknownFile, 
                              AgpOtherFile     , NsFile]


print "%-22s: %-5s %s" % ( "KmerCoverageFile"      , os.path.exists(KmerCoverageFile)      , KmerCoverageFile       )
print "%-22s: %-5s %s" % ( "SequencingCoverageFile", os.path.exists(SequencingCoverageFile), SequencingCoverageFile )
print "%-22s: %-5s %s" % ( "AgpContigFile"         , os.path.exists(AgpContigFile)         , AgpContigFile          )
print "%-22s: %-5s %s" % ( "AgpGapFile"            , os.path.exists(AgpGapFile)            , AgpGapFile             )
print "%-22s: %-5s %s" % ( "AgpOtherFile"          , os.path.exists(AgpOtherFile)          , AgpOtherFile           )
print "%-22s: %-5s %s" % ( "AgpUnknownFile"        , os.path.exists(AgpUnknownFile)        , AgpUnknownFile         )
print "%-22s: %-5s %s" % ( "NsFile"                , os.path.exists(NsFile)                , NsFile                 )



if BAC_MODE:
    print "%-22s: %-5s %s" % ( "BlastCoverageFile" , os.path.exists(BlastCoverageFile)     , BlastCoverageFile      )

print



if not all([os.path.exists(x) for x in infiles]):
    print "missing file"

else:
    print "all files present"


Files

Input Files

KmerCoverageFile      : True  probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc05611.sam.cov.prop.cov.gz
SequencingCoverageFile: True  probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.SL2.40ch12.pos.cov.SL2.40sc05611.cov.gz
AgpContigFile         : True  probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.contig.agp.cov.gz
AgpGapFile            : True  probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.gap.agp.cov.gz
AgpOtherFile          : True  probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.other.agp.cov.gz
AgpUnknownFile        : True  probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.unknown.agp.cov.gz
NsFile                : True  probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds.fa_NONE.tab.SL2.40ch12.tab.SL2.40sc05611.tab.cov.gz
BlastCoverageFile     : True  probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727_Product.fasta.blast.cov.gz

all files present

Output Files


In [397]:
addHeader(2,'Output Files')

config['out_bn'        ] = "%(out_folder)s/%%s/%(RUN)s_%(chromosome)s_%(scaffold)s_prop" % config
config['out_bn_img'    ] = "%(out_bn)s.%%s.%%s"                                            % config

if BAC_MODE:
    config['out_bn'       ] = "%(out_folder)s/%%s/%(RUN)s_%(chromosome)s_%(scaffold)s_%(BAC)s_%(BAC_coord)s" % config
    config['out_bn_img'   ] = "%(out_bn)s.%%s.%%s"                                                             % config

    
output_files = {
    'all_data'                        : [ config['out_bn_img']  % (ext, 'raw_data'                        , 'csv'     ) for ext in config['out_extensions'] ],
    'all_data_full'                   : [ config['out_bn_img']  % (ext, 'K-raw_data'                      , 'full.csv') for ext in config['out_extensions'] ],
    'K-mer Coverage Stats'            : [ config['out_bn_img']  % (ext, 'K-mer_Coverage_Stats'            , ext       ) for ext in config['out_extensions'] ],
    'Sequencing Coverage Stats'       : [ config['out_bn_img']  % (ext, 'Sequencing_Coverage_Stats'       , ext       ) for ext in config['out_extensions'] ],
    'K-mer Coverage Distribution'     : [ config['out_bn_img']  % (ext, 'K-mer_Coverage_Distribution'     , ext       ) for ext in config['out_extensions'] ],
    'Sequencing Coverage Distribution': [ config['out_bn_img']  % (ext, 'Sequencing_Coverage_Distribution', ext       ) for ext in config['out_extensions'] ],
    'Gaps Distribution'               : [ config['out_bn_img']  % (ext, 'Gaps_Distribution'               , ext       ) for ext in config['out_extensions'] ],
    'Ns Distribution'                 : [ config['out_bn_img']  % (ext, 'Ns_Distribution'                 , ext       ) for ext in config['out_extensions'] ],
    'Combined graph'                  : [ config['out_bn_img']  % (ext, 'Combined_graph'                  , ext       ) for ext in config['out_extensions'] ]
}


if BAC_MODE:
    output_files['BLAST Coverage Stats'       ] = [ config['out_bn_img'] % (ext, 'BLAST_Coverage_Stats'       , ext) for ext in config['out_extensions'] ]
    output_files['BLAST Coverage Distribution'] = [ config['out_bn_img'] % (ext, 'BLAST_Coverage_Distribution', ext) for ext in config['out_extensions'] ]

    
for of in sorted(output_files.keys()):
    print "%-32s:\n - %s" % ( of, "\n - ".join( output_files[of] ) )


Output Files

BLAST Coverage Distribution     :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Distribution.eps
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Distribution.png
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Distribution.pdf
BLAST Coverage Stats            :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Stats.eps
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Stats.png
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Stats.pdf
Combined graph                  :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Combined_graph.eps
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Combined_graph.png
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Combined_graph.pdf
Gaps Distribution               :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Gaps_Distribution.eps
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Gaps_Distribution.png
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Gaps_Distribution.pdf
K-mer Coverage Distribution     :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Distribution.eps
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Distribution.png
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Distribution.pdf
K-mer Coverage Stats            :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Stats.eps
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Stats.png
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Stats.pdf
Ns Distribution                 :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Ns_Distribution.eps
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Ns_Distribution.png
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Ns_Distribution.pdf
Sequencing Coverage Distribution:
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Distribution.eps
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Distribution.png
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Distribution.pdf
Sequencing Coverage Stats       :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Stats.eps
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Stats.png
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Stats.pdf
all_data                        :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.raw_data.csv
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.raw_data.csv
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.raw_data.csv
all_data_full                   :
 - reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-raw_data.full.csv
 - reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-raw_data.full.csv
 - reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-raw_data.full.csv

In [398]:
class size_controller(object):
    def __init__(self, w, h):
        self.w = w
        self.h = h
        
    def __enter__(self):
        self.o = rcParams['figure.figsize']
        rcParams['figure.figsize'] = self.w, self.h
        return None
    
    def __exit__(self, type, value, traceback):
        rcParams['figure.figsize'] = self.o
        
col_type_int  = np.int64
col_type_flo  = np.float64
col_type_str  = np.object
col_type_bol  = np.int8

Read Files

K-mer Coverage File


In [399]:
addHeader(1,'Read Files')
addHeader(2,'K-mer Coverage File')

col_info = [
    [ "Position"                             , col_type_int ], 
    [ "K-mer Coverage"                       , col_type_flo ], 
    [ "K-mer Coverage averaged: 500 bp"      , col_type_flo ], 
    [ "K-mer Coverage averaged: 2.5 Kbp"     , col_type_flo ], 
    [ "K-mer Coverage averaged: 5 Kbp"       , col_type_flo ], 
    [ "K-mer Coverage averaged: 50 Kbp"      , col_type_flo ], 
    [ "K-mer Coverage averaged: 1 Mbp"       , col_type_flo ], 
    [ "K-mer Coverage averaged: 5 Kbp before", col_type_flo ], 
    [ "K-mer Coverage averaged: 5 Kbp after" , col_type_flo ]
]

col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))

if PARSE_VERBOSE:
    print "\n".join( col_names )


Read Files

K-mer Coverage File


In [400]:
SKIP_ROWS = 1

print KmerCoverageFile

KmerData  = pd.read_csv(KmerCoverageFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
    
print "Loaded %d rows and %d columns" % ( KmerData.shape[0], KmerData.shape[1] )

if PARSE_VERBOSE:
    print KmerData.head()


probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc05611.sam.cov.prop.cov.gz
Loaded 3564 rows and 9 columns

Sequencing Coverage File


In [401]:
addHeader(2,'Sequencing Coverage File')

col_info = [
    [ "Position"                             , col_type_int ], 
    [ "Sequencing Coverage"                  , col_type_int ]
]

col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))

if PARSE_VERBOSE:
    print col_names
    col_types


Sequencing Coverage File


In [402]:
SKIP_ROWS = 0

print SequencingCoverageFile

SequencingCoverageData  = pd.read_csv(SequencingCoverageFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")

print "Loaded %d rows and %d columns" % ( SequencingCoverageData.shape[0], SequencingCoverageData.shape[1] )

if PARSE_VERBOSE:
    print SequencingCoverageData.head()


probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.SL2.40ch12.pos.cov.SL2.40sc05611.cov.gz
Loaded 3564 rows and 2 columns

In [403]:
if BAC_MODE:
    addHeader(2,'BLAST')

    col_info = [
        [ "Position"                             , col_type_int ], 
        [ "BLAST Coverage"                       , col_type_flo ]
    ]

    col_names=[cf[0] for cf in col_info]
    col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))

    if PARSE_VERBOSE:
        print col_names
        col_types


BLAST


In [404]:
if BAC_MODE:
    SKIP_ROWS = 0
    
    print BlastCoverageFile
    
    BlastCoverageData  = pd.read_csv(BlastCoverageFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                            skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
        
    print "Loaded %d rows and %d columns" % ( BlastCoverageData.shape[0], BlastCoverageData.shape[1] )
    
    if PARSE_VERBOSE:
        print BlastCoverageData.head()


probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727_Product.fasta.blast.cov.gz
Loaded 3564 rows and 2 columns

AGP


In [405]:
addHeader(2,'AGP')
SKIP_ROWS = 0


AGP

Contig


In [406]:
addHeader(3,'Contig')

col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Contig"                  , col_type_bol ]
]

col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))

if PARSE_VERBOSE:
    print col_names
    col_types


Contig


In [407]:
print AgpContigFile

AgpContigData  = pd.read_csv(AgpContigFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")

print "Loaded %d rows and %d columns" % ( AgpContigData.shape[0], AgpContigData.shape[1] )

if PARSE_VERBOSE:
    print AgpContigData.head()


probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.contig.agp.cov.gz
Loaded 3564 rows and 2 columns

Gap


In [408]:
addHeader(3,'Gap')

col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Gap"                     , col_type_bol ]
]

col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))

if PARSE_VERBOSE:
    print col_names
    col_types


Gap


In [409]:
print AgpGapFile

AgpGapData  = pd.read_csv(AgpGapFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
    
print "Loaded %d rows and %d columns" % ( AgpGapData.shape[0], AgpGapData.shape[1] )

if PARSE_VERBOSE:
    print AgpGapData.head()


probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.gap.agp.cov.gz
Loaded 3564 rows and 2 columns

Unknown


In [410]:
addHeader(3,'Unknown')

col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Unknown"                 , col_type_bol ]
]

col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))

if PARSE_VERBOSE:
    print col_names
    col_types


Unknown


In [411]:
print AgpUnknownFile

AgpUnknownData  = pd.read_csv(AgpUnknownFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
    
print "Loaded %d rows and %d columns" % ( AgpUnknownData.shape[0], AgpUnknownData.shape[1] )

if PARSE_VERBOSE:
    print AgpUnknownData.head()


probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.unknown.agp.cov.gz
Loaded 3564 rows and 2 columns

Other


In [412]:
addHeader(3,'Other')

col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Other"                   , col_type_bol ]
]

col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))

if PARSE_VERBOSE:
    print col_names
    col_types


Other


In [413]:
print AgpOtherFile

AgpOtherData  = pd.read_csv(AgpOtherFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")

print "Loaded %d rows and %d columns" % ( AgpOtherData.shape[0], AgpOtherData.shape[1] )

if PARSE_VERBOSE:
    print AgpOtherData.head()


probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.other.agp.cov.gz
Loaded 3564 rows and 2 columns

Ns


In [414]:
addHeader(2,'Ns')

col_info = [
    [ "Position"                    , col_type_int ], 
    [ "Ns"                          , col_type_bol ]
]

col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))

if PARSE_VERBOSE:
    print col_names
    col_types


Ns


In [415]:
print NsFile

NsData  = pd.read_csv(NsFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
    
print "Loaded %d rows and %d columns" % ( NsData.shape[0], NsData.shape[1] )

if PARSE_VERBOSE:
    print NsData.head()


probes/run_BWA_JKL_23_JU_1_orig_PROBES/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds.fa_NONE.tab.SL2.40ch12.tab.SL2.40sc05611.tab.cov.gz
Loaded 3564 rows and 2 columns

Merge


In [416]:
addHeader(1,'Merge')

Data = pd.DataFrame(KmerData, copy=True)

Data.combine_first( KmerData )
Data[ "AGP Contig"          ] = AgpContigData[          "AGP Contig"          ]
Data[ "AGP Gap"             ] = AgpGapData[             "AGP Gap"             ]
Data[ "AGP Unknown"         ] = AgpUnknownData[         "AGP Unknown"         ]
Data[ "AGP Other"           ] = AgpOtherData[           "AGP Other"           ]
Data[ "Ns"                  ] = NsData[                 "Ns"                  ]
Data[ "Sequencing Coverage" ] = SequencingCoverageData[ "Sequencing Coverage" ]
if BAC_MODE:
    Data[ "BLAST Coverage"  ] = BlastCoverageData[      "BLAST Coverage"      ]

print "Saved %d rows and %d columns" % ( Data.shape[0], Data.shape[1] )

if PARSE_VERBOSE:
    print Data.head()


Merge

Saved 3564 rows and 16 columns

Stats


In [417]:
addHeader(1,'Stats')
addHeader(2,'Describe')

print Data.describe()


Stats

Describe

          Position  K-mer Coverage  K-mer Coverage averaged: 500 bp  \
count  3564.000000     3564.000000                      3564.000000   
mean   1781.500000        0.987118                         0.979846   
std    1028.982507        0.042654                         0.023863   
min       0.000000        0.652174                         0.868090   
25%     890.750000        1.000000                         0.976048   
50%    1781.500000        1.000000                         0.988024   
75%    2672.250000        1.000000                         0.994012   
max    3563.000000        1.000000                         0.998004   

       K-mer Coverage averaged: 2.5 Kbp  K-mer Coverage averaged: 5 Kbp  \
count                       3564.000000                     3564.000000   
mean                           0.972331                        0.950346   
std                            0.015624                        0.020099   
min                            0.921127                        0.900498   
25%                            0.960016                        0.937013   
50%                            0.977739                        0.952210   
75%                            0.987205                        0.969478   
max                            0.990404                        0.976005   

       K-mer Coverage averaged: 50 Kbp  K-mer Coverage averaged: 1 Mbp  \
count                      3564.000000                     3564.000000   
mean                          0.644977                        0.759933   
std                           0.015788                        0.000203   
min                           0.622268                        0.759625   
25%                           0.629966                        0.759754   
50%                           0.645013                        0.759918   
75%                           0.659184                        0.760043   
max                           0.671427                        0.760458   

       K-mer Coverage averaged: 5 Kbp before  \
count                            3564.000000   
mean                                0.862887   
std                                 0.052024   
min                                 0.796571   
25%                                 0.808638   
50%                                 0.883462   
75%                                 0.913017   
max                                 0.950810   

       K-mer Coverage averaged: 5 Kbp after  AGP Contig  AGP Gap  AGP Unknown  \
count                           3564.000000        3564     3564         3564   
mean                               0.932908           1        0            0   
std                                0.018178           0        0            0   
min                                0.917816           1        0            0   
25%                                0.921216           1        0            0   
50%                                0.922416           1        0            0   
75%                                0.948210           1        0            0   
max                                0.976005           1        0            0   

       AGP Other    Ns  Sequencing Coverage  BLAST Coverage  
count       3564  3564          3564.000000     3564.000000  
mean           0     0           133.513187        1.024270  
std            0     0            39.100518        0.162556  
min            0     0            56.000000       -1.000000  
25%            0     0           102.000000        1.000000  
50%            0     0           132.000000        1.000000  
75%            0     0           159.000000        1.000000  
max            0     0           299.000000        2.000000  

Quantiles


In [418]:
addHeader(2,'Quantiles')

print Data['K-mer Coverage'].quantile([.01, .025, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .975, .99])

addHeader(2,'Percentiles')
print "count     ", Data['K-mer Coverage'].count()
print "count == 1", Data['K-mer Coverage'][ Data['K-mer Coverage'] == 1].count()
print "prop  == 1", Data['K-mer Coverage'][ Data['K-mer Coverage'] == 1].count() * 1.0 / Data['K-mer Coverage'].count()


Quantiles

0.010    0.782609
0.025    0.869565
0.050    0.913043
0.100    0.956522
0.200    1.000000
0.300    1.000000
0.400    1.000000
0.500    1.000000
0.600    1.000000
0.700    1.000000
0.800    1.000000
0.900    1.000000
0.950    1.000000
0.975    1.000000
0.990    1.000000
dtype: float64

Percentiles

count      3564
count == 1 3033
prop  == 1 0.85101010101

In [419]:
addHeader(2,'Median')

print Data.median()


Median

Position                                 1781.500000
K-mer Coverage                              1.000000
K-mer Coverage averaged: 500 bp             0.988024
K-mer Coverage averaged: 2.5 Kbp            0.977739
K-mer Coverage averaged: 5 Kbp              0.952210
K-mer Coverage averaged: 50 Kbp             0.645013
K-mer Coverage averaged: 1 Mbp              0.759918
K-mer Coverage averaged: 5 Kbp before       0.883462
K-mer Coverage averaged: 5 Kbp after        0.922416
AGP Contig                                  1.000000
AGP Gap                                     0.000000
AGP Unknown                                 0.000000
AGP Other                                   0.000000
Ns                                          0.000000
Sequencing Coverage                       132.000000
BLAST Coverage                              1.000000
dtype: float64

In [420]:
addHeader(2,'MAD')
print Data.mad()


MAD

Position                                 891.000000
K-mer Coverage                             0.021926
K-mer Coverage averaged: 500 bp            0.014734
K-mer Coverage averaged: 2.5 Kbp           0.013925
K-mer Coverage averaged: 5 Kbp             0.016092
K-mer Coverage averaged: 50 Kbp            0.013832
K-mer Coverage averaged: 1 Mbp             0.000161
K-mer Coverage averaged: 5 Kbp before      0.048236
K-mer Coverage averaged: 5 Kbp after       0.015645
AGP Contig                                 0.000000
AGP Gap                                    0.000000
AGP Unknown                                0.000000
AGP Other                                  0.000000
Ns                                         0.000000
Sequencing Coverage                       30.920078
BLAST Coverage                             0.050039
dtype: float64

Plot

K-mer Coverage Stats


In [421]:
addHeader(1,'Plot')
addHeader(2,'K-mer Coverage Stats')

with size_controller(FULL_FIG_W, FULL_FIG_H):
    bqc = Data.boxplot(column=["K-mer Coverage", 
                               "K-mer Coverage averaged: 500 bp",
                               "K-mer Coverage averaged: 2.5 Kbp",
                               "K-mer Coverage averaged: 5 Kbp",
                               "K-mer Coverage averaged: 50 Kbp",
                               "K-mer Coverage averaged: 1 Mbp",
                               "K-mer Coverage averaged: 5 Kbp before",
                               "K-mer Coverage averaged: 5 Kbp after"], return_type='dict', rot=45)
    
    plt.setp(bqc['boxes'   ], color='black')
    plt.setp(bqc['medians' ], color='black')
    plt.setp(bqc['whiskers'], color='black')
    plt.setp(bqc['fliers'  ], color='black')
    
    for of in output_files['K-mer Coverage Stats']:
        print "Saving Image:", of
        savefig(of)


Plot

K-mer Coverage Stats

Saving Image: reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Stats.eps
Saving Image: reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Stats.png
Saving Image: reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Stats.pdf

Sequencing Coverage Stats


In [422]:
addHeader(2,'Sequencing Coverage Stats')

with size_controller(FULL_FIG_W, FULL_FIG_H):
    bqc = Data.boxplot(column=['Sequencing Coverage'], return_type='dict')
    
    plt.setp(bqc['boxes'   ], color='black')
    plt.setp(bqc['medians' ], color='black')
    plt.setp(bqc['whiskers'], color='black')
    plt.setp(bqc['fliers'  ], color='black')
    
    for of in output_files['Sequencing Coverage Stats']:
        print "Saving Image:", of
        savefig(of)


Sequencing Coverage Stats

Saving Image: reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Stats.eps
Saving Image: reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Stats.png
Saving Image: reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Stats.pdf

In [423]:
if BAC_MODE:
    addHeader(2,'BLAST Coverage Stats')
    
    with size_controller(FULL_FIG_W, FULL_FIG_H):
        bqc = Data.boxplot(column=['BLAST Coverage'], return_type='dict')
        plt.setp(bqc['boxes'   ], color='black')
        plt.setp(bqc['medians' ], color='black')
        plt.setp(bqc['whiskers'], color='black')
        plt.setp(bqc['fliers'  ], color='black')

        for of in output_files['BLAST Coverage Stats']:
            print "Saving Image:", of
            savefig(of)


BLAST Coverage Stats

Saving Image: reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Stats.eps
Saving Image: reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Stats.png
Saving Image: reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Stats.pdf

K-mer Coverage Distribution


In [424]:
addHeader(2,'K-mer Coverage Distribution')

with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["K-mer Coverage"] > 0 ]['Position']
    
    print "Number of rows:", hf.size

    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)

        for of in output_files['K-mer Coverage Distribution']:
            print "Saving Image:", of
            savefig(of)


K-mer Coverage Distribution

Number of rows: 3564
Saving Image: reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Distribution.eps
Saving Image: reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Distribution.png
Saving Image: reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.K-mer_Coverage_Distribution.pdf

Sequencing Coverage Distribution


In [425]:
addHeader(2,'Sequencing Coverage Distribution')

with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["Sequencing Coverage"] > 0 ]['Position']
    
    print "Number of rows:", hf.size

    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)

        for of in output_files['Sequencing Coverage Distribution']:
            print "Saving Image:", of
            savefig(of)


Sequencing Coverage Distribution

Number of rows: 3564
Saving Image: reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Distribution.eps
Saving Image: reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Distribution.png
Saving Image: reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Sequencing_Coverage_Distribution.pdf

In [426]:
if BAC_MODE:
    addHeader(2,'BLAST Coverage Distribution')
    
    with size_controller(FULL_FIG_W, FULL_FIG_H):
        hf = Data[ Data["BLAST Coverage"] > 0 ]['Position']
    
        print "Number of rows:", hf.size

        if hf.size > 0:
            hs = hf.hist(color=HISTOGRAM_COLOR)
            
            for of in output_files['BLAST Coverage Distribution']:
                print "Saving Image:", of
                savefig(of)


BLAST Coverage Distribution

Number of rows: 3560
Saving Image: reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Distribution.eps
Saving Image: reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Distribution.png
Saving Image: reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.BLAST_Coverage_Distribution.pdf

Gaps Distribution


In [427]:
addHeader(2,'Gaps Distribution')

with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["AGP Gap"] > 0 ]['Position']
    
    print "Number of rows:", hf.size

    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        
        for of in output_files['Gaps Distribution']:
            print "Saving Image:", of
            savefig(of)


Gaps Distribution

Number of rows: 0

Ns Distribution


In [428]:
addHeader(2,'Ns Distribution')

with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["Ns"] > 0 ]['Position']
    
    print "Number of rows:", hf.size

    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        
        for of in output_files['Ns Distribution']:
            print "Saving Image:", of
            savefig(of)


Ns Distribution

Number of rows: 0

CSV Output


In [429]:
addHeader(2,'CSV Output')

SAMPLE_EVERY = 1

while ( Data.shape[0] / SAMPLE_EVERY ) > MAX_ROWS:
    SAMPLE_EVERY += 1
    
print "Original Size %d rows and %d columns" % ( Data.shape[0], Data.shape[1] )

if SAMPLE_EVERY != 1:
    print "SAMPLING EVERY %d ROWS" % SAMPLE_EVERY
    
    for of in output_files['all_data_full']:
        print "Saving full data to: ", of
        Data.to_csv(of, sep='\t', index=False)
    
    DataSampled = Data[::SAMPLE_EVERY]

    print "New Size %d rows and %d columns" % ( DataSampled.shape[0], DataSampled.shape[1] )
    
    if PARSE_VERBOSE:
        print DataSampled.head()

else:
    print "no need to sample"
    DataSampled = Data


for of in output_files['all_data']:
    print "Saving data to     :", of
    DataSampled.to_csv(of, sep='\t', index=False)


CSV Output

Original Size 3564 rows and 16 columns
no need to sample
Saving data to     : reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.raw_data.csv
Saving data to     : reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.raw_data.csv
Saving data to     : reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.raw_data.csv

Combined Graph


In [436]:
addHeader(2,'Combined Graph')

num_cols = sum([1 if x is not None else 0 for x in cols_to_plot])
#print "num_cols:", num_cols, " :: ", ", ".join(sorted(list(set([x[0] if x is not None else None for x in cols_to_plot])-set([None]))))
#print "\n".join( [ str( (x, y[0]) ) if y is not None else str( (x, "None") ) for x,y in enumerate(cols_to_plot) ] )

with size_controller(CHROM_FIG_W, CHROM_FIG_H):
    if True:
        #fig, axes = plt.subplots(nrows=len(cols_to_plot)+2, ncols=1)
        f = plt.figure()
        
        plt.subplots_adjust(wspace=0.5, hspace=0.5)

        num_extra_rows = 3
        num_row_span   = 3
        
        num_cols_e     = num_cols + (num_extra_rows*num_row_span)
        axes           = []

        axes.append( [ plt.subplot2grid((num_cols_e,1), (0           , 0), rowspan=num_row_span) ] + cols_to_plot[0] )
        axes.append( [ plt.subplot2grid((num_cols_e,1), (num_row_span, 0), rowspan=num_row_span) ] + cols_to_plot[1] )
        
        #print "num_extra_rows", num_extra_rows
        #print "num_row_span  ", num_row_span
        #print "num_cols_e    ", num_cols_e
        
        if BAC_MODE:
            axes.append( [ plt.subplot2grid((num_cols_e,1), (num_row_span*2, 0), rowspan=num_row_span) ] + cols_to_plot[2] )
        else:
            axes.append( None )
            
        col_num  = 3
        plot_num = 0
        for i in xrange( col_num, len(cols_to_plot) ):
            y = (num_extra_rows*num_row_span)+plot_num
            
            try:
                cold = cols_to_plot[col_num]
            except:
                continue
                
            while ( cold is None ) and ( col_num < len(cols_to_plot)-1 ):
                col_num += 1
                #print " cn", col_num,"l",len(cols_to_plot)
                cold     = cols_to_plot[col_num]
            
            if cold is not None:
                #print 'i', i, 'y', y, 'col', col_num, 'num cols', num_cols, 'col name', cold[0]
                axes.append( [ plt.subplot2grid((num_cols_e,1), (y, 0) ) ] + cold )
                col_num  += 1
                plot_num += 1

        #print "num_cols", num_cols
        #print "axes"    , len(axes)

        axis_j = 0
        for axis_i, axis_data in enumerate(axes):
            if axis_data is None:
                #print " skip"
                continue
            
            axis, col_to_plot, col_ylim, col_yticks, col_color = axis_data
            
            #print "For axis", axis_i, "col name", col_to_plot
            if axis is None:
                #print " skip"
                continue
                
            p = DataSampled[col_to_plot].plot(ax=axis, kind='area', fontsize=14, stacked=False, color=col_color)
            
            axis.set_title(col_to_plot, fontdict={'fontsize': 16, 'fontweight': 'bold'})
            
            if col_ylim is not None:
                col_ylim_min, col_ylim_max = col_ylim
                
                if col_ylim_min is not None:
                    axis.set_ylim( bottom = col_ylim_min )
                
                if col_ylim_max is not None:
                    axis.set_ylim( top    = col_ylim_max )
            
            
            if col_yticks is not None:
                if col_yticks == 0:
                    axis.set_yticks([])
                    
                else:
                    ylim_min, ylim_max = axis.get_ylim()
                    ylim_diff = ylim_max - ylim_min
                    ylim_step = ylim_diff / (col_yticks*1.0)
                    #print col_to_plot, ylim_min, ylim_max, ylim_diff, ylim_step
                    axis.set_yticks(np.arange(ylim_min,ylim_max+ylim_step,ylim_step))
            
    plt.tight_layout()
    curr_fig = plt.gcf()
    #curr_fig.set_size_inches(CHROM_FIG_W/5.0, CHROM_FIG_H/5.0)
    for of in output_files['Combined graph']:
        print "Saving Image:", of
        curr_fig.savefig(of, dpi=300)
    #curr_fig.set_size_inches(CHROM_FIG_W, CHROM_FIG_H)


Combined Graph

i 3 y 9 col 3 num cols 9 col name Ns
i 4 y 10 col 4 num cols 9 col name AGP Contig
i 5 y 11 col 5 num cols 9 col name AGP Gap
i 6 y 12 col 8 num cols 9 col name K-mer Coverage averaged: 500 bp
i 7 y 13 col 10 num cols 9 col name K-mer Coverage averaged: 5 Kbp
i 8 y 14 col 12 num cols 9 col name K-mer Coverage averaged: 1 Mbp
For axis 0 col name K-mer Coverage
K-mer Coverage 0.0 1.0 1.0 0.2
For axis 1 col name Sequencing Coverage
Sequencing Coverage 0.0 300.0 300.0 60.0
For axis 2 col name BLAST Coverage
BLAST Coverage 0.0 2.0 2.0 0.4
For axis 3 col name Ns
For axis 4 col name AGP Contig
For axis 5 col name AGP Gap
For axis 6 col name K-mer Coverage averaged: 500 bp
K-mer Coverage averaged: 500 bp 0.0 1.0 1.0 1.0
For axis 7 col name K-mer Coverage averaged: 5 Kbp
K-mer Coverage averaged: 5 Kbp 0.0 1.0 1.0 1.0
For axis 8 col name K-mer Coverage averaged: 1 Mbp
K-mer Coverage averaged: 1 Mbp 0.0 0.8 0.8 0.8
Saving Image: reports/eps/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Combined_graph.eps
Saving Image: reports/png/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Combined_graph.png
Saving Image: reports/pdf/run_BWA_JKL_23_JU_1_orig_PROBES_SL2.40ch12_SL2.40sc05611_JBPP0904_000000967164-000000970727.Combined_graph.pdf

In [431]:
if CONFIG_LOCAL:
    %run -i probes_cfg_footer.ipynb


Saulo Aflitos Last updated: 30/06/2015 

CPython 2.7.9
IPython 3.0.0

numpy 1.9.2
scipy 0.15.1
matplotlib 1.4.3
pandas 0.16.0
IPython 3.0.0

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 3.13.0-46-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 80
interpreter: 64bit
host name  : assembly
Git hash   : 5f46e34ab0d66890231b7cebb0d492969ea23a90