In [391]:
    
CONFIG_LOCAL = False
#CONFIG_LOCAL = True
    
In [392]:
    
if CONFIG_LOCAL:
    config = {
        'data_folder'   : 'probes',
        'RUN'           : 'run_BWA_JKL_23_JU_1_orig',
        'chromosome'    : 'SL2.40ch12',
        'scaffold'      : 'SL2.40sc05611',
        'out_folder'    : 'reports',
        'out_extensions': ['eps', 'png']
    }
    if True:
        config['BAC'            ] = 'JBPP0904'
        config['BAC_coord_start'] = 967164
        config['BAC_coord_end'  ] = 970727
        config['BAC_coord'      ] = '%012d-%012d' % ( config['BAC_coord_start'], config['BAC_coord_end'  ] )
        config['RUN'            ] = 'run_BWA_JKL_23_JU_1_orig_PROBES'
    
In [393]:
    
if CONFIG_LOCAL:
    %run -i probes_cfg.ipynb
    
In [394]:
    
if CONFIG_LOCAL:
    %run -i probes_cfg_header.ipynb
    
    
    
    
    
    
    
    
    
In [395]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
import os
import operator
#import pylab
#pylab.show()
%pylab inline
pd.set_option('display.notebook_repr_html', True)
#pd.set_option('display.max_columns', 20)
#pd.set_option('display.max_rows', 25)
from IPython.display import HTML, display
def addHeader(level, text):
    display( HTML('''<h%(level)d>%(text)s</h%(level)d>''' % {'level':level, 'text':text}) )
    
    
    
In [396]:
    
addHeader(1,'Files')
addHeader(2,'Input Files')
"""
S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc06147.sam.cov
S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc06147.sam.cov.prop.cov
S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc06147.sam.pos
"""
    
KmerCoverageFile       = "%(data_folder)s/%(RUN)s/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.%(chromosome)s.sam.%(scaffold)s.sam.cov.prop.cov.gz" % config
SequencingCoverageFile = "%(data_folder)s/mapping/out/S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.%(chromosome)s.pos.cov.%(scaffold)s.cov.gz"    % config
AgpContigFile          = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.contig.agp.cov.gz"  % config
AgpGapFile             = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.gap.agp.cov.gz"     % config
AgpOtherFile           = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.other.agp.cov.gz"   % config
AgpUnknownFile         = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.unknown.agp.cov.gz" % config
NsFile                 = "%(data_folder)s/Ns/S_lycopersicum_scaffolds.fa_NONE.tab.%(chromosome)s.tab.%(scaffold)s.tab.cov.gz"     % config
infiles                = [KmerCoverageFile, SequencingCoverageFile, 
                          AgpContigFile, AgpGapFile, 
                          AgpUnknownFile, AgpOtherFile, NsFile]
if BAC_MODE:
    """
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.SL2.40ch12.pos.cov.SL2.40sc05611.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc05611.sam.cov.prop.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727_Product.fasta.blast.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.contig.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.gap.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.other.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.unknown.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds.fa_NONE.tab.SL2.40ch12.tab.SL2.40sc05611.tab.cov
    """
                                                      
    config['in_base_name'] = "%(data_folder)s/%(RUN)s/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.%(chromosome)s.%(scaffold)s.%(BAC)s_primer.%(BAC_coord)s" % config
    KmerCoverageFile       =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.%(chromosome)s.sam.%(scaffold)s.sam.cov.prop.cov.gz" % config
    SequencingCoverageFile =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.%(chromosome)s.pos.cov.%(scaffold)s.cov.gz"        % config
    BlastCoverageFile      =                                                                                                                                      "%(in_base_name)s_Product.fasta.blast.cov.gz"                                                         % config
    AgpContigFile          =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.contig.agp.cov.gz"  % config
    AgpGapFile             =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.gap.agp.cov.gz"     % config
    AgpOtherFile           =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.other.agp.cov.gz"   % config
    AgpUnknownFile         =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.unknown.agp.cov.gz" % config
    NsFile                 =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds.fa_NONE.tab.%(chromosome)s.tab.%(scaffold)s.tab.cov.gz"    % config
    
    infiles                = [KmerCoverageFile , SequencingCoverageFile, 
                              BlastCoverageFile, AgpContigFile, 
                              AgpGapFile       , AgpUnknownFile, 
                              AgpOtherFile     , NsFile]
print "%-22s: %-5s %s" % ( "KmerCoverageFile"      , os.path.exists(KmerCoverageFile)      , KmerCoverageFile       )
print "%-22s: %-5s %s" % ( "SequencingCoverageFile", os.path.exists(SequencingCoverageFile), SequencingCoverageFile )
print "%-22s: %-5s %s" % ( "AgpContigFile"         , os.path.exists(AgpContigFile)         , AgpContigFile          )
print "%-22s: %-5s %s" % ( "AgpGapFile"            , os.path.exists(AgpGapFile)            , AgpGapFile             )
print "%-22s: %-5s %s" % ( "AgpOtherFile"          , os.path.exists(AgpOtherFile)          , AgpOtherFile           )
print "%-22s: %-5s %s" % ( "AgpUnknownFile"        , os.path.exists(AgpUnknownFile)        , AgpUnknownFile         )
print "%-22s: %-5s %s" % ( "NsFile"                , os.path.exists(NsFile)                , NsFile                 )
if BAC_MODE:
    print "%-22s: %-5s %s" % ( "BlastCoverageFile" , os.path.exists(BlastCoverageFile)     , BlastCoverageFile      )
print
if not all([os.path.exists(x) for x in infiles]):
    print "missing file"
else:
    print "all files present"
    
    
    
    
In [397]:
    
addHeader(2,'Output Files')
config['out_bn'        ] = "%(out_folder)s/%%s/%(RUN)s_%(chromosome)s_%(scaffold)s_prop" % config
config['out_bn_img'    ] = "%(out_bn)s.%%s.%%s"                                            % config
if BAC_MODE:
    config['out_bn'       ] = "%(out_folder)s/%%s/%(RUN)s_%(chromosome)s_%(scaffold)s_%(BAC)s_%(BAC_coord)s" % config
    config['out_bn_img'   ] = "%(out_bn)s.%%s.%%s"                                                             % config
    
output_files = {
    'all_data'                        : [ config['out_bn_img']  % (ext, 'raw_data'                        , 'csv'     ) for ext in config['out_extensions'] ],
    'all_data_full'                   : [ config['out_bn_img']  % (ext, 'K-raw_data'                      , 'full.csv') for ext in config['out_extensions'] ],
    'K-mer Coverage Stats'            : [ config['out_bn_img']  % (ext, 'K-mer_Coverage_Stats'            , ext       ) for ext in config['out_extensions'] ],
    'Sequencing Coverage Stats'       : [ config['out_bn_img']  % (ext, 'Sequencing_Coverage_Stats'       , ext       ) for ext in config['out_extensions'] ],
    'K-mer Coverage Distribution'     : [ config['out_bn_img']  % (ext, 'K-mer_Coverage_Distribution'     , ext       ) for ext in config['out_extensions'] ],
    'Sequencing Coverage Distribution': [ config['out_bn_img']  % (ext, 'Sequencing_Coverage_Distribution', ext       ) for ext in config['out_extensions'] ],
    'Gaps Distribution'               : [ config['out_bn_img']  % (ext, 'Gaps_Distribution'               , ext       ) for ext in config['out_extensions'] ],
    'Ns Distribution'                 : [ config['out_bn_img']  % (ext, 'Ns_Distribution'                 , ext       ) for ext in config['out_extensions'] ],
    'Combined graph'                  : [ config['out_bn_img']  % (ext, 'Combined_graph'                  , ext       ) for ext in config['out_extensions'] ]
}
if BAC_MODE:
    output_files['BLAST Coverage Stats'       ] = [ config['out_bn_img'] % (ext, 'BLAST_Coverage_Stats'       , ext) for ext in config['out_extensions'] ]
    output_files['BLAST Coverage Distribution'] = [ config['out_bn_img'] % (ext, 'BLAST_Coverage_Distribution', ext) for ext in config['out_extensions'] ]
    
for of in sorted(output_files.keys()):
    print "%-32s:\n - %s" % ( of, "\n - ".join( output_files[of] ) )
    
    
    
In [398]:
    
class size_controller(object):
    def __init__(self, w, h):
        self.w = w
        self.h = h
        
    def __enter__(self):
        self.o = rcParams['figure.figsize']
        rcParams['figure.figsize'] = self.w, self.h
        return None
    
    def __exit__(self, type, value, traceback):
        rcParams['figure.figsize'] = self.o
        
col_type_int  = np.int64
col_type_flo  = np.float64
col_type_str  = np.object
col_type_bol  = np.int8
    
In [399]:
    
addHeader(1,'Read Files')
addHeader(2,'K-mer Coverage File')
col_info = [
    [ "Position"                             , col_type_int ], 
    [ "K-mer Coverage"                       , col_type_flo ], 
    [ "K-mer Coverage averaged: 500 bp"      , col_type_flo ], 
    [ "K-mer Coverage averaged: 2.5 Kbp"     , col_type_flo ], 
    [ "K-mer Coverage averaged: 5 Kbp"       , col_type_flo ], 
    [ "K-mer Coverage averaged: 50 Kbp"      , col_type_flo ], 
    [ "K-mer Coverage averaged: 1 Mbp"       , col_type_flo ], 
    [ "K-mer Coverage averaged: 5 Kbp before", col_type_flo ], 
    [ "K-mer Coverage averaged: 5 Kbp after" , col_type_flo ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print "\n".join( col_names )
    
    
    
In [400]:
    
SKIP_ROWS = 1
print KmerCoverageFile
KmerData  = pd.read_csv(KmerCoverageFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
    
print "Loaded %d rows and %d columns" % ( KmerData.shape[0], KmerData.shape[1] )
if PARSE_VERBOSE:
    print KmerData.head()
    
    
In [401]:
    
addHeader(2,'Sequencing Coverage File')
col_info = [
    [ "Position"                             , col_type_int ], 
    [ "Sequencing Coverage"                  , col_type_int ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
    
In [402]:
    
SKIP_ROWS = 0
print SequencingCoverageFile
SequencingCoverageData  = pd.read_csv(SequencingCoverageFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
print "Loaded %d rows and %d columns" % ( SequencingCoverageData.shape[0], SequencingCoverageData.shape[1] )
if PARSE_VERBOSE:
    print SequencingCoverageData.head()
    
    
In [403]:
    
if BAC_MODE:
    addHeader(2,'BLAST')
    col_info = [
        [ "Position"                             , col_type_int ], 
        [ "BLAST Coverage"                       , col_type_flo ]
    ]
    col_names=[cf[0] for cf in col_info]
    col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
    if PARSE_VERBOSE:
        print col_names
        col_types
    
    
In [404]:
    
if BAC_MODE:
    SKIP_ROWS = 0
    
    print BlastCoverageFile
    
    BlastCoverageData  = pd.read_csv(BlastCoverageFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                            skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
        
    print "Loaded %d rows and %d columns" % ( BlastCoverageData.shape[0], BlastCoverageData.shape[1] )
    
    if PARSE_VERBOSE:
        print BlastCoverageData.head()
    
    
In [405]:
    
addHeader(2,'AGP')
SKIP_ROWS = 0
    
    
In [406]:
    
addHeader(3,'Contig')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Contig"                  , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
    
In [407]:
    
print AgpContigFile
AgpContigData  = pd.read_csv(AgpContigFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
print "Loaded %d rows and %d columns" % ( AgpContigData.shape[0], AgpContigData.shape[1] )
if PARSE_VERBOSE:
    print AgpContigData.head()
    
    
In [408]:
    
addHeader(3,'Gap')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Gap"                     , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
    
In [409]:
    
print AgpGapFile
AgpGapData  = pd.read_csv(AgpGapFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
    
print "Loaded %d rows and %d columns" % ( AgpGapData.shape[0], AgpGapData.shape[1] )
if PARSE_VERBOSE:
    print AgpGapData.head()
    
    
In [410]:
    
addHeader(3,'Unknown')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Unknown"                 , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
    
In [411]:
    
print AgpUnknownFile
AgpUnknownData  = pd.read_csv(AgpUnknownFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
    
print "Loaded %d rows and %d columns" % ( AgpUnknownData.shape[0], AgpUnknownData.shape[1] )
if PARSE_VERBOSE:
    print AgpUnknownData.head()
    
    
In [412]:
    
addHeader(3,'Other')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Other"                   , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
    
In [413]:
    
print AgpOtherFile
AgpOtherData  = pd.read_csv(AgpOtherFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
print "Loaded %d rows and %d columns" % ( AgpOtherData.shape[0], AgpOtherData.shape[1] )
if PARSE_VERBOSE:
    print AgpOtherData.head()
    
    
In [414]:
    
addHeader(2,'Ns')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "Ns"                          , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
    
In [415]:
    
print NsFile
NsData  = pd.read_csv(NsFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#", compression="gzip")
    
print "Loaded %d rows and %d columns" % ( NsData.shape[0], NsData.shape[1] )
if PARSE_VERBOSE:
    print NsData.head()
    
    
In [416]:
    
addHeader(1,'Merge')
Data = pd.DataFrame(KmerData, copy=True)
Data.combine_first( KmerData )
Data[ "AGP Contig"          ] = AgpContigData[          "AGP Contig"          ]
Data[ "AGP Gap"             ] = AgpGapData[             "AGP Gap"             ]
Data[ "AGP Unknown"         ] = AgpUnknownData[         "AGP Unknown"         ]
Data[ "AGP Other"           ] = AgpOtherData[           "AGP Other"           ]
Data[ "Ns"                  ] = NsData[                 "Ns"                  ]
Data[ "Sequencing Coverage" ] = SequencingCoverageData[ "Sequencing Coverage" ]
if BAC_MODE:
    Data[ "BLAST Coverage"  ] = BlastCoverageData[      "BLAST Coverage"      ]
print "Saved %d rows and %d columns" % ( Data.shape[0], Data.shape[1] )
if PARSE_VERBOSE:
    print Data.head()
    
    
    
In [417]:
    
addHeader(1,'Stats')
addHeader(2,'Describe')
print Data.describe()
    
    
    
    
In [418]:
    
addHeader(2,'Quantiles')
print Data['K-mer Coverage'].quantile([.01, .025, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .975, .99])
addHeader(2,'Percentiles')
print "count     ", Data['K-mer Coverage'].count()
print "count == 1", Data['K-mer Coverage'][ Data['K-mer Coverage'] == 1].count()
print "prop  == 1", Data['K-mer Coverage'][ Data['K-mer Coverage'] == 1].count() * 1.0 / Data['K-mer Coverage'].count()
    
    
    
    
    
In [419]:
    
addHeader(2,'Median')
print Data.median()
    
    
    
In [420]:
    
addHeader(2,'MAD')
print Data.mad()
    
    
    
In [421]:
    
addHeader(1,'Plot')
addHeader(2,'K-mer Coverage Stats')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    bqc = Data.boxplot(column=["K-mer Coverage", 
                               "K-mer Coverage averaged: 500 bp",
                               "K-mer Coverage averaged: 2.5 Kbp",
                               "K-mer Coverage averaged: 5 Kbp",
                               "K-mer Coverage averaged: 50 Kbp",
                               "K-mer Coverage averaged: 1 Mbp",
                               "K-mer Coverage averaged: 5 Kbp before",
                               "K-mer Coverage averaged: 5 Kbp after"], return_type='dict', rot=45)
    
    plt.setp(bqc['boxes'   ], color='black')
    plt.setp(bqc['medians' ], color='black')
    plt.setp(bqc['whiskers'], color='black')
    plt.setp(bqc['fliers'  ], color='black')
    
    for of in output_files['K-mer Coverage Stats']:
        print "Saving Image:", of
        savefig(of)
    
    
    
    
    
In [422]:
    
addHeader(2,'Sequencing Coverage Stats')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    bqc = Data.boxplot(column=['Sequencing Coverage'], return_type='dict')
    
    plt.setp(bqc['boxes'   ], color='black')
    plt.setp(bqc['medians' ], color='black')
    plt.setp(bqc['whiskers'], color='black')
    plt.setp(bqc['fliers'  ], color='black')
    
    for of in output_files['Sequencing Coverage Stats']:
        print "Saving Image:", of
        savefig(of)
    
    
    
    
In [423]:
    
if BAC_MODE:
    addHeader(2,'BLAST Coverage Stats')
    
    with size_controller(FULL_FIG_W, FULL_FIG_H):
        bqc = Data.boxplot(column=['BLAST Coverage'], return_type='dict')
        plt.setp(bqc['boxes'   ], color='black')
        plt.setp(bqc['medians' ], color='black')
        plt.setp(bqc['whiskers'], color='black')
        plt.setp(bqc['fliers'  ], color='black')
        for of in output_files['BLAST Coverage Stats']:
            print "Saving Image:", of
            savefig(of)
    
    
    
    
In [424]:
    
addHeader(2,'K-mer Coverage Distribution')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["K-mer Coverage"] > 0 ]['Position']
    
    print "Number of rows:", hf.size
    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        for of in output_files['K-mer Coverage Distribution']:
            print "Saving Image:", of
            savefig(of)
    
    
    
    
In [425]:
    
addHeader(2,'Sequencing Coverage Distribution')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["Sequencing Coverage"] > 0 ]['Position']
    
    print "Number of rows:", hf.size
    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        for of in output_files['Sequencing Coverage Distribution']:
            print "Saving Image:", of
            savefig(of)
    
    
    
    
In [426]:
    
if BAC_MODE:
    addHeader(2,'BLAST Coverage Distribution')
    
    with size_controller(FULL_FIG_W, FULL_FIG_H):
        hf = Data[ Data["BLAST Coverage"] > 0 ]['Position']
    
        print "Number of rows:", hf.size
        if hf.size > 0:
            hs = hf.hist(color=HISTOGRAM_COLOR)
            
            for of in output_files['BLAST Coverage Distribution']:
                print "Saving Image:", of
                savefig(of)
    
    
    
    
In [427]:
    
addHeader(2,'Gaps Distribution')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["AGP Gap"] > 0 ]['Position']
    
    print "Number of rows:", hf.size
    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        
        for of in output_files['Gaps Distribution']:
            print "Saving Image:", of
            savefig(of)
    
    
    
In [428]:
    
addHeader(2,'Ns Distribution')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["Ns"] > 0 ]['Position']
    
    print "Number of rows:", hf.size
    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        
        for of in output_files['Ns Distribution']:
            print "Saving Image:", of
            savefig(of)
    
    
    
In [429]:
    
addHeader(2,'CSV Output')
SAMPLE_EVERY = 1
while ( Data.shape[0] / SAMPLE_EVERY ) > MAX_ROWS:
    SAMPLE_EVERY += 1
    
print "Original Size %d rows and %d columns" % ( Data.shape[0], Data.shape[1] )
if SAMPLE_EVERY != 1:
    print "SAMPLING EVERY %d ROWS" % SAMPLE_EVERY
    
    for of in output_files['all_data_full']:
        print "Saving full data to: ", of
        Data.to_csv(of, sep='\t', index=False)
    
    DataSampled = Data[::SAMPLE_EVERY]
    print "New Size %d rows and %d columns" % ( DataSampled.shape[0], DataSampled.shape[1] )
    
    if PARSE_VERBOSE:
        print DataSampled.head()
else:
    print "no need to sample"
    DataSampled = Data
for of in output_files['all_data']:
    print "Saving data to     :", of
    DataSampled.to_csv(of, sep='\t', index=False)
    
    
    
In [436]:
    
addHeader(2,'Combined Graph')
num_cols = sum([1 if x is not None else 0 for x in cols_to_plot])
#print "num_cols:", num_cols, " :: ", ", ".join(sorted(list(set([x[0] if x is not None else None for x in cols_to_plot])-set([None]))))
#print "\n".join( [ str( (x, y[0]) ) if y is not None else str( (x, "None") ) for x,y in enumerate(cols_to_plot) ] )
with size_controller(CHROM_FIG_W, CHROM_FIG_H):
    if True:
        #fig, axes = plt.subplots(nrows=len(cols_to_plot)+2, ncols=1)
        f = plt.figure()
        
        plt.subplots_adjust(wspace=0.5, hspace=0.5)
        num_extra_rows = 3
        num_row_span   = 3
        
        num_cols_e     = num_cols + (num_extra_rows*num_row_span)
        axes           = []
        axes.append( [ plt.subplot2grid((num_cols_e,1), (0           , 0), rowspan=num_row_span) ] + cols_to_plot[0] )
        axes.append( [ plt.subplot2grid((num_cols_e,1), (num_row_span, 0), rowspan=num_row_span) ] + cols_to_plot[1] )
        
        #print "num_extra_rows", num_extra_rows
        #print "num_row_span  ", num_row_span
        #print "num_cols_e    ", num_cols_e
        
        if BAC_MODE:
            axes.append( [ plt.subplot2grid((num_cols_e,1), (num_row_span*2, 0), rowspan=num_row_span) ] + cols_to_plot[2] )
        else:
            axes.append( None )
            
        col_num  = 3
        plot_num = 0
        for i in xrange( col_num, len(cols_to_plot) ):
            y = (num_extra_rows*num_row_span)+plot_num
            
            try:
                cold = cols_to_plot[col_num]
            except:
                continue
                
            while ( cold is None ) and ( col_num < len(cols_to_plot)-1 ):
                col_num += 1
                #print " cn", col_num,"l",len(cols_to_plot)
                cold     = cols_to_plot[col_num]
            
            if cold is not None:
                #print 'i', i, 'y', y, 'col', col_num, 'num cols', num_cols, 'col name', cold[0]
                axes.append( [ plt.subplot2grid((num_cols_e,1), (y, 0) ) ] + cold )
                col_num  += 1
                plot_num += 1
        #print "num_cols", num_cols
        #print "axes"    , len(axes)
        axis_j = 0
        for axis_i, axis_data in enumerate(axes):
            if axis_data is None:
                #print " skip"
                continue
            
            axis, col_to_plot, col_ylim, col_yticks, col_color = axis_data
            
            #print "For axis", axis_i, "col name", col_to_plot
            if axis is None:
                #print " skip"
                continue
                
            p = DataSampled[col_to_plot].plot(ax=axis, kind='area', fontsize=14, stacked=False, color=col_color)
            
            axis.set_title(col_to_plot, fontdict={'fontsize': 16, 'fontweight': 'bold'})
            
            if col_ylim is not None:
                col_ylim_min, col_ylim_max = col_ylim
                
                if col_ylim_min is not None:
                    axis.set_ylim( bottom = col_ylim_min )
                
                if col_ylim_max is not None:
                    axis.set_ylim( top    = col_ylim_max )
            
            
            if col_yticks is not None:
                if col_yticks == 0:
                    axis.set_yticks([])
                    
                else:
                    ylim_min, ylim_max = axis.get_ylim()
                    ylim_diff = ylim_max - ylim_min
                    ylim_step = ylim_diff / (col_yticks*1.0)
                    #print col_to_plot, ylim_min, ylim_max, ylim_diff, ylim_step
                    axis.set_yticks(np.arange(ylim_min,ylim_max+ylim_step,ylim_step))
            
    plt.tight_layout()
    curr_fig = plt.gcf()
    #curr_fig.set_size_inches(CHROM_FIG_W/5.0, CHROM_FIG_H/5.0)
    for of in output_files['Combined graph']:
        print "Saving Image:", of
        curr_fig.savefig(of, dpi=300)
    #curr_fig.set_size_inches(CHROM_FIG_W, CHROM_FIG_H)
    
    
    
    
In [431]:
    
if CONFIG_LOCAL:
    %run -i probes_cfg_footer.ipynb