In [28]:
    
CONFIG_LOCAL = False
CONFIG_LOCAL = True
    
In [29]:
    
if CONFIG_LOCAL:
    config = {
        'data_folder'   : 'probes',
        'RUN'           : 'run_BWA_JKL_23_JU_1_orig',
        'chromosome'    : 'SL2.40ch12',
        'scaffold'      : 'SL2.40sc05611',
        'out_folder'    : 'reports',
        'out_extensions': ['eps', 'png']
    }
    if True:
        config['BAC'            ] = 'JBPP0904'
        config['BAC_coord_start'] = 967164
        config['BAC_coord_end'  ] = 970727
        config['BAC_coord'      ] = '%012d-%012d' % ( config['BAC_coord_start'], config['BAC_coord_end'  ] )
        config['RUN'            ] = 'run_BWA_JKL_23_JU_1_orig_PROBES'
    
In [30]:
    
if CONFIG_LOCAL:
    %run -i probes_cfg.ipynb
    
In [31]:
    
if CONFIG_LOCAL:
    %run -i probes_cfg_header.ipynb
    
    
    
    
    
    
    
    
    
In [32]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
import os
import operator
#import pylab
#pylab.show()
%pylab inline
pd.set_option('display.notebook_repr_html', True)
#pd.set_option('display.max_columns', 20)
#pd.set_option('display.max_rows', 25)
from IPython.display import HTML, display
def addHeader(level, text):
    display( HTML('''<h%(level)d>%(text)s</h%(level)d>''' % {'level':level, 'text':text}) )
    
    
In [33]:
    
addHeader(1,'Files')
addHeader(2,'Input Files')
"""
S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc06147.sam.cov
S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc06147.sam.cov.prop.cov
S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc06147.sam.pos
"""
config['agp_source_scaff_name'  ] = 'S_lycopersicum_chromosomes_from_scaffolds.2.40.agp'
config['agp_source_contig_name' ] = 'S_lycopersicum_chromosomes_from_contigs.2.40.agp'
config['fasta_name'             ] = 'S_lycopersicum_chromosomes.2.40.fa'
config['fasta_name2'            ] = 'S_lycopersicum_chromosomes'
config['cdna_alignments_name'   ] = 'ITAG2.3_cdna_alignments.gff3'
config['gene_models_name'       ] = 'ITAG2.3_gene_models.gff3'
config['genomic_reagents_name'  ] = 'ITAG2.3_genomic_reagents.gff3'
config['repeats_aggressive_name'] = 'ITAG2.3_repeats_aggressive.gff3'
config['repeats_name'           ] = 'ITAG2.3_repeats.gff3'
AgpSourceScaff          = "%(data_folder)s/extraGff/%(agp_source_scaff_name)s"   % config
AgpSourceContig         = "%(data_folder)s/extraGff/%(agp_source_contig_name)s"  % config
AgpContigFile           = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.contig.agp.cov"  % config
AgpGapFile              = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.gap.agp.cov"     % config
AgpOtherFile            = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.other.agp.cov"   % config
AgpUnknownFile          = "%(data_folder)s/agp/S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.unknown.agp.cov" % config
FastaFile               = "%(data_folder)s/extraGff/%(fasta_name)s"              % config
cdna_alignments_name    = "%(data_folder)s/extraGff/%(cdna_alignments_name)s"    % config
gene_models_name        = "%(data_folder)s/extraGff/%(gene_models_name)s"        % config
genomic_reagents_name   = "%(data_folder)s/extraGff/%(genomic_reagents_name)s"   % config
repeats_aggressive_name = "%(data_folder)s/extraGff/%(repeats_aggressive_name)s" % config
repeats_name            = "%(data_folder)s/extraGff/%(repeats_name)s"            % config
KmerCoverageFile        = "%(data_folder)s/%(RUN)s/%(fasta_name2)s_S_lycopersicum_scaffolds.sam.%(chromosome)s.sam.%(scaffold)s.sam.cov.prop.cov" % config
SequencingCoverageFile  = "%(data_folder)s/mapping/out/%(fasta_name2)s_S_lycopersicum_chromosomes.pos.%(chromosome)s.pos.cov.%(scaffold)s.cov"    % config
SamFile                 = "%(data_folder)s/mapping/out/%(fasta_name2)s.fa_%(fasta_name2)s.sam.gz" % config
NsFile                  = "%(data_folder)s/Ns/S_lycopersicum_scaffolds.fa_NONE.tab.%(chromosome)s.tab.%(scaffold)s.tab.cov"     % config
if BAC_MODE:
    """
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.SL2.40ch12.pos.cov.SL2.40sc05611.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.sam.SL2.40sc05611.sam.cov.prop.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727_Product.fasta.blast.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.contig.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.gap.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.other.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds_from_contigs.2.40.agp.SL2.40sc05611.agp.unknown.agp.cov
                                                      S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.SL2.40ch12.SL2.40sc05611.JBPP0904_primer.000000967164-000000970727.S_lycopersicum_scaffolds.fa_NONE.tab.SL2.40ch12.tab.SL2.40sc05611.tab.cov
    """
                                                      
    config['in_base_name'] = "%(data_folder)s/%(RUN)s/S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.%(chromosome)s.%(scaffold)s.%(BAC)s_primer.%(BAC_coord)s" % config
    KmerCoverageFile       =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_chromosomes.fa_S_lycopersicum_scaffolds.sam.%(chromosome)s.sam.%(scaffold)s.sam.cov.prop.cov" % config
    SequencingCoverageFile =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_chromosomes.fa_S_lycopersicum_chromosomes.pos.%(chromosome)s.pos.cov.%(scaffold)s.cov"        % config
    BlastCoverageFile      =                                                                                                                                      "%(in_base_name)s_Product.fasta.blast.cov"                                                         % config
    AgpContigFile          =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.contig.agp.cov"  % config
    AgpGapFile             =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.gap.agp.cov"     % config
    AgpOtherFile           =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.other.agp.cov"   % config
    AgpUnknownFile         =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds_from_contigs.2.40.agp.%(scaffold)s.agp.unknown.agp.cov" % config
    NsFile                 =                                                                                                                                      "%(in_base_name)s.S_lycopersicum_scaffolds.fa_NONE.tab.%(chromosome)s.tab.%(scaffold)s.tab.cov"    % config
    
infiles                 = [
    AgpSourceScaff          , AgpSourceContig         ,
    AgpContigFile           , AgpGapFile              ,
    AgpOtherFile            , AgpUnknownFile          ,
    FastaFile               , cdna_alignments_name    ,
    gene_models_name        , genomic_reagents_name   ,
    repeats_aggressive_name , repeats_name            ,
    KmerCoverageFile        , SequencingCoverageFile  ,
    SamFile                 , NsFile                  
]
print "%-23s: %-5s %s" % ( "AgpSourceScaff"         , os.path.exists(AgpSourceScaff)         , AgpSourceScaff          )
print "%-23s: %-5s %s" % ( "AgpSourceContig"        , os.path.exists(AgpSourceContig)        , AgpSourceContig         )
print "%-23s: %-5s %s" % ( "AgpContigFile"          , os.path.exists(AgpContigFile)          , AgpContigFile           )
print "%-23s: %-5s %s" % ( "AgpGapFile"             , os.path.exists(AgpGapFile)             , AgpGapFile              )
print "%-23s: %-5s %s" % ( "AgpOtherFile"           , os.path.exists(AgpOtherFile)           , AgpOtherFile            )
print "%-23s: %-5s %s" % ( "AgpUnknownFile"         , os.path.exists(AgpUnknownFile)         , AgpUnknownFile          )
print "%-23s: %-5s %s" % ( "FastaFile"              , os.path.exists(FastaFile)              , FastaFile               )
print "%-23s: %-5s %s" % ( "cdna_alignments_name"   , os.path.exists(cdna_alignments_name)   , cdna_alignments_name    )
print "%-23s: %-5s %s" % ( "gene_models_name"       , os.path.exists(gene_models_name)       , gene_models_name        )
print "%-23s: %-5s %s" % ( "genomic_reagents_name"  , os.path.exists(genomic_reagents_name)  , genomic_reagents_name   )
print "%-23s: %-5s %s" % ( "repeats_aggressive_name", os.path.exists(repeats_aggressive_name), repeats_aggressive_name )
print "%-23s: %-5s %s" % ( "repeats_name"           , os.path.exists(repeats_name)           , repeats_name            )
print "%-23s: %-5s %s" % ( "KmerCoverageFile"       , os.path.exists(KmerCoverageFile)       , KmerCoverageFile        )
print "%-23s: %-5s %s" % ( "SequencingCoverageFile" , os.path.exists(SequencingCoverageFile) , SequencingCoverageFile  )
print "%-23s: %-5s %s" % ( "SamFile"                , os.path.exists(SamFile)                , SamFile                 )
print "%-23s: %-5s %s" % ( "NsFile"                 , os.path.exists(NsFile)                 , NsFile                  )
if BAC_MODE:
    print "%-23s: %-5s %s" % ( "BlastCoverageFile" , os.path.exists(BlastCoverageFile)     , BlastCoverageFile      )
    infiles.append(BlastCoverageFile)
    
print
if not all([os.path.exists(x) for x in infiles]):
    print "missing file"
else:
    print "all files present"
    
    
    
    
In [34]:
    
addHeader(2,'Output Files')
config['out_bn'        ] = "%(out_folder)s/%%s/%(RUN)s_%(chromosome)s_%(scaffold)s_prop" % config
config['out_bn_img'    ] = "%(out_bn)s.%%s.%%s"                                            % config
if BAC_MODE:
    config['out_bn'       ] = "%(out_folder)s/%%s/%(RUN)s_%(chromosome)s_%(scaffold)s_%(BAC)s_%(BAC_coord)s" % config
    config['out_bn_img'   ] = "%(out_bn)s.%%s.%%s"                                                             % config
    
output_files = {
    'all_data'                        : [ config['out_bn_img']  % (ext, 'raw_data'                        , 'csv'     ) for ext in config['out_extensions'] ],
    'all_data_full'                   : [ config['out_bn_img']  % (ext, 'K-raw_data'                      , 'full.csv') for ext in config['out_extensions'] ],
    'K-mer Coverage Stats'            : [ config['out_bn_img']  % (ext, 'K-mer_Coverage_Stats'            , ext       ) for ext in config['out_extensions'] ],
    'Sequencing Coverage Stats'       : [ config['out_bn_img']  % (ext, 'Sequencing_Coverage_Stats'       , ext       ) for ext in config['out_extensions'] ],
    'K-mer Coverage Distribution'     : [ config['out_bn_img']  % (ext, 'K-mer_Coverage_Distribution'     , ext       ) for ext in config['out_extensions'] ],
    'Sequencing Coverage Distribution': [ config['out_bn_img']  % (ext, 'Sequencing_Coverage_Distribution', ext       ) for ext in config['out_extensions'] ],
    'Gaps Distribution'               : [ config['out_bn_img']  % (ext, 'Gaps_Distribution'               , ext       ) for ext in config['out_extensions'] ],
    'Ns Distribution'                 : [ config['out_bn_img']  % (ext, 'Ns_Distribution'                 , ext       ) for ext in config['out_extensions'] ],
    'Combined graph'                  : [ config['out_bn_img']  % (ext, 'Combined_graph'                  , ext       ) for ext in config['out_extensions'] ]
}
if BAC_MODE:
    output_files['BLAST Coverage Stats'       ] = [ config['out_bn_img'] % (ext, 'BLAST_Coverage_Stats'       , ext) for ext in config['out_extensions'] ]
    output_files['BLAST Coverage Distribution'] = [ config['out_bn_img'] % (ext, 'BLAST_Coverage_Distribution', ext) for ext in config['out_extensions'] ]
    
for of in sorted(output_files.keys()):
    print "%-32s:\n - %s" % ( of, "\n - ".join( output_files[of] ) )
    
    
    
In [35]:
    
col_type_int16 = np.int16
col_type_int32 = np.int32
col_type_int   = np.int64
col_type_flo   = np.float64
col_type_flo32 = np.float32
col_type_str   = np.object
col_type_str   = np.str
col_type_str   = np.str_
col_type_str   = np.string_
col_type_bol   = np.int8
col_type_char  = np.character
col_type_char  = np.dtype('a1')
    
In [36]:
    
import gzip
def open_file(infile, mode):
    if infile.endswith('.gz'):
        if 'b'not in mode:
            mode = mode + 'b'
        return gzip.open(infile, mode)
    else:
        return open(infile, mode)
    
In [37]:
    
class size_controller(object):
    def __init__(self, w, h):
        self.w = w
        self.h = h
        
    def __enter__(self):
        self.o = rcParams['figure.figsize']
        rcParams['figure.figsize'] = self.w, self.h
        return None
    
    def __exit__(self, type, value, traceback):
        rcParams['figure.figsize'] = self.o
    
In [38]:
    
class coortroller(object):
    def __init__(self, arr, keys):
        self.arr    = arr
        self.keys   = keys
    
    def add(self, chrom, start, end, value):
        chrom_start = self.keys[chrom]['cumm']
        add_start   = chrom_start + start
        add_end     = chrom_start + end
        #print "chromosome %s chromosome start %d start %d end %d real start %d real end %d value %d" % (chrom, chrom_start, start, end, add_start, add_end, value)
        #print self.arr[add_start-2:add_end+1]
        self.arr[add_start-1:add_end] += 1
        #print self.arr[add_start-2:add_end+1]
    
In [39]:
    
def parse_CSV( infile, col_info, delimiter="\t", comment="#", header=None, 
              skiprows=0, nrows=None, verbose=False, converters=None, print_log=True ):
    
    col_names = [cf[0] for cf in col_info]
    col_types = dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
    
    if print_log:
        print "parsing", infile
    
    if verbose or print_log:
        with open(infile, 'r') as fhd:
            f = 0
            for line in fhd:
                print line,
                f += 1
                if f == 5:
                    break
    
    data  = pd.read_csv(infile, header=header, names=col_names, dtype=col_types, nrows=nrows,
                        skiprows=skiprows, verbose=verbose, delimiter=delimiter, comment=comment, 
                        converters=converters)
    if print_log:
        print "Loaded %d rows and %d columns" % ( data.shape[0], data.shape[1] )
    if verbose or print_log:
        print data.head(1)
        print data.tail(1)
    
    return data
    
In [40]:
    
def to_coverage(tab_data, col_name, start_name, end_name, chromTotalLen, chroms, dtype=col_type_bol, print_log=True, cov_data=None):
    if cov_data == None:
        cov_data    = np.zeros(chromTotalLen, dtype=dtype)
        
    data_controller   = coortroller(cov_data, chroms)
    total_coord_count = 0
    total_pos_count   = 0
    
    for col in tab_data[col_name].unique():
        if print_log:
            print col_name, col,
            print 'filtering data',
        
        cold = tab_data[ tab_data[col_name] == col ]
        if print_log:
            #print 'chromd', chromd.head()
            print 'adding coordinates',
        
        coord_count = 0
        pos_count   = 0
        
        for v in cold[[start_name, end_name]].iterrows():
            start        = min([v[1][0], v[1][1]])
            end          = max([v[1][0], v[1][1]])
            coord_count += 1
            pos_count   += end - start
            #print "Start:'" + str(start) + "' End: '" + str(end) + "'"
            data_controller.add(col, start, end, 1)
            #break
        #break
        total_coord_count += coord_count
        total_pos_count   += pos_count
        if print_log:
            print 'added %10d coordinates in a total of %10d positions' % ( coord_count, pos_count )
    
    if print_log:
        print 'added %10d coordinates in a total of %10d positions' % ( total_coord_count, total_pos_count )
    return cov_data
    
In [41]:
    
def process_process( proc ):
    if not proc['enabled']:
        return
        
    addHeader(3, proc['title'])
    
    
    of = None
    if 'filter' in proc:
        of = proc['filter_out_file']
    else:
        of = proc['parser_parameters'][0][0]
    
    if 'cov' not in proc:
        if 'data' not in proc or 'data' not in proc['data']:
            if 'filter' in proc and 'filtered' not in proc:
                print "filtering"
                if   len( proc['filter_parameters'][0] ) == 2:
                    proc['filter_parameters'][0].append( proc['filter_in_file' ] )
                    proc['filter_parameters'][0].append( proc['filter_out_file'] )
                elif len( proc['filter_parameters'][0] ) == 4:
                    proc['filter_parameters'][0][2] = proc['filter_in_file' ]
                    proc['filter_parameters'][0][3] = proc['filter_out_file']
                proc['filter'           ]( *proc['filter_parameters'][0], **proc['filter_parameters'][1] )
                proc['parser_parameters'][0][0] = proc['filter_out_file']
                proc['filtered'         ] = True
        
            if os.path.exists(of + '.df.h5') and False:
                print "loading data from h5"
                proc['data']         = pd.HDFStore( of + '.df.h5' )
            else:
                print "loading data from raw"
                #proc['data']         = pd.HDFStore( of + '.df.h5' )
                #proc['data']['data'] = proc['parser'   ]( *proc['parser_parameters'   ][0], **proc['parser_parameters'   ][1] )
                proc['data']         = {}
                proc['data']['data'] = proc['parser'   ]( *proc['parser_parameters'   ][0], **proc['parser_parameters'   ][1] )
                
        converter_parameters = proc['converter_parameters'][0]
        converter_parameters.insert(0, proc['data']['data'])
        
        if os.path.exists(of + '.cov.npy'):
            print "loading coverage from pickle"
            proc['cov'] = np.load( open(of + '.cov.npy', 'rb') )
        else:
            print "loading coverage from raw"
            proc['cov'] = proc['converter']( *converter_parameters, **proc['converter_parameters'][1])
            np.save( open(of + '.cov.npy', 'wb'), proc['cov'] )
    
In [42]:
    
#http://www.ncbi.nlm.nih.gov/projects/genome/assembly/agp/AGP_Specification.shtml
# A Active Finishing
# D Draft HTG (often phase1 and phase2 are called Draft, whether or not they have the draft keyword).
# F Finished HTG (phase3)
# G Whole Genome Finishing
# O Other sequence (typically means no HTG keyword)
# P Pre Draft
# W WGS contig
# N gap with specified size
# U gap of unknown size, defaulting to 100 bases.
#!touch {AgpSource}.tst
import os
def filter_col(col_num, col_val, infile, outfile, contains=None, misses=None, skip=None):
    if os.path.exists(outfile):
        print "output file %s exists" % outfile
        return
    
    ext_str  = ""
    ext_str += " Contains: %s" % ", ".join(contains) if contains is not None else ""
    ext_str += " Misses: %s"   % ", ".join(misses  ) if misses   is not None else ""
    ext_str += " Skip: %s"     % ", ".join(skip    ) if skip     is not None else ""
    print "Filtering %s for %s. saving in %s.%s" % ( infile, ", ".join(col_val), outfile, ext_str )
    
    with open(outfile, 'w') as ofhd:
        with open(infile, 'r') as ifhd:
            for line in ifhd:
                lines = line.strip()
                if len(lines) == 0:
                    ofhd.write(line)
                    continue
                
                if skip is not None:
                    if any([c in line for c in skip]):
                        continue
                
                if lines[0] == "#":
                    ofhd.write(line)
                    continue
                cols = lines.split("\t")
                if len(cols) <= col_num:
                    continue
                    
                if cols[col_num] in col_val:
                    cts = True
                    if contains is not None:
                        if not all([c in line for c in contains]):
                            cts = False
                    if misses is not None:
                        if any([c in line for c in misses]):
                            cts = False
                    if cts:
                        ofhd.write(line)
    
In [43]:
    
addHeader(1,'Read Files')
    
    
In [44]:
    
addHeader(2,'Fasta')
    
    
In [45]:
    
from collections import OrderedDict
import cPickle as pickle
chroms          = OrderedDict()
FastaFilePickle = FastaFile + '.pkl'
if os.path.exists(FastaFilePickle):
    print "loading fasta pickle"
    chroms = pickle.load(open(FastaFilePickle, 'rb'))
    
else:
    print "loading fasta"
    lastChrom = None
    with open(FastaFile, 'r') as fhd:
        cumm = 0
        for line in fhd:
            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == ">":
                lastChrom  = line[1:]
                chroms[lastChrom] = { 'size': 0, 'cumm': cumm+1 }
                continue
            ll = len(line)
            chroms[lastChrom]['size'] += ll
            cumm += ll
    pickle.dump(chroms, open(FastaFilePickle,'wb'), pickle.HIGHEST_PROTOCOL)
    
print "\n".join( [ "%-15s size %12d cummulative size %12d" % ( x, chroms[x]['size'], chroms[x]['cumm'] ) for x in chroms ] )
    
    
In [46]:
    
chromTotalLen = sum([chroms[x]['size'] for x in chroms])
print "chromTotalLen", chromTotalLen
    
    
In [47]:
    
addHeader(2,'Setup')
    
    
In [48]:
    
col_info_agp_w = [
    [ "Chromosome"                           , col_type_str  ], 
    [ "Start"                                , col_type_int  ], 
    [ "End"                                  , col_type_int  ], 
    [ "Seq"                                  , col_type_int  ], 
    [ "Type"                                 , col_type_char ], 
    [ "Scaffold"                             , col_type_str  ], 
    [ "Begin"                                , col_type_int  ],
    [ "Finish"                               , col_type_int  ],
    [ "Orientation"                          , col_type_char ]
]
col_info_agp_u = [
    [ "Chromosome"                           , col_type_str ], 
    [ "Start"                                , col_type_int ], 
    [ "End"                                  , col_type_int ], 
    [ "Seq"                                  , col_type_int ], 
    [ "Type"                                 , col_type_char], 
    [ "GapLength"                            , col_type_int ], 
    [ "GapType"                              , col_type_str ], 
    [ "Linkage"                              , col_type_str ]
]
col_info_agp_n = [
    [ "Chromosome"                           , col_type_str ], 
    [ "Start"                                , col_type_int ], 
    [ "End"                                  , col_type_int ], 
    [ "Seq"                                  , col_type_int ], 
    [ "Type"                                 , col_type_char], 
    [ "GapLength"                            , col_type_int ], 
    [ "GapType"                              , col_type_str ], 
    [ "Linkage"                              , col_type_str ],
    [ "Empty"                                , col_type_str ]
]
col_info_gff = [
    [ "Chromosome"                           , col_type_str ], 
    [ "Name"                                 , col_type_str ],
    [ "Type"                                 , col_type_str ],
    [ "Start"                                , col_type_int ], 
    [ "End"                                  , col_type_int ], 
    #[ "Qual"                                 , col_type_flo ], 
    [ "Qual"                                 , col_type_str ], 
    [ "Orientation"                          , col_type_char], 
    [ "Filter"                               , col_type_str ],
    [ "Info"                                 , col_type_str ]   
]
    
In [49]:
    
processes   = []
PARSE_AGP   = True
PARSE_GFF   = True
PARSE_BLAST = True
    
In [50]:
    
if PARSE_AGP:
    agp_to_parse = [ 
        #f_file, f_type, f_filter, f_enabled, f_info in 
        [AgpSourceScaff , 'Scaffold', 'A', False, col_info_agp_w, col_type_bol], 
        [AgpSourceScaff , 'Scaffold', 'D', False, col_info_agp_w, col_type_bol], 
        [AgpSourceScaff , 'Scaffold', 'F', False, col_info_agp_w, col_type_bol], 
        [AgpSourceScaff , 'Scaffold', 'G', False, col_info_agp_w, col_type_bol], 
        [AgpSourceScaff , 'Scaffold', 'O', False, col_info_agp_w, col_type_bol], 
        [AgpSourceScaff , 'Scaffold', 'P', False, col_info_agp_w, col_type_bol], 
        [AgpSourceScaff , 'Scaffold', 'W', True , col_info_agp_w, col_type_bol], 
        [AgpSourceScaff , 'Scaffold', 'N', False, col_info_agp_n, col_type_bol], 
        [AgpSourceScaff , 'Scaffold', 'U', True , col_info_agp_u, col_type_bol],
        [AgpSourceContig, 'Contig'  , 'A', False, col_info_agp_w, col_type_bol], 
        [AgpSourceContig, 'Contig'  , 'D', False, col_info_agp_w, col_type_bol], 
        [AgpSourceContig, 'Contig'  , 'F', False, col_info_agp_w, col_type_bol], 
        [AgpSourceContig, 'Contig'  , 'G', False, col_info_agp_w, col_type_bol], 
        [AgpSourceContig, 'Contig'  , 'O', False, col_info_agp_w, col_type_bol], 
        [AgpSourceContig, 'Contig'  , 'P', False, col_info_agp_w, col_type_bol], 
        [AgpSourceContig, 'Contig'  , 'W', True , col_info_agp_w, col_type_bol], 
        [AgpSourceContig, 'Contig'  , 'N', True , col_info_agp_n, col_type_bol], 
        [AgpSourceContig, 'Contig'  , 'U', True , col_info_agp_u, col_type_bol]
    ]
    
In [51]:
    
if PARSE_AGP:
    for f_file, f_type, f_filter, f_enabled, f_info, f_type in agp_to_parse:
        if not f_enabled:
            continue
        cfg = { 
                'title'            : 'AGP %s :: %s' % (f_filter, f_type),
                'name'             : 'AGP %s %s'    % (f_filter, f_type),
                'enabled'          : f_enabled,
                'filter'           : filter_col,
                'filter_in_file'   : f_file,
                'filter_out_file'  : f_file + '.%s.agp' % f_filter,
                'filter_parameters': [
                    [ 4, [f_filter            ] ],
                    {}
                ],
                'parser'  : parse_CSV,
                'parser_parameters': [
                    [
                        None, 
                        f_info
                    ],
                    {
                        'skiprows': 0, 
                        'nrows'   : NROWS,
                        'verbose' : PARSE_VERBOSE
                    }
                ],
                'converter': to_coverage,
                'converter_parameters': [
                    ["Chromosome", "Start", "End", chromTotalLen, chroms],
                    {'dtype'   : f_type}
                ]
        }
        processes.append(cfg)
    
In [52]:
    
if PARSE_GFF:
    f_kwargs0={}
    f_kwargs1={ 'skip':['###']}
    
    p_kwargs0={ 'skiprows': 0, 'nrows': NROWS, 'verbose': PARSE_VERBOSE }
    p_kwargs1={ 'skiprows': 1, 'nrows': NROWS, 'verbose': PARSE_VERBOSE }
    p_kwargs2={ 'skiprows': 1, 'nrows': NROWS, 'verbose': PARSE_VERBOSE }
    p_kwargs3={ 'skiprows': 1, 'nrows': NROWS, 'verbose': PARSE_VERBOSE }
    c_kwargs0 = {}
    c_kwargs1 = {'dtype': col_type_int16}
    #    f_file                 , f_type      , f_filter       , f_name          , f_enabled, f_info      , p_kwargs , f_kwargs       in 
    gff_to_parse = [ 
        [cdna_alignments_name   , 'cDNA'      , 'match'        , 'match'         , True     , col_info_gff, p_kwargs3, f_kwargs0, c_kwargs1], 
        [cdna_alignments_name   , 'cDNA'      , 'match_part'   , 'match_part'    , True     , col_info_gff, p_kwargs3, f_kwargs0, c_kwargs1], 
        [gene_models_name       , 'Gene Model', 'gene'         , 'gene'          , True     , col_info_gff, p_kwargs3, f_kwargs1, c_kwargs0], 
        [gene_models_name       , 'Gene Model', 'mRNA'         , 'mRNA'          , True     , col_info_gff, p_kwargs3, f_kwargs1, c_kwargs0], 
        [gene_models_name       , 'Gene Model', 'exon'         , 'exon'          , True     , col_info_gff, p_kwargs3, f_kwargs1, c_kwargs0], 
        [gene_models_name       , 'Gene Model', 'CDS'          , 'CDS'           , True     , col_info_gff, p_kwargs3, f_kwargs1, c_kwargs0], 
        [gene_models_name       , 'Gene Model', 'intron'       , 'intron'        , True     , col_info_gff, p_kwargs3, f_kwargs1, c_kwargs0], 
        [genomic_reagents_name  , 'BAC'       , 'BAC_clone'    , 'BAC_clone'     , True     , col_info_gff, p_kwargs3, f_kwargs0, c_kwargs0], 
        [genomic_reagents_name  , 'BAC'       , 'BAC_end'      , 'BAC_end'       , True     , col_info_gff, p_kwargs3, f_kwargs0, c_kwargs0], 
        [repeats_aggressive_name, 'Repeat'    , 'repeat_region', 'Copia'         , True     , col_info_gff, p_kwargs1, {'contains':['repeat_class=LTR/Copia;'     ], 'skip':['###']}, c_kwargs0], 
        [repeats_aggressive_name, 'Repeat'    , 'repeat_region', 'LTR'           , True     , col_info_gff, p_kwargs1, {'contains':['repeat_class=LTR;'           ], 'skip':['###']}, c_kwargs0],
        [repeats_aggressive_name, 'Repeat'    , 'repeat_region', 'Low_complexity', True     , col_info_gff, p_kwargs1, {'contains':['repeat_class=Low_complexity;'], 'skip':['###']}, c_kwargs0],
        [repeats_aggressive_name, 'Repeat'    , 'repeat_region', 'Gypsy'         , True     , col_info_gff, p_kwargs1, {'contains':['repeat_class=LTR/Gypsy;'     ], 'skip':['###']}, c_kwargs0],
        [repeats_aggressive_name, 'Repeat'    , 'repeat_region', 'Simple_repeat' , True     , col_info_gff, p_kwargs1, {'contains':['repeat_class=Simple_repeat;' ], 'skip':['###']}, c_kwargs0],
        [repeats_aggressive_name, 'Repeat'    , 'repeat_region', 'SINE'          , True     , col_info_gff, p_kwargs1, {'contains':['repeat_class=SINE;'          ], 'skip':['###']}, c_kwargs0],
        [repeats_name           , 'Repeat'    , 'repeat_region', 'Copia'         , False    , col_info_gff, p_kwargs1, {'contains':['repeat_class=LTR/Copia;'     ], 'skip':['###']}, c_kwargs0], 
        [repeats_name           , 'Repeat'    , 'repeat_region', 'LTR'           , False    , col_info_gff, p_kwargs1, {'contains':['repeat_class=LTR;'           ], 'skip':['###']}, c_kwargs0],
        [repeats_name           , 'Repeat'    , 'repeat_region', 'Low_complexity', False    , col_info_gff, p_kwargs1, {'contains':['repeat_class=Low_complexity;'], 'skip':['###']}, c_kwargs0],
        [repeats_name           , 'Repeat'    , 'repeat_region', 'Gypsy'         , False    , col_info_gff, p_kwargs1, {'contains':['repeat_class=LTR/Gypsy;'     ], 'skip':['###']}, c_kwargs0],
        [repeats_name           , 'Repeat'    , 'repeat_region', 'Simple_repeat' , False    , col_info_gff, p_kwargs1, {'contains':['repeat_class=Simple_repeat;' ], 'skip':['###']}, c_kwargs0],
        [repeats_name           , 'Repeat'    , 'repeat_region', 'SINE'          , False    , col_info_gff, p_kwargs1, {'contains':['repeat_class=SINE;'          ], 'skip':['###']}, c_kwargs0],
    ]
    
In [53]:
    
if PARSE_GFF:
    for  f_file                 , f_type      , f_filter       , f_name          , f_enabled, f_info      , p_kwargs , f_kwargs, c_kwargs       in gff_to_parse:
        if not f_enabled:
            continue
        processes.append(
            { 
                'title'            : 'GFF %s :: %s :: %s' % (f_filter, f_type, f_name),
                'name'             : 'GFF %s %s %s'       % (f_filter, f_type, f_name),
                'enabled'          : f_enabled,
                'filter'           : filter_col,
                'filter_in_file'   : f_file,
                'filter_out_file'  : f_file + '.%s.agp' % f_name,
                'filter_parameters': [
                    [ 2, [f_filter            ] ],
                    f_kwargs
                ],
                'parser'           : parse_CSV,
                'parser_parameters': [
                    [
                        None, 
                        f_info
                    ],
                    p_kwargs
                ],
                'converter'           : to_coverage,
                'converter_parameters': [
                    ["Chromosome", "Start", "End", chromTotalLen, chroms],
                    c_kwargs
                ]
            }
        )
    
In [54]:
    
for proc in processes:
    process_process( proc )
print 'done'
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [55]:
    
for proc in processes:
    title = proc['title']
    print title
    if not proc['enabled']:
        continue
    data  = proc['cov'  ]
    with size_controller(FULL_FIG_W, FULL_FIG_H):
        plt.plot(data, markevery=10, label=title)
        show()
        #display( plt.gcf() )
        pass
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
In [56]:
    
#S_lycopersicum_chromosomes.2.40.fa_SL2.40ch12_000065480000_000065486252.fasta SL2.40ch12 92.80 625               42         3            5629   6252 703152 703774 0.0    744
#query                                                                         subject    %id   alignment length  mismatches gap openings qstart qend sstart send   evalue bitscore 
import decimal
def decimal_float(v):
    return float(decimal.Decimal(v))
    
col_info_blast = [
    ["query_name"       , str           ],
    ["subject_name"     , str           ],
    ["perc_identity"    , float         ],
    ["alignment length" , int           ],
    ["mismatches"       , int           ],
    ["gap openings"     , int           ],
    ["qstart"           , int           ],
    ["qend"             , int           ],
    ["sstart"           , int           ],
    ["send"             , int           ],
    ["evalue"           , float         ],
    ["bitscore"         , float         ]
]
col_info_blast_k = dict( zip( [x[0] for x in col_info_blast], range(len(col_info_blast)) ) )
print col_info_blast_k
    
    
In [57]:
    
from IPython.parallel import Client
pc = Client()
pc.ids
print pc[:].apply_sync(lambda : "Hello, World")
dview = pc[:]
parallel_result = dview.map_sync(lambda x: x**10, range(32))
print parallel_result
    
    
In [58]:
    
def load_blast_chrom_db(BLAST_VAL_TYPE, BLAST_COL_TYPE, chromTotalLen, chroms, chrom_folder, f_folder):
    import numpy  as np
        
    chrom_blast_data  = None
    chrom_coord_count = 0
    chrom_pos_count   = 0
    chrom     = os.path.basename( chrom_folder )
    print '  chrom %s' % chrom
    file_path    = chrom_folder + '.blast'
    file_path_py = file_path + '.' + BLAST_VAL_TYPE + '.cov.npy'
    print '   file %s' % file_path
    chrom_blast_data = None
    if os.path.exists( file_path_py ):
        print "   loading coverage from pickle"
        sys.stdout.flush()
        chrom_blast_data    = np.load( open(file_path_py, 'rb') )
        print "   loaded coverage from pickle"
        print '    chrom  average %6.3f' % ( np.mean(chrom_blast_data ) )
        return 0, 0, chrom_blast_data
    
    else:
        print "   loading coverage from raw"
        sys.stdout.flush()
        sys.stdout.write('   ')
        chrom_blast_data      = np.zeros(chromTotalLen, dtype=BLAST_COL_TYPE)
        chrom_data_controller = coortroller(chrom_blast_data, chroms)
        with open( file_path, 'r' ) as fhd:
            for line in fhd:
                line = line.strip()
                cols = line.split("\t")
                #print cols
                cols = [ col_info_blast[i][1](v) for i,v in enumerate(cols) ]
                #print cols
                fields = dict(zip( col_info_blast_k.keys(), [ cols[v] for k,v in col_info_blast_k.items() ]))
                #print keys
                start               = min([fields['sstart'], fields['send']])
                end                 = max([fields['sstart'], fields['send']])
                chrom_coord_count  += 1
                chrom_pos_count    += end - start
                if chrom_coord_count % 1000000 == 0:
                    sys.stdout.write('.')
                    sys.stdout.flush()
                #print "Start:'" + str(start) + "' End: '" + str(end) + "'"
                if BLAST_VAL_TYPE == 'COUNT':
                    value = 1
                elif BLAST_VAL_TYPE == 'SYM':
                    value = fields['perc_identity'] / 100.0
                chrom_data_controller.add( fields['subject_name'], start, end, value )
                #break
        print
        print "   loaded coverage from raw"
        print "   saving coverage to pickle"
        sys.stdout.flush()
        np.save( open(file_path_py, 'wb'), chrom_blast_data )
        print '   chromosome %s added %10d coordinates in a total of %10d positions. average %6.3f' % ( chrom, chrom_coord_count, chrom_pos_count, np.mean(chrom_blast_data) )
        
        return chrom_coord_count, chrom_pos_count, chrom_blast_data
pc[:]['load_blast_chrom_db'] = load_blast_chrom_db
pc[:]['coortroller'        ] = coortroller
with pc[:].sync_imports():
    import os
    import sys
#dview.push(load_blast_chrom_db)
@dview.parallel(block=True)
def p_load_blast_chrom_db(params):
    return load_blast_chrom_db(*params)
    
    
In [59]:
    
def load_blast_db(f_method, f_fasta, f_folder, BLAST_VAL_TYPE):
    if   BLAST_VAL_TYPE == 'COUNT':
        BLAST_COL_TYPE = col_type_int32
        
    elif BLAST_VAL_TYPE == 'SYM':
        BLAST_COL_TYPE = col_type_flo32
    
    else:
        print "unknown blast val type %s" % BLAST_VAL_TYPE
        raise 'error'
    
    
    
    print "blast method %s folder %s" % ( f_method, f_folder )
    f_folder_py = f_folder + '.' + BLAST_VAL_TYPE + '.cov.npy'
    if os.path.exists(f_folder_py):
        print " getting blast method from pickle"
        sys.stdout.flush()
        method_blast_data  = np.load( open(f_folder_py, 'rb') )
        blast_to_parse[btp].append(method_blast_data)
        print " got blast method from pickle"
        print ' method average %6.3f' % ( np.mean(method_blast_data) )
        return 0, 0, method_blast_data
    else:
        print " getting blast method from raw"
        method_blast_data  = np.zeros(chromTotalLen, dtype=BLAST_COL_TYPE)
        method_coord_count = 0
        method_pos_count   = 0
        cov                = None
        chroms_to_process  = []
        for chrom_folder in sorted(glob(os.path.join(os.path.abspath(f_folder), '*'))):
            if not os.path.isdir( chrom_folder ):
                continue
            chroms_to_process.append( [BLAST_VAL_TYPE, BLAST_COL_TYPE, chromTotalLen, chroms, chrom_folder, f_folder] )
            
            chrom_coord_count, chrom_pos_count, chrom_blast_data = load_blast_chrom_db(BLAST_VAL_TYPE, BLAST_COL_TYPE, chromTotalLen, chroms, chrom_folder, f_folder)
            print
            method_coord_count += chrom_coord_count
            method_pos_count   += chrom_pos_count
            method_blast_data  += chrom_blast_data
            print '    method average %6.3f' % ( np.mean(method_blast_data) )
            #break
        
        #print chroms_to_process
        sys.stdout.flush()
        
        #chrom_res = p_load_blast_chrom_db.map(chroms_to_process)
        #print chrom_res
        #sys.stdout.flush()
        #raise
        
        print '  method %s added %10d coordinates in a total of %10d positions. average %6.3f' % ( f_method, method_coord_count, method_pos_count, np.mean(method_blast_data) )
        np.save( open(f_folder_py, 'wb'), method_blast_data )
        
        return method_coord_count, method_pos_count, method_blast_data
    
In [60]:
    
if PARSE_BLAST:
    if1    = "probes/blast/out" #/SL2.40ch12/S_lycopersicum_chromosomes.2.40.fa_SL2.40ch12_000065480000_000065486252.fasta.blast"
    fasta  = 'S_lycopersicum_chromosomes.2.40.fa'
    #
    blast_to_parse = [ 
        #f_method  , f_enabled, f_fasta, f_folder
        ['standard', True     , fasta  , if1], 
    ]
    print "%d blast experiments" % len(blast_to_parse)
    
    
In [61]:
    
import fnmatch
import os
from glob import glob
if PARSE_BLAST:
    total_coord_count = 0
    total_pos_count   = 0
    
    BLAST_VAL_TYPE    = 'COUNT'
    BLAST_VAL_TYPE    = 'SYM'
    
    for btp in xrange(len(blast_to_parse)):
        f_method, f_enabled, f_fasta, f_folder = blast_to_parse[btp]
        
        if not f_enabled:
            continue
        
        method_coord_count, method_pos_count, method_blast_data = load_blast_db(f_method, f_fasta, f_folder, BLAST_VAL_TYPE)
        total_coord_count += method_coord_count
        total_pos_count   += method_pos_count
        blast_to_parse[btp].append(method_blast_data)
            
    print 'total added %10d coordinates in a total of %10d positions' % ( total_coord_count, total_pos_count )
    
    
In [62]:
    
with size_controller(FULL_FIG_W, FULL_FIG_H):
    for btp in blast_to_parse:
        plt.plot(btp[-1], markevery=10)
    
    
In [63]:
    
addHeader(2,'Coverage')
    
    
In [ ]:
    
DO_COVERAGE = True
if DO_COVERAGE:
    COVERAGE_COL_TYPE = col_type_int32
    sam_coverage_data = None
    SamFile_py       = SamFile + '.cov.npy'
    if os.path.exists(SamFile_py):
        print " getting sam coverage from pickle"
        sys.stdout.flush()
        sam_coverage_data  = np.load( open(SamFile_py, 'rb') )
        print " got sam coverage from pickle"
        print ' sam coverage average %6.3f' % ( np.mean(sam_coverage_data) )
    else:
        print " getting sam coverage from raw"
        sys.stdout.flush()
        sam_coverage_data            = np.zeros(chromTotalLen, dtype=COVERAGE_COL_TYPE)
        sam_coverage_data_controller = coortroller(sam_coverage_data, chroms)
        sam_coverage_coord_count     = 0
        sam_coverage_pos_count       = 0
    
        with open_file(SamFile, 'r') as fhd:
            lc = 0
            for line in fhd:
                line = line.strip()
                if len(line) == 0:
                    continue
                if line[0] == '@':
                    continue
                #print line
                cols = line.split('\t')
                if len(cols) < 11:
                    print cols
                    continue
                if cols[2] == '*':
                    continue
                #print cols
                chrom  =     cols[2]
                start  = int(cols[3])
                seq    =     cols[9]
                seqLen = len(cols[9])
                end    = start + seqLen
                #print "chrom %s start %12d end %12d seq %s len %4d" % (chrom, start, end, seq, seqLen)
                
                sam_coverage_coord_count  += 1
                sam_coverage_pos_count    += end - start
                if sam_coverage_coord_count % 1000000 == 0:
                    sys.stdout.write('.')
                    sys.stdout.flush()
                sam_coverage_data_controller.add( chrom, start, end, 1 )
                
        print '    sam coverage average %6.3f' % ( np.mean(sam_coverage_data) )
        np.save( open(SamFile_py, 'wb'), sam_coverage_data )
    
In [ ]:
    
with size_controller(FULL_FIG_W, FULL_FIG_H):
    plt.plot(sam_coverage_data, markevery=10)
    
In [ ]:
    
rep = np.repeat(['00', '01', '02'], [1, 2, 3])
print rep
ran = np.concatenate([ np.arange(1, x+1) for x in (1,2,3)])
print ran
con = np.concatenate([rep, ran])
print con
res = con.reshape(2,6)
print res
tra = np.transpose(res)
print tra
    
In [ ]:
    
# OrderedDict([('SL2.40ch00', 21805821)
#CovChroms = np.repeat([x for x in chroms], [chroms[x] for x in chroms])
#print CovChroms
    
In [ ]:
    
#CovPos = np.concatenate([ np.arange(1, x+1) for x in [chroms[x] for x in chroms]])
#print CovPos
#con = np.concatenate([rep, ran])
#print con
#res = con.reshape(2,6)
#print res
#tra = np.transpose(res)
#print tra
    
In [ ]:
    
#del AgpCov
#AgpCov = pd.DataFrame(OrderedDict([ ('AgpW', AgpCovW), ('AgpU', AgpCovU) ]), dtype='int8')
#AgpCov.head()
    
In [ ]:
    
#AgpCov.dtypes
    
In [ ]:
    
#AgpCov.shape
    
In [ ]:
    
#AgpCov.memory_usage(index=True)
    
In [ ]:
    
#AgpCov.memory_usage(index=True).sum()
    
In [ ]:
    
#AgpCov.index
    
In [ ]:
    
addHeader(2,'K-mer Coverage File')
col_info = [
    [ "Position"                             , col_type_int ], 
    [ "K-mer Coverage"                       , col_type_flo ], 
    [ "K-mer Coverage averaged: 500 bp"      , col_type_flo ], 
    [ "K-mer Coverage averaged: 2.5 Kbp"     , col_type_flo ], 
    [ "K-mer Coverage averaged: 5 Kbp"       , col_type_flo ], 
    [ "K-mer Coverage averaged: 50 Kbp"      , col_type_flo ], 
    [ "K-mer Coverage averaged: 1 Mbp"       , col_type_flo ], 
    [ "K-mer Coverage averaged: 5 Kbp before", col_type_flo ], 
    [ "K-mer Coverage averaged: 5 Kbp after" , col_type_flo ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print "\n".join( col_names )
    
In [ ]:
    
SKIP_ROWS = 1
print KmerCoverageFile
KmerData  = pd.read_csv(KmerCoverageFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#")
    
print "Loaded %d rows and %d columns" % ( KmerData.shape[0], KmerData.shape[1] )
if PARSE_VERBOSE:
    print KmerData.head()
    
In [ ]:
    
addHeader(2,'Sequencing Coverage File')
col_info = [
    [ "Position"                             , col_type_int ], 
    [ "Sequencing Coverage"                  , col_type_int ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
In [ ]:
    
SKIP_ROWS = 0
print SequencingCoverageFile
SequencingCoverageData  = pd.read_csv(SequencingCoverageFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#")
print "Loaded %d rows and %d columns" % ( SequencingCoverageData.shape[0], SequencingCoverageData.shape[1] )
if PARSE_VERBOSE:
    print SequencingCoverageData.head()
    
In [ ]:
    
if BAC_MODE:
    addHeader(2,'BLAST')
    col_info = [
        [ "Position"                             , col_type_int ], 
        [ "BLAST Coverage"                       , col_type_flo ]
    ]
    col_names=[cf[0] for cf in col_info]
    col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
    if PARSE_VERBOSE:
        print col_names
        col_types
    
In [ ]:
    
if BAC_MODE:
    SKIP_ROWS = 0
    
    print BlastCoverageFile
    
    BlastCoverageData  = pd.read_csv(BlastCoverageFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                            skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#")
        
    print "Loaded %d rows and %d columns" % ( BlastCoverageData.shape[0], BlastCoverageData.shape[1] )
    
    if PARSE_VERBOSE:
        print BlastCoverageData.head()
    
In [ ]:
    
addHeader(2,'AGP')
SKIP_ROWS = 0
    
In [ ]:
    
addHeader(3,'Contig')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Contig"                  , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
In [ ]:
    
print AgpContigFile
AgpContigData  = pd.read_csv(AgpContigFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#")
print "Loaded %d rows and %d columns" % ( AgpContigData.shape[0], AgpContigData.shape[1] )
if PARSE_VERBOSE:
    print AgpContigData.head()
    
In [ ]:
    
addHeader(3,'Gap')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Gap"                     , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
In [ ]:
    
print AgpGapFile
AgpGapData  = pd.read_csv(AgpGapFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#")
    
print "Loaded %d rows and %d columns" % ( AgpGapData.shape[0], AgpGapData.shape[1] )
if PARSE_VERBOSE:
    print AgpGapData.head()
    
In [ ]:
    
addHeader(3,'Unknown')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Unknown"                 , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
In [ ]:
    
print AgpUnknownFile
AgpUnknownData  = pd.read_csv(AgpUnknownFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#")
    
print "Loaded %d rows and %d columns" % ( AgpUnknownData.shape[0], AgpUnknownData.shape[1] )
if PARSE_VERBOSE:
    print AgpUnknownData.head()
    
In [ ]:
    
addHeader(3,'Other')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "AGP Other"                   , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
In [ ]:
    
print AgpOtherFile
AgpOtherData  = pd.read_csv(AgpOtherFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#")
print "Loaded %d rows and %d columns" % ( AgpOtherData.shape[0], AgpOtherData.shape[1] )
if PARSE_VERBOSE:
    print AgpOtherData.head()
    
In [ ]:
    
addHeader(2,'Ns')
col_info = [
    [ "Position"                    , col_type_int ], 
    [ "Ns"                          , col_type_bol ]
]
col_names=[cf[0] for cf in col_info]
col_types=dict(zip([c[0] for c in col_info], [c[1] for c in col_info]))
if PARSE_VERBOSE:
    print col_names
    col_types
    
In [ ]:
    
print NsFile
NsData  = pd.read_csv(NsFile, header=None, names=col_names, dtype=col_types, nrows=NROWS, \
                        skiprows=SKIP_ROWS, verbose=PARSE_VERBOSE, delimiter="\t", comment="#")
    
print "Loaded %d rows and %d columns" % ( NsData.shape[0], NsData.shape[1] )
if PARSE_VERBOSE:
    print NsData.head()
    
In [ ]:
    
addHeader(1,'Merge')
Data = pd.DataFrame(KmerData, copy=True)
Data.combine_first( KmerData )
Data[ "AGP Contig"          ] = AgpContigData[          "AGP Contig"          ]
Data[ "AGP Gap"             ] = AgpGapData[             "AGP Gap"             ]
Data[ "AGP Unknown"         ] = AgpUnknownData[         "AGP Unknown"         ]
Data[ "AGP Other"           ] = AgpOtherData[           "AGP Other"           ]
Data[ "Ns"                  ] = NsData[                 "Ns"                  ]
Data[ "Sequencing Coverage" ] = SequencingCoverageData[ "Sequencing Coverage" ]
if BAC_MODE:
    Data[ "BLAST Coverage"  ] = BlastCoverageData[      "BLAST Coverage"      ]
print "Saved %d rows and %d columns" % ( Data.shape[0], Data.shape[1] )
if PARSE_VERBOSE:
    print Data.head()
    
In [ ]:
    
addHeader(1,'Plot')
addHeader(2,'K-mer Coverage Stats')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    bqc = Data.boxplot(column=["K-mer Coverage", 
                               "K-mer Coverage averaged: 500 bp",
                               "K-mer Coverage averaged: 2.5 Kbp",
                               "K-mer Coverage averaged: 5 Kbp",
                               "K-mer Coverage averaged: 50 Kbp",
                               "K-mer Coverage averaged: 1 Mbp",
                               "K-mer Coverage averaged: 5 Kbp before",
                               "K-mer Coverage averaged: 5 Kbp after"], return_type='dict', rot=45)
    
    plt.setp(bqc['boxes'   ], color='black')
    plt.setp(bqc['medians' ], color='black')
    plt.setp(bqc['whiskers'], color='black')
    plt.setp(bqc['fliers'  ], color='black')
    
    for of in output_files['K-mer Coverage Stats']:
        print "Saving Image:", of
        savefig(of)
    
In [ ]:
    
addHeader(2,'Sequencing Coverage Stats')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    bqc = Data.boxplot(column=['Sequencing Coverage'], return_type='dict')
    
    plt.setp(bqc['boxes'   ], color='black')
    plt.setp(bqc['medians' ], color='black')
    plt.setp(bqc['whiskers'], color='black')
    plt.setp(bqc['fliers'  ], color='black')
    
    for of in output_files['Sequencing Coverage Stats']:
        print "Saving Image:", of
        savefig(of)
    
In [ ]:
    
if BAC_MODE:
    addHeader(2,'BLAST Coverage Stats')
    
    with size_controller(FULL_FIG_W, FULL_FIG_H):
        bqc = Data.boxplot(column=['BLAST Coverage'], return_type='dict')
        plt.setp(bqc['boxes'   ], color='black')
        plt.setp(bqc['medians' ], color='black')
        plt.setp(bqc['whiskers'], color='black')
        plt.setp(bqc['fliers'  ], color='black')
        for of in output_files['BLAST Coverage Stats']:
            print "Saving Image:", of
            savefig(of)
    
In [ ]:
    
addHeader(2,'K-mer Coverage Distribution')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["K-mer Coverage"] > 0 ]['Position']
    
    print "Number of rows:", hf.size
    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        for of in output_files['K-mer Coverage Distribution']:
            print "Saving Image:", of
            savefig(of)
    
In [ ]:
    
addHeader(2,'Sequencing Coverage Distribution')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["Sequencing Coverage"] > 0 ]['Position']
    
    print "Number of rows:", hf.size
    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        for of in output_files['Sequencing Coverage Distribution']:
            print "Saving Image:", of
            savefig(of)
    
In [ ]:
    
if BAC_MODE:
    addHeader(2,'BLAST Coverage Distribution')
    
    with size_controller(FULL_FIG_W, FULL_FIG_H):
        hf = Data[ Data["BLAST Coverage"] > 0 ]['Position']
    
        print "Number of rows:", hf.size
        if hf.size > 0:
            hs = hf.hist(color=HISTOGRAM_COLOR)
            
            for of in output_files['BLAST Coverage Distribution']:
                print "Saving Image:", of
                savefig(of)
    
In [ ]:
    
addHeader(2,'Gaps Distribution')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["AGP Gap"] > 0 ]['Position']
    
    print "Number of rows:", hf.size
    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        
        for of in output_files['Gaps Distribution']:
            print "Saving Image:", of
            savefig(of)
    
In [ ]:
    
addHeader(2,'Ns Distribution')
with size_controller(FULL_FIG_W, FULL_FIG_H):
    hf = Data[ Data["Ns"] > 0 ]['Position']
    
    print "Number of rows:", hf.size
    if hf.size > 0:
        hs = hf.hist(color=HISTOGRAM_COLOR)
        
        for of in output_files['Ns Distribution']:
            print "Saving Image:", of
            savefig(of)
    
In [ ]:
    
addHeader(2,'CSV Output')
SAMPLE_EVERY = 1
while ( Data.shape[0] / SAMPLE_EVERY ) > MAX_ROWS:
    SAMPLE_EVERY += 1
    
print "Original Size %d rows and %d columns" % ( Data.shape[0], Data.shape[1] )
if SAMPLE_EVERY != 1:
    print "SAMPLING EVERY %d ROWS" % SAMPLE_EVERY
    
    for of in output_files['all_data_full']:
        print "Saving full data to: ", of
        Data.to_csv(of, sep='\t', index=False)
    
    DataSampled = Data[::SAMPLE_EVERY]
    print "New Size %d rows and %d columns" % ( DataSampled.shape[0], DataSampled.shape[1] )
    
    if PARSE_VERBOSE:
        print DataSampled.head()
else:
    print "no need to sample"
    DataSampled = Data
for of in output_files['all_data']:
    print "Saving data to     :", of
    DataSampled.to_csv(of, sep='\t', index=False)
    
In [ ]:
    
addHeader(2,'Combined Graph')
num_cols = len(cols_to_plot)
#print "\n".join( [ str( (x, y[0]) ) if y is not None else str( (x, "None") ) for x,y in enumerate(cols_to_plot) ] )
with size_controller(CHROM_FIG_W, CHROM_FIG_H):
    if True:
        #fig, axes = plt.subplots(nrows=len(cols_to_plot)+2, ncols=1)
        f = plt.figure()
        
        plt.subplots_adjust(wspace=0.5, hspace=0.5)
        num_extra_rows = 3
        num_row_span   = 3
        
        num_cols_e     = num_cols + (num_extra_rows*num_row_span)
        axes           = []
        axes.append( plt.subplot2grid((num_cols_e,1), (0           , 0), rowspan=num_row_span) )
        axes.append( plt.subplot2grid((num_cols_e,1), (num_row_span, 0), rowspan=num_row_span) )
        
        #print "num_extra_rows", num_extra_rows
        #print "num_row_span  ", num_row_span
        #print "num_cols_e    ", num_cols_e
        
        if BAC_MODE:
            axes.append( plt.subplot2grid((num_cols_e,1), (num_row_span*2, 0), rowspan=num_row_span) )
        else:
            axes.append( None )
            
        for i in xrange ( num_cols - num_extra_rows ):
            #print 'i', i, 'y', (num_extra_rows*num_row_span)+i
            axes.append( plt.subplot2grid((num_cols_e,1), ((num_extra_rows*num_row_span)+i, 0) ) )
        #print "num_cols", num_cols
        #print "axes"    , len(axes)
        for axis_i, axis in enumerate(axes):
            #print "For axis", axis_i, "Plotting col", col_to_plot_i,
            
            if axis is None:
                #print "skip"
                continue
                
            #print "ok"
            #for col_to_plot_i, col_to_plot_info in enumerate(cols_to_plot):
            col_to_plot, col_ylim, col_yticks = cols_to_plot[axis_i]
            
            p = DataSampled[col_to_plot].plot(ax=axis, kind='area', stacked=False, color=ALL_GRAPH_COLOR)
            
            axis.set_title(col_to_plot)
            
            if col_ylim is not None:
                col_ylim_min, col_ylim_max = col_ylim
                
                if col_ylim_min is not None:
                    axis.set_ylim( bottom = col_ylim_min )
                
                if col_ylim_max is not None:
                    axis.set_ylim( top    = col_ylim_max )
            
            
            if col_yticks is not None:
                if col_yticks == 0:
                    axis.set_yticks([])
                    
                else:
                    ylim_min, ylim_max = axis.get_ylim()
                    ylim_diff = ylim_max - ylim_min
                    ylim_step = ylim_diff / (col_yticks*1.0)
                    #print col_to_plot, ylim_min, ylim_max, ylim_diff, ylim_step
                    axis.set_yticks(np.arange(ylim_min,ylim_max+ylim_step,ylim_step))
            
    plt.tight_layout()
    curr_fig = plt.gcf()
    #curr_fig.set_size_inches(CHROM_FIG_W/5.0, CHROM_FIG_H/5.0)
    for of in output_files['Combined graph']:
        print "Saving Image:", of
        curr_fig.savefig(of, dpi=300)
    #curr_fig.set_size_inches(CHROM_FIG_W, CHROM_FIG_H)
    
In [ ]:
    
if CONFIG_LOCAL:
    %run -i probes_cfg_footer.ipynb