In [1]:
import biokbase.data_api
import biokbase.data_api.genome_annotation
import biokbase.data_api.display

import bokeh.charts
import bokeh.plotting
import bokeh.io
bokeh.io.output_notebook(hide_banner=True)

import ssl

import numpy as np

import jinja2
import IPython.display

import pprint
import datetime


/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [2]:
def get_genome_summary(genome_annotation=None):
    if genome_annotation == None:
        raise TypeError("No GenomeAnnotation object given.")
    elif genome_annotation.get_typestring().split('-')[0] not in biokbase.data_api.genome_annotation.TYPES:
        raise TypeError("{0} is not a recognized GenomeAnnotation type.".format(type(genome_annotation)))
        
    taxon = genome_annotation.get_taxon()
    
    print "genome_annotation.get_taxon"
    assembly = genome_annotation.get_assembly()
    print "genome_annotation.get_assembly"
    
    overview = dict()
    
    #get tax info
    taxon_details = dict()
    taxon_details["taxonomic_id"] = taxon.get_taxonomic_id()
    print "taxon.get_taxonomic_id"
    taxon_details["kingdom"] = taxon.get_kingdom()
    print "taxon.get_kingdom"
    taxon_details["domain"] = taxon.get_domain()
    print "taxon.get_domain"
    taxon_details["genetic_code"] = taxon.get_genetic_code()
    print "taxon.get_genetic_code"
    taxon_details["scientific_name"] = taxon.get_scientific_name()
    print "taxon.get_scientific_name"
    taxon_details["aliases"] = taxon.get_aliases()
    print "taxon.get_aliases"
    taxon_details["scientific_lineage"] = taxon.get_scientific_lineage()
    print "taxon.get_scientific_lineage"

    overview["taxon"] = taxon_details
    
    #get assembly info
    assembly_details = dict()
    assembly_details["number_of_contigs"] = assembly.get_number_contigs()
    print "assembly.get_number_contigs"
    assembly_details["total_length"] = assembly.get_dna_size()
    print "assembly.get_dna_size"
    assembly_details["total_gc_content"] = assembly.get_gc_content()
    print "assembly.get_gc_content"
    assembly_details["contig_length"] = assembly.get_contig_lengths()
    print "assembly.get_contig_lengths"
    assembly_details["contig_gc_content"] = assembly.get_contig_gc_content()
    print "assembly.get_contig_gc_content"
    
    overview["assembly"] = assembly_details
    
    #get annotation info
    annotation_details = dict()
    annotation_details["feature_types"] = genome_annotation.get_feature_types()
    print "genome_annotation.get_feature_types"
    annotation_details["feature_type_descriptions"] = genome_annotation.get_feature_type_descriptions(annotation_details["feature_types"])
    print "genome_annotation.get_feature_type_descriptions"
    annotation_details["feature_type_counts"] = genome_annotation.get_feature_type_counts(annotation_details["feature_types"])
    print "genome_annotation.get_feature_type_counts"
    
    overview["annotation"] = annotation_details
    
    return overview

In [3]:
b = biokbase.data_api.browse(1011)

object_list = b.ls()

test = b["kb|g.3157"]

annotations = {test.name: test.object}

start = datetime.datetime.utcnow()
for n in annotations:
    print n

    overview = get_genome_summary(annotations[n])
    
    contig_lengths = overview["assembly"]["contig_length"].values()
    bins = 20

    p2 = bokeh.charts.Histogram(contig_lengths,
                                bins=bins,
                                density=False,
                                title="{0} Contig Lengths from {1} to {2}".format(len(contig_lengths), min(contig_lengths),max(contig_lengths)),
                                xlabel="Sequence length (bp)",
                                ylabel="Count",
                                palette=bokeh.palettes.Greens3,
                                width=500,
                                height=400)

    contig_gc_values = [x*100.0 for x in overview["assembly"]["contig_gc_content"].values()]

    p3 = bokeh.charts.Line(contig_gc_values,
                           title="Contig GC % from {0:.2f} to {1:.2f}".format(min(contig_gc_values),max(contig_gc_values)),
                           xlabel="Contigs",
                           ylabel="GC %",
                           palette=bokeh.palettes.Reds3,
                           width=500,
                           height=400)
    bokeh.io.show(bokeh.io.hplot(p2,p3))

    
end = datetime.datetime.utcnow()

print "Total time : {0}".format(end - start)


/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
kb|g.3157
genome_annotation.get_taxon
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
genome_annotation.get_assembly
taxon.get_taxonomic_id
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
taxon.get_kingdom
taxon.get_domain
taxon.get_genetic_code
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
taxon.get_scientific_name
taxon.get_aliases
taxon.get_scientific_lineage
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
assembly.get_number_contigs
assembly.get_dna_size
assembly.get_gc_content
assembly.get_contig_lengths
assembly.get_contig_gc_content
genome_annotation.get_feature_types
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/requests/packages/urllib3/util/ssl_.py:90: InsecurePlatformWarning: A true SSLContext object is not available. This prevents urllib3 from configuring SSL appropriately and may cause certain SSL connections to fail. For more information, see https://urllib3.readthedocs.org/en/latest/security.html#insecureplatformwarning.
  InsecurePlatformWarning
genome_annotation.get_feature_type_descriptions
genome_annotation.get_feature_type_counts
ERROR:/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/bokeh/validation/check.pyc:E-1000 (COLUMN_LENGTHS): ColumnDataSource column lengths are not all the same: ColumnDataSource, ViewModel:ColumnDataSource, ref _id: 88f5d22b-7d7e-4e84-9600-4a092e1599a7

Total time : 0:01:57.167610

In [4]:
contig_gc_values = [x*100.0 for x in overview["assembly"]["contig_gc_content"].values()]

p3 = bokeh.charts.Line(contig_gc_values,
                           title="Contig GC % from {0:.2f} to {1:.2f}".format(min(contig_gc_values),max(contig_gc_values)),
                           xlabel="Contigs",
                           ylabel="GC %",
                           palette=bokeh.palettes.Reds3,
                           width=500,
                           height=400)
bokeh.io.show(p3)



In [5]:
from bokeh.plotting import figure, show, output_file

p = figure(title="Contig GC% by contig index")

p.scatter(np.arange(1,len(contig_gc_values)), contig_gc_values, marker="circle",
            line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.5, size=12)

show(p)


ERROR:/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/bokeh/validation/check.pyc:E-1000 (COLUMN_LENGTHS): ColumnDataSource column lengths are not all the same: ColumnDataSource, ViewModel:ColumnDataSource, ref _id: 0cf5534e-235a-44fe-ba45-790ff4c24cec

In [6]:
from bokeh.plotting import figure, show, output_file

p = figure(title="Cumulative GC% distribution")

p.scatter(np.cumsum(np.sort(contig_gc_values))/np.sum(contig_gc_values), np.sort(contig_gc_values), marker="circle",
            line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.5, size=7)

show(p)



In [7]:
from bokeh.plotting import figure, show, output_file

p = figure(title="Contig GC% by contig length")

p.scatter(overview["assembly"]["contig_length"].values(), contig_gc_values, marker="circle",
            line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.5, size=12)

show(p)



In [8]:
from bokeh.plotting import figure, show, output_file

p = figure(title="Contig GC% by log10 contig length")

p.scatter(np.log10(overview["assembly"]["contig_length"].values()), contig_gc_values, marker="circle",
            line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.5, size=12)

show(p)



In [ ]: