In [1]:
import biokbase.data_api
import biokbase.data_api.genome_annotation
import biokbase.data_api.display

import bokeh.charts
import bokeh.plotting
import bokeh.io
bokeh.io.output_notebook(hide_banner=True)

import ssl

import numpy as np

import jinja2
import IPython.display

import pprint
import datetime


/Users/marcin/Documents/KBase/modules/data_api/venv6/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [11]:
def get_genome_summary(genome_annotation=None):
    if genome_annotation == None:
        raise TypeError("No GenomeAnnotation object given.")
    elif genome_annotation.get_typestring().split('-')[0] not in biokbase.data_api.genome_annotation.TYPES:
        raise TypeError("{0} is not a recognized GenomeAnnotation type.".format(type(genome_annotation)))
        
    taxon = genome_annotation.get_taxon()
    
    print "genome_annotation.get_taxon"
    assembly = genome_annotation.get_assembly()
    print "genome_annotation.get_assembly"
    
    overview = dict()
    
    #get assembly info
    assembly_details = dict()
    assembly_details["number_of_contigs"] = assembly.get_number_contigs()
    print "assembly.get_number_contigs"
    assembly_details["contig_length"] = assembly.get_contig_lengths()
    print "assembly.get_contig_lengths"
    assembly_details["contig_gc_content"] = assembly.get_contig_gc_content()
    print "assembly.get_contig_gc_content"
    
    overview["assembly"] = assembly_details
    
    #get annotation info
    annotation_details = dict()
    annotation_details["feature_types"] = genome_annotation.get_feature_types()
    print "genome_annotation.get_feature_types"
    annotation_details["feature_type_descriptions"] = genome_annotation.get_feature_type_descriptions(annotation_details["feature_types"])
    print "genome_annotation.get_feature_type_descriptions"
    annotation_details["feature_type_counts"] = genome_annotation.get_feature_type_counts(annotation_details["feature_types"])
    print "genome_annotation.get_feature_type_counts"
    
    overview["annotation"] = annotation_details
    
    return overview

In [112]:
def plot_genome_quality(contig_gc_values=None,contig_len_values=None,feature_counts=None):
    from bokeh.plotting import figure, show, gridplot
    import numpy as np

    log10_feature_counts = np.log10(feature_counts)    
    log10_contig_len_values = np.log10(contig_len_values)
    log10_contig_gc_values = np.log10(contig_gc_values)     

        
    global_scale = 130.0
        
    ### 
    xscale1 = len(contig_gc_values)
    radius_scale1 = xscale1/global_scale
    p1 = figure(title="Contig %GC by contig index",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')

    p1.scatter(np.arange(1,len(contig_gc_values)), contig_gc_values, marker="circle",radius=log10_feature_counts*radius_scale1,
                line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="Number of genes")
    p1.xaxis.axis_label="Contig index"
    p1.yaxis.axis_label="%GC"
        
    global_scale1 = xscale1/radius_scale1
    #115.5 (old)
    
    ###
    from scipy.interpolate import spline

    p2 = figure(title="Cumulative %GC distribution",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
    x = np.cumsum(np.sort(contig_gc_values))/np.sum(contig_gc_values)
    y = np.sort(contig_gc_values)

    x_smooth = np.linspace(x.min(), x.max(), 50)
    y_smooth = spline(x, y, x_smooth)
    rev_y_smooth = y_smooth[::-1]

    index = -1
    mindist = 1000000
    for i in np.arange(1,len(y_smooth)):
        dist = abs(rev_y_smooth[i] - y_smooth[i])
        if(dist < mindist):        
            mindist = dist
            index = i

    ynew = np.zeros(shape=(len(y_smooth)))
    for i in np.arange(0,index+1):
        ynew[i] = rev_y_smooth[i] - 2*abs(rev_y_smooth[i] - y_smooth[index])

    for i in np.arange(1,index+2):
        ynew[i+index] = rev_y_smooth[i+index] + 2*abs(rev_y_smooth[i+index] - y_smooth[index])   

    p2.line(x_smooth,y_smooth, color="gray",line_alpha=0.5, line_width=6, legend="Contig %GC distribution")
    p2.line(x_smooth,ynew, color="red",line_alpha=0.5,line_width=6, legend="Contig %GC distribution assymetry")

    p2.circle(x, y,
                line_color="black", fill_color="gray", fill_alpha=1, size=2, legend="Contig %GC values")

    p2.xaxis.axis_label="Fraction contigs < %GC"
    p2.yaxis.axis_label="%GC"
    p2.legend.orientation = "top_left"
    p2.xaxis.bounds = (0,1)
    
    
    ###
    xscale3 = max(contig_len_values) - min(contig_len_values)
    radius_scale3 = xscale3/global_scale
    p3 = figure(title="Contig %GC by contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
    p3.scatter(contig_len_values, contig_gc_values, marker="circle",radius=log10_feature_counts*radius_scale3,
                line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="Number of genes")
    p3.xaxis.axis_label="Contig length (bp)"
    p3.yaxis.axis_label="%GC"
    
    global_scale3 = xscale3/radius_scale3
    
    
    ###
    xscale4 = max(log10_contig_len_values) - min(log10_contig_len_values)    
    radius_scale4 = xscale4/global_scale
    p4 = figure(title="Contig %GC by log10 contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt') 
    p4.scatter(log10_contig_len_values, contig_gc_values, marker="circle", radius=log10_feature_counts*radius_scale4,
                line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="Number of genes")
    p4.xaxis.axis_label="Log10 Contig length (bp)"
    p4.yaxis.axis_label="%GC"

    global_scale4 = xscale4/radius_scale4

    
    
    #global_scale910 = 10000.0
    ### ###
    xscale9 = max(contig_len_values) - min(contig_len_values)
    radius_scale9 = xscale9/global_scale
    p9 = figure(title="Feature count by contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
    p9.scatter(contig_len_values, log10_feature_counts, marker="circle",radius=np.asarray(log10_contig_gc_values)*radius_scale9,
                line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="%GC")
    p9.xaxis.axis_label="Contig length (bp)"
    p9.yaxis.axis_label="Log10 Number of genes"
    p9.legend.orientation = "bottom_right"
    
    global_scale9 = xscale9/radius_scale9
    
    
    ###
    xscale10 = max(log10_contig_len_values) - min(log10_contig_len_values)    
    radius_scale10 = xscale10/global_scale
    p10 = figure(title="Feature count by log10 contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt') 
    p10.scatter(log10_contig_len_values, log10_feature_counts, marker="circle", radius=np.asarray(log10_contig_gc_values)*radius_scale10,
                line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="%GC")
    p10.xaxis.axis_label="Log10 Contig length (bp)"
    p10.yaxis.axis_label="Log10 Number of genes"
    p10.legend.orientation = "bottom_right"
    
    global_scale10 = xscale10/radius_scale10
    
    print xscale1, xscale3, xscale4, xscale9, xscale10
    print global_scale1,global_scale3, global_scale4, global_scale9, global_scale10
    
       
    ###
    import scipy.special, scipy.stats
    
    mu5 = np.mean(contig_gc_values)
    mu5print = "{:3.1f}".format(mu5)
    sigma5 = np.std(contig_gc_values)
    sigma5print = "{:3.1f}".format(sigma5)
    skew5 = scipy.stats.skew(contig_gc_values)
    skew5print = "{:3.1f}".format(skew5)
    #print(skew5print)  
    
    p5 = figure(title="Contig %GC (μ={mu5print}, σ={sigma5print}, sk={skew5print})".format(**vars()),tools="save",
       background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')

    #measured = np.random.normal(mu, sigma, 1000)
    hist5, edges5 = np.histogram(contig_gc_values, density=True, bins=50)

    x = np.linspace(min(contig_gc_values), max(contig_gc_values), 100)
    pdf = 1/(sigma5 * np.sqrt(2*np.pi)) * np.exp(-(x-mu5)**2 / (2*sigma5**2))
    cdf = (1+scipy.special.erf((x-mu5)/np.sqrt(2*sigma5**2)))/2

    p5.quad(top=hist5, bottom=0, left=edges5[:-1], right=edges5[1:],
         fill_color="#59D659", line_color="#59D659")
    p5.line(x, pdf, line_color="#D17519", line_width=4, alpha=0.7, legend="Normal PDF estimate")
    p5.line(x, cdf, line_color="#4D504D", line_width=2, alpha=0.7, legend="CDF")

    p5.legend.orientation = "top_left"
    p5.xaxis.axis_label = 'Contig %GC'
    p5.yaxis.axis_label = 'Pr(Contig %GC)'

    
    ###
    contig_lengths_log10 = np.log10(contig_len_values)
    mu6 = np.mean(contig_lengths_log10)
    mu6print = "{:2.0f}".format(mu6)
    sigma6 = np.std(contig_lengths_log10)
    sigma6print = "{:2.0f}".format(sigma6)
    skew6 = scipy.stats.skew(contig_lengths_log10)
    skew6print = "{:3.1f}".format(skew6)
    
    p6 = figure(title="Log10 Contig lengths (μ={mu6print}, σ={sigma6print}, sk={skew6print})".format(**vars()),tools="save",
       background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')

    #measured = np.random.normal(mu, sigma, 1000)
    hist6, edges6 = np.histogram(contig_lengths_log10, density=True, bins=50)
    #print hist6
    #print contig_lengths_log10
    
    x = np.linspace(min(contig_lengths_log10), max(contig_lengths_log10), 100)
    pdf = 1/(sigma6 * np.sqrt(2*np.pi)) * np.exp(-(x-mu6)**2 / (2*sigma6**2))
    cdf = (1+scipy.special.erf((x-mu6)/np.sqrt(2*sigma6**2)))/2

    p6.quad(top=hist6, bottom=0, left=edges6[:-1], right=edges6[1:],
         fill_color="#59D659", line_color="#59D659")
    p6.line(x, pdf, line_color="#D17519", line_width=4, alpha=0.7, legend="Normal PDF estimate")
    p6.line(x, cdf, line_color="#4D504D", line_width=2, alpha=0.7, legend="CDF")

    p6.legend.orientation = "top_left"
    p6.xaxis.axis_label = 'Log10 Contig lengths (bp)'
    p6.yaxis.axis_label = 'Pr(Length)'
    
    
    ###
    
    mu7 = np.mean(log10_feature_counts)
    mu7print = "{:2.0f}".format(mu7)
    sigma7 = np.std(log10_feature_counts)
    sigma7print = "{:2.0f}".format(sigma7)
    skew7 = scipy.stats.skew(log10_feature_counts)
    skew7print = "{:3.1f}".format(skew7)
    
    p7 = figure(title="Log10 number of genes (μ={mu6print}, σ={sigma6print}, sk={skew6print})".format(**vars()),tools="save",
       background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')

    #measured = np.random.normal(mu, sigma, 1000)
    hist7, edges7 = np.histogram(log10_feature_counts, density=True, bins=50)
    #print hist6
    #print contig_lengths_log10
    
    x = np.linspace(min(log10_feature_counts), max(log10_feature_counts), 100)
    pdf = 1/(sigma7 * np.sqrt(2*np.pi)) * np.exp(-(x-mu7)**2 / (2*sigma7**2))
    cdf = (1+scipy.special.erf((x-mu7)/np.sqrt(2*sigma7**2)))/2

    p7.quad(top=hist7, bottom=0, left=edges7[:-1], right=edges7[1:],
         fill_color="#59D659", line_color="#59D659")
    p7.line(x, pdf, line_color="#D17519", line_width=4, alpha=0.7, legend="Normal PDF estimate")
    p7.line(x, cdf, line_color="#4D504D", line_width=2, alpha=0.7, legend="CDF")

    p7.legend.orientation = "top_left"
    p7.xaxis.axis_label = 'Log10 number of genes'
    p7.yaxis.axis_label = 'Pr(Length)'
    
    ###  
    import operator

    length_dict = dict()
    count = 0
    for c in contig_ids:
        length_dict[c] = contig_len_values[count]
        count += 1

    ###code to sort along with contig ids
    #sorted_length = sorted(length_dict.items(), key=operator.itemgetter(1))

    sorted_len_ar = np.asarray(sorted(length_dict.values()))[::-1]
    sum = np.sum(sorted_len_ar)
    cumsum = np.cumsum(sorted_len_ar)
    cumsum_norm = 100.0 * cumsum.astype('f') / (sum + 0.0)

    xs = np.zeros(2 * len(cumsum_norm)-2)
    ys = np.zeros(2 * len(cumsum_norm)-2)
    xs[0] = 0
    xs[1] = cumsum_norm[0]
    ys[0] = sorted_len_ar[0]
    ys[1] = sorted_len_ar[0]

    for i in  range(1,len(cumsum_norm)-1,1):
        xs[2*i] = cumsum_norm[i-1]
        xs[2*i+1] = cumsum_norm[i]
        ys[2*i] = sorted_len_ar[i]
        ys[2*i+1] = sorted_len_ar[i]

    maxy =  max(ys)
    p8 = figure(title="Nx plot for contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
    p8.line(xs, ys, line_color="#6666ee", line_width=2, alpha=0.7,legend="Nx")
    p8.line([50, 50], [0,maxy], line_color="red", line_width=2, alpha=0.7,legend="N50")
    ###the below line prints NaN, puzzling
    #p8.text(50, maxy, "N50 = ")
    p8.xaxis.axis_label="Nx"
    p8.yaxis.axis_label="Contig length"

    
    masterp = gridplot([[p1, p2], [p3, p4], [p9, p10], [p5, p7],[p6, p8]])

    show(masterp)

In [113]:
def run_genome_quality(genome="kb|g.3157"):

    from doekbase.data_api.annotation.genome_annotation import GenomeAnnotationAPI
    services = {"workspace_service_url": "https://ci.kbase.us/services/ws/",
                "shock_service_url": "https://ci.kbase.us/services/shock_api/"}
    token = None
    s = ""
    annotations = {genome: GenomeAnnotationAPI(services, token, s.join(("PrototypeReferenceGenomes/",genome)))
                   #"kb|g.166819": GenomeAnnotationAPI(services, token, "PrototypeReferenceGenomes/kb|g.166819"),
                   #"kb|g.3899": GenomeAnnotationAPI(services, token, "PrototypeReferenceGenomes/kb|g.3899")
                  }
    start_time = datetime.datetime.utcnow()
    for n in annotations:
        print n
        data = dict()
        overview = get_genome_summary(annotations[n])

        contig_ids = sorted(overview["assembly"]["contig_length"].keys(), 
                            cmp=lambda x,y: cmp(x.rsplit(".",1)[1],y.rsplit(".",1)[1]))

        contig_len_values = [overview["assembly"]["contig_length"][c] for c in contig_ids]
        contig_gc_values = [100.0 * overview["assembly"]["contig_gc_content"][c] for c in contig_ids]


        count =0
        start=1
        stop=100000000
        #for c in contig_ids:
        #    print c

        regions = []
        for c in contig_ids:
            contig_add = {"contig_id": c, "start": start, "stop": stop, "strand": "?"}
            regions.append(contig_add)


            
        feat_data = annotations[n].get_feature_ids(region_list=regions)

        feature_counts = []    
        for c in contig_ids:
            num = len(feat_data['region'][c])
            if(num == 0):
                num = 0.1
            feature_counts.append(num)

        plot_genome_quality(contig_gc_values, contig_len_values,feature_counts)

        print "WARNING breaking after first genome"
        break;
    end_time = datetime.datetime.utcnow()
    print "Total time : {0}".format(end_time - start_time)

In [114]:
run_genome_quality("kb|g.3157")


kb|g.3157
genome_annotation.get_taxon
genome_annotation.get_assembly
assembly.get_number_contigs
assembly.get_contig_lengths
assembly.get_contig_gc_content
genome_annotation.get_feature_types
genome_annotation.get_feature_type_descriptions
genome_annotation.get_feature_type_counts
462 4069938 3.5718794455 4069938 3.5718794455
130.0 130.0 130.0 130.0 130.0
WARNING breaking after first genome
Total time : 0:00:38.862666

In [ ]: