In [1]:
import biokbase.data_api
import biokbase.data_api.genome_annotation
import biokbase.data_api.display
import bokeh.charts
import bokeh.plotting
import bokeh.io
bokeh.io.output_notebook(hide_banner=True)
import ssl
import numpy as np
import jinja2
import IPython.display
import pprint
import datetime
In [11]:
def get_genome_summary(genome_annotation=None):
if genome_annotation == None:
raise TypeError("No GenomeAnnotation object given.")
elif genome_annotation.get_typestring().split('-')[0] not in biokbase.data_api.genome_annotation.TYPES:
raise TypeError("{0} is not a recognized GenomeAnnotation type.".format(type(genome_annotation)))
taxon = genome_annotation.get_taxon()
print "genome_annotation.get_taxon"
assembly = genome_annotation.get_assembly()
print "genome_annotation.get_assembly"
overview = dict()
#get assembly info
assembly_details = dict()
assembly_details["number_of_contigs"] = assembly.get_number_contigs()
print "assembly.get_number_contigs"
assembly_details["contig_length"] = assembly.get_contig_lengths()
print "assembly.get_contig_lengths"
assembly_details["contig_gc_content"] = assembly.get_contig_gc_content()
print "assembly.get_contig_gc_content"
overview["assembly"] = assembly_details
#get annotation info
annotation_details = dict()
annotation_details["feature_types"] = genome_annotation.get_feature_types()
print "genome_annotation.get_feature_types"
annotation_details["feature_type_descriptions"] = genome_annotation.get_feature_type_descriptions(annotation_details["feature_types"])
print "genome_annotation.get_feature_type_descriptions"
annotation_details["feature_type_counts"] = genome_annotation.get_feature_type_counts(annotation_details["feature_types"])
print "genome_annotation.get_feature_type_counts"
overview["annotation"] = annotation_details
return overview
In [112]:
def plot_genome_quality(contig_gc_values=None,contig_len_values=None,feature_counts=None):
from bokeh.plotting import figure, show, gridplot
import numpy as np
log10_feature_counts = np.log10(feature_counts)
log10_contig_len_values = np.log10(contig_len_values)
log10_contig_gc_values = np.log10(contig_gc_values)
global_scale = 130.0
###
xscale1 = len(contig_gc_values)
radius_scale1 = xscale1/global_scale
p1 = figure(title="Contig %GC by contig index",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
p1.scatter(np.arange(1,len(contig_gc_values)), contig_gc_values, marker="circle",radius=log10_feature_counts*radius_scale1,
line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="Number of genes")
p1.xaxis.axis_label="Contig index"
p1.yaxis.axis_label="%GC"
global_scale1 = xscale1/radius_scale1
#115.5 (old)
###
from scipy.interpolate import spline
p2 = figure(title="Cumulative %GC distribution",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
x = np.cumsum(np.sort(contig_gc_values))/np.sum(contig_gc_values)
y = np.sort(contig_gc_values)
x_smooth = np.linspace(x.min(), x.max(), 50)
y_smooth = spline(x, y, x_smooth)
rev_y_smooth = y_smooth[::-1]
index = -1
mindist = 1000000
for i in np.arange(1,len(y_smooth)):
dist = abs(rev_y_smooth[i] - y_smooth[i])
if(dist < mindist):
mindist = dist
index = i
ynew = np.zeros(shape=(len(y_smooth)))
for i in np.arange(0,index+1):
ynew[i] = rev_y_smooth[i] - 2*abs(rev_y_smooth[i] - y_smooth[index])
for i in np.arange(1,index+2):
ynew[i+index] = rev_y_smooth[i+index] + 2*abs(rev_y_smooth[i+index] - y_smooth[index])
p2.line(x_smooth,y_smooth, color="gray",line_alpha=0.5, line_width=6, legend="Contig %GC distribution")
p2.line(x_smooth,ynew, color="red",line_alpha=0.5,line_width=6, legend="Contig %GC distribution assymetry")
p2.circle(x, y,
line_color="black", fill_color="gray", fill_alpha=1, size=2, legend="Contig %GC values")
p2.xaxis.axis_label="Fraction contigs < %GC"
p2.yaxis.axis_label="%GC"
p2.legend.orientation = "top_left"
p2.xaxis.bounds = (0,1)
###
xscale3 = max(contig_len_values) - min(contig_len_values)
radius_scale3 = xscale3/global_scale
p3 = figure(title="Contig %GC by contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
p3.scatter(contig_len_values, contig_gc_values, marker="circle",radius=log10_feature_counts*radius_scale3,
line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="Number of genes")
p3.xaxis.axis_label="Contig length (bp)"
p3.yaxis.axis_label="%GC"
global_scale3 = xscale3/radius_scale3
###
xscale4 = max(log10_contig_len_values) - min(log10_contig_len_values)
radius_scale4 = xscale4/global_scale
p4 = figure(title="Contig %GC by log10 contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
p4.scatter(log10_contig_len_values, contig_gc_values, marker="circle", radius=log10_feature_counts*radius_scale4,
line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="Number of genes")
p4.xaxis.axis_label="Log10 Contig length (bp)"
p4.yaxis.axis_label="%GC"
global_scale4 = xscale4/radius_scale4
#global_scale910 = 10000.0
### ###
xscale9 = max(contig_len_values) - min(contig_len_values)
radius_scale9 = xscale9/global_scale
p9 = figure(title="Feature count by contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
p9.scatter(contig_len_values, log10_feature_counts, marker="circle",radius=np.asarray(log10_contig_gc_values)*radius_scale9,
line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="%GC")
p9.xaxis.axis_label="Contig length (bp)"
p9.yaxis.axis_label="Log10 Number of genes"
p9.legend.orientation = "bottom_right"
global_scale9 = xscale9/radius_scale9
###
xscale10 = max(log10_contig_len_values) - min(log10_contig_len_values)
radius_scale10 = xscale10/global_scale
p10 = figure(title="Feature count by log10 contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
p10.scatter(log10_contig_len_values, log10_feature_counts, marker="circle", radius=np.asarray(log10_contig_gc_values)*radius_scale10,
line_color="#6666ee", fill_color="#ee6666", fill_alpha=0.3, size=7, legend="%GC")
p10.xaxis.axis_label="Log10 Contig length (bp)"
p10.yaxis.axis_label="Log10 Number of genes"
p10.legend.orientation = "bottom_right"
global_scale10 = xscale10/radius_scale10
print xscale1, xscale3, xscale4, xscale9, xscale10
print global_scale1,global_scale3, global_scale4, global_scale9, global_scale10
###
import scipy.special, scipy.stats
mu5 = np.mean(contig_gc_values)
mu5print = "{:3.1f}".format(mu5)
sigma5 = np.std(contig_gc_values)
sigma5print = "{:3.1f}".format(sigma5)
skew5 = scipy.stats.skew(contig_gc_values)
skew5print = "{:3.1f}".format(skew5)
#print(skew5print)
p5 = figure(title="Contig %GC (μ={mu5print}, σ={sigma5print}, sk={skew5print})".format(**vars()),tools="save",
background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
#measured = np.random.normal(mu, sigma, 1000)
hist5, edges5 = np.histogram(contig_gc_values, density=True, bins=50)
x = np.linspace(min(contig_gc_values), max(contig_gc_values), 100)
pdf = 1/(sigma5 * np.sqrt(2*np.pi)) * np.exp(-(x-mu5)**2 / (2*sigma5**2))
cdf = (1+scipy.special.erf((x-mu5)/np.sqrt(2*sigma5**2)))/2
p5.quad(top=hist5, bottom=0, left=edges5[:-1], right=edges5[1:],
fill_color="#59D659", line_color="#59D659")
p5.line(x, pdf, line_color="#D17519", line_width=4, alpha=0.7, legend="Normal PDF estimate")
p5.line(x, cdf, line_color="#4D504D", line_width=2, alpha=0.7, legend="CDF")
p5.legend.orientation = "top_left"
p5.xaxis.axis_label = 'Contig %GC'
p5.yaxis.axis_label = 'Pr(Contig %GC)'
###
contig_lengths_log10 = np.log10(contig_len_values)
mu6 = np.mean(contig_lengths_log10)
mu6print = "{:2.0f}".format(mu6)
sigma6 = np.std(contig_lengths_log10)
sigma6print = "{:2.0f}".format(sigma6)
skew6 = scipy.stats.skew(contig_lengths_log10)
skew6print = "{:3.1f}".format(skew6)
p6 = figure(title="Log10 Contig lengths (μ={mu6print}, σ={sigma6print}, sk={skew6print})".format(**vars()),tools="save",
background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
#measured = np.random.normal(mu, sigma, 1000)
hist6, edges6 = np.histogram(contig_lengths_log10, density=True, bins=50)
#print hist6
#print contig_lengths_log10
x = np.linspace(min(contig_lengths_log10), max(contig_lengths_log10), 100)
pdf = 1/(sigma6 * np.sqrt(2*np.pi)) * np.exp(-(x-mu6)**2 / (2*sigma6**2))
cdf = (1+scipy.special.erf((x-mu6)/np.sqrt(2*sigma6**2)))/2
p6.quad(top=hist6, bottom=0, left=edges6[:-1], right=edges6[1:],
fill_color="#59D659", line_color="#59D659")
p6.line(x, pdf, line_color="#D17519", line_width=4, alpha=0.7, legend="Normal PDF estimate")
p6.line(x, cdf, line_color="#4D504D", line_width=2, alpha=0.7, legend="CDF")
p6.legend.orientation = "top_left"
p6.xaxis.axis_label = 'Log10 Contig lengths (bp)'
p6.yaxis.axis_label = 'Pr(Length)'
###
mu7 = np.mean(log10_feature_counts)
mu7print = "{:2.0f}".format(mu7)
sigma7 = np.std(log10_feature_counts)
sigma7print = "{:2.0f}".format(sigma7)
skew7 = scipy.stats.skew(log10_feature_counts)
skew7print = "{:3.1f}".format(skew7)
p7 = figure(title="Log10 number of genes (μ={mu6print}, σ={sigma6print}, sk={skew6print})".format(**vars()),tools="save",
background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
#measured = np.random.normal(mu, sigma, 1000)
hist7, edges7 = np.histogram(log10_feature_counts, density=True, bins=50)
#print hist6
#print contig_lengths_log10
x = np.linspace(min(log10_feature_counts), max(log10_feature_counts), 100)
pdf = 1/(sigma7 * np.sqrt(2*np.pi)) * np.exp(-(x-mu7)**2 / (2*sigma7**2))
cdf = (1+scipy.special.erf((x-mu7)/np.sqrt(2*sigma7**2)))/2
p7.quad(top=hist7, bottom=0, left=edges7[:-1], right=edges7[1:],
fill_color="#59D659", line_color="#59D659")
p7.line(x, pdf, line_color="#D17519", line_width=4, alpha=0.7, legend="Normal PDF estimate")
p7.line(x, cdf, line_color="#4D504D", line_width=2, alpha=0.7, legend="CDF")
p7.legend.orientation = "top_left"
p7.xaxis.axis_label = 'Log10 number of genes'
p7.yaxis.axis_label = 'Pr(Length)'
###
import operator
length_dict = dict()
count = 0
for c in contig_ids:
length_dict[c] = contig_len_values[count]
count += 1
###code to sort along with contig ids
#sorted_length = sorted(length_dict.items(), key=operator.itemgetter(1))
sorted_len_ar = np.asarray(sorted(length_dict.values()))[::-1]
sum = np.sum(sorted_len_ar)
cumsum = np.cumsum(sorted_len_ar)
cumsum_norm = 100.0 * cumsum.astype('f') / (sum + 0.0)
xs = np.zeros(2 * len(cumsum_norm)-2)
ys = np.zeros(2 * len(cumsum_norm)-2)
xs[0] = 0
xs[1] = cumsum_norm[0]
ys[0] = sorted_len_ar[0]
ys[1] = sorted_len_ar[0]
for i in range(1,len(cumsum_norm)-1,1):
xs[2*i] = cumsum_norm[i-1]
xs[2*i+1] = cumsum_norm[i]
ys[2*i] = sorted_len_ar[i]
ys[2*i+1] = sorted_len_ar[i]
maxy = max(ys)
p8 = figure(title="Nx plot for contig length",background_fill="#FFF5E0",plot_width=400, plot_height=400,title_text_font_size='14pt')
p8.line(xs, ys, line_color="#6666ee", line_width=2, alpha=0.7,legend="Nx")
p8.line([50, 50], [0,maxy], line_color="red", line_width=2, alpha=0.7,legend="N50")
###the below line prints NaN, puzzling
#p8.text(50, maxy, "N50 = ")
p8.xaxis.axis_label="Nx"
p8.yaxis.axis_label="Contig length"
masterp = gridplot([[p1, p2], [p3, p4], [p9, p10], [p5, p7],[p6, p8]])
show(masterp)
In [113]:
def run_genome_quality(genome="kb|g.3157"):
from doekbase.data_api.annotation.genome_annotation import GenomeAnnotationAPI
services = {"workspace_service_url": "https://ci.kbase.us/services/ws/",
"shock_service_url": "https://ci.kbase.us/services/shock_api/"}
token = None
s = ""
annotations = {genome: GenomeAnnotationAPI(services, token, s.join(("PrototypeReferenceGenomes/",genome)))
#"kb|g.166819": GenomeAnnotationAPI(services, token, "PrototypeReferenceGenomes/kb|g.166819"),
#"kb|g.3899": GenomeAnnotationAPI(services, token, "PrototypeReferenceGenomes/kb|g.3899")
}
start_time = datetime.datetime.utcnow()
for n in annotations:
print n
data = dict()
overview = get_genome_summary(annotations[n])
contig_ids = sorted(overview["assembly"]["contig_length"].keys(),
cmp=lambda x,y: cmp(x.rsplit(".",1)[1],y.rsplit(".",1)[1]))
contig_len_values = [overview["assembly"]["contig_length"][c] for c in contig_ids]
contig_gc_values = [100.0 * overview["assembly"]["contig_gc_content"][c] for c in contig_ids]
count =0
start=1
stop=100000000
#for c in contig_ids:
# print c
regions = []
for c in contig_ids:
contig_add = {"contig_id": c, "start": start, "stop": stop, "strand": "?"}
regions.append(contig_add)
feat_data = annotations[n].get_feature_ids(region_list=regions)
feature_counts = []
for c in contig_ids:
num = len(feat_data['region'][c])
if(num == 0):
num = 0.1
feature_counts.append(num)
plot_genome_quality(contig_gc_values, contig_len_values,feature_counts)
print "WARNING breaking after first genome"
break;
end_time = datetime.datetime.utcnow()
print "Total time : {0}".format(end_time - start_time)
In [114]:
run_genome_quality("kb|g.3157")
In [ ]: