In [1]:
import doekbase.data_api
from doekbase.data_api.annotation.genome_annotation.api import GenomeAnnotationAPI
#import doekbase.data_api.display
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
print __version__ # requires version >= 1.9.0
from bokeh.plotting import figure, show, gridplot
from bokeh.io import hplot, vplot
#bokeh.io.output_notebook(hide_banner=True)
from scipy.interpolate import spline
import jinja2
import IPython.display
import ssl
import numpy as np
import scipy.special, scipy.stats
import pprint
import datetime
import operator
import os
#if redis_host is not None and redis_port is not None:
# logger.info("Activating REDIS at host:{} port:{}".format(redis_host, redis_port))
# cache.ObjectCache.cache_class = cache.RedisCache
# cache.ObjectCache.cache_params = {'redis_host': redis_host, 'redis_port': redis_port}
token = os.environ["KB_AUTH_TOKEN"]
#from doekbase.data_api.annotation.genome_annotation import GenomeAnnotationAPI
from doekbase.data_api.annotation.genome_annotation.api import GenomeAnnotationClientAPI
from doekbase.data_api.sequence.assembly.api import AssemblyClientAPI
services = {"workspace_service_url": "https://next.kbase.us/services/ws/",
"shock_service_url": "https://next.kbase.us/services/shock_api/",
"genome_annotation_url": "https://next.kbase.us/services/data/genome_annotation",
"assembly_url": "https://next.kbase.us/services/data/assembly"}
In [2]:
init_notebook_mode()
In [3]:
iplot([{"x": [1, 2, 3], "y": [3, 1, 6]}])
In [4]:
def get_genome_summary(genome_annotation=None):
if genome_annotation == None:
raise TypeError("No GenomeAnnotation object given.")
#elif genome_annotation.get_typestring().split('-')[0] not in biokbase.data_api.genome_annotation.TYPES:
# raise TypeError("{0} is not a recognized GenomeAnnotation type.".format(type(genome_annotation)))
start_time = datetime.datetime.utcnow()
taxon = genome_annotation.get_taxon()
print "genome_annotation.get_taxon "+taxon
end_time = datetime.datetime.utcnow()
print "time : {0}".format(end_time - start_time)
assemblyref = genome_annotation.get_assembly()
assemblyapi = AssemblyClientAPI(services["assembly_url"], token, assemblyref)
print "genome_annotation.get_assembly"
end_time = datetime.datetime.utcnow()
print "time : {0}".format(end_time - start_time)
overview = dict()
#get assembly info
assembly_details = dict()
start_time = datetime.datetime.utcnow()
assembly_details["number_of_contigs"] = assemblyapi.get_number_contigs()
print "assembly.get_number_contigs"
end_time = datetime.datetime.utcnow()
print "time : {0}".format(end_time - start_time)
start_time = datetime.datetime.utcnow()
assembly_details["contig_length"] = assemblyapi.get_contig_lengths()
print "assembly.get_contig_lengths"
end_time = datetime.datetime.utcnow()
print "time : {0}".format(end_time - start_time)
start_time = datetime.datetime.utcnow()
assembly_details["contig_gc_content"] = assemblyapi.get_contig_gc_content()
print "assembly.get_contig_gc_content"
end_time = datetime.datetime.utcnow()
print "time : {0}".format(end_time - start_time)
overview["assembly"] = assembly_details
#get annotation info
annotation_details = dict()
start_time = datetime.datetime.utcnow()
annotation_details["feature_types"] = genome_annotation.get_feature_types()
print annotation_details["feature_types"]
end_time = datetime.datetime.utcnow()
print "time : {0}".format(end_time - start_time)
#remove 'exon' type which causes error for get_feature_type_descriptions()
del annotation_details["feature_types"][-3]
print annotation_details["feature_types"]
print "genome_annotation.get_feature_types"
#start_time = datetime.datetime.utcnow()
#annotation_details["feature_type_descriptions"] = genome_annotation.get_feature_type_descriptions(annotation_details["feature_types"])
#print "genome_annotation.get_feature_type_descriptions"
#end_time = datetime.datetime.utcnow()
#print "time : {0}".format(end_time - start_time)
start_time = datetime.datetime.utcnow()
annotation_details["feature_type_counts"] = genome_annotation.get_feature_type_counts(annotation_details["feature_types"])
print "genome_annotation.get_feature_type_counts"
end_time = datetime.datetime.utcnow()
print "time : {0}".format(end_time - start_time)
overview["annotation"] = annotation_details
return overview
In [5]:
#init clients
print "start"
annotationRefSeq = GenomeAnnotationClientAPI(services["genome_annotation_url"],
token, "288/17")
print "got RefSeq"
annotationENSEMBL = GenomeAnnotationClientAPI(services["genome_annotation_url"],
token, "252/26")
print "got ENSEMBL"
print annotationRefSeq.get_feature_types()
print annotationENSEMBL.get_feature_types()
In [6]:
#load for first genome
start_fetch = datetime.datetime.utcnow()
overviewENSEMBL = get_genome_summary(annotationENSEMBL)
end_fetch = datetime.datetime.utcnow()
print "Total time to gather data : {0}".format(end_fetch - start_fetch)
start_prep = datetime.datetime.utcnow()
contig_idsENSEMBL = [overviewENSEMBL["assembly"]["contig_length"].keys()]
#contig_idsENSEMBL = [sorted(overviewENSEMBL["assembly"]["contig_length"].keys(),
# cmp=lambda x,y: cmp(x.rsplit(".",1)[1],y.rsplit(".",1)[1]))]
contig_idsENSEMBL = overviewENSEMBL["assembly"]["contig_length"].keys()
contig_len_valuesENSEMBL = [overviewENSEMBL["assembly"]["contig_length"][c] for c in contig_idsENSEMBL]
contig_gc_valuesENSEMBL = [100.0 * overviewENSEMBL["assembly"]["contig_gc_content"][c] for c in contig_idsENSEMBL]
count = 0
start = 0
length = 1E10
regionsENSEMBL = [{"contig_id": c, "start": start, "length": length, "strand": "?"} for c in contig_idsENSEMBL]
feat_dataENSEMBL = annotationENSEMBL.get_feature_ids(group_by="region")
feature_countsENSEMBL = []
for c in contig_idsENSEMBL:
count = 0
for strand in feat_dataENSEMBL["by_region"][c]:
for range in feat_dataENSEMBL["by_region"][c][strand]:
count += len(feat_dataENSEMBL["by_region"][c][strand])
if count == 0:
feature_countsENSEMBL.append(0.1)
else:
feature_countsENSEMBL.append(count)
end_prep = datetime.datetime.utcnow()
print "Total time to prep data : {0}".format(end_prep - start_prep)
In [7]:
#load for second genome
start_fetch = datetime.datetime.utcnow()
overviewRefSeq = get_genome_summary(annotationRefSeq)
end_fetch = datetime.datetime.utcnow()
print "Total time to gather data : {0}".format(end_fetch - start_fetch)
start_prep = datetime.datetime.utcnow()
contig_idsRefSeq = [overviewRefSeq["assembly"]["contig_length"].keys()]
#contig_idsENSEMBL = [sorted(overviewENSEMBL["assembly"]["contig_length"].keys(),
# cmp=lambda x,y: cmp(x.rsplit(".",1)[1],y.rsplit(".",1)[1]))]
contig_idsRefSeq = overviewRefSeq["assembly"]["contig_length"].keys()
contig_len_valuesRefSeq = [overviewRefSeq["assembly"]["contig_length"][c] for c in contig_idsRefSeq]
contig_gc_valuesRefSeq = [100.0 * overviewRefSeq["assembly"]["contig_gc_content"][c] for c in contig_idsRefSeq]
count = 0
start = 0
length = 1E10
regionsRefSeq = [{"contig_id": c, "start": start, "length": length, "strand": "?"} for c in contig_idsRefSeq]
feat_dataRefSeq = annotationRefSeq.get_feature_ids(group_by="region")
feature_countsRefSeq = []
for c in contig_idsRefSeq:
count = 0
for strand in feat_dataRefSeq["by_region"][c]:
for range in feat_dataRefSeq["by_region"][c][strand]:
count += len(feat_dataRefSeq["by_region"][c][strand])
if count == 0:
feature_countsRefSeq.append(0.1)
else:
feature_countsRefSeq.append(count)
end_prep = datetime.datetime.utcnow()
print "Total time to prep data : {0}".format(end_prep - start_prep)
In [13]:
overviewRefSeq["assembly"]["contig_length"]
Out[13]:
In [14]:
overviewENSEMBL["assembly"]["contig_length"]
Out[14]:
In [8]:
assemblyENSEMBL = annotationENSEMBL.get_assembly()
assemblyapiENSEMBL = AssemblyClientAPI(services["assembly_url"], token, assemblyENSEMBL)
assemblyRefSeq = annotationRefSeq.get_assembly()
assemblyapiRefSeq = AssemblyClientAPI(services["assembly_url"], token, assemblyRefSeq)
#contigsENSEMBL = assemblyapi.get_data_subset(["contigs"])["contigs"]
#for contig in contigsENSEMBL:
# md5 = contigsENSEMBL[contig]["md5"]
# print md5
In [9]:
lengthsRefSeq = dict((v, k) for k, v in overviewRefSeq["assembly"]["contig_length"].iteritems())
for idENSEMBL in overviewENSEMBL["assembly"]["contig_length"].keys():
print idENSEMBL +"\t"+ str(overviewENSEMBL["assembly"]["contig_length"][idENSEMBL])
idRefSeq = lengthsRefSeq.get(overviewENSEMBL["assembly"]["contig_length"][idENSEMBL])
print idRefSeq +"\t"+ str(overviewENSEMBL["assembly"]["contig_length"][idENSEMBL])
count =0
start=1
stop=100000000
regionsRefSeq = []
contig_addRefSeq = {"contig_id": idRefSeq, "start": start, "length": stop-start, "strand": "?"}
#print contig_addRefSeq
regionsRefSeq.append(contig_addRefSeq)
feat_dataRefSeq = annotationRefSeq.get_feature_ids(filters={"region_list":regionsRefSeq})
#print feat_data
num_gene_RefSeq = len(feat_dataRefSeq['by_type']['gene'])
regionsENSEMBL = []
contig_addENSEMBL = {"contig_id": idENSEMBL, "start": start, "length": stop-start, "strand": "?"}
#print contig_addENSEMBL
regionsENSEMBL.append(contig_addENSEMBL)
feat_dataENSEMBL = annotationENSEMBL.get_feature_ids(filters={"region_list":regionsENSEMBL})
#print feat_data
num_gene_ENSEMBL = len(feat_dataENSEMBL['by_type']['gene'])\
print "** "+idRefSeq+"\t"+str(num_gene_RefSeq) +"\t" + idENSEMBL+"\t"+str(num_gene_ENSEMBL)
In [10]:
3318+3775+2556+59+2795+53+4316
Out[10]:
In [ ]: