notebook.community

Edit and run



In [1]:

    
import doekbase.data_api
from doekbase.data_api.annotation.genome_annotation.api import GenomeAnnotationAPI
#import doekbase.data_api.display

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

print __version__ # requires version >= 1.9.0

from bokeh.plotting import figure, show, gridplot
from bokeh.io import hplot, vplot
#bokeh.io.output_notebook(hide_banner=True)

from scipy.interpolate import spline

import jinja2
import IPython.display

import ssl

import numpy as np
import scipy.special, scipy.stats

import pprint
import datetime
import operator
import os

#if redis_host is not None and redis_port is not None:
#        logger.info("Activating REDIS at host:{} port:{}".format(redis_host, redis_port))
#        cache.ObjectCache.cache_class = cache.RedisCache
#        cache.ObjectCache.cache_params = {'redis_host': redis_host, 'redis_port': redis_port}

token = os.environ["KB_AUTH_TOKEN"]

#from doekbase.data_api.annotation.genome_annotation import GenomeAnnotationAPI
from doekbase.data_api.annotation.genome_annotation.api import GenomeAnnotationClientAPI
from doekbase.data_api.sequence.assembly.api import AssemblyClientAPI

services = {"workspace_service_url": "https://next.kbase.us/services/ws/",
                "shock_service_url": "https://next.kbase.us/services/shock_api/",
            "genome_annotation_url": "https://next.kbase.us/services/data/genome_annotation",
            "assembly_url": "https://next.kbase.us/services/data/assembly"}



In [2]:

    
init_notebook_mode()



In [3]:

    
iplot([{"x": [1, 2, 3], "y": [3, 1, 6]}])



In [4]:

    
def get_genome_summary(genome_annotation=None):
    
    if genome_annotation == None:
        raise TypeError("No GenomeAnnotation object given.")
    #elif genome_annotation.get_typestring().split('-')[0] not in biokbase.data_api.genome_annotation.TYPES:
    #    raise TypeError("{0} is not a recognized GenomeAnnotation type.".format(type(genome_annotation)))
        
    
    start_time = datetime.datetime.utcnow()
    taxon = genome_annotation.get_taxon()
    print "genome_annotation.get_taxon "+taxon
    end_time = datetime.datetime.utcnow()
    print "time : {0}".format(end_time - start_time)
    
    assemblyref = genome_annotation.get_assembly()
    assemblyapi = AssemblyClientAPI(services["assembly_url"], token, assemblyref)
    print "genome_annotation.get_assembly"
    end_time = datetime.datetime.utcnow()
    print "time : {0}".format(end_time - start_time) 
    
    overview = dict()
    
    #get assembly info
    assembly_details = dict()
    start_time = datetime.datetime.utcnow()
    assembly_details["number_of_contigs"] = assemblyapi.get_number_contigs()
    print "assembly.get_number_contigs"
    end_time = datetime.datetime.utcnow()
    print "time : {0}".format(end_time - start_time)
    
    start_time = datetime.datetime.utcnow()
    assembly_details["contig_length"] = assemblyapi.get_contig_lengths()
    print "assembly.get_contig_lengths"
    end_time = datetime.datetime.utcnow()
    print "time : {0}".format(end_time - start_time)
    
    start_time = datetime.datetime.utcnow()
    assembly_details["contig_gc_content"] = assemblyapi.get_contig_gc_content()
    print "assembly.get_contig_gc_content"
    end_time = datetime.datetime.utcnow()
    print "time : {0}".format(end_time - start_time)
    
    overview["assembly"] = assembly_details
    
    #get annotation info
    annotation_details = dict()
    
    start_time = datetime.datetime.utcnow()
    annotation_details["feature_types"] = genome_annotation.get_feature_types()
    print annotation_details["feature_types"]
    end_time = datetime.datetime.utcnow()
    print "time : {0}".format(end_time - start_time)
    
    #remove 'exon' type which causes error for get_feature_type_descriptions()
    del annotation_details["feature_types"][-3]
    
    print annotation_details["feature_types"]
    
    print "genome_annotation.get_feature_types"
    
    #start_time = datetime.datetime.utcnow()
    #annotation_details["feature_type_descriptions"] = genome_annotation.get_feature_type_descriptions(annotation_details["feature_types"])
    #print "genome_annotation.get_feature_type_descriptions"
    #end_time = datetime.datetime.utcnow()
    #print "time : {0}".format(end_time - start_time)
    
    start_time = datetime.datetime.utcnow()
    annotation_details["feature_type_counts"] = genome_annotation.get_feature_type_counts(annotation_details["feature_types"])
    print "genome_annotation.get_feature_type_counts"
    end_time = datetime.datetime.utcnow()
    print "time : {0}".format(end_time - start_time)
    
    overview["annotation"] = annotation_details
    
    return overview



In [5]:

    
#init clients
print "start"

annotationRefSeq = GenomeAnnotationClientAPI(services["genome_annotation_url"], 
                                       token, "288/17")

print "got RefSeq"


annotationENSEMBL = GenomeAnnotationClientAPI(services["genome_annotation_url"], 
                                       token, "252/26")
print "got ENSEMBL"


print annotationRefSeq.get_feature_types()
print annotationENSEMBL.get_feature_types()









    



start
got RefSeq
got ENSEMBL
['repeat_unit', 'ncRNA', 'repeat_region', 'tRNA', 'intron', 'gap', 'regulatory', 'rRNA', 'exon', 'mRNA', 'CDS', 'misc_RNA', 'gene']
['misc_RNA', 'gene', 'exon', 'mRNA', 'CDS']



In [6]:

    
#load for first genome
start_fetch = datetime.datetime.utcnow()    
overviewENSEMBL = get_genome_summary(annotationENSEMBL)
end_fetch = datetime.datetime.utcnow()

print "Total time to gather data : {0}".format(end_fetch - start_fetch)

start_prep = datetime.datetime.utcnow()

contig_idsENSEMBL = [overviewENSEMBL["assembly"]["contig_length"].keys()]
                            
#contig_idsENSEMBL = [sorted(overviewENSEMBL["assembly"]["contig_length"].keys(), 
#                    cmp=lambda x,y: cmp(x.rsplit(".",1)[1],y.rsplit(".",1)[1]))]

contig_idsENSEMBL = overviewENSEMBL["assembly"]["contig_length"].keys()
contig_len_valuesENSEMBL = [overviewENSEMBL["assembly"]["contig_length"][c] for c in contig_idsENSEMBL]
contig_gc_valuesENSEMBL = [100.0 * overviewENSEMBL["assembly"]["contig_gc_content"][c] for c in contig_idsENSEMBL]

count = 0
start = 0
length = 1E10

regionsENSEMBL = [{"contig_id": c, "start": start, "length": length, "strand": "?"} for c in contig_idsENSEMBL]

feat_dataENSEMBL = annotationENSEMBL.get_feature_ids(group_by="region")

feature_countsENSEMBL = []
for c in contig_idsENSEMBL:
    count = 0
    for strand in feat_dataENSEMBL["by_region"][c]:
        for range in feat_dataENSEMBL["by_region"][c][strand]:
            count += len(feat_dataENSEMBL["by_region"][c][strand])

    if count == 0:
        feature_countsENSEMBL.append(0.1)
    else:
        feature_countsENSEMBL.append(count)

end_prep = datetime.datetime.utcnow()
print "Total time to prep data : {0}".format(end_prep - start_prep)









    



genome_annotation.get_taxon 187/1001255/1
time : 0:00:04.216689
genome_annotation.get_assembly
time : 0:00:05.324047
assembly.get_number_contigs
time : 0:00:01.013010
assembly.get_contig_lengths
time : 0:00:00.441404
assembly.get_contig_gc_content
time : 0:00:00.329660
['misc_RNA', 'gene', 'exon', 'mRNA', 'CDS']
time : 0:00:00.488962
['misc_RNA', 'gene', 'mRNA', 'CDS']
genome_annotation.get_feature_types
genome_annotation.get_feature_type_counts
time : 0:00:03.323271
Total time to gather data : 0:00:10.920809
Total time to prep data before plotting : 0:00:48.141166



In [7]:

    
#load for second genome
start_fetch = datetime.datetime.utcnow()    
overviewRefSeq = get_genome_summary(annotationRefSeq)
end_fetch = datetime.datetime.utcnow()

print "Total time to gather data : {0}".format(end_fetch - start_fetch)

start_prep = datetime.datetime.utcnow()

contig_idsRefSeq = [overviewRefSeq["assembly"]["contig_length"].keys()]
                            
#contig_idsENSEMBL = [sorted(overviewENSEMBL["assembly"]["contig_length"].keys(), 
#                    cmp=lambda x,y: cmp(x.rsplit(".",1)[1],y.rsplit(".",1)[1]))]

contig_idsRefSeq = overviewRefSeq["assembly"]["contig_length"].keys()
contig_len_valuesRefSeq = [overviewRefSeq["assembly"]["contig_length"][c] for c in contig_idsRefSeq]
contig_gc_valuesRefSeq = [100.0 * overviewRefSeq["assembly"]["contig_gc_content"][c] for c in contig_idsRefSeq]

count = 0
start = 0
length = 1E10

regionsRefSeq = [{"contig_id": c, "start": start, "length": length, "strand": "?"} for c in contig_idsRefSeq]

feat_dataRefSeq = annotationRefSeq.get_feature_ids(group_by="region")

feature_countsRefSeq = []
for c in contig_idsRefSeq:
    count = 0
    for strand in feat_dataRefSeq["by_region"][c]:
        for range in feat_dataRefSeq["by_region"][c][strand]:
            count += len(feat_dataRefSeq["by_region"][c][strand])

    if count == 0:
        feature_countsRefSeq.append(0.1)
    else:
        feature_countsRefSeq.append(count)

end_prep = datetime.datetime.utcnow()
print "Total time to prep data : {0}".format(end_prep - start_prep)









    



genome_annotation.get_taxon 187/1001255/1
time : 0:00:02.233959
genome_annotation.get_assembly
time : 0:00:02.848240
assembly.get_number_contigs
time : 0:00:00.646516
assembly.get_contig_lengths
time : 0:00:00.580673
assembly.get_contig_gc_content
time : 0:00:00.477618
['repeat_unit', 'gene', 'rRNA', 'tRNA', 'intron', 'gap', 'repeat_region', 'exon', 'mRNA', 'CDS', 'regulatory', 'misc_RNA', 'ncRNA']
time : 0:00:00.753039
['repeat_unit', 'gene', 'rRNA', 'tRNA', 'intron', 'gap', 'repeat_region', 'exon', 'mRNA', 'CDS', 'misc_RNA', 'ncRNA']
genome_annotation.get_feature_types
genome_annotation.get_feature_type_counts
time : 0:00:02.251841
Total time to gather data : 0:00:07.558315
Total time to prep data before plotting : 0:00:30.173184



In [13]:

    
overviewRefSeq["assembly"]["contig_length"]









    Out[13]:





{'NC_000932': 154478,
 'NC_001284': 366924,
 'NC_003070': 30427671,
 'NC_003071': 19698289,
 'NC_003074': 23459830,
 'NC_003075': 18585056,
 'NC_003076': 26975502}



In [14]:

    
overviewENSEMBL["assembly"]["contig_length"]









    Out[14]:





{'chromosome:TAIR10:1:1:30427671:1': 30427671,
 'chromosome:TAIR10:2:1:19698289:1': 19698289,
 'chromosome:TAIR10:3:1:23459830:1': 23459830,
 'chromosome:TAIR10:4:1:18585056:1': 18585056,
 'chromosome:TAIR10:5:1:26975502:1': 26975502,
 'chromosome:TAIR10:Mt:1:366924:1': 366924,
 'chromosome:TAIR10:Pt:1:154478:1': 154478}



In [8]:

    
assemblyENSEMBL = annotationENSEMBL.get_assembly()
assemblyapiENSEMBL = AssemblyClientAPI(services["assembly_url"], token, assemblyENSEMBL)

assemblyRefSeq = annotationRefSeq.get_assembly()
assemblyapiRefSeq = AssemblyClientAPI(services["assembly_url"], token, assemblyRefSeq)



#contigsENSEMBL = assemblyapi.get_data_subset(["contigs"])["contigs"]
#for contig in contigsENSEMBL:
#    md5 = contigsENSEMBL[contig]["md5"]
#    print md5



In [9]:

    
lengthsRefSeq = dict((v, k) for k, v in overviewRefSeq["assembly"]["contig_length"].iteritems())

for idENSEMBL in overviewENSEMBL["assembly"]["contig_length"].keys():
    print idENSEMBL +"\t"+ str(overviewENSEMBL["assembly"]["contig_length"][idENSEMBL])
    idRefSeq = lengthsRefSeq.get(overviewENSEMBL["assembly"]["contig_length"][idENSEMBL])
    print idRefSeq +"\t"+ str(overviewENSEMBL["assembly"]["contig_length"][idENSEMBL])
    
    count =0
    start=1
    stop=100000000
 
    regionsRefSeq = []
    contig_addRefSeq = {"contig_id": idRefSeq, "start": start, "length": stop-start, "strand": "?"}
    #print contig_addRefSeq
    regionsRefSeq.append(contig_addRefSeq)
    feat_dataRefSeq = annotationRefSeq.get_feature_ids(filters={"region_list":regionsRefSeq})
    #print feat_data
    num_gene_RefSeq = len(feat_dataRefSeq['by_type']['gene'])
    
    regionsENSEMBL = []
    contig_addENSEMBL = {"contig_id": idENSEMBL, "start": start, "length": stop-start, "strand": "?"}
    #print contig_addENSEMBL
    regionsENSEMBL.append(contig_addENSEMBL)
    feat_dataENSEMBL = annotationENSEMBL.get_feature_ids(filters={"region_list":regionsENSEMBL})
    #print feat_data
    num_gene_ENSEMBL = len(feat_dataENSEMBL['by_type']['gene'])\
    
    print "** "+idRefSeq+"\t"+str(num_gene_RefSeq) +"\t" + idENSEMBL+"\t"+str(num_gene_ENSEMBL)









    



chromosome:TAIR10:3:1:23459830:1	23459830
NC_003074	23459830
** NC_003074	3318	chromosome:TAIR10:3:1:23459830:1	3318
chromosome:TAIR10:5:1:26975502:1	26975502
NC_003076	26975502
** NC_003076	3775	chromosome:TAIR10:5:1:26975502:1	3775
chromosome:TAIR10:4:1:18585056:1	18585056
NC_003075	18585056
** NC_003075	2556	chromosome:TAIR10:4:1:18585056:1	2556
chromosome:TAIR10:Mt:1:366924:1	366924
NC_001284	366924
** NC_001284	59	chromosome:TAIR10:Mt:1:366924:1	59
chromosome:TAIR10:2:1:19698289:1	19698289
NC_003071	19698289
** NC_003071	2795	chromosome:TAIR10:2:1:19698289:1	2795
chromosome:TAIR10:Pt:1:154478:1	154478
NC_000932	154478
** NC_000932	53	chromosome:TAIR10:Pt:1:154478:1	55
chromosome:TAIR10:1:1:30427671:1	30427671
NC_003070	30427671
** NC_003070	4316	chromosome:TAIR10:1:1:30427671:1	4316



In [10]:

    
3318+3775+2556+59+2795+53+4316









    Out[10]:





16872



In [ ]: