In [1]:
# Basic setup for displaying bokeh plots in jupyter.
from bokeh.plotting import figure 
from bokeh.io import output_notebook, show
output_notebook()


Loading BokehJS ...

In [2]:
# NumPy imports.
import numpy as np
import cPickle as pickle
import os
cwd = os.getcwd()
print(cwd)


/Users/kevin/dev/nml/paper_platform/scripts

In [3]:
# Load parsed data (NumPy array).
array = pickle.load(open('2018-03-27-10-49-30_array.p', 'rb'))
print(array.shape)
x_scaled = [float(i)//1000.0 for i in array[0]]


(2, 22052)

In [4]:
# Plot.
p = figure(width=500,
           height=500,
           x_axis_label="Total # of Targets Retrieved per Analysis",
           y_axis_label="Runtime (seconds)",
           title="C. Overall Runtimes for Fisher's Analysis",
           hidpi=True,
           )
p.axis.major_label_text_font_size="12pt"
p.axis.axis_label_text_font='12pt'
p.circle(array[0],array[1], size=1, color="darkslateblue", alpha=0.5)
show(p)



In [5]:
# Load raw data (list)
raws = pickle.load(open('2018-03-27-10-49-30_raws.p', 'rb'))
print(type(raws))
print(raws[0])


<type 'list'>
(7, 97, 106, u'4.0\n', u'H29', u'H28', 'https://www.github.com/superphy#AntimicrobialResistanceGene')

In [6]:
# Check average number of targets.
import scipy.stats as st
l = [i[2] for i in raws]
print(np.median(l))
ci = st.t.interval(0.95, len(l)-1, loc=np.mean(l), scale=st.sem(l))
print(ci)


185.5
(403.24203191260204, 411.6263700464039)

In [7]:
# Plot a histogram of the targets.
x = np.array(l)
hist, edges = np.histogram(x, density=True, bins=50)

p1 = figure(title="Histogram of Targets",tools="save",)

p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")

show(p1)



In [8]:
# Find the CI for the lower end of the distribution.
l = [i[2] for i in raws if i[2] <= 200]
ci_lower = st.t.interval(0.95, len(l)-1, loc=np.mean(l), scale=st.sem(l))
print(ci_lower)


(106.82540724832967, 107.60894168166669)

In [9]:
# Check size of dataset we would have.
l_lower = [i[2] for i in raws if ci_lower[0] <= i[2] <= ci_lower[1]]
print(len(l_lower))


180

In [10]:
# Create an np.array to see how runtime varies depending on group size, while holding target size between the lower CI.
lx = []
ly = []
for i in raws:
    # Only sample from within this CI.
    if ci_lower[0] <= i[2] <= ci_lower[1]:
        lx.append(i[0] + i[1])
        ly.append(i[3])
a_lower = np.array([lx,ly])

In [19]:
# Plot.
p_holdtargets = figure(width=500,
           height=500,
           x_axis_label="Total # of Genomes per Analysis",
           y_axis_label="Runtime (seconds)",
           title="A. Runtimes when 107 Targets Retrieved for Differing # of Genomes",
           hidpi=True)
p_holdtargets.axis.major_label_text_font_size="12pt"
p_holdtargets.axis.axis_label_text_font='12pt'
p_holdtargets.circle(a_lower[0],a_lower[1], size=4, color="darkslateblue", alpha=0.5)
show(p_holdtargets)



In [12]:
# Check average number of genomes in query groups.
l = [i[0] + i[1] for i in raws]
print(np.median(l))
ci_queries = st.t.interval(0.95, len(l)-1, loc=np.mean(l), scale=st.sem(l))
print(ci_queries)


83.0
(217.6447531479244, 226.43469542816848)

In [13]:
# Plot a histogram of the number of genomes in query groups.
x = np.array(l)
hist, edges = np.histogram(x, density=True, bins=50)

p1 = figure(title="Histogram of Queries",tools="save",)

p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")

show(p1)



In [14]:
# Find CI for a Group Size <500.
l = [i[0] + i[1] for i in raws if (i[0]+i[1])<500]
print(np.median(l))
ci_queries_lower = st.t.interval(0.95, len(l)-1, loc=np.mean(l), scale=st.sem(l))
print(ci_queries_lower)


65.0
(113.47491570341955, 116.60101164372804)

In [15]:
# Check size of dataset we would have.
l_queries_lower = [i[2] for i in raws if ci_queries_lower[0] <= i[2] <= ci_queries_lower[1]]
print(len(l_queries_lower))


506

In [16]:
# Create an np.array to see how runtime varies depending on target size, while holding target size of genomes between the CI.
lx = []
ly = []
for i in raws:
    # Only sample from within this CI.
    if ci_queries_lower[0] <= i[2] <= ci_queries_lower[1]:
        lx.append(i[0] + i[1])
        ly.append(i[3])
b_lower = np.array([lx,ly])

In [20]:
# Plot.
p_holdgenomes = figure(width=500,
           height=500,
           x_axis_label="# Targets Retrieved per Genome",
           y_axis_label="Runtime (seconds)",
           title="B. Runtimes When Differing Target Sizes Retrieved For 115 Genomes",hidpi=True)
p_holdgenomes.axis.major_label_text_font_size="12pt"
p_holdgenomes.axis.axis_label_text_font='12pt'
p_holdgenomes.circle(b_lower[0],b_lower[1], size=4, color="darkslateblue", alpha=0.5)
show(p_holdgenomes)



In [18]:
from bokeh.layouts import gridplot
show(gridplot(p_holdtargets,p_holdgenomes,p, ncols=3, plot_width=400, plot_height=400))



In [ ]: