In [1]:
# Basic setup for displaying bokeh plots in jupyter.
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
output_notebook()
In [2]:
# NumPy imports.
import numpy as np
import cPickle as pickle
import os
cwd = os.getcwd()
print(cwd)
In [3]:
# Load parsed data (NumPy array).
array = pickle.load(open('2018-03-27-10-49-30_array.p', 'rb'))
print(array.shape)
x_scaled = [float(i)//1000.0 for i in array[0]]
In [4]:
# Plot.
p = figure(width=500,
height=500,
x_axis_label="Total # of Targets Retrieved per Analysis",
y_axis_label="Runtime (seconds)",
title="C. Overall Runtimes for Fisher's Analysis",
hidpi=True,
)
p.axis.major_label_text_font_size="12pt"
p.axis.axis_label_text_font='12pt'
p.circle(array[0],array[1], size=1, color="darkslateblue", alpha=0.5)
show(p)
In [5]:
# Load raw data (list)
raws = pickle.load(open('2018-03-27-10-49-30_raws.p', 'rb'))
print(type(raws))
print(raws[0])
In [6]:
# Check average number of targets.
import scipy.stats as st
l = [i[2] for i in raws]
print(np.median(l))
ci = st.t.interval(0.95, len(l)-1, loc=np.mean(l), scale=st.sem(l))
print(ci)
In [7]:
# Plot a histogram of the targets.
x = np.array(l)
hist, edges = np.histogram(x, density=True, bins=50)
p1 = figure(title="Histogram of Targets",tools="save",)
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="#036564", line_color="#033649")
show(p1)
In [8]:
# Find the CI for the lower end of the distribution.
l = [i[2] for i in raws if i[2] <= 200]
ci_lower = st.t.interval(0.95, len(l)-1, loc=np.mean(l), scale=st.sem(l))
print(ci_lower)
In [9]:
# Check size of dataset we would have.
l_lower = [i[2] for i in raws if ci_lower[0] <= i[2] <= ci_lower[1]]
print(len(l_lower))
In [10]:
# Create an np.array to see how runtime varies depending on group size, while holding target size between the lower CI.
lx = []
ly = []
for i in raws:
# Only sample from within this CI.
if ci_lower[0] <= i[2] <= ci_lower[1]:
lx.append(i[0] + i[1])
ly.append(i[3])
a_lower = np.array([lx,ly])
In [19]:
# Plot.
p_holdtargets = figure(width=500,
height=500,
x_axis_label="Total # of Genomes per Analysis",
y_axis_label="Runtime (seconds)",
title="A. Runtimes when 107 Targets Retrieved for Differing # of Genomes",
hidpi=True)
p_holdtargets.axis.major_label_text_font_size="12pt"
p_holdtargets.axis.axis_label_text_font='12pt'
p_holdtargets.circle(a_lower[0],a_lower[1], size=4, color="darkslateblue", alpha=0.5)
show(p_holdtargets)
In [12]:
# Check average number of genomes in query groups.
l = [i[0] + i[1] for i in raws]
print(np.median(l))
ci_queries = st.t.interval(0.95, len(l)-1, loc=np.mean(l), scale=st.sem(l))
print(ci_queries)
In [13]:
# Plot a histogram of the number of genomes in query groups.
x = np.array(l)
hist, edges = np.histogram(x, density=True, bins=50)
p1 = figure(title="Histogram of Queries",tools="save",)
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
fill_color="#036564", line_color="#033649")
show(p1)
In [14]:
# Find CI for a Group Size <500.
l = [i[0] + i[1] for i in raws if (i[0]+i[1])<500]
print(np.median(l))
ci_queries_lower = st.t.interval(0.95, len(l)-1, loc=np.mean(l), scale=st.sem(l))
print(ci_queries_lower)
In [15]:
# Check size of dataset we would have.
l_queries_lower = [i[2] for i in raws if ci_queries_lower[0] <= i[2] <= ci_queries_lower[1]]
print(len(l_queries_lower))
In [16]:
# Create an np.array to see how runtime varies depending on target size, while holding target size of genomes between the CI.
lx = []
ly = []
for i in raws:
# Only sample from within this CI.
if ci_queries_lower[0] <= i[2] <= ci_queries_lower[1]:
lx.append(i[0] + i[1])
ly.append(i[3])
b_lower = np.array([lx,ly])
In [20]:
# Plot.
p_holdgenomes = figure(width=500,
height=500,
x_axis_label="# Targets Retrieved per Genome",
y_axis_label="Runtime (seconds)",
title="B. Runtimes When Differing Target Sizes Retrieved For 115 Genomes",hidpi=True)
p_holdgenomes.axis.major_label_text_font_size="12pt"
p_holdgenomes.axis.axis_label_text_font='12pt'
p_holdgenomes.circle(b_lower[0],b_lower[1], size=4, color="darkslateblue", alpha=0.5)
show(p_holdgenomes)
In [18]:
from bokeh.layouts import gridplot
show(gridplot(p_holdtargets,p_holdgenomes,p, ncols=3, plot_width=400, plot_height=400))
In [ ]: