In [ ]:
using DataFrames
using DataFramesMeta
basedir = "2017-03-30_finalsims-data/"
seeds = filter(x -> x != "stats", readdir(basedir))
atof(x) = parse(Float64, x)
atoi(x) = Int(atof(x))
In [ ]:
function extracttime(path, tool)
fname = splitext(basename(path))[1]
if tool == "kwip"
rexp = r"^(?<sz>[e.\d]+)sz-(?<cov>[e.\d]+)x-(?<var>[e.\d]+)-(?<metric>\S+)$"
else
rexp = r"^(?<sz>[e.\d]+)sz-(?<metric>\S+)-(?<cov>[e.\d]+)x-(?<var>[e.\d]+)$"
end
m = match(rexp, fname)
time = readdlm(path)[2,1]
metric = m[:metric]
size = m[:sz]
cov = m[:cov]
var = m[:var]
return (metric, size, cov, var, time)
end
In [ ]:
function readsamp(path)
d = readtable(path)
name = split(d[1,:ht_name],".")[1]
reads = d[1,:num_reads]
kmers = d[1,:num_kmers]
return (name,reads,kmers)
end
In [ ]:
function calctiming()
timing = []
for seed in seeds, tool in ["mash", "kwip"]
for measurement in readdir("$basedir/$seed/bench/$tool/")
path = "$basedir/$seed/bench/$tool/$measurement"
m, s, c, v, t = extracttime(path, tool)
push!(timing,
DataFrame(tool=tool, metric=m, seed=seed, size=atof(s), cov=atof(c),
var=atof(v), time=t))
end
end
timing = vcat(timing...)
return by(timing, [:tool, :metric, :size, :cov, :var]) do df
DataFrame(time_mean=mean(df[:time]),
time_sd=std(df[:time]))
end
end
In [ ]:
function calccov()
coverage = []
for seed in seeds
for szcvvr in readdir("$basedir/$seed/sketches/")
sz, cov, var = match(r"([e.\d]+)sz-([e.\d]+)x-([e.\d]+)", szcvvr).captures
for sample in filter(x->endswith(x, "tsv"), readdir("$basedir/$seed/sketches/$szcvvr/"))
name, reads, kmers = readsamp("$basedir/$seed/sketches/$szcvvr/$sample")
push!(coverage, DataFrame(size=atof(sz), cov=atof(cov), var=atof(var),
name=name, nreads=reads, nkmers=kmers, seed=seed))
end
end
end
coverage = vcat(coverage...)
by(coverage, [:cov, :var, :size]) do df
DataFrame(
nread_mean = mean(df[:nreads]),
nread_sd = std(df[:nreads]),
nkmer_mean = mean(df[:nkmers]),
nkmer_sd = std(df[:nkmers]),
)
end
end
In [ ]:
data = join(calctiming(), calccov(), on=[:size, :cov, :var], kind=:left)
In [ ]:
writetable("2017-03-30_simulation-performance.tsv", data)