In [ ]:
using DataFrames
using DataFramesMeta
basedir = "data/"
atof(x) = parse(Float64, x)
atoi(x) = Int(atof(x))
In [ ]:
function extracttime(path, tool)
fname = splitext(basename(path))[1]
if tool == "kwip"
m = match(r"^(?<set>\w+)-(?<metric>w?ip)$", fname)
set = m[:set]
metric = m[:metric]
else
set = fname
metric = "mash"
end
proj = startswith(set, "3krice") ? "rice" : "chlamy"
time = readdlm(path)[2,1]
return (proj, set, metric, time)
end
function calctiming()
timing = []
for tool in ["mash", "kwip"]
for measurement in readdir("$basedir/benchmarks/$tool/")
path = "$basedir/benchmarks/$tool/$measurement"
proj, set, metric, time = extracttime(path, tool)
push!(timing, DataFrame(proj=proj, tool=tool, metric=metric, set=set, time=time))
end
end
timing = vcat(timing...)
return by(timing, [:tool, :metric, :proj]) do df
DataFrame(time_mean=mean(df[:time]),
time_sd=std(df[:time]))
end
end
In [ ]:
function readsamp(path)
d = readtable(path)
name = split(d[1,:ht_name],".")[1]
reads = d[1,:num_reads]
kmers = d[1,:num_kmers]
return (name,reads,kmers)
end
function calccov()
coverage = []
for proj in readdir("$basedir/counts/")
for sample in filter(x->endswith(x, "tsv"),
readdir("$basedir/counts/$proj/"))
name, reads, kmers = readsamp("$basedir/counts/$proj/$sample")
push!(coverage, DataFrame(proj=proj, name=name, nreads=reads, nkmers=kmers))
end
end
coverage = vcat(coverage...)
by(coverage, :proj) do df
DataFrame(
nread_mean = mean(df[:nreads]),
nread_sd = std(df[:nreads]),
nkmer_mean = mean(df[:nkmers]),
nkmer_sd = std(df[:nkmers]),
)
end
end
In [ ]:
cvr = calccov()
In [ ]:
timing = calctiming()
In [ ]:
data = join(calctiming(), calccov(), on=[:proj], kind=:left)
In [ ]:
writetable("computational-performance.tsv", data)