In [ ]:
using DataFrames
using DataFramesMeta

basedir = "2017-03-30_finalsims-data/"
seeds = filter(x -> x != "stats", readdir(basedir))

atof(x) = parse(Float64, x)
atoi(x) = Int(atof(x))

In [ ]:
function extracttime(path, tool)
    fname = splitext(basename(path))[1]
    if tool == "kwip"
        rexp = r"^(?<sz>[e.\d]+)sz-(?<cov>[e.\d]+)x-(?<var>[e.\d]+)-(?<metric>\S+)$"
    else
        rexp = r"^(?<sz>[e.\d]+)sz-(?<metric>\S+)-(?<cov>[e.\d]+)x-(?<var>[e.\d]+)$"
    end
    m = match(rexp, fname)
    time = readdlm(path)[2,1]
    metric = m[:metric]
    size = m[:sz]
    cov = m[:cov]
    var = m[:var]
    return (metric, size, cov, var, time)
end

In [ ]:
function readsamp(path)
    d = readtable(path)
    name = split(d[1,:ht_name],".")[1]
    reads = d[1,:num_reads]
    kmers = d[1,:num_kmers]
    return (name,reads,kmers)
end

In [ ]:
function calctiming()
    timing = []
    for seed in seeds, tool in ["mash", "kwip"]
        for measurement in readdir("$basedir/$seed/bench/$tool/")
            path = "$basedir/$seed/bench/$tool/$measurement"
            m, s, c, v, t = extracttime(path, tool)
            push!(timing,
                  DataFrame(tool=tool, metric=m, seed=seed, size=atof(s), cov=atof(c),
                            var=atof(v), time=t))
        end
    end
    timing =  vcat(timing...)
    return by(timing, [:tool, :metric, :size, :cov, :var]) do df
        DataFrame(time_mean=mean(df[:time]),
                  time_sd=std(df[:time]))
    end
end

In [ ]:
function calccov()
    coverage = []
    for seed in seeds
        for szcvvr in readdir("$basedir/$seed/sketches/")
            sz, cov, var = match(r"([e.\d]+)sz-([e.\d]+)x-([e.\d]+)", szcvvr).captures
            for sample in filter(x->endswith(x, "tsv"), readdir("$basedir/$seed/sketches/$szcvvr/"))
                name, reads, kmers = readsamp("$basedir/$seed/sketches/$szcvvr/$sample")
                push!(coverage, DataFrame(size=atof(sz), cov=atof(cov), var=atof(var),
                        name=name, nreads=reads, nkmers=kmers, seed=seed))
            end
        end
    end
    coverage = vcat(coverage...)
    by(coverage, [:cov, :var, :size]) do df
        DataFrame(
            nread_mean = mean(df[:nreads]),
            nread_sd = std(df[:nreads]),
            nkmer_mean = mean(df[:nkmers]),
            nkmer_sd = std(df[:nkmers]),
        )
    end
end

In [ ]:
data = join(calctiming(), calccov(), on=[:size, :cov, :var], kind=:left)

In [ ]:
writetable("2017-03-30_simulation-performance.tsv", data)