In [ ]:
using DataFrames
using DataFramesMeta

basedir = "data/"

atof(x) = parse(Float64, x)
atoi(x) = Int(atof(x))

In [ ]:
function extracttime(path, tool)
    fname = splitext(basename(path))[1]
    if tool == "kwip"
        m = match(r"^(?<set>\w+)-(?<metric>w?ip)$", fname)
        set = m[:set]
        metric = m[:metric]
    else
        set = fname
        metric = "mash"
    end
    proj = startswith(set, "3krice") ? "rice" : "chlamy"
    time = readdlm(path)[2,1]
    return (proj, set, metric, time)
end
        
function calctiming()
    timing = []
    for tool in ["mash", "kwip"]
        for measurement in readdir("$basedir/benchmarks/$tool/")
            path = "$basedir/benchmarks/$tool/$measurement"
            proj, set, metric, time = extracttime(path, tool)
            push!(timing, DataFrame(proj=proj, tool=tool, metric=metric, set=set, time=time))
        end
    end
    timing =  vcat(timing...)
    return by(timing, [:tool, :metric, :proj]) do df
        DataFrame(time_mean=mean(df[:time]),
                  time_sd=std(df[:time]))
    end
end

In [ ]:
function readsamp(path)
    d = readtable(path)
    name = split(d[1,:ht_name],".")[1]
    reads = d[1,:num_reads]
    kmers = d[1,:num_kmers]
    return (name,reads,kmers)
end

function calccov()
    coverage = []
    for proj in readdir("$basedir/counts/")
        for sample in filter(x->endswith(x, "tsv"),
                             readdir("$basedir/counts/$proj/"))
            name, reads, kmers = readsamp("$basedir/counts/$proj/$sample")
            push!(coverage, DataFrame(proj=proj, name=name, nreads=reads, nkmers=kmers))
        end
    end
    coverage = vcat(coverage...)
    by(coverage, :proj) do df
        DataFrame(
            nread_mean = mean(df[:nreads]),
            nread_sd = std(df[:nreads]),
            nkmer_mean = mean(df[:nkmers]),
            nkmer_sd = std(df[:nkmers]),
        )
    end
end

In [ ]:
cvr = calccov()

In [ ]:
timing = calctiming()

In [ ]:
data = join(calctiming(), calccov(), on=[:proj], kind=:left)

In [ ]:
writetable("computational-performance.tsv", data)