In [1]:
from glob import glob
import itertools as it
import os.path
import statistics

import numpy as np
import pandas as pd

from otdet.util import pick
from otdet.feature_extraction import CountVectorizerWrapper
from run_experiment import shorten

In [2]:
norm_dirs = ['./datasets/bbs-arch/152930__Pantheon-Shell-on-Archlinux/',
            './datasets/physicsforums/17301__what-is-nothing',
            './datasets/musicboards/1139__Is-it-just-me-or-does-all-the-music-on-the-radio-today-suck']
oot_dirs = ['./datasets/bbs-arch/57549__The-dwm-thread',
           './datasets/musicboards/10142__Lyrics-vs-Music',
           './datasets/physicsforums/40008__altering-the-speed-of-light']

In [3]:
def read_contents(files):
    for file in files:
        with open(file) as f:
            yield f.read()

In [4]:
num_norms = [10, 80]
num_oots = [1, 4, 8]

In [5]:
data = np.empty(2*3*3*3*2)
for i, (num_norm, num_oot, norm_dir, oot_dir) in enumerate(it.product(num_norms, num_oots, norm_dirs, oot_dirs)):
    counts = []
    for _ in range(30):
        norm_files = pick(glob(os.path.join(norm_dir, '*.txt')),
                          k=num_norm, randomized=False)
        oot_files = pick(glob(os.path.join(oot_dir, '*.txt')),
                         k=num_oot, randomized=True)
        all_docs = list(read_contents(norm_files)) + list(read_contents(oot_files))
        vec = CountVectorizerWrapper(input='content', stop_words='english')
        vec.fit(all_docs)
        counts.append(len(vec.vocabulary_))
    data[2*i] = statistics.mean(counts)
    data[2*i+1] = statistics.stdev(counts)

In [6]:
short_norm_dirs = [shorten(norm_dir) for norm_dir in norm_dirs]
short_oot_dirs = [shorten(oot_dir) for oot_dir in oot_dirs]

In [7]:
index = pd.MultiIndex.from_product([num_norms, num_oots], names=['num_norm','num_oot'])
columns = pd.MultiIndex.from_product([short_norm_dirs, short_oot_dirs, ['mean','stdev']], names=['norm_dir','oot_dir','value'])

In [8]:
result = pd.DataFrame(data.reshape((2*3,3*3*2)), index=index, columns=columns)

In [9]:
result


Out[9]:
norm_dir bbs152930 phy17301 mus1139
oot_dir bbs57549 mus10142 phy40008 bbs57549 mus10142 phy40008 bbs57549 mus10142 phy40008
value mean stdev mean stdev mean stdev mean stdev mean stdev mean stdev mean stdev mean stdev mean stdev
num_norm num_oot
10 1 163.833333 25.923074 148.033333 5.968211 163.766667 23.088560 144.433333 21.021636 130.966667 11.663333 150.200000 33.415256 166.366667 27.245352 146.033333 12.949460 165.133333 41.612940
4 214.266667 32.880068 178.433333 23.671143 252.366667 71.877522 212.500000 37.771317 157.300000 25.290246 222.633333 60.800115 214.766667 28.600317 168.466667 21.126439 242.966667 61.187943
8 280.233333 36.727030 201.933333 31.139213 361.733333 76.556463 289.933333 44.208700 188.633333 26.185655 324.433333 74.503730 313.400000 50.127837 203.833333 26.166685 342.766667 79.531249
80 1 787.566667 17.415180 780.400000 9.379361 794.866667 31.688933 1194.900000 17.465927 1183.633333 6.599286 1191.200000 16.214936 937.866667 13.581766 926.466667 8.935529 942.900000 26.754761
4 828.166667 31.316057 798.400000 17.592318 859.733333 49.930941 1245.500000 24.627921 1204.266667 18.792760 1231.300000 44.391013 989.533333 32.254092 940.100000 13.360647 994.700000 44.519775
8 867.766667 24.443719 823.733333 20.503882 921.266667 55.613155 1295.733333 45.904649 1225.100000 20.100352 1288.366667 46.830938 1038.666667 34.509702 954.366667 13.335158 1062.266667 47.047982