In [1]:
from glob import glob
import itertools as it
import os.path
import statistics
import numpy as np
import pandas as pd
from otdet.util import pick
from otdet.feature_extraction import CountVectorizerWrapper
from run_experiment import shorten
In [2]:
norm_dirs = ['./datasets/bbs-arch/152930__Pantheon-Shell-on-Archlinux/',
'./datasets/physicsforums/17301__what-is-nothing',
'./datasets/musicboards/1139__Is-it-just-me-or-does-all-the-music-on-the-radio-today-suck']
oot_dirs = ['./datasets/bbs-arch/57549__The-dwm-thread',
'./datasets/musicboards/10142__Lyrics-vs-Music',
'./datasets/physicsforums/40008__altering-the-speed-of-light']
In [3]:
def read_contents(files):
for file in files:
with open(file) as f:
yield f.read()
In [4]:
num_norms = [10, 80]
num_oots = [1, 4, 8]
In [5]:
data = np.empty(2*3*3*3*2)
for i, (num_norm, num_oot, norm_dir, oot_dir) in enumerate(it.product(num_norms, num_oots, norm_dirs, oot_dirs)):
counts = []
for _ in range(30):
norm_files = pick(glob(os.path.join(norm_dir, '*.txt')),
k=num_norm, randomized=False)
oot_files = pick(glob(os.path.join(oot_dir, '*.txt')),
k=num_oot, randomized=True)
all_docs = list(read_contents(norm_files)) + list(read_contents(oot_files))
vec = CountVectorizerWrapper(input='content', stop_words='english')
vec.fit(all_docs)
counts.append(len(vec.vocabulary_))
data[2*i] = statistics.mean(counts)
data[2*i+1] = statistics.stdev(counts)
In [6]:
short_norm_dirs = [shorten(norm_dir) for norm_dir in norm_dirs]
short_oot_dirs = [shorten(oot_dir) for oot_dir in oot_dirs]
In [7]:
index = pd.MultiIndex.from_product([num_norms, num_oots], names=['num_norm','num_oot'])
columns = pd.MultiIndex.from_product([short_norm_dirs, short_oot_dirs, ['mean','stdev']], names=['norm_dir','oot_dir','value'])
In [8]:
result = pd.DataFrame(data.reshape((2*3,3*3*2)), index=index, columns=columns)
In [9]:
result
Out[9]: