notebook.community

Edit and run



In [1]:

    
from glob import glob
import itertools as it
import os.path
import statistics

import numpy as np
import pandas as pd

from otdet.util import pick
from otdet.feature_extraction import CountVectorizerWrapper
from run_experiment import shorten



In [2]:

    
norm_dirs = ['./datasets/bbs-arch/152930__Pantheon-Shell-on-Archlinux/',
            './datasets/physicsforums/17301__what-is-nothing',
            './datasets/musicboards/1139__Is-it-just-me-or-does-all-the-music-on-the-radio-today-suck']
oot_dirs = ['./datasets/bbs-arch/57549__The-dwm-thread',
           './datasets/musicboards/10142__Lyrics-vs-Music',
           './datasets/physicsforums/40008__altering-the-speed-of-light']



In [3]:

    
def read_contents(files):
    for file in files:
        with open(file) as f:
            yield f.read()



In [4]:

    
num_norms = [10, 80]
num_oots = [1, 4, 8]



In [5]:

    
data = np.empty(2*3*3*3*2)
for i, (num_norm, num_oot, norm_dir, oot_dir) in enumerate(it.product(num_norms, num_oots, norm_dirs, oot_dirs)):
    counts = []
    for _ in range(30):
        norm_files = pick(glob(os.path.join(norm_dir, '*.txt')),
                          k=num_norm, randomized=False)
        oot_files = pick(glob(os.path.join(oot_dir, '*.txt')),
                         k=num_oot, randomized=True)
        all_docs = list(read_contents(norm_files)) + list(read_contents(oot_files))
        vec = CountVectorizerWrapper(input='content', stop_words='english')
        vec.fit(all_docs)
        counts.append(len(vec.vocabulary_))
    data[2*i] = statistics.mean(counts)
    data[2*i+1] = statistics.stdev(counts)



In [6]:

    
short_norm_dirs = [shorten(norm_dir) for norm_dir in norm_dirs]
short_oot_dirs = [shorten(oot_dir) for oot_dir in oot_dirs]



In [7]:

    
index = pd.MultiIndex.from_product([num_norms, num_oots], names=['num_norm','num_oot'])
columns = pd.MultiIndex.from_product([short_norm_dirs, short_oot_dirs, ['mean','stdev']], names=['norm_dir','oot_dir','value'])



In [8]:

    
result = pd.DataFrame(data.reshape((2*3,3*3*2)), index=index, columns=columns)



In [9]:

    
result









    Out[9]:






  
    
      
      norm_dir
      bbs152930
      phy17301
      mus1139
    
    
      
      oot_dir
      bbs57549
      mus10142
      phy40008
      bbs57549
      mus10142
      phy40008
      bbs57549
      mus10142
      phy40008
    
    
      
      value
      mean
      stdev
      mean
      stdev
      mean
      stdev
      mean
      stdev
      mean
      stdev
      mean
      stdev
      mean
      stdev
      mean
      stdev
      mean
      stdev
    
    
      num_norm
      num_oot
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      10
      1
       163.833333
       25.923074
       148.033333
        5.968211
       163.766667
       23.088560
        144.433333
       21.021636
        130.966667
       11.663333
        150.200000
       33.415256
        166.366667
       27.245352
       146.033333
       12.949460
        165.133333
       41.612940
    
    
      4
       214.266667
       32.880068
       178.433333
       23.671143
       252.366667
       71.877522
        212.500000
       37.771317
        157.300000
       25.290246
        222.633333
       60.800115
        214.766667
       28.600317
       168.466667
       21.126439
        242.966667
       61.187943
    
    
      8
       280.233333
       36.727030
       201.933333
       31.139213
       361.733333
       76.556463
        289.933333
       44.208700
        188.633333
       26.185655
        324.433333
       74.503730
        313.400000
       50.127837
       203.833333
       26.166685
        342.766667
       79.531249
    
    
      80
      1
       787.566667
       17.415180
       780.400000
        9.379361
       794.866667
       31.688933
       1194.900000
       17.465927
       1183.633333
        6.599286
       1191.200000
       16.214936
        937.866667
       13.581766
       926.466667
        8.935529
        942.900000
       26.754761
    
    
      4
       828.166667
       31.316057
       798.400000
       17.592318
       859.733333
       49.930941
       1245.500000
       24.627921
       1204.266667
       18.792760
       1231.300000
       44.391013
        989.533333
       32.254092
       940.100000
       13.360647
        994.700000
       44.519775
    
    
      8
       867.766667
       24.443719
       823.733333
       20.503882
       921.266667
       55.613155
       1295.733333
       45.904649
       1225.100000
       20.100352
       1288.366667
       46.830938
       1038.666667
       34.509702
       954.366667
       13.335158
       1062.266667
       47.047982

	norm_dir	bbs152930						phy17301						mus1139
	oot_dir	bbs57549		mus10142		phy40008		bbs57549		mus10142		phy40008		bbs57549		mus10142		phy40008
	value	mean	stdev	mean	stdev	mean	stdev	mean	stdev	mean	stdev	mean	stdev	mean	stdev	mean	stdev	mean	stdev
num_norm	num_oot
10	1	163.833333	25.923074	148.033333	5.968211	163.766667	23.088560	144.433333	21.021636	130.966667	11.663333	150.200000	33.415256	166.366667	27.245352	146.033333	12.949460	165.133333	41.612940
	4	214.266667	32.880068	178.433333	23.671143	252.366667	71.877522	212.500000	37.771317	157.300000	25.290246	222.633333	60.800115	214.766667	28.600317	168.466667	21.126439	242.966667	61.187943
	8	280.233333	36.727030	201.933333	31.139213	361.733333	76.556463	289.933333	44.208700	188.633333	26.185655	324.433333	74.503730	313.400000	50.127837	203.833333	26.166685	342.766667	79.531249
80	1	787.566667	17.415180	780.400000	9.379361	794.866667	31.688933	1194.900000	17.465927	1183.633333	6.599286	1191.200000	16.214936	937.866667	13.581766	926.466667	8.935529	942.900000	26.754761
	4	828.166667	31.316057	798.400000	17.592318	859.733333	49.930941	1245.500000	24.627921	1204.266667	18.792760	1231.300000	44.391013	989.533333	32.254092	940.100000	13.360647	994.700000	44.519775
	8	867.766667	24.443719	823.733333	20.503882	921.266667	55.613155	1295.733333	45.904649	1225.100000	20.100352	1288.366667	46.830938	1038.666667	34.509702	954.366667	13.335158	1062.266667	47.047982