In this session, we're going to analyze the small experiment result to determine which method and distance metric are the best so we can just use them in the larger experiment. A little bit of explanation, in this small experiment, we didn't vary the number of normal posts, number of OOT posts, number of posts in top list, and the features. The reason is because we think that a good method and distance metric should be able to detect OOT posts no matter what the domain is. So, the forum variation is more important here. That's why we varied the threads instead and left other settings fixed.

Import our toolkits and let matplotlib plots inline.



In [1]:

    
import numpy as np
import pandas as pd
%matplotlib inline

Read our experiment result.



In [2]:

    
df = pd.read_hdf('../reports/small-exp.h5', 'df')
df









    Out[2]:






  
    
      
      
      
      
      num_norm
      30
    
    
      
      
      
      
      num_oot
      3
    
    
      
      
      
      
      num_top
      3
    
    
      
      
      
      
      result
      base
      perf
    
    
      
      
      
      
      k
      0
      1
      2
      3
      0
      1
      2
      3
    
    
      method
      feature
      metric
      norm_dir
      oot_dir
      
      
      
      
      
      
      
      
    
  
  
    
      clust_dist
      unigram
      euclidean
      bbs152930
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.833333
       0.166667
       0.000000
       0.000000
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.700000
       0.300000
       0.000000
       0.000000
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       0.900000
       0.100000
       0.000000
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.533333
       0.433333
       0.033333
       0.000000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.133333
       0.433333
       0.433333
       0.000000
    
    
      phy17301
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       1.000000
       0.000000
       0.000000
       0.000000
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.933333
       0.066667
       0.000000
       0.000000
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       1.000000
       0.000000
       0.000000
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.833333
       0.166667
       0.000000
       0.000000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.433333
       0.533333
       0.033333
       0.000000
    
    
      rel37898
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.933333
       0.066667
       0.000000
       0.000000
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.700000
       0.300000
       0.000000
       0.000000
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       1.000000
       0.000000
       0.000000
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.500000
       0.433333
       0.066667
       0.000000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.333333
       0.400000
       0.233333
       0.033333
    
    
      mov9811
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.966667
       0.033333
       0.000000
       0.000000
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       1.000000
       0.000000
       0.000000
       0.000000
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       1.000000
       0.000000
       0.000000
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.800000
       0.166667
       0.033333
       0.000000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.400000
       0.500000
       0.100000
       0.000000
    
    
      mus1139
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.833333
       0.166667
       0.000000
       0.000000
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.700000
       0.266667
       0.033333
       0.000000
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       1.000000
       0.000000
       0.000000
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.300000
       0.600000
       0.100000
       0.000000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.366667
       0.333333
       0.266667
       0.033333
    
    
      cityblock
      bbs152930
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.633333
       0.366667
       0.000000
       0.000000
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.600000
       0.400000
       0.000000
       0.000000
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       0.766667
       0.233333
       0.000000
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.433333
       0.533333
       0.000000
       0.033333
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.033333
       0.366667
       0.466667
       0.133333
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      txt_comp_dist
      unigram
      euclidean
      mus1139
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.133333
       0.433333
       0.400000
       0.033333
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.133333
       0.433333
       0.400000
       0.033333
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       0.300000
       0.633333
       0.066667
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.366667
       0.266667
       0.266667
       0.100000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.733333
       0.233333
       0.033333
       0.000000
    
    
      cityblock
      bbs152930
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.000000
       0.300000
       0.700000
       0.000000
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.233333
       0.633333
       0.133333
       0.000000
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       0.033333
       0.233333
       0.733333
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.133333
       0.600000
       0.266667
       0.000000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.600000
       0.366667
       0.033333
       0.000000
    
    
      phy17301
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.000000
       0.166667
       0.766667
       0.066667
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.033333
       0.400000
       0.533333
       0.033333
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       0.000000
       0.200000
       0.700000
       0.100000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.300000
       0.466667
       0.233333
       0.000000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.600000
       0.366667
       0.033333
       0.000000
    
    
      rel37898
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.166667
       0.700000
       0.133333
       0.000000
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.333333
       0.666667
       0.000000
       0.000000
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       0.033333
       0.766667
       0.200000
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.300000
       0.666667
       0.033333
       0.000000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.900000
       0.100000
       0.000000
       0.000000
    
    
      mov9811
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.466667
       0.466667
       0.066667
       0.000000
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.733333
       0.266667
       0.000000
       0.000000
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       0.500000
       0.466667
       0.033333
       0.000000
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.733333
       0.266667
       0.000000
       0.000000
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.933333
       0.066667
       0.000000
       0.000000
    
    
      mus1139
      mov973
       0.744135
       0.239186
       0.016496
       0.000183
       0.166667
       0.633333
       0.166667
       0.033333
    
    
      bbs57549
       0.744135
       0.239186
       0.016496
       0.000183
       0.400000
       0.333333
       0.233333
       0.033333
    
    
      mus10142
       0.744135
       0.239186
       0.016496
       0.000183
       0.333333
       0.400000
       0.233333
       0.033333
    
    
      phy40008
       0.744135
       0.239186
       0.016496
       0.000183
       0.500000
       0.300000
       0.166667
       0.033333
    
    
      rel159410
       0.744135
       0.239186
       0.016496
       0.000183
       0.800000
       0.166667
       0.033333
       0.000000
    
  

150 rows × 8 columns

Let's see how well each method performed. We need to compute the average baseline and performance distribution of each method over all features, distance metrics, and threads.



In [3]:

    
df2 = df.groupby(level='method').mean()



In [4]:

    
df2









    Out[4]:






  
    
      num_norm
      30
    
    
      num_oot
      3
    
    
      num_top
      3
    
    
      result
      base
      perf
    
    
      k
      0
      1
      2
      3
      0
      1
      2
      3
    
    
      method
      
      
      
      
      
      
      
      
    
  
  
    
      clust_dist
       0.744135
       0.239186
       0.016496
       0.000183
       0.716667
       0.227333
       0.050667
       0.005333
    
    
      mean_comp
       0.744135
       0.239186
       0.016496
       0.000183
       0.722000
       0.218667
       0.052000
       0.007333
    
    
      txt_comp_dist
       0.744135
       0.239186
       0.016496
       0.000183
       0.352000
       0.390000
       0.235333
       0.022667

We don't actually need the baseline so let's remove it.



In [5]:

    
df3 = df2.drop('base', axis=1, level='result')



In [6]:

    
df3









    Out[6]:






  
    
      num_norm
      30
    
    
      num_oot
      3
    
    
      num_top
      3
    
    
      result
      perf
    
    
      k
      0
      1
      2
      3
    
    
      method
      
      
      
      
    
  
  
    
      clust_dist
       0.716667
       0.227333
       0.050667
       0.005333
    
    
      mean_comp
       0.722000
       0.218667
       0.052000
       0.007333
    
    
      txt_comp_dist
       0.352000
       0.390000
       0.235333
       0.022667



In [7]:

    
df3.T.plot(kind='bar', subplots=True, ylim=(0.,1.))









    Out[7]:





array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fcf049e9a20>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7fceecbe00b8>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7fceecbeb9e8>], dtype=object)

From the plot we can see that txt_comp_dist outperformed the other two methods since its probability distribution is more negatively skewed. To make it clearer, let's compute the expected value for each method's distribution.



In [8]:

    
df3









    Out[8]:






  
    
      num_norm
      30
    
    
      num_oot
      3
    
    
      num_top
      3
    
    
      result
      perf
    
    
      k
      0
      1
      2
      3
    
    
      method
      
      
      
      
    
  
  
    
      clust_dist
       0.716667
       0.227333
       0.050667
       0.005333
    
    
      mean_comp
       0.722000
       0.218667
       0.052000
       0.007333
    
    
      txt_comp_dist
       0.352000
       0.390000
       0.235333
       0.022667



In [9]:

    
df4 = df3 * np.arange(4)



In [10]:

    
df4









    Out[10]:






  
    
      num_norm
      30
    
    
      num_oot
      3
    
    
      num_top
      3
    
    
      result
      perf
    
    
      k
      0
      1
      2
      3
    
    
      method
      
      
      
      
    
  
  
    
      clust_dist
       0
       0.227333
       0.101333
       0.016
    
    
      mean_comp
       0
       0.218667
       0.104000
       0.022
    
    
      txt_comp_dist
       0
       0.390000
       0.470667
       0.068



In [11]:

    
df4.sum(axis=1, level='result')









    Out[11]:






  
    
      result
      perf
    
    
      method
      
    
  
  
    
      clust_dist
       0.344667
    
    
      mean_comp
       0.344667
    
    
      txt_comp_dist
       0.928667

We see that txt_comp_dist is indeed superior compared to the others.

Now, let's see which distance metric is the best one. Again, we have to compute the performance distribution of each distance metric over all methods, features, and threads. Let's do it. And don't forget to remove the baseline. (I'm getting better at this, yay!)



In [12]:

    
df5 = df.groupby(level='metric').mean().drop('base', axis=1, level='result')



In [13]:

    
df5



In [14]:

    
df5.T.plot(kind='bar', subplots=True, ylim=(0.,1.))









    Out[14]:





array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fcf049f86a0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7fceeca2c2e8>], dtype=object)

The difference is very subtle. Let's compute the expected value instead.



In [15]:

    
df6 = df5 * np.arange(4)



In [16]:

    
df6



In [17]:

    
df6.sum(axis=1, level='result')









    Out[17]:






  
    
      result
      perf
    
    
      metric
      
    
  
  
    
      cityblock
       0.533778
    
    
      euclidean
       0.544889

Although they don't differ that much, we still can conclude that euclidean is better.

OK. Now we can safely use only txt_comp_dist method with euclidean distance metric for our experiment. Yeay!

				num_norm	30
				num_oot	3
				num_top	3
				result	base				perf
				k	0	1	2	3	0	1	2	3
method	feature	metric	norm_dir	oot_dir
clust_dist	unigram	euclidean	bbs152930	mov973	0.744135	0.239186	0.016496	0.000183	0.833333	0.166667	0.000000	0.000000
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.700000	0.300000	0.000000	0.000000
				mus10142	0.744135	0.239186	0.016496	0.000183	0.900000	0.100000	0.000000	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.533333	0.433333	0.033333	0.000000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.133333	0.433333	0.433333	0.000000
			phy17301	mov973	0.744135	0.239186	0.016496	0.000183	1.000000	0.000000	0.000000	0.000000
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.933333	0.066667	0.000000	0.000000
				mus10142	0.744135	0.239186	0.016496	0.000183	1.000000	0.000000	0.000000	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.833333	0.166667	0.000000	0.000000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.433333	0.533333	0.033333	0.000000
			rel37898	mov973	0.744135	0.239186	0.016496	0.000183	0.933333	0.066667	0.000000	0.000000
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.700000	0.300000	0.000000	0.000000
				mus10142	0.744135	0.239186	0.016496	0.000183	1.000000	0.000000	0.000000	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.500000	0.433333	0.066667	0.000000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.333333	0.400000	0.233333	0.033333
			mov9811	mov973	0.744135	0.239186	0.016496	0.000183	0.966667	0.033333	0.000000	0.000000
				bbs57549	0.744135	0.239186	0.016496	0.000183	1.000000	0.000000	0.000000	0.000000
				mus10142	0.744135	0.239186	0.016496	0.000183	1.000000	0.000000	0.000000	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.800000	0.166667	0.033333	0.000000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.400000	0.500000	0.100000	0.000000
			mus1139	mov973	0.744135	0.239186	0.016496	0.000183	0.833333	0.166667	0.000000	0.000000
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.700000	0.266667	0.033333	0.000000
				mus10142	0.744135	0.239186	0.016496	0.000183	1.000000	0.000000	0.000000	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.300000	0.600000	0.100000	0.000000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.366667	0.333333	0.266667	0.033333
		cityblock	bbs152930	mov973	0.744135	0.239186	0.016496	0.000183	0.633333	0.366667	0.000000	0.000000
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.600000	0.400000	0.000000	0.000000
				mus10142	0.744135	0.239186	0.016496	0.000183	0.766667	0.233333	0.000000	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.433333	0.533333	0.000000	0.033333
				rel159410	0.744135	0.239186	0.016496	0.000183	0.033333	0.366667	0.466667	0.133333
...	...	...	...	...	...	...	...	...	...	...	...	...
txt_comp_dist	unigram	euclidean	mus1139	mov973	0.744135	0.239186	0.016496	0.000183	0.133333	0.433333	0.400000	0.033333
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.133333	0.433333	0.400000	0.033333
				mus10142	0.744135	0.239186	0.016496	0.000183	0.300000	0.633333	0.066667	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.366667	0.266667	0.266667	0.100000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.733333	0.233333	0.033333	0.000000
		cityblock	bbs152930	mov973	0.744135	0.239186	0.016496	0.000183	0.000000	0.300000	0.700000	0.000000
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.233333	0.633333	0.133333	0.000000
				mus10142	0.744135	0.239186	0.016496	0.000183	0.033333	0.233333	0.733333	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.133333	0.600000	0.266667	0.000000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.600000	0.366667	0.033333	0.000000
			phy17301	mov973	0.744135	0.239186	0.016496	0.000183	0.000000	0.166667	0.766667	0.066667
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.033333	0.400000	0.533333	0.033333
				mus10142	0.744135	0.239186	0.016496	0.000183	0.000000	0.200000	0.700000	0.100000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.300000	0.466667	0.233333	0.000000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.600000	0.366667	0.033333	0.000000
			rel37898	mov973	0.744135	0.239186	0.016496	0.000183	0.166667	0.700000	0.133333	0.000000
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.333333	0.666667	0.000000	0.000000
				mus10142	0.744135	0.239186	0.016496	0.000183	0.033333	0.766667	0.200000	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.300000	0.666667	0.033333	0.000000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.900000	0.100000	0.000000	0.000000
			mov9811	mov973	0.744135	0.239186	0.016496	0.000183	0.466667	0.466667	0.066667	0.000000
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.733333	0.266667	0.000000	0.000000
				mus10142	0.744135	0.239186	0.016496	0.000183	0.500000	0.466667	0.033333	0.000000
				phy40008	0.744135	0.239186	0.016496	0.000183	0.733333	0.266667	0.000000	0.000000
				rel159410	0.744135	0.239186	0.016496	0.000183	0.933333	0.066667	0.000000	0.000000
			mus1139	mov973	0.744135	0.239186	0.016496	0.000183	0.166667	0.633333	0.166667	0.033333
				bbs57549	0.744135	0.239186	0.016496	0.000183	0.400000	0.333333	0.233333	0.033333
				mus10142	0.744135	0.239186	0.016496	0.000183	0.333333	0.400000	0.233333	0.033333
				phy40008	0.744135	0.239186	0.016496	0.000183	0.500000	0.300000	0.166667	0.033333
				rel159410	0.744135	0.239186	0.016496	0.000183	0.800000	0.166667	0.033333	0.000000

result	perf
method
clust_dist	0.344667
mean_comp	0.344667
txt_comp_dist	0.928667