notebook.community

Edit and run

Okay, finally the experiment using readability measures as features has finished so we will analyze the result in this session. First, load our toolkits and let matplotlib plots inline.



In [1]:

    
import numpy as np
import pandas as pd
%matplotlib inline

Load the experiment result and see the result.



In [2]:

    
df = pd.read_hdf('../reports/large-exp-readability-feats.h5', 'df')
df









    Out[2]:






  
    
      
      
      
      
      num_norm
      10
      ...
      80
    
    
      
      
      
      
      num_oot
      1
      ...
      8
    
    
      
      
      
      
      num_top
      1
      3
      5
      ...
      5
    
    
      
      
      
      
      result
      base
      perf
      base
      perf
      base
      ...
      base
      perf
    
    
      
      
      
      
      k
      0
      1
      0
      1
      0
      1
      0
      1
      0
      1
      ...
      2
      3
      4
      5
      0
      1
      2
      3
      4
      5
    
    
      method
      feature
      metric
      norm_dir
      oot_dir
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      txt_comp_dist
      readability
      euclidean
      bbs152930
      bbs57549
       0.909091
       0.090909
       0.966667
       0.033333
       0.727273
       0.272727
       0.633333
       0.366667
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.966667
       0.033333
       0.000000
       0.0
       0.000000
       0
    
    
      mus10142
       0.909091
       0.090909
       0.966667
       0.033333
       0.727273
       0.272727
       0.333333
       0.666667
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.933333
       0.066667
       0.000000
       0.0
       0.000000
       0
    
    
      phy40008
       0.909091
       0.090909
       1.000000
       0.000000
       0.727273
       0.272727
       0.433333
       0.566667
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.966667
       0.033333
       0.000000
       0.0
       0.000000
       0
    
    
      phy17301
      bbs57549
       0.909091
       0.090909
       0.933333
       0.066667
       0.727273
       0.272727
       0.666667
       0.333333
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.533333
       0.433333
       0.033333
       0.0
       0.000000
       0
    
    
      mus10142
       0.909091
       0.090909
       0.800000
       0.200000
       0.727273
       0.272727
       0.300000
       0.700000
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.066667
       0.300000
       0.500000
       0.1
       0.033333
       0
    
    
      phy40008
       0.909091
       0.090909
       0.933333
       0.066667
       0.727273
       0.272727
       0.700000
       0.300000
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.600000
       0.400000
       0.000000
       0.0
       0.000000
       0
    
    
      mus1139
      bbs57549
       0.909091
       0.090909
       0.933333
       0.066667
       0.727273
       0.272727
       0.900000
       0.100000
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.633333
       0.366667
       0.000000
       0.0
       0.000000
       0
    
    
      mus10142
       0.909091
       0.090909
       0.966667
       0.033333
       0.727273
       0.272727
       0.833333
       0.166667
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.600000
       0.400000
       0.000000
       0.0
       0.000000
       0
    
    
      phy40008
       0.909091
       0.090909
       1.000000
       0.000000
       0.727273
       0.272727
       0.933333
       0.066667
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.933333
       0.066667
       0.000000
       0.0
       0.000000
       0
    
  

9 rows × 116 columns

Group them all by method, feature, and distance (averaging over all threads).



In [3]:

    
df2 = df.groupby(level=['method', 'feature', 'metric']).mean()
df2









    Out[3]:






  
    
      
      
      num_norm
      10
      ...
      80
    
    
      
      
      num_oot
      1
      ...
      8
    
    
      
      
      num_top
      1
      3
      5
      ...
      5
    
    
      
      
      result
      base
      perf
      base
      perf
      base
      ...
      base
      perf
    
    
      
      
      k
      0
      1
      0
      1
      0
      1
      0
      1
      0
      1
      ...
      2
      3
      4
      5
      0
      1
      2
      3
      4
      5
    
    
      method
      feature
      metric
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      txt_comp_dist
      readability
      euclidean
       0.909091
       0.090909
       0.944444
       0.055556
       0.727273
       0.272727
       0.637037
       0.362963
       0.545455
       0.454545
      ...
       0.058722
       0.004517
       0.000143
       0.000001
       0.692593
       0.233333
       0.059259
       0.011111
       0.003704
       0
    
  

1 rows × 116 columns

Scroll easier by looking at its transpose.



In [4]:

    
df2.T









    Out[4]:






  
    
      
      
      
      
      method
      txt_comp_dist
    
    
      
      
      
      
      feature
      readability
    
    
      
      
      
      
      metric
      euclidean
    
    
      num_norm
      num_oot
      num_top
      result
      k
      
    
  
  
    
      10
      1
      1
      base
      0
       0.909091
    
    
      1
       0.090909
    
    
      perf
      0
       0.944444
    
    
      1
       0.055556
    
    
      3
      base
      0
       0.727273
    
    
      1
       0.272727
    
    
      perf
      0
       0.637037
    
    
      1
       0.362963
    
    
      5
      base
      0
       0.545455
    
    
      1
       0.454545
    
    
      perf
      0
       0.451852
    
    
      1
       0.548148
    
    
      4
      1
      base
      0
       0.714286
    
    
      1
       0.285714
    
    
      perf
      0
       0.818519
    
    
      1
       0.181481
    
    
      3
      base
      0
       0.329670
    
    
      1
       0.494505
    
    
      2
       0.164835
    
    
      3
       0.010989
    
    
      perf
      0
       0.311111
    
    
      1
       0.437037
    
    
      2
       0.251852
    
    
      3
       0.000000
    
    
      5
      base
      0
       0.125874
    
    
      1
       0.419580
    
    
      2
       0.359640
    
    
      3
       0.089910
    
    
      4
       0.004995
    
    
      perf
      0
       0.185185
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      80
      4
      5
      base
      4
       0.000003
    
    
      perf
      0
       0.774074
    
    
      1
       0.207407
    
    
      2
       0.014815
    
    
      3
       0.003704
    
    
      4
       0.000000
    
    
      8
      1
      base
      0
       0.909091
    
    
      1
       0.090909
    
    
      perf
      0
       0.900000
    
    
      1
       0.100000
    
    
      3
      base
      0
       0.748706
    
    
      1
       0.230371
    
    
      2
       0.020413
    
    
      3
       0.000510
    
    
      perf
      0
       0.751852
    
    
      1
       0.181481
    
    
      2
       0.066667
    
    
      3
       0.000000
    
    
      5
      base
      0
       0.613645
    
    
      1
       0.322971
    
    
      2
       0.058722
    
    
      3
       0.004517
    
    
      4
       0.000143
    
    
      5
       0.000001
    
    
      perf
      0
       0.692593
    
    
      1
       0.233333
    
    
      2
       0.059259
    
    
      3
       0.011111
    
    
      4
       0.003704
    
    
      5
       0.000000
    
  

116 rows × 1 columns

Put baseline and performance distribution side by side.



In [5]:

    
df3 = df2.T.unstack(level='result')
df3









    Out[5]:






  
    
      
      
      
      method
      txt_comp_dist
    
    
      
      
      
      feature
      readability
    
    
      
      
      
      metric
      euclidean
    
    
      
      
      
      result
      base
      perf
    
    
      num_norm
      num_oot
      num_top
      k
      
      
    
  
  
    
      10
      1
      1
      0
       0.909091
       0.944444
    
    
      1
       0.090909
       0.055556
    
    
      3
      0
       0.727273
       0.637037
    
    
      1
       0.272727
       0.362963
    
    
      5
      0
       0.545455
       0.451852
    
    
      1
       0.454545
       0.548148
    
    
      4
      1
      0
       0.714286
       0.818519
    
    
      1
       0.285714
       0.181481
    
    
      3
      0
       0.329670
       0.311111
    
    
      1
       0.494505
       0.437037
    
    
      2
       0.164835
       0.251852
    
    
      3
       0.010989
       0.000000
    
    
      5
      0
       0.125874
       0.185185
    
    
      1
       0.419580
       0.370370
    
    
      2
       0.359640
       0.281481
    
    
      3
       0.089910
       0.148148
    
    
      4
       0.004995
       0.014815
    
    
      8
      1
      0
       0.555556
       0.696296
    
    
      1
       0.444444
       0.303704
    
    
      3
      0
       0.147059
       0.207407
    
    
      1
       0.441176
       0.311111
    
    
      2
       0.343137
       0.470370
    
    
      3
       0.068627
       0.011111
    
    
      5
      0
       0.029412
       0.092593
    
    
      1
       0.196078
       0.229630
    
    
      2
       0.392157
       0.266667
    
    
      3
       0.294118
       0.259259
    
    
      4
       0.081699
       0.151852
    
    
      5
       0.006536
       0.000000
    
    
      80
      1
      1
      0
       0.987654
       0.988889
    
    
      1
       0.012346
       0.011111
    
    
      3
      0
       0.962963
       0.959259
    
    
      1
       0.037037
       0.040741
    
    
      5
      0
       0.938272
       0.955556
    
    
      1
       0.061728
       0.044444
    
    
      4
      1
      0
       0.952381
       0.944444
    
    
      1
       0.047619
       0.055556
    
    
      3
      0
       0.862264
       0.825926
    
    
      1
       0.132656
       0.155556
    
    
      2
       0.005038
       0.018519
    
    
      3
       0.000042
       0.000000
    
    
      5
      0
       0.778699
       0.774074
    
    
      1
       0.204921
       0.207407
    
    
      2
       0.015968
       0.014815
    
    
      3
       0.000409
       0.003704
    
    
      4
       0.000003
       0.000000
    
    
      8
      1
      0
       0.909091
       0.900000
    
    
      1
       0.090909
       0.100000
    
    
      3
      0
       0.748706
       0.751852
    
    
      1
       0.230371
       0.181481
    
    
      2
       0.020413
       0.066667
    
    
      3
       0.000510
       0.000000
    
    
      5
      0
       0.613645
       0.692593
    
    
      1
       0.322971
       0.233333
    
    
      2
       0.058722
       0.059259
    
    
      3
       0.004517
       0.011111
    
    
      4
       0.000143
       0.003704
    
    
      5
       0.000001
       0.000000

Let's group this result for each random event and plot the corresponding baseline and performance distributions.



In [6]:

    
grouped = df3.groupby(level=['num_norm', 'num_oot', 'num_top'])



In [7]:

    
for name, group in grouped:
    group.plot(kind='bar', legend=False, use_index=False, title='num_norm={}, num_oot={}, num_top={}'.format(*name))

We see form the plot that the performance doesn't differ that much from the baseline. In some cases, it is even worse than the baseline. Let's take a look at their expected values.



In [8]:

    
ngroup = len(grouped)
data = np.empty((ngroup, 2))
index = []
for i, (name, _) in enumerate(grouped):
    tmp = df3.loc[name]
    prod = tmp.T * np.array(tmp.index)   # multiply pmf and support
    prod = prod.unstack(level='result')
    expval = prod.sum(axis=1, level='result').values.ravel()
    data[i,:] = expval
    index.append(name)



In [9]:

    
data









    Out[9]:





array([[ 0.09090909,  0.05555556],
       [ 0.27272727,  0.36296296],
       [ 0.45454545,  0.54814815],
       [ 0.28571429,  0.18148148],
       [ 0.85714286,  0.94074074],
       [ 1.42857143,  1.43703704],
       [ 0.44444444,  0.3037037 ],
       [ 1.33333333,  1.28518519],
       [ 2.22222222,  2.14814815],
       [ 0.01234568,  0.01111111],
       [ 0.03703704,  0.04074074],
       [ 0.0617284 ,  0.04444444],
       [ 0.04761905,  0.05555556],
       [ 0.14285714,  0.19259259],
       [ 0.23809524,  0.24814815],
       [ 0.09090909,  0.1       ],
       [ 0.27272727,  0.31481481],
       [ 0.45454545,  0.4       ]])



In [10]:

    
index = pd.MultiIndex.from_tuples(index, names=['num_norm', 'num_oot', 'num_top'])
columns = pd.MultiIndex.from_tuples([('E[X]', 'base'), ('E[X]', 'perf')])



In [11]:

    
result = pd.DataFrame(data, index=index, columns=columns)



In [12]:

    
result

Now we can see clearer the comparison between the two.

				num_norm	10										...	80
				num_oot	1										...	8
				num_top	1				3				5		...	5
				result	base		perf		base		perf		base		...	base				perf
				k	0	1	0	1	0	1	0	1	0	1	...	2	3	4	5	0	1	2	3	4	5
method	feature	metric	norm_dir	oot_dir
txt_comp_dist	readability	euclidean	bbs152930	bbs57549	0.909091	0.090909	0.966667	0.033333	0.727273	0.272727	0.633333	0.366667	0.545455	0.454545	...	0.058722	0.004517	0.000143	0.000001	0.966667	0.033333	0.000000	0.0	0.000000	0
				mus10142	0.909091	0.090909	0.966667	0.033333	0.727273	0.272727	0.333333	0.666667	0.545455	0.454545	...	0.058722	0.004517	0.000143	0.000001	0.933333	0.066667	0.000000	0.0	0.000000	0
				phy40008	0.909091	0.090909	1.000000	0.000000	0.727273	0.272727	0.433333	0.566667	0.545455	0.454545	...	0.058722	0.004517	0.000143	0.000001	0.966667	0.033333	0.000000	0.0	0.000000	0
			phy17301	bbs57549	0.909091	0.090909	0.933333	0.066667	0.727273	0.272727	0.666667	0.333333	0.545455	0.454545	...	0.058722	0.004517	0.000143	0.000001	0.533333	0.433333	0.033333	0.0	0.000000	0
				mus10142	0.909091	0.090909	0.800000	0.200000	0.727273	0.272727	0.300000	0.700000	0.545455	0.454545	...	0.058722	0.004517	0.000143	0.000001	0.066667	0.300000	0.500000	0.1	0.033333	0
				phy40008	0.909091	0.090909	0.933333	0.066667	0.727273	0.272727	0.700000	0.300000	0.545455	0.454545	...	0.058722	0.004517	0.000143	0.000001	0.600000	0.400000	0.000000	0.0	0.000000	0
			mus1139	bbs57549	0.909091	0.090909	0.933333	0.066667	0.727273	0.272727	0.900000	0.100000	0.545455	0.454545	...	0.058722	0.004517	0.000143	0.000001	0.633333	0.366667	0.000000	0.0	0.000000	0
				mus10142	0.909091	0.090909	0.966667	0.033333	0.727273	0.272727	0.833333	0.166667	0.545455	0.454545	...	0.058722	0.004517	0.000143	0.000001	0.600000	0.400000	0.000000	0.0	0.000000	0
				phy40008	0.909091	0.090909	1.000000	0.000000	0.727273	0.272727	0.933333	0.066667	0.545455	0.454545	...	0.058722	0.004517	0.000143	0.000001	0.933333	0.066667	0.000000	0.0	0.000000	0

			E[X]
			base	perf
num_norm	num_oot	num_top
10	1	1	0.090909	0.055556
		3	0.272727	0.362963
		5	0.454545	0.548148
	4	1	0.285714	0.181481
		3	0.857143	0.940741
		5	1.428571	1.437037
	8	1	0.444444	0.303704
		3	1.333333	1.285185
		5	2.222222	2.148148
80	1	1	0.012346	0.011111
		3	0.037037	0.040741
		5	0.061728	0.044444
	4	1	0.047619	0.055556
		3	0.142857	0.192593
		5	0.238095	0.248148
	8	1	0.090909	0.100000
		3	0.272727	0.314815
		5	0.454545	0.400000