In [1]:
from everything import *
from dasem.semantic import Semantic
from dasem.data import wordsim353 as wordsim353_data
In [2]:
# Read datasets
four_words = read_csv('../dasem/data/four_words.csv', encoding='utf-8')
wordsim353 = wordsim353_data()
In [3]:
def compute_accuracy(semantic, four_words):
outlier = []
for idx, words in four_words.iterrows():
sorted_words = semantic.sort_by_outlierness(words.values[:4])
outlier.append(sorted_words[0])
accuracy = mean(four_words.word4 == outlier)
return accuracy
In [4]:
def compute_correlation(semantic, wordsim):
human = []
relatednesses = []
for idx, row in wordsim.iterrows():
R = semantic.relatedness([row.da1, row.da2])
relatednesses.append(R[0, 1])
human.append(row['Human (mean)'])
human = array(human)
relatednesses = array(relatednesses)
indices = (~isnan(relatednesses)).nonzero()[0]
C = corrcoef(human[indices], relatednesses[indices])
return C[0, 1]
In [5]:
max_n_pagess = [3000, 30000, None]
norms = ['l1', 'l2', None]
stop_wordss = [None, set(nltk.corpus.stopwords.words('danish'))]
use_idfs = [True, False]
sublinear_tfs = [True, False]
columns = ['accuracy', 'correlation', 'stop_words', 'use_idf', 'norm', 'sublinear_tf', 'max_n_pages']
n_total = len(max_n_pagess) * len(norms) * len(stop_wordss) * len(use_idfs) * \
len(sublinear_tfs)
results = DataFrame(dtype=float, index=range(n_total), columns=columns)
n = 0
for stop_words_index, stop_words in (enumerate(stop_wordss)):
for norm in (norms):
for use_idf in (use_idfs):
for sublinear_tf in (sublinear_tfs):
for max_n_pages in (max_n_pagess):
results.ix[n, 'max_n_pages'] = max_n_pages
results.ix[n, 'stop_words'] = stop_words_index
results.ix[n, 'norm'] = str(norm)
results.ix[n, 'use_idf'] = use_idf
results.ix[n, 'sublinear_tf'] = sublinear_tf
semantic = Semantic(stop_words=stop_words, norm=norm,
use_idf=use_idf, sublinear_tf=sublinear_tf,
max_n_pages=max_n_pages)
results.ix[n, 'accuracy'] = compute_accuracy(semantic, four_words)
results.ix[n, 'correlation'] = compute_correlation(semantic, wordsim353)
n += 1
In [6]:
relatednesses = []
for idx, row in wordsim353.iterrows():
R = semantic.relatedness([row.da1, row.da2])
relatednesses.append(R[0, 1])
wordsim353['relatedness'] = relatednesses
In [7]:
wordsim353
Out[7]:
In [12]:
wordsim353.plot(x='Human (mean)', y='relatedness', kind='scatter')
yscale('log')
ylim(0.0001, 1)
title('Scatter plot of Wordsim353 data')
show()
In [9]:
results
Out[9]:
In [10]:
formula = 'accuracy ~ stop_words + use_idf + norm + sublinear_tf + max_n_pages'
model = smf.glm(formula, data=results).fit()
model.summary()
Out[10]:
In [ ]: