In [1]:
%cd ~/NetBeansProjects/ExpLosion/
from notebooks.common_imports import *
import numpy as np
from itertools import tee
from gui.user_code import pairwise_randomised_significance, get_data_for_signif_test, pairwise_significance_exp_ids
from gui.output_utils import get_cv_fold_count, get_cv_scores_many_experiment
from gui.constants import CLASSIFIER
from discoutils.collections_utils import walk_overlapping_pairs
sns.timeseries.algo.bootstrap = my_bootstrap
sns.categorical.bootstrap = my_bootstrap
What is the effect of adding uniform random noise to vectors? Can our experiments tell between a set of vectors and a corrupted version of the same vectors? To find out, add uniform random noise from $-n$ to $n$ (x axis) to all non-zero entries in a set of word2vec embeddings.
In [5]:
def plot(d):
experiments = Experiment.objects.filter(**d).order_by('expansions__noise')
e = [x.id for x in experiments if x.expansions.entries_of is None]
print('experiments are', e)
for eid in e:
print('id %d noise %2.2f, acc %2.2f, macrof1 %2.2f'%(eid,
Experiment.objects.get(id=eid).expansions.noise,
Results.objects.get(id=eid, classifier=CLASSIFIER).accuracy_mean,
Results.objects.get(id=eid, classifier=CLASSIFIER).macrof1_mean))
df = dataframe_from_exp_ids(e, {'Noise':'expansions__noise'})
df['Noise'] = ['%1.1f'%float(x) for x in df.Noise]
ax = sns.tsplot(df, time='Noise', value='Accuracy', unit='folds', marker='o')
# random baseline for comparison
plt.axhline(random_vect_baseline(corpus=d['labelled']), c='k')
significance_df = pairwise_significance_exp_ids(walk_overlapping_pairs(e), ['expansions__noise'])
return significance_df
In [6]:
d = {'expansions__vectors__unlabelled_percentage': 100,
'labelled': 'amazon_grouped-tagged',
'expansions__vectors__dimensionality': 100,
'expansions__decode_handler': 'SignifiedOnlyFeatureHandler',
'expansions__vectors__composer': 'Add',
'expansions__vectors__rep': 0,
'expansions__allow_overlap': False,
'expansions__k': 3,
'expansions__vectors__unlabelled': 'wiki',
'expansions__vectors__algorithm': 'word2vec',
'document_features_tr': 'J+N+AN+NN',
'document_features_ev': 'AN+NN',
'labelled': 'amazon_grouped-tagged'}
df = plot(d).convert_objects(convert_numeric=True)
plt.savefig('plot-noise-gigaword-add-amazon.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)
In [7]:
d['labelled'] = 'reuters21578/r8-tagged-grouped'
df2 = plot(d).convert_objects(convert_numeric=True)
plt.savefig('plot-noise-gigaword-add-r2.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)
In [8]:
d['expansions__k'] = 30
df2 = plot(d).convert_objects(convert_numeric=True)
plt.savefig('plot-noise-gigaword-add-r2-k30.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)
In [9]:
d['expansions__k'] = 60
df2 = plot(d).convert_objects(convert_numeric=True)
plt.savefig('plot-noise-gigaword-add-r2-k60.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)
In [ ]: