In [1]:
from discoutils.thesaurus_loader import Vectors
from random import sample
import numpy as np
import logging
import seaborn as sns
logging.basicConfig(level=logging.INFO,
                     format="%(asctime)s\t%(module)s.%(funcName)s (line %(lineno)d)\t%(levelname)s : %(message)s")

In [2]:
v = Vectors.from_tsv('/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/word2vec_vectors/composed/AN_NN_word2vec_20percent-rep0_Add.events.filtered.strings')
keys = sample(sorted(v.keys()), 10)
data = v._obj


2015-02-04 17:09:13,026	<ipython-input-2-dcbdb31d36e2>.<module> (line 1)	INFO : Loading thesaurus lustre/scratch/inf/mmb28/FeatureExtractionToolkit/word2vec_vectors/composed/AN_NN_word2vec_20percent-rep0_Add.events.filtered.strings from disk

In [3]:
for n in np.arange(0, 1.51, .25):
    v = Vectors(data, noise=n)
    v.init_sims(n_neighbors=2)
    for k in keys[:3]:
#         print(v.get_vector(k).data[:3])
        print(v.get_nearest_neighbours(k))
    print('___________________________________%.1f'%n)


[('non/J_shaft/N', 2.0234959724183512), ('non/J_bolt/N', 2.0264466266919294)]
[('stitch/N', 1.9824516971244923), ('strawberry/N_failure/N', 2.069234463334594)]
[('cut/N_bearer/N', 2.5574729391352395), ('remark/N', 2.599693097450376)]
___________________________________0.0
[('non/J_bolt/N', 2.526625840354646), ('non/J_tire/N', 2.7522454632601585)]
[('failure/N_mode/N', 2.7413648813540843), ('world-wide/J_sensation/N', 2.7600594881532294)]
[('decent/J_remark/N', 2.9258340539499774), ('scratch/N_cut/N', 3.0256096887545492)]
___________________________________0.2
[('non/J_mat/N', 4.0189424173403827), ('non/J_paint/N', 4.0768063165845119)]
[('phillips/N_monster/N', 3.9476807854780867), ('ball/N_failure/N', 4.0684886868842804)]
[('vet/N_remark/N', 4.3483622015651413), ('accusation/N', 4.3487899655195674)]
___________________________________0.5
[('non/J_pro/N', 5.5190101361917954), ('buck/N_amazon/N', 5.5438420089220752)]
[('frantic/J_circle/N', 5.5901076263327312), ('bright/J_use/N', 5.6097603238374019)]
[('birthplace/N', 5.6553719178659483), ('frequent/J_comment/N', 5.8131149948974237)]
___________________________________0.8
[('human/J_pig/N', 7.3214298066126897), ('individual/N', 7.4067798014779722)]
[('defective/J_control/N', 6.6352072818512342), ('fine/J_collar/N', 6.6550241074701093)]
[('great/J_shop/N', 7.1592514257065121), ('loud/J_clock/N', 7.1772119808243815)]
___________________________________1.0
[('cold/J_film/N', 8.0566367571958999), ('bus/N_right/N', 8.0760321054713451)]
[('home/N_miracle/N', 8.2171341621309999), ('right/N_wrist/N', 8.3323682129782668)]
[('higher/J_suspect/N', 7.8333699327480311), ('anticipated/J_time/N', 7.8517168500474623)]
___________________________________1.2
[('plate/N_justice/N', 9.6032022446643062), ('standard/N_blade/N', 9.8461272191716578)]
[('mainly/RB', 9.6896596460818714), ('professional/J_package/N', 9.6958366050242724)]
[('santa/N_contest/N', 9.6256191190564522), ('acceptable/J_clearance/N', 9.6601665567552129)]
___________________________________1.5

In [11]:
from matplotlib import pylab as plt
%matplotlib inline
plt.hist(v.matrix.data, bins=100)
plt.gca().set_yticklabels([])
plt.xlim(-2, 2)
plt.savefig('plot-w2v-distribution-of-weights.pdf', format='pdf', dpi=300)



In [9]:
v.matrix.data.mean()


Out[9]:
-0.018923800067449655

In [4]:
np.arange(0, 1.51, .25)


Out[4]:
array([ 0.  ,  0.25,  0.5 ,  0.75,  1.  ,  1.25,  1.5 ])

In [6]: