In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from gensim.models.word2vec import Word2Vec

In [3]:
from collections import OrderedDict

models = OrderedDict([
    (year, Word2Vec.load('models/bpo/{}.bin'.format(year)))
    for year in range(1720, 1960, 20)
])

In [4]:
def cosine_series(anchor, query):
    
    series = OrderedDict()
    
    for year, model in models.items():
        
        series[year] = (
            model.similarity(anchor, query)
            if query in model else 0
        )

    return series

In [5]:
import numpy as np
import statsmodels.api as sm

def lin_reg(series):

    x = np.array(list(series.keys()))
    y = np.array(list(series.values()))

    x = sm.add_constant(x)

    return sm.OLS(y, x).fit()

In [6]:
def plot_cosine_series(anchor, query, w=5, h=4):
    
    series = cosine_series(anchor, query)
    
    fit = lin_reg(series)

    x1 = list(series.keys())[0]
    x2 = list(series.keys())[-1]

    y1 = fit.predict()[0]
    y2 = fit.predict()[-1]
    
    print(anchor, query)
    
    plt.figure(figsize=(w, h))
    plt.ylim(0, 1)
    plt.title(query)
    plt.xlabel('Year')
    plt.ylabel('Similarity')
    plt.plot(list(series.keys()), list(series.values()))
    plt.plot([x1, x2], [y1, y2], color='gray', linewidth=0.5)
    plt.show()

In [7]:
plot_cosine_series('science', 'poetry')
plot_cosine_series('literature', 'poetry')
plot_cosine_series('literature', 'writing')
plot_cosine_series('literature', 'polite')


science poetry
literature poetry
literature writing
literature polite

In [8]:
import enchant

dictionary = enchant.Dict('en_US')

def union_neighbor_vocab(anchor, topn=200):
    
    vocab = set()
    
    for year, model in models.items():
        similar = model.most_similar(anchor, topn=topn)
        vocab.update([s[0] for s in similar if dictionary.check(s[0])])
        
    return vocab

In [9]:
union_vocab = union_neighbor_vocab('literature')

In [10]:
data = []
for token in union_vocab:
    
    series = cosine_series('literature', token)
    fit = lin_reg(series)
    
    data.append((token, fit.params[1], fit.pvalues[1]))

In [11]:
import pandas as pd

df1 = pd.DataFrame(data, columns=('token', 'slope', 'p'))

Increasing


In [12]:
pd.set_option('display.max_rows', 1000)

df1.sort_values('slope', ascending=False).head(50)


Out[12]:
token slope p
447 journalism 0.004263 0.000052
158 psychology 0.003640 0.000081
378 renaissance 0.003440 0.001242
510 anthropology 0.003316 0.000328
69 sociology 0.003243 0.000100
1 biology 0.003210 0.000165
475 folklore 0.003119 0.000631
398 technique 0.002996 0.001171
572 decadence 0.002940 0.000517
546 masterpieces 0.002929 0.000369
479 scholarship 0.002921 0.014835
468 curriculum 0.002920 0.000035
302 symbolism 0.002866 0.000814
254 linguistic 0.002856 0.001133
649 realism 0.002855 0.000197
137 biographies 0.002851 0.000032
47 pictorial 0.002845 0.000401
432 mysticism 0.002830 0.002411
336 handbook 0.002815 0.000197
44 periodicals 0.002803 0.001853
276 terminology 0.002781 0.001529
160 slang 0.002776 0.000004
152 bibliography 0.002743 0.001752
553 artistic 0.002734 0.001260
574 enlightenment 0.002697 0.001983
598 photography 0.002690 0.000055
129 aesthetics 0.002689 0.000123
131 composers 0.002677 0.003792
209 propaganda 0.002655 0.000016
6 literatures 0.002626 0.027821
164 novelists 0.002614 0.011911
229 archaeological 0.002601 0.000389
513 idealism 0.002601 0.000492
150 scholarly 0.002591 0.000261
350 economics 0.002590 0.004145
94 romanticism 0.002574 0.001771
50 textbook 0.002557 0.000290
464 pseudo 0.002545 0.000525
314 athletics 0.002529 0.004221
61 dramatists 0.002517 0.005857
262 spiritualism 0.002513 0.010403
298 technicalities 0.002512 0.000980
4 mystics 0.002501 0.001282
237 medieval 0.002501 0.005138
308 philosophies 0.002495 0.003190
544 synthesis 0.002494 0.000055
512 reproduction 0.002491 0.000439
74 museums 0.002485 0.001646
492 appreciation 0.002484 0.000369
596 distinctively 0.002471 0.002278

In [13]:
for i, row in df1.sort_values('slope', ascending=False).head(20).iterrows():
    plot_cosine_series('literature', row['token'], 3, 2)


literature journalism
literature psychology
literature renaissance
literature anthropology
literature sociology
literature biology
literature folklore
literature technique
literature decadence
literature masterpieces
literature scholarship
literature curriculum
literature symbolism
literature linguistic
literature realism
literature biographies
literature pictorial
literature mysticism
literature handbook
literature periodicals

Decreasing


In [14]:
df1.sort_values('slope', ascending=True).head(50)


Out[14]:
token slope p
421 polite -0.003020 0.000025
424 politer -0.002901 0.000111
377 proficients -0.002741 0.000138
364 mechanic -0.002513 0.000014
420 encourager -0.002496 0.000003
632 pharmacy -0.002490 0.000025
272 stare -0.002378 0.006064
300 literati -0.002293 0.000955
322 agronomy -0.002120 0.001476
459 ornamental -0.001997 0.000027
603 liberal -0.001955 0.000026
247 mercantile -0.001937 0.000003
289 dunces -0.001863 0.004224
631 proficient -0.001825 0.000372
41 mechanics -0.001781 0.000055
200 frugality -0.001747 0.000114
288 excelling -0.001729 0.001095
216 academical -0.001689 0.000038
481 skilled -0.001623 0.000073
217 improvement -0.001612 0.000013
89 learning -0.001594 0.000694
171 gallantry -0.001556 0.000660
294 rabbinical -0.001540 0.023604
126 antiquaries -0.001526 0.007720
304 tragic -0.001513 0.000655
644 republics -0.001506 0.002310
144 improvements -0.001479 0.000877
426 medical -0.001477 0.009062
306 oratory -0.001475 0.034696
466 adepts -0.001474 0.008265
534 oriental -0.001470 0.000025
559 miniature -0.001469 0.012887
550 emporium -0.001468 0.004943
547 graphical -0.001416 0.056515
511 republic -0.001367 0.000224
471 ornaments -0.001342 0.000534
341 gardening -0.001336 0.001199
583 agriculture -0.001318 0.000351
244 refinement -0.001310 0.010546
535 phi -0.001294 0.006914
165 experimental -0.001275 0.001543
507 decorations -0.001253 0.002598
7 geographical -0.001235 0.000040
515 entertaining -0.001233 0.000814
83 allegories -0.001227 0.020008
170 surgery -0.001209 0.014367
292 societies -0.001163 0.004890
467 refined -0.001159 0.017626
186 academies -0.001149 0.003349
333 antiquarians -0.001147 0.034666

In [15]:
for i, row in df1.sort_values('slope', ascending=True).head(20).iterrows():
    plot_cosine_series('literature', row['token'], 3, 2)


literature polite
literature politer
literature proficients
literature mechanic
literature encourager
literature pharmacy
literature stare
literature literati
literature agronomy
literature ornamental
literature liberal
literature mercantile
literature dunces
literature proficient
literature mechanics
literature frugality
literature excelling
literature academical
literature skilled
literature improvement

In [16]:
def intersect_neighbor_vocab(anchor, topn=1000):
    
    vocabs = []
    
    for year, model in models.items():
        similar = model.most_similar(anchor, topn=topn)
        vocabs.append(set([s[0] for s in similar if dictionary.check(s[0])]))
        
    return set.intersection(*vocabs)

In [17]:
intersect_vocab = intersect_neighbor_vocab('literature')

In [20]:
data = []
for token in intersect_vocab:
    
    series = cosine_series('literature', token)
    fit = lin_reg(series)
    
    if fit.pvalues[1] < 0.05:
        data.append((token, fit.params[1], fit.pvalues[1]))

In [21]:
import pandas as pd

df2 = pd.DataFrame(data, columns=('token', 'slope', 'p'))

Intersected neighbors


In [22]:
df2.sort_values('slope', ascending=False)


Out[22]:
token slope p
2 art 0.001591 0.001347
5 culture 0.001178 0.005864
0 language 0.000973 0.000014
3 poetry 0.000636 0.001080
9 geography -0.000304 0.036853
1 sciences -0.000540 0.033214
8 languages -0.000601 0.022820
4 proficiency -0.000603 0.010853
6 mathematics -0.000604 0.028463
7 rhetoric -0.000639 0.036787

In [23]:
for i, row in df2.sort_values('slope', ascending=False).iterrows():
    plot_cosine_series('literature', row['token'], 3, 2)


literature art
literature culture
literature language
literature poetry
literature geography
literature sciences
literature languages
literature proficiency
literature mathematics
literature rhetoric

In [ ]: