notebook.community

Edit and run



In [1]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
from gensim.models.word2vec import Word2Vec



In [3]:

    
from collections import OrderedDict

models = OrderedDict([
    (year, Word2Vec.load('models/bpo/{}.bin'.format(year)))
    for year in range(1720, 1960, 20)
])



In [4]:

    
def cosine_series(anchor, query):
    
    series = OrderedDict()
    
    for year, model in models.items():
        
        series[year] = (
            model.similarity(anchor, query)
            if query in model else 0
        )

    return series



In [5]:

    
import numpy as np
import statsmodels.api as sm

def lin_reg(series):

    x = np.array(list(series.keys()))
    y = np.array(list(series.values()))

    x = sm.add_constant(x)

    return sm.OLS(y, x).fit()



In [6]:

    
def plot_cosine_series(anchor, query, w=5, h=4):
    
    series = cosine_series(anchor, query)
    
    fit = lin_reg(series)

    x1 = list(series.keys())[0]
    x2 = list(series.keys())[-1]

    y1 = fit.predict()[0]
    y2 = fit.predict()[-1]
    
    print(anchor, query)
    
    plt.figure(figsize=(w, h))
    plt.ylim(0, 1)
    plt.title(query)
    plt.xlabel('Year')
    plt.ylabel('Similarity')
    plt.plot(list(series.keys()), list(series.values()))
    plt.plot([x1, x2], [y1, y2], color='gray', linewidth=0.5)
    plt.show()



In [7]:

    
plot_cosine_series('science', 'poetry')
plot_cosine_series('literature', 'poetry')
plot_cosine_series('literature', 'writing')
plot_cosine_series('literature', 'polite')









    



science poetry






    












    



literature poetry






    












    



literature writing






    












    



literature polite



In [8]:

    
import enchant

dictionary = enchant.Dict('en_US')

def union_neighbor_vocab(anchor, topn=200):
    
    vocab = set()
    
    for year, model in models.items():
        similar = model.most_similar(anchor, topn=topn)
        vocab.update([s[0] for s in similar if dictionary.check(s[0])])
        
    return vocab



In [9]:

    
union_vocab = union_neighbor_vocab('literature')



In [10]:

    
data = []
for token in union_vocab:
    
    series = cosine_series('literature', token)
    fit = lin_reg(series)
    
    data.append((token, fit.params[1], fit.pvalues[1]))



In [11]:

    
import pandas as pd

df1 = pd.DataFrame(data, columns=('token', 'slope', 'p'))

Increasing



In [12]:

    
pd.set_option('display.max_rows', 1000)

df1.sort_values('slope', ascending=False).head(50)









    Out[12]:






  
    
      
      token
      slope
      p
    
  
  
    
      447
      journalism
      0.004263
      0.000052
    
    
      158
      psychology
      0.003640
      0.000081
    
    
      378
      renaissance
      0.003440
      0.001242
    
    
      510
      anthropology
      0.003316
      0.000328
    
    
      69
      sociology
      0.003243
      0.000100
    
    
      1
      biology
      0.003210
      0.000165
    
    
      475
      folklore
      0.003119
      0.000631
    
    
      398
      technique
      0.002996
      0.001171
    
    
      572
      decadence
      0.002940
      0.000517
    
    
      546
      masterpieces
      0.002929
      0.000369
    
    
      479
      scholarship
      0.002921
      0.014835
    
    
      468
      curriculum
      0.002920
      0.000035
    
    
      302
      symbolism
      0.002866
      0.000814
    
    
      254
      linguistic
      0.002856
      0.001133
    
    
      649
      realism
      0.002855
      0.000197
    
    
      137
      biographies
      0.002851
      0.000032
    
    
      47
      pictorial
      0.002845
      0.000401
    
    
      432
      mysticism
      0.002830
      0.002411
    
    
      336
      handbook
      0.002815
      0.000197
    
    
      44
      periodicals
      0.002803
      0.001853
    
    
      276
      terminology
      0.002781
      0.001529
    
    
      160
      slang
      0.002776
      0.000004
    
    
      152
      bibliography
      0.002743
      0.001752
    
    
      553
      artistic
      0.002734
      0.001260
    
    
      574
      enlightenment
      0.002697
      0.001983
    
    
      598
      photography
      0.002690
      0.000055
    
    
      129
      aesthetics
      0.002689
      0.000123
    
    
      131
      composers
      0.002677
      0.003792
    
    
      209
      propaganda
      0.002655
      0.000016
    
    
      6
      literatures
      0.002626
      0.027821
    
    
      164
      novelists
      0.002614
      0.011911
    
    
      229
      archaeological
      0.002601
      0.000389
    
    
      513
      idealism
      0.002601
      0.000492
    
    
      150
      scholarly
      0.002591
      0.000261
    
    
      350
      economics
      0.002590
      0.004145
    
    
      94
      romanticism
      0.002574
      0.001771
    
    
      50
      textbook
      0.002557
      0.000290
    
    
      464
      pseudo
      0.002545
      0.000525
    
    
      314
      athletics
      0.002529
      0.004221
    
    
      61
      dramatists
      0.002517
      0.005857
    
    
      262
      spiritualism
      0.002513
      0.010403
    
    
      298
      technicalities
      0.002512
      0.000980
    
    
      4
      mystics
      0.002501
      0.001282
    
    
      237
      medieval
      0.002501
      0.005138
    
    
      308
      philosophies
      0.002495
      0.003190
    
    
      544
      synthesis
      0.002494
      0.000055
    
    
      512
      reproduction
      0.002491
      0.000439
    
    
      74
      museums
      0.002485
      0.001646
    
    
      492
      appreciation
      0.002484
      0.000369
    
    
      596
      distinctively
      0.002471
      0.002278



In [13]:

    
for i, row in df1.sort_values('slope', ascending=False).head(20).iterrows():
    plot_cosine_series('literature', row['token'], 3, 2)









    



literature journalism






    












    



literature psychology






    












    



literature renaissance






    












    



literature anthropology






    












    



literature sociology






    












    



literature biology






    












    



literature folklore






    












    



literature technique






    












    



literature decadence






    












    



literature masterpieces






    












    



literature scholarship






    












    



literature curriculum






    












    



literature symbolism






    












    



literature linguistic






    












    



literature realism






    












    



literature biographies






    












    



literature pictorial






    












    



literature mysticism






    












    



literature handbook






    












    



literature periodicals

Decreasing



In [14]:

    
df1.sort_values('slope', ascending=True).head(50)









    Out[14]:






  
    
      
      token
      slope
      p
    
  
  
    
      421
      polite
      -0.003020
      0.000025
    
    
      424
      politer
      -0.002901
      0.000111
    
    
      377
      proficients
      -0.002741
      0.000138
    
    
      364
      mechanic
      -0.002513
      0.000014
    
    
      420
      encourager
      -0.002496
      0.000003
    
    
      632
      pharmacy
      -0.002490
      0.000025
    
    
      272
      stare
      -0.002378
      0.006064
    
    
      300
      literati
      -0.002293
      0.000955
    
    
      322
      agronomy
      -0.002120
      0.001476
    
    
      459
      ornamental
      -0.001997
      0.000027
    
    
      603
      liberal
      -0.001955
      0.000026
    
    
      247
      mercantile
      -0.001937
      0.000003
    
    
      289
      dunces
      -0.001863
      0.004224
    
    
      631
      proficient
      -0.001825
      0.000372
    
    
      41
      mechanics
      -0.001781
      0.000055
    
    
      200
      frugality
      -0.001747
      0.000114
    
    
      288
      excelling
      -0.001729
      0.001095
    
    
      216
      academical
      -0.001689
      0.000038
    
    
      481
      skilled
      -0.001623
      0.000073
    
    
      217
      improvement
      -0.001612
      0.000013
    
    
      89
      learning
      -0.001594
      0.000694
    
    
      171
      gallantry
      -0.001556
      0.000660
    
    
      294
      rabbinical
      -0.001540
      0.023604
    
    
      126
      antiquaries
      -0.001526
      0.007720
    
    
      304
      tragic
      -0.001513
      0.000655
    
    
      644
      republics
      -0.001506
      0.002310
    
    
      144
      improvements
      -0.001479
      0.000877
    
    
      426
      medical
      -0.001477
      0.009062
    
    
      306
      oratory
      -0.001475
      0.034696
    
    
      466
      adepts
      -0.001474
      0.008265
    
    
      534
      oriental
      -0.001470
      0.000025
    
    
      559
      miniature
      -0.001469
      0.012887
    
    
      550
      emporium
      -0.001468
      0.004943
    
    
      547
      graphical
      -0.001416
      0.056515
    
    
      511
      republic
      -0.001367
      0.000224
    
    
      471
      ornaments
      -0.001342
      0.000534
    
    
      341
      gardening
      -0.001336
      0.001199
    
    
      583
      agriculture
      -0.001318
      0.000351
    
    
      244
      refinement
      -0.001310
      0.010546
    
    
      535
      phi
      -0.001294
      0.006914
    
    
      165
      experimental
      -0.001275
      0.001543
    
    
      507
      decorations
      -0.001253
      0.002598
    
    
      7
      geographical
      -0.001235
      0.000040
    
    
      515
      entertaining
      -0.001233
      0.000814
    
    
      83
      allegories
      -0.001227
      0.020008
    
    
      170
      surgery
      -0.001209
      0.014367
    
    
      292
      societies
      -0.001163
      0.004890
    
    
      467
      refined
      -0.001159
      0.017626
    
    
      186
      academies
      -0.001149
      0.003349
    
    
      333
      antiquarians
      -0.001147
      0.034666



In [15]:

    
for i, row in df1.sort_values('slope', ascending=True).head(20).iterrows():
    plot_cosine_series('literature', row['token'], 3, 2)









    



literature polite






    












    



literature politer






    












    



literature proficients






    












    



literature mechanic






    












    



literature encourager






    












    



literature pharmacy






    












    



literature stare






    












    



literature literati






    












    



literature agronomy






    












    



literature ornamental






    












    



literature liberal






    












    



literature mercantile






    












    



literature dunces






    












    



literature proficient






    












    



literature mechanics






    












    



literature frugality






    












    



literature excelling






    












    



literature academical






    












    



literature skilled






    












    



literature improvement



In [16]:

    
def intersect_neighbor_vocab(anchor, topn=1000):
    
    vocabs = []
    
    for year, model in models.items():
        similar = model.most_similar(anchor, topn=topn)
        vocabs.append(set([s[0] for s in similar if dictionary.check(s[0])]))
        
    return set.intersection(*vocabs)



In [17]:

    
intersect_vocab = intersect_neighbor_vocab('literature')



In [20]:

    
data = []
for token in intersect_vocab:
    
    series = cosine_series('literature', token)
    fit = lin_reg(series)
    
    if fit.pvalues[1] < 0.05:
        data.append((token, fit.params[1], fit.pvalues[1]))



In [21]:

    
import pandas as pd

df2 = pd.DataFrame(data, columns=('token', 'slope', 'p'))

Intersected neighbors



In [22]:

    
df2.sort_values('slope', ascending=False)









    Out[22]:






  
    
      
      token
      slope
      p
    
  
  
    
      2
      art
      0.001591
      0.001347
    
    
      5
      culture
      0.001178
      0.005864
    
    
      0
      language
      0.000973
      0.000014
    
    
      3
      poetry
      0.000636
      0.001080
    
    
      9
      geography
      -0.000304
      0.036853
    
    
      1
      sciences
      -0.000540
      0.033214
    
    
      8
      languages
      -0.000601
      0.022820
    
    
      4
      proficiency
      -0.000603
      0.010853
    
    
      6
      mathematics
      -0.000604
      0.028463
    
    
      7
      rhetoric
      -0.000639
      0.036787



In [23]:

    
for i, row in df2.sort_values('slope', ascending=False).iterrows():
    plot_cosine_series('literature', row['token'], 3, 2)









    



literature art






    












    



literature culture






    












    



literature language






    












    



literature poetry






    












    



literature geography






    












    



literature sciences






    












    



literature languages






    












    



literature proficiency






    












    



literature mathematics






    












    



literature rhetoric



In [ ]:

	token	slope	p
447	journalism	0.004263	0.000052
158	psychology	0.003640	0.000081
378	renaissance	0.003440	0.001242
510	anthropology	0.003316	0.000328
69	sociology	0.003243	0.000100
1	biology	0.003210	0.000165
475	folklore	0.003119	0.000631
398	technique	0.002996	0.001171
572	decadence	0.002940	0.000517
546	masterpieces	0.002929	0.000369
479	scholarship	0.002921	0.014835
468	curriculum	0.002920	0.000035
302	symbolism	0.002866	0.000814
254	linguistic	0.002856	0.001133
649	realism	0.002855	0.000197
137	biographies	0.002851	0.000032
47	pictorial	0.002845	0.000401
432	mysticism	0.002830	0.002411
336	handbook	0.002815	0.000197
44	periodicals	0.002803	0.001853
276	terminology	0.002781	0.001529
160	slang	0.002776	0.000004
152	bibliography	0.002743	0.001752
553	artistic	0.002734	0.001260
574	enlightenment	0.002697	0.001983
598	photography	0.002690	0.000055
129	aesthetics	0.002689	0.000123
131	composers	0.002677	0.003792
209	propaganda	0.002655	0.000016
6	literatures	0.002626	0.027821
164	novelists	0.002614	0.011911
229	archaeological	0.002601	0.000389
513	idealism	0.002601	0.000492
150	scholarly	0.002591	0.000261
350	economics	0.002590	0.004145
94	romanticism	0.002574	0.001771
50	textbook	0.002557	0.000290
464	pseudo	0.002545	0.000525
314	athletics	0.002529	0.004221
61	dramatists	0.002517	0.005857
262	spiritualism	0.002513	0.010403
298	technicalities	0.002512	0.000980
4	mystics	0.002501	0.001282
237	medieval	0.002501	0.005138
308	philosophies	0.002495	0.003190
544	synthesis	0.002494	0.000055
512	reproduction	0.002491	0.000439
74	museums	0.002485	0.001646
492	appreciation	0.002484	0.000369
596	distinctively	0.002471	0.002278

	token	slope	p
421	polite	-0.003020	0.000025
424	politer	-0.002901	0.000111
377	proficients	-0.002741	0.000138
364	mechanic	-0.002513	0.000014
420	encourager	-0.002496	0.000003
632	pharmacy	-0.002490	0.000025
272	stare	-0.002378	0.006064
300	literati	-0.002293	0.000955
322	agronomy	-0.002120	0.001476
459	ornamental	-0.001997	0.000027
603	liberal	-0.001955	0.000026
247	mercantile	-0.001937	0.000003
289	dunces	-0.001863	0.004224
631	proficient	-0.001825	0.000372
41	mechanics	-0.001781	0.000055
200	frugality	-0.001747	0.000114
288	excelling	-0.001729	0.001095
216	academical	-0.001689	0.000038
481	skilled	-0.001623	0.000073
217	improvement	-0.001612	0.000013
89	learning	-0.001594	0.000694
171	gallantry	-0.001556	0.000660
294	rabbinical	-0.001540	0.023604
126	antiquaries	-0.001526	0.007720
304	tragic	-0.001513	0.000655
644	republics	-0.001506	0.002310
144	improvements	-0.001479	0.000877
426	medical	-0.001477	0.009062
306	oratory	-0.001475	0.034696
466	adepts	-0.001474	0.008265
534	oriental	-0.001470	0.000025
559	miniature	-0.001469	0.012887
550	emporium	-0.001468	0.004943
547	graphical	-0.001416	0.056515
511	republic	-0.001367	0.000224
471	ornaments	-0.001342	0.000534
341	gardening	-0.001336	0.001199
583	agriculture	-0.001318	0.000351
244	refinement	-0.001310	0.010546
535	phi	-0.001294	0.006914
165	experimental	-0.001275	0.001543
507	decorations	-0.001253	0.002598
7	geographical	-0.001235	0.000040
515	entertaining	-0.001233	0.000814
83	allegories	-0.001227	0.020008
170	surgery	-0.001209	0.014367
292	societies	-0.001163	0.004890
467	refined	-0.001159	0.017626
186	academies	-0.001149	0.003349
333	antiquarians	-0.001147	0.034666

	token	slope	p
2	art	0.001591	0.001347
5	culture	0.001178	0.005864
0	language	0.000973	0.000014
3	poetry	0.000636	0.001080
9	geography	-0.000304	0.036853
1	sciences	-0.000540	0.033214
8	languages	-0.000601	0.022820
4	proficiency	-0.000603	0.010853
6	mathematics	-0.000604	0.028463
7	rhetoric	-0.000639	0.036787