In [66]:
import matplotlib.pyplot as plt
%matplotlib inline
In [67]:
from gensim.models.word2vec import Word2Vec
In [68]:
from collections import OrderedDict
models = OrderedDict([
(year, Word2Vec.load('/home/odysseus/Téléchargements/hist-vec-master/crit/LemCorpCrit_models/{}.bin'.format(year)))
for year in range(1840, 1940, 20)
])
In [69]:
def cosine_series(anchor, query):
series = OrderedDict()
for year, model in models.items():
series[year] = (
model.similarity(anchor, query)
if query in model else 0
)
return series
In [70]:
import numpy as np
import statsmodels.api as sm
def lin_reg(series):
x = np.array(list(series.keys()))
y = np.array(list(series.values()))
x = sm.add_constant(x)
return sm.OLS(y, x).fit()
In [71]:
def plot_cosine_series(anchor, query, w=8, h=4):
series = cosine_series(anchor, query)
fit = lin_reg(series)
x1 = list(series.keys())[0]
x2 = list(series.keys())[-1]
y1 = fit.predict()[0]
y2 = fit.predict()[-1]
print(query)
plt.figure(figsize=(w, h))
plt.ylim(0, 1)
plt.title(query)
plt.xlabel('Year')
plt.ylabel('Similarity')
plt.plot(list(series.keys()), list(series.values()))
plt.plot([x1, x2], [y1, y2], color='gray', linewidth=0.5)
plt.show()
In [72]:
testList = ('littérature','poésie', 'science', 'savoir', 'histoire', 'philosophie', 'lettre', 'critique',
'roman', 'théâtre', 'drame', 'esprit', 'langue', 'diplomatie', 'politique', 'morale', 'société',
'pouvoir', 'théologie', 'droit', 'loi', 'méthode', 'génie', 'romantisme', 'réalisme', 'symbolisme',
'naturalisme')
for idx, val in enumerate(testList):
if idx>0:
plot_cosine_series('littérature', val)
The two next cells get the 200 most similar terms to a specific term, from the training models, here "littérature".
In [73]:
def union_neighbor_vocab(anchor, topn=200):
vocab = set()
for year, model in models.items():
similar = model.most_similar(anchor, topn=topn)
vocab.update([s[0] for s in similar])
return vocab
At this point, we'll do the same thing as above, and calculate, for each token in the 200 nearest terms to the main entry, the proximity of this term and its significance. The significance is calculated with the p value, that is to say that, below a certain threshold (0.05) we have a strong likelyhood that the result is sure and significant.
In [74]:
testList = ('littérature','poésie', 'science', 'savoir', 'histoire', 'philosophie', 'lettre', 'critique',
'roman', 'théâtre', 'drame', 'esprit', 'langue', 'diplomatie', 'politique', 'morale', 'société',
'pouvoir', 'théologie', 'droit', 'loi', 'méthode', 'génie', 'romantisme', 'réalisme', 'symbolisme',
'naturalisme')
entries={}
for word in testList:
data = []
for token in union_neighbor_vocab(word):
series = cosine_series(word, token)
fit = lin_reg(series)
if fit.pvalues[1] < 0.05:
data.append((token, fit.params[1], fit.pvalues[1]))
entries[word]=data
In [75]:
import pandas as pd
from IPython.display import Markdown, display
pd.set_option('display.max_rows', 1000)
for word in testList :
display(Markdown("### <i><b>"+word+"</i></b>"))
df1 = pd.DataFrame(entries[word], columns=('token', 'slope', 'p'))
print(df1.sort_values('slope', ascending=False).head(10))
print('\n\n')
for i, row in df1.sort_values('slope', ascending=False).head(10).iterrows():
plot_cosine_series(word, row['token'], 8, 4)
In [76]:
for word in testList :
display(Markdown("### <i><b>"+word+"</i></b>"))
df2 = pd.DataFrame(entries[word], columns=('token', 'slope', 'p'))
print(df2.sort_values('slope', ascending=True).head(10))
print('\n\n')
for i, row in df2.sort_values('slope', ascending=True).head(10).iterrows():
plot_cosine_series(word, row['token'], 8, 4)
In [77]:
def intersect_neighbor_vocab(anchor, topn=2000):
vocabs = []
for year, model in models.items():
similar = model.most_similar(anchor, topn=topn)
vocabs.append(set([s[0] for s in similar]))
return set.intersection(*vocabs)
In [78]:
entries={}
for word in testList:
data = []
for token in intersect_neighbor_vocab(word):
series = cosine_series(word, token)
fit = lin_reg(series)
if fit.pvalues[1] < 0.05:
data.append((token, fit.params[1], fit.pvalues[1]))
entries[word]=data
In [79]:
import pandas as pd
In [80]:
from IPython.display import Markdown, display
for word in testList :
display(Markdown("### <i><b>"+word+"</i></b>"))
df3 = pd.DataFrame(entries[word], columns=('token', 'slope', 'p'))
print(df3.sort_values('slope', ascending=False).head(10))
print('\n\n')
for i, row in df3.sort_values('slope', ascending=False).head(10).iterrows():
plot_cosine_series(word, row['token'], 8, 4)