In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from gensim.models.word2vec import Word2Vec
In [3]:
from collections import OrderedDict
models = OrderedDict([
(year, Word2Vec.load('models/bpo/{}.bin'.format(year)))
for year in range(1720, 1960, 20)
])
In [4]:
def cosine_series(anchor, query):
series = OrderedDict()
for year, model in models.items():
series[year] = (
model.similarity(anchor, query)
if query in model else 0
)
return series
In [5]:
import numpy as np
import statsmodels.api as sm
def lin_reg(series):
x = np.array(list(series.keys()))
y = np.array(list(series.values()))
x = sm.add_constant(x)
return sm.OLS(y, x).fit()
In [6]:
def plot_cosine_series(anchor, query, w=5, h=4):
series = cosine_series(anchor, query)
fit = lin_reg(series)
x1 = list(series.keys())[0]
x2 = list(series.keys())[-1]
y1 = fit.predict()[0]
y2 = fit.predict()[-1]
print(anchor, query)
plt.figure(figsize=(w, h))
plt.ylim(0, 1)
plt.title(query)
plt.xlabel('Year')
plt.ylabel('Similarity')
plt.plot(list(series.keys()), list(series.values()))
plt.plot([x1, x2], [y1, y2], color='gray', linewidth=0.5)
plt.show()
In [7]:
plot_cosine_series('science', 'poetry')
plot_cosine_series('literature', 'poetry')
plot_cosine_series('literature', 'writing')
plot_cosine_series('literature', 'polite')
In [8]:
import enchant
dictionary = enchant.Dict('en_US')
def union_neighbor_vocab(anchor, topn=200):
vocab = set()
for year, model in models.items():
similar = model.most_similar(anchor, topn=topn)
vocab.update([s[0] for s in similar if dictionary.check(s[0])])
return vocab
In [9]:
union_vocab = union_neighbor_vocab('literature')
In [10]:
data = []
for token in union_vocab:
series = cosine_series('literature', token)
fit = lin_reg(series)
data.append((token, fit.params[1], fit.pvalues[1]))
In [11]:
import pandas as pd
df1 = pd.DataFrame(data, columns=('token', 'slope', 'p'))
In [12]:
pd.set_option('display.max_rows', 1000)
df1.sort_values('slope', ascending=False).head(50)
Out[12]:
In [13]:
for i, row in df1.sort_values('slope', ascending=False).head(20).iterrows():
plot_cosine_series('literature', row['token'], 3, 2)
In [14]:
df1.sort_values('slope', ascending=True).head(50)
Out[14]:
In [15]:
for i, row in df1.sort_values('slope', ascending=True).head(20).iterrows():
plot_cosine_series('literature', row['token'], 3, 2)
In [16]:
def intersect_neighbor_vocab(anchor, topn=1000):
vocabs = []
for year, model in models.items():
similar = model.most_similar(anchor, topn=topn)
vocabs.append(set([s[0] for s in similar if dictionary.check(s[0])]))
return set.intersection(*vocabs)
In [17]:
intersect_vocab = intersect_neighbor_vocab('literature')
In [20]:
data = []
for token in intersect_vocab:
series = cosine_series('literature', token)
fit = lin_reg(series)
if fit.pvalues[1] < 0.05:
data.append((token, fit.params[1], fit.pvalues[1]))
In [21]:
import pandas as pd
df2 = pd.DataFrame(data, columns=('token', 'slope', 'p'))
In [22]:
df2.sort_values('slope', ascending=False)
Out[22]:
In [23]:
for i, row in df2.sort_values('slope', ascending=False).iterrows():
plot_cosine_series('literature', row['token'], 3, 2)
In [ ]: