In [1]:
%pylab inline
In [1]:
import nltk
from tethne.readers import zotero
import matplotlib.pyplot as plt
from helpers import normalize_token, filter_token
import gensim
from itertools import groupby, islice
In [2]:
text_root = '../data/EmbryoProjectTexts/files'
zotero_export_path = '../data/EmbryoProjectTexts'
corpus = nltk.corpus.PlaintextCorpusReader(text_root, 'https.+')
metadata = zotero.read(zotero_export_path, index_by='link', follow_links=False)
In [3]:
years = []
documents = []
for fileid in corpus.fileids():
for sentence in corpus.sents(fileids=[fileid]):
years.append(metadata[fileid].date)
documents.append([normalize_token(token)
for token in sentence
if filter_token(token)])
In [4]:
len(documents)
Out[4]:
In [5]:
model = gensim.models.Word2Vec(documents, size=200, window=5)
Similary is measured using cosine distance in the low-dimensional embedding/vector space.
In [31]:
model.most_similar(positive=['cell'])
Out[31]:
In [39]:
model.similarity('egg', 'ovum'), model.similarity('egg', 'embryo'), model.similarity('egg', 'brain')
Out[39]:
In [61]:
model.syn0.shape # Words x Dimensions.
Out[61]:
In [63]:
len(model.index2word)
Out[63]:
In [7]:
from sklearn import decomposition
In [52]:
focal_words = ['egg', 'ontogeny', 'ovum', 'fetus', 'ivf', 'uterus', 'brain',
'neuron', 'cell', 'transplantation', 'culture', 'stem', 'nucleus']
In [53]:
focal_indices = [model.index2word.index(w) for w in focal_words]
In [54]:
pca = decomposition.PCA(n_components=2)
In [55]:
t = pca.fit_transform(model.syn0[focal_indices, :])
In [56]:
plt.scatter(t[:, 0], t[:, 1])
for i in xrange(len(focal_indices)):
w = focal_words[i]
plt.text(t[i, 0], t[i, 1], w)
In [ ]: