This notebook is part of the material for the talk "Beyond PageRank" on Studienstiftung Winterakademie 2017.
To run the notebook, install the following dependencies:
NLTKscikit-learnnumpy, scipy, pylab, pandas, ...)The toolchain presented here includes:
Some imports
In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from pylab import *
%matplotlib inline
import seaborn as sns
sns.set_style("white")
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from nltk.stem.porter import *
import pandas as pd
In [2]:
vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform([ "I'd like an apple",
"An apple a day keeps the doctor away",
"Never compare an apple to an orange",
"I prefer scikit-learn to Orange"])
sns.heatmap((tfidf * tfidf.T).A)
show()
Load example data from the NLTK gutenberg dataset. All credits to the Gutenberg project for providing the content.
In [3]:
def get_corpus():
corpus = {}
for key in nltk.corpus.gutenberg.fileids():
text = nltk.corpus.gutenberg.raw(key)
key_ = key.replace(".txt", "")
corpus[key_] = text
print("Loaded {} containing {} characters".format(key_, len(text)))
return corpus
corpus = get_corpus()
In [4]:
stemmer = nltk.WordNetLemmatizer()
tokens = {k : word_tokenize(corpus[k]) for k in corpus.keys()}
stemmed_stopwords = [stemmer.lemmatize(t.lower()) for t in stopwords.words('english')]
stemmed_tokens = {k : [stemmer.lemmatize(t.lower()) for t in tokens[k]] for k in corpus.keys()}
In [50]:
index = set()
for k in corpus.keys():
for token in stemmed_tokens[k]:
index.add(token)
counts = pd.DataFrame(index=index, columns=corpus.keys(), data=0)
In [51]:
for key in corpus.keys():
print(key)
for t in stemmed_tokens[key]:
if t in stemmed_stopwords:
continue
counts[key][t] += 1
counts
Out[51]:
In [52]:
# store as sparse dataframe
scounts = counts.to_sparse(fill_value=0)
print("Density", scounts.density)
In [53]:
scounts.to_pickle("corpus_wordcounts.npy")
In [140]:
scounts = pd.read_pickle("corpus_wordcounts.npy")
cols = list(scounts.keys())
scounts.sort_values(by="shakespeare-hamlet", ascending=False)
keywords = (scounts/scounts.sum(axis=0)).prod(axis=1).sort_values(ascending=False)
print(", ".join(list(keywords.index[100:300])))
Create a Bag-of-Words (BOW) and display the words that occur most often
In [55]:
#max_keys = []
#keys = []
#values = []
#for k,v in counts.items():
# keys.append(k)
# values.append(int(v))
#bag = pd.DataFrame({"words" : keys, "count" : values}).set_index("words")
#bag = bag.sort_values(by="count",ascending=False)[0:40]
#bag
In [5]:
import scipy
X = scipy.sparse.csr_matrix(scounts.values.T)
X.shape
Out[5]:
In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf.fit(X)
scores = tfidf.transform(X)
In [90]:
id_bad, = np.where(scounts.index == "bad")[0]
id_nice, = np.where(scounts.index == "nice")[0]
print(id_bad, id_nice)
In [78]:
print(scores.shape)
for k in range(1):
print(scores[k,:])
In [144]:
import seaborn as sns
sns.set_style("white")
def plot_embedding(word1, word2):
id_bad, = np.where(scounts.index == word1)[0]
id_nice, = np.where(scounts.index == word2)[0]
y = np.zeros((18, 2))
for k in range(18):
vec = scores[k,:].toarray()[0]
y[k,0] = vec[id_bad]
y[k,1] = vec[id_nice]
y = y / (y**2).sum(axis=-1, keepdims=True)**0.5
figure(figsize=(5,5))
scatter(y[:,0], y[:,1], c=range(18), cmap="Set1")
for i, txt in enumerate(corpus.keys()):
if "shake" in txt or True:
annotate(txt, (y[i,0],y[i,1]))
plot(np.linspace(0,1/2**0.5,10), np.linspace(0,1/2**0.5,10))
xlim([0, 1])
ylim([0, 1])
xlabel(word1)
ylabel(word2)
def query(query):
pass
plot_embedding("love", "lie")
show()
# macbeth —> macht, orakel, schicksaal, könig
# hamlet —> macht, könig, rache, liebe
"""
among, fear, brought, ground, turn, water, wish, arm, dead, sound, lost, none, lay, saying, bed,
run, meet, doubt, fall, need, truth, ready, free, close, making, sit, wood, please, secret, fast,
red, past, dare, age, met, lie, laugh, noise, tongue, rise, breath, besides, wise, fly, angry,
write, shake, [, ], forgot, gently, lettered, care., caring, self-same, 7:57, tumble, flexion,
43:25, interweaving, 119:36, 43:3, no, stomach's, japonica, money-making, elihoenai, erst, vibration,
feebleness, 115:14, adze, 2,800, national, ekronites, military, begun, thalia, bezer, reckons, hoof,
adiew, magic, 64:12, dreamed, giddy, cavern, evan's, airless, ideality, gold-bound-brow, 3:15, every,
unpack, pub-frequenting, syringa, imploring, himselves, merry-mad, ranck, verdigris, richest,
ever-returning, snap-shotted, seventy-seven, meekly, mug, etta, afar, moonshine, mac, passport,
23:21, 27:59, confines, gritted, zaccur, ulloa, onset, youthfull, compliment, petrified, dotes,
`do, unfixed, 23:48, arnholds, excavating, loyalty, exploded, civitas, clank, anim, 17:9, 1:71,
demonstrate, work-basket, contrasting, winded, hampstead, brandy, _times_, twinge, 'just, woollen-draper,
wedding-cake, wrestling, jeopardy, aiath, ijeabarim, construe, striker, evincing, significance,
'advance, 3:60, solicited, contracted, sourse, infringing, nursing, highly, spungie, undiscoverable,
intimately, what-you-call-him, 81:7, kells, gripped, seyward, benign, putteth, 19:33, aid, cherith,
stimulate, chemarims, 6:53, 68:2, 71:11, thrusteth, absurdity, succeeding, sand-hills, 46:25, hesitate,
charles's, tattersall, meddleth, breez, rid, across, carmelite, sea-crashing, ogre, dragon, 56:5, moony,
unfrequent
"""
Out[144]:
In [58]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
pca = PCA(2)
embedding = pca.fit_transform(X.toarray())
#tsne = TSNE()
#embedding = tsne.fit_transform(x_)
scatter(embedding[:,0], embedding[:,1])
for i, txt in enumerate(corpus.keys()):
if "shake" in txt or True:
annotate(txt, (embedding[i,0],embedding[i,1]))
show()
In [69]:
from nltk.collocations import *
trigram_measures = nltk.collocations.TrigramAssocMeasures()
for key in corpus.keys():
if "shakespeare" in key:
finder = TrigramCollocationFinder.from_words(
nltk.corpus.gutenberg.words(key+".txt"))
print(key)
print(finder.nbest(trigram_measures.pmi, 30))