In [6]:
from __future__ import division, print_function, unicode_literals
%matplotlib inline
import IPython.display
import numpy as np
import matplotlib.pyplot as plt
import data_io
import editdist
import arrow
import twython
import textblob
import requests
import utilities
# Keyboard shortcuts: http://ipython.org/ipython-doc/stable/interactive/notebook.html#keyboard-shortcuts
Note: I wrote this little section before I decided to do away with EMD. Not really relevant anymore. Get ride of it? Stick it in another notebook?
Compute histograms of occurance of each word as basis for Bag-of-Words style model for cross-comparing multiple sets of words. Use EMD metric as basis for quantifying similarity of two histograms. The idea here is to compute a cost metric between all pairs of words used in this analysis. So let's says my complete list of words is this:
In [7]:
words = ['apple', 'pear', 'peach', 'banana', 'raspberry']
I can next compute the edit distance between all word pairs as:
In [8]:
d1 = editdist.distance('abcd', 'abcd')
d2 = editdist.distance('abcd', 'abcf')
d3 = editdist.distance('abcd', 'ah!')
d4 = editdist.distance('abcd', 'eh!')
d5 = editdist.distance('abcd', 'trampoline')
print(d1, d2, d3, d4, d5)
In [9]:
N = len(words)
cost = np.zeros((N, N))
for i in range(N):
w_i = words[i]
for j in range(N):
w_j = words[j]
dist_ij = editdist.distance(w_i, w_j)
cost[i, j] = dist_ij
print(cost)
In [ ]: