First, let's create a function to calculate a MinHash:
In [1]:
from datasketch import MinHash # greate library for Probabilistic Data Structures
def mh_digest (data):
mh = MinHash(num_perm=512)
for d in data:
mh.update(d.encode('utf8')) # hashing and encoding with utf8
return mh
Then we'll iterate through each parsed document, adding the keywords to the MinHash:
In [2]:
import pynlp
files = ["a4.json", "a3.json", "a2.json", "a1.json"]
stopwords = pynlp.load_stopwords("stop.txt")
files_set = {}
files_mh = {}
for json_file in files: # for each json file initialize keywords
keywords = set([])
for lex in pynlp.lex_iter(json_file):
if (lex.pos != ".") and (lex.root not in stopwords): # throw away punctuations
keywords.add(lex.root) # add cannonical version of that word to the set
files_set[json_file] = keywords
files_mh[json_file] = mh_digest(keywords) # run the minhash algorithm
print(json_file, keywords)
Let's compare the HTML documents, using a pairwise MinHash to approximate their Jaccard similarity:
In [ ]:
# after leveraging MinHas instead of Jaccard and compare all 4 documents with each other
import itertools
sim = []
for i1, i2 in itertools.combinations(range(len(files)), 2):
j = files_mh[files[i1]].jaccard(files_mh[files[i2]])
sim.append((j, files[i1], files[i2],))
# MinHash to MinHas comparison is very cheap - a few persent of accuracy can buy a few degrees
# of speed
for jaccard, file1, file2 in sorted(sim, key=lambda x: x[0], reverse=True):
print("%0.4f\t%s\t%s" % (jaccard, file1, file2))
Note the top-ranked ("most similar") pair, where both html/article2.html and html/article3.html are about machine learning. Take a look at their overlapping keywords:
In [ ]:
files_set["a3.json"] & files_set["a2.json"]
# show the intersection words for the documents that have been approximated with MinHas
# what other documents are out there that are similar to this set of documents?