In [1]:
# Note - these lines added to make it work with shared Jupyter Hub instance,
# modifying the system path so that locally installed modules installed with the shell commands below will be found -
# they would need to be modified for your instance, or to install the modules normally remove the --user param
# import sys
# import os
# sys.path.append(os.path.abspath("/...path to your local module install dir..."))
In [2]:
# these are the values we want to test
text1 = 'General Electric Company'
text2 = 'General Electric Co Inc'
In [3]:
import re, math
from collections import Counter
String comparison using cosine similarity https://en.wikipedia.org/wiki/Cosine_similarity
Code sample copypasta from Stack Overflow: http://stackoverflow.com/questions/15173225/how-to-calculate-cosine-similarity-given-2-sentence-strings-python
In [4]:
WORD = re.compile(r'\w+')
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)
cosine = get_cosine(vector1, vector2)
print ('Cosine:', cosine)
Cosine works fine with whole words and word transposition but will start to trip up on CO vs COMPANY and when too much extraneous text is introduced.
String comparison using difflib - https://docs.python.org/3/library/difflib.html
In [5]:
import difflib
from difflib import SequenceMatcher
m = SequenceMatcher(None, text1, text2)
print (m.ratio())
May have some issues where it comes to partial string matches http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
Background on FuzzyWuzzy - http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
Package install should only need to be done once, unless the cluster was reset - this will install locally (using --user parameter), so variables need to be set
In [6]:
## %%sh
## pip install fuzzywuzzy --user
Also wants to use python-Levenshtein to improve speed, but install failed on gcc - will complain below
In [7]:
## %%sh
## pip install python-Levenshtein --user
In [8]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
In [9]:
print (fuzz.ratio(text1, text2))
In [10]:
print (fuzz.partial_ratio(text1, text2))
In [11]:
print (fuzz.token_sort_ratio(text1, text2))
In [12]:
print (fuzz.token_set_ratio(text1, text2))
Fuzzywuzzy has an interesting "process" function:
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
process.extract("new york jets", choices, limit=2)
[('New York Jets', 100), ('New York Giants', 78)]
process.extractOne("cowboys", choices)
("Dallas Cowboys", 90)
In [13]:
import numpy as np
# Jaccard Similarity J (A,B) = | Intersection (A,B) | / | Union (A,B) |
def compute_jaccard_similarity_score(x, y):
intersection_cardinality = len(set(x).intersection(set(y)))
union_cardinality = len(set(x).union(set(y)))
return intersection_cardinality / float(union_cardinality)
score = compute_jaccard_similarity_score(text1, text2)
print ("Jaccard Similarity Score: ",score)
taken from https://codegists.com/code/python%20jaccard/
Testing Jellyfish library, with the following algorithms
String comparison:
Phonetic encoding:
In [14]:
## %%sh
## pip install jellyfish --user
In [15]:
import jellyfish
In [16]:
jellyfish.levenshtein_distance(text1,text2)
Out[16]:
In [17]:
jellyfish.damerau_levenshtein_distance(text1,text2)
Out[17]:
In [18]:
jellyfish.jaro_distance(text1,text2)
Out[18]:
In [19]:
jellyfish.jaro_winkler(text1,text2)
Out[19]:
In [20]:
jellyfish.match_rating_comparison(text1,text2)
Out[20]:
In [21]:
jellyfish.hamming_distance(text1,text2)
Out[21]:
In [22]:
jellyfish.soundex(text1)
Out[22]:
In [23]:
jellyfish.soundex(text2)
Out[23]:
In [24]:
soundexenc = ''
sentence=text1.split()
for word in sentence:
soundexenc = soundexenc+' '+jellyfish.soundex(word)
print(soundexenc)
In [25]:
jellyfish.metaphone(text1)
Out[25]:
In [26]:
jellyfish.metaphone(text2)
Out[26]:
In [27]:
jellyfish.metaphone(text1) == jellyfish.metaphone(text2)
Out[27]:
In [28]:
jellyfish.nysiis(text1)
Out[28]:
In [29]:
jellyfish.nysiis(text2)
Out[29]:
Note that soundex and nysiis both appear to just take the first word
In [30]:
jellyfish.nysiis(text1) == jellyfish.nysiis(text2)
Out[30]:
In [31]:
nysiisenc = ''
sentence=text2.split()
for word in sentence:
nysiisenc = nysiisenc+' '+jellyfish.nysiis(word)
print(nysiisenc)
In [32]:
jellyfish.match_rating_codex(text1)
Out[32]:
In [33]:
jellyfish.match_rating_codex(text2)
Out[33]:
In [ ]: