Calculate edit distances between each question pair (Levenshtein, Jaro, Jaro-Winkler, ...).
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [1]:
from pygoose import *
Fuzzy matching libraries
In [2]:
from fuzzywuzzy import fuzz
from jellyfish import jaro_distance, jaro_winkler
Automatically discover the paths to various data folders and compose the project structure.
In [3]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [4]:
feature_list_id = 'fuzzy'
Preprocessed and tokenized questions.
In [5]:
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_spellcheck_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_spellcheck_test.pickle')
In [6]:
tokens = tokens_train + tokens_test
In [7]:
def get_fuzzy_distances(pair):
q1_tokens, q2_tokens = pair
q1_text = ' '.join(pair[0])
q2_text = ' '.join(pair[1])
fuzzy_distances = np.array([
fuzz.ratio(q1_tokens, q2_tokens),
fuzz.partial_ratio(q1_tokens, q2_tokens),
fuzz.token_sort_ratio(q1_tokens, q2_tokens),
fuzz.token_set_ratio(q1_tokens, q2_tokens),
fuzz.partial_token_sort_ratio(q1_tokens, q2_tokens),
], dtype='float')
# Normalize to [0 - 1] range.
fuzzy_distances /= 100
jelly_distances = np.array([
jaro_distance(q1_text, q2_text),
jaro_winkler(q1_text, q2_text),
])
return np.concatenate([fuzzy_distances, jelly_distances])
In [8]:
features = kg.jobs.map_batch_parallel(
tokens,
item_mapper=get_fuzzy_distances,
batch_size=1000,
)
In [9]:
X_train = np.array(features[:len(tokens_train)])
X_test = np.array(features[len(tokens_train):])
In [10]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)
In [11]:
feature_names = [
'fuzz_ratio',
'fuzz_partial_ratio',
'fuzz_token_sort_ratio',
'fuzz_token_set_ratio',
'fuzz_partial_token_sort_ratio',
'jaro',
'jaro_winkler',
]
In [12]:
project.save_features(X_train, X_test, feature_names, feature_list_id)