Calculate Jaccard similarities between sets of character $n$-grams for different values of $n$.
This utility package imports numpy
, pandas
, matplotlib
and a helper kg
module into the root namespace.
In [1]:
from pygoose import *
Automatically discover the paths to various data folders and compose the project structure.
In [2]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [3]:
feature_list_id = 'jaccard_ngrams'
Range of $n$ to try for the $n$-grams.
In [4]:
NGRAM_RANGE = range(2, 6)
Preprocessed and tokenized questions.
In [5]:
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_spellcheck_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_spellcheck_test.pickle')
In [6]:
tokens = tokens_train + tokens_test
In [7]:
def get_char_ngrams(doc, n):
return [doc[i:i + n] for i in range(len(doc) - n + 1)]
In [8]:
def get_jaccard_set_similarities(a, b):
len_intersection = len(a.intersection(b))
jaccard_index = len_intersection / len(a.union(b))
jaccard_index_norm_a = len_intersection / len(a)
jaccard_index_norm_b = len_intersection / len(b)
return jaccard_index, jaccard_index_norm_a, jaccard_index_norm_b
In [9]:
def get_jaccard_similarities(q1, q2, n):
if len(q1) < max(NGRAM_RANGE) and len(q2) < max(NGRAM_RANGE):
return 1, 1, 1
if len(q1) < max(NGRAM_RANGE) or len(q2) < max(NGRAM_RANGE):
return 0, 0, 0
q1_ngrams = set(get_char_ngrams(q1, n))
q2_ngrams = set(get_char_ngrams(q2, n))
return get_jaccard_set_similarities(q1_ngrams, q2_ngrams)
In [10]:
def get_question_pair_features(pair):
q1 = ' '.join(pair[0])
q2 = ' '.join(pair[1])
features = []
for n in NGRAM_RANGE:
features.extend(get_jaccard_similarities(q1, q2, n))
return features
In [11]:
features = kg.jobs.map_batch_parallel(
tokens,
item_mapper=get_question_pair_features,
batch_size=1000,
)
In [12]:
feature_names = []
In [13]:
for n in NGRAM_RANGE:
feature_names.append(f'jaccard_ix_{n}gram')
feature_names.append(f'jaccard_ix_norm_q1_{n}gram')
feature_names.append(f'jaccard_ix_norm_q2_{n}gram')
In [14]:
df = pd.DataFrame(features, columns=feature_names)
In [15]:
for n in NGRAM_RANGE[:-1]:
m = n + 1
diff_feature_name = f'jaccard_ix_diff_{n}_{m}'
df[diff_feature_name]= np.abs(df[f'jaccard_ix_{n}gram'] - df[f'jaccard_ix_{m}gram'])
feature_names.append(diff_feature_name)
In [16]:
X_train = np.array(df.values[:len(tokens_train)], dtype='float64')
X_test = np.array(df.values[len(tokens_train):], dtype='float64')
In [17]:
print('X_train:', X_train.shape)
print('X_test: ', X_test.shape)
In [18]:
project.save_features(X_train, X_test, feature_names, feature_list_id)