Derive bag-of-POS-tag and bag-of-NER-tag vectors from each question and calculate their vector distances.
This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.
In [1]:
from pygoose import *
In [2]:
import os
import warnings
In [3]:
from collections import Counter
In [4]:
from scipy.spatial.distance import cosine, euclidean, jaccard
In [5]:
import spacy
Automatically discover the paths to various data folders and compose the project structure.
In [6]:
project = kg.Project.discover()
Identifier for storing these features on disk and referring to them later.
In [7]:
feature_list_id = 'nlp_tags'
Original question datasets.
In [8]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')
Preprocessed and tokenized questions.
We should not use lowercased tokens here because that would harm the named entity recognition process.
In [9]:
tokens_train = kg.io.load(project.preprocessed_data_dir + 'tokens_spellcheck_train.pickle')
tokens_test = kg.io.load(project.preprocessed_data_dir + 'tokens_spellcheck_test.pickle')
In [10]:
df_all_texts = pd.DataFrame(
[[' '.join(pair[0]), ' '.join(pair[1])] for pair in tokens_train + tokens_test],
columns=['question1', 'question2'],
)
Dependency parsing takes a lot of time and we don't use any features from it. Let's disable it in the pipeline.
If model loading fails, run python -m spacy download en
In [11]:
nlp = spacy.load('en', parser=False)
In [12]:
pos_tags_whitelist = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'NUM', 'VERB']
ner_tags_whitelist = ['GPE', 'LOC', 'ORG', 'NORP', 'PERSON', 'PRODUCT', 'DATE', 'TIME', 'QUANTITY', 'CARDINAL']
In [13]:
num_raw_features = len(pos_tags_whitelist) + len(ner_tags_whitelist)
In [14]:
X1 = np.zeros((len(df_all_texts), num_raw_features))
X2 = np.zeros((len(df_all_texts), num_raw_features))
In [15]:
X1.shape, X2.shape
Out[15]:
In [16]:
pipe_q1 = nlp.pipe(df_all_texts['question1'].values, n_threads=os.cpu_count())
pipe_q2 = nlp.pipe(df_all_texts['question2'].values, n_threads=os.cpu_count())
In [17]:
for i, doc in progressbar(enumerate(pipe_q1), total=len(df_all_texts)):
pos_counter = Counter(token.pos_ for token in doc)
ner_counter = Counter(ent.label_ for ent in doc.ents)
X1[i, :] = np.array(
[pos_counter[pos_tag] for pos_tag in pos_tags_whitelist] +
[ner_counter[ner_tag] for ner_tag in ner_tags_whitelist]
)
In [18]:
for i, doc in progressbar(enumerate(pipe_q2), total=len(df_all_texts)):
pos_counter = Counter(token.pos_ for token in doc)
ner_counter = Counter(ent.label_ for ent in doc.ents)
X2[i, :] = np.array(
[pos_counter[pos_tag] for pos_tag in pos_tags_whitelist] +
[ner_counter[ner_tag] for ner_tag in ner_tags_whitelist]
)
In [19]:
df_pos_q1 = pd.DataFrame(
X1[:, 0:len(pos_tags_whitelist)],
columns=['pos_q1_' + pos_tag.lower() for pos_tag in pos_tags_whitelist]
)
In [20]:
df_pos_q2 = pd.DataFrame(
X2[:, 0:len(pos_tags_whitelist)],
columns=['pos_q2_' + pos_tag.lower() for pos_tag in pos_tags_whitelist]
)
In [21]:
df_ner_q1 = pd.DataFrame(
X1[:, -len(ner_tags_whitelist):],
columns=['ner_q1_' + ner_tag.lower() for ner_tag in ner_tags_whitelist]
)
In [22]:
df_ner_q2 = pd.DataFrame(
X2[:, -len(ner_tags_whitelist):],
columns=['ner_q2_' + ner_tag.lower() for ner_tag in ner_tags_whitelist]
)
In [24]:
def get_vector_distances(i):
return [
# POS distances.
cosine(X1[i, 0:len(pos_tags_whitelist)], X2[i, 0:len(pos_tags_whitelist)]),
euclidean(X1[i, 0:len(pos_tags_whitelist)], X2[i, 0:len(pos_tags_whitelist)]),
# NER distances.
euclidean(X1[i, -len(ner_tags_whitelist):], X2[i, -len(ner_tags_whitelist):]),
np.abs(np.sum(X1[i, -len(ner_tags_whitelist):]) - np.sum(X2[i, -len(ner_tags_whitelist):])),
]
In [38]:
warnings.filterwarnings('ignore')
X_distances = kg.jobs.map_batch_parallel(
list(range(len(df_all_texts))),
item_mapper=get_vector_distances,
batch_size=1000,
)
In [26]:
X_distances = np.array(X_distances)
In [27]:
df_distances = pd.DataFrame(
X_distances,
columns=[
'pos_tag_cosine',
'pos_tag_euclidean',
'ner_tag_euclidean',
'ner_tag_count_diff',
]
)
In [28]:
df_master = pd.concat(
[df_pos_q1, df_ner_q1, df_pos_q2, df_ner_q2, df_distances],
axis=1,
ignore_index=True,
)
In [29]:
df_master.columns = list(df_pos_q1.columns) + \
list(df_ner_q1.columns) + \
list(df_pos_q2.columns) + \
list(df_ner_q2.columns) + \
list(df_distances.columns)
In [30]:
df_master.describe().T
Out[30]:
In [32]:
X_train = df_master[:len(tokens_train)].values
X_test = df_master[len(tokens_train):].values
In [33]:
print('X train:', X_train.shape)
print('X test: ', X_test.shape)
In [34]:
feature_names = list(df_master.columns)
In [35]:
project.save_features(X_train, X_test, feature_names, feature_list_id)