Break question titles into tokens, and perform token-level normalization: expand shortened words, correct spelling, etc.
This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.
In [1]:
from pygoose import *
In [2]:
import nltk
Automatically discover the paths to various data folders and compose the project structure.
In [3]:
project = kg.Project.discover()
Original question datasets.
In [4]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('none')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('none')
In [5]:
df_all = pd.concat([df_train, df_test])
Stopwords customized for Quora dataset.
In [6]:
stopwords = set(kg.io.load_lines(project.aux_dir + 'stopwords.vocab'))
Pre-composed spelling correction dictionary.
In [7]:
spelling_corrections = kg.io.load_json(project.aux_dir + 'spelling_corrections.json')
In [8]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
In [9]:
def translate(text, translation):
for token, replacement in translation.items():
text = text.replace(token, ' ' + replacement + ' ')
text = text.replace(' ', ' ')
return text
In [10]:
def spell_digits(text):
translation = {
'0': 'zero',
'1': 'one',
'2': 'two',
'3': 'three',
'4': 'four',
'5': 'five',
'6': 'six',
'7': 'seven',
'8': 'eight',
'9': 'nine',
}
return translate(text, translation)
In [11]:
def expand_negations(text):
translation = {
"can't": 'can not',
"won't": 'would not',
"shan't": 'shall not',
}
text = translate(text, translation)
return text.replace("n't", " not")
In [12]:
def correct_spelling(text):
return ' '.join(
spelling_corrections.get(token, token)
for token in tokenizer.tokenize(text)
)
In [13]:
def get_question_tokens(question, lowercase=True, spellcheck=True, remove_stopwords=True):
if lowercase:
question = question.lower()
if spellcheck:
question = correct_spelling(question)
question = spell_digits(question)
question = expand_negations(question)
tokens = [token for token in tokenizer.tokenize(question.lower() if lowercase else question)]
if remove_stopwords:
tokens = [token for token in tokens if token not in stopwords]
tokens.append('.')
return tokens
In [14]:
def get_question_pair_tokens_spellcheck(pair):
return [
get_question_tokens(pair[0], lowercase=False, spellcheck=True, remove_stopwords=False),
get_question_tokens(pair[1], lowercase=False, spellcheck=True, remove_stopwords=False),
]
In [15]:
def get_question_pair_tokens_lowercase_spellcheck(pair):
return [
get_question_tokens(pair[0], lowercase=True, spellcheck=True, remove_stopwords=False),
get_question_tokens(pair[1], lowercase=True, spellcheck=True, remove_stopwords=False),
]
In [16]:
def get_question_pair_tokens_lowercase_spellcheck_remove_stopwords(pair):
return [
get_question_tokens(pair[0], lowercase=True, spellcheck=True, remove_stopwords=True),
get_question_tokens(pair[1], lowercase=True, spellcheck=True, remove_stopwords=True),
]
Tokenize the questions, correct spelling, but keep the upper/lower case.
In [17]:
tokens_spellcheck = kg.jobs.map_batch_parallel(
df_all.as_matrix(columns=['question1', 'question2']),
item_mapper=get_question_pair_tokens_spellcheck,
batch_size=1000,
)
Tokenize the questions, convert to lowercase and correct spelling, keep the stopwords (useful for neural models).
In [18]:
tokens_lowercase_spellcheck = kg.jobs.map_batch_parallel(
df_all.as_matrix(columns=['question1', 'question2']),
item_mapper=get_question_pair_tokens_lowercase_spellcheck,
batch_size=1000,
)
Just as before, but also with stopwords removed.
In [19]:
tokens_lowercase_spellcheck_no_stopwords = kg.jobs.map_batch_parallel(
df_all.as_matrix(columns=['question1', 'question2']),
item_mapper=get_question_pair_tokens_lowercase_spellcheck_remove_stopwords,
batch_size=1000,
)
In [20]:
vocab = set()
for question in progressbar(np.array(tokens_lowercase_spellcheck).ravel()):
for token in question:
vocab.add(token)
In [21]:
vocab_no_stopwords = vocab - stopwords
Tokenized questions.
In [22]:
kg.io.save(
tokens_spellcheck[:len(df_train)],
project.preprocessed_data_dir + 'tokens_spellcheck_train.pickle'
)
kg.io.save(
tokens_spellcheck[len(df_train):],
project.preprocessed_data_dir + 'tokens_spellcheck_test.pickle'
)
In [23]:
kg.io.save(
tokens_lowercase_spellcheck[:len(df_train)],
project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_train.pickle'
)
kg.io.save(
tokens_lowercase_spellcheck[len(df_train):],
project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_test.pickle'
)
In [24]:
kg.io.save(
tokens_lowercase_spellcheck_no_stopwords[:len(df_train)],
project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_train.pickle'
)
kg.io.save(
tokens_lowercase_spellcheck_no_stopwords[len(df_train):],
project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords_test.pickle'
)
Question vocabulary.
In [25]:
kg.io.save_lines(
sorted(list(vocab)),
project.preprocessed_data_dir + 'tokens_lowercase_spellcheck.vocab'
)
In [26]:
kg.io.save_lines(
sorted(list(vocab_no_stopwords)),
project.preprocessed_data_dir + 'tokens_lowercase_spellcheck_no_stopwords.vocab'
)
Ground truth.
In [27]:
kg.io.save(df_train['is_duplicate'].values, project.features_dir + 'y_train.pickle')