Based on the training and test sets, extract a list of unique documents.
This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.
In [1]:
from pygoose import *
In [2]:
import nltk
Automatically discover the paths to various data folders and compose the project structure.
In [3]:
project = kg.Project.discover()
Original question datasets.
In [4]:
df_train = pd.read_csv(project.data_dir + 'train.csv').fillna('')
df_test = pd.read_csv(project.data_dir + 'test.csv').fillna('')
In [5]:
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
In [6]:
df = pd.concat([df_train, df_test])
In [7]:
unique_question_texts = [
question.strip(' \'"')
for question in np.unique(df[['question1', 'question2']].values.ravel())
]
In [8]:
def tokenize_question_text(q):
return tokenizer.tokenize(q.lower())
In [9]:
unique_question_tokens = kg.jobs.map_batch_parallel(
unique_question_texts,
item_mapper=tokenize_question_text,
batch_size=1000,
)
In [10]:
kg.io.save_lines(unique_question_texts, project.preprocessed_data_dir + 'unique_questions_raw.txt')
In [11]:
kg.io.save(unique_question_tokens, project.preprocessed_data_dir + 'unique_questions_tokenized.pickle')