In [1]:
from __future__ import print_function
from collections import Counter
from tqdm import tqdm
try:
    import cPickle as pickle
except:
    import pickle

In [2]:
# Load the processed data we created in the previous notebook.
raw_train_lines = pickle.load(open("./data/processed/01.processed_train.pkl", "rb"))

In [3]:
# Print the first example
raw_train_lines[0]


Out[3]:
(['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market',
  'in',
  'india',
  '?'],
 ['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market',
  '?'],
 0)

Right now, each data point consists of two strings and an integer label. Computers don't like dealing with strings directly very much, so we need to convert these strings to lists of integers.

The way we do this is: we will assign each string a unique integer ID, and then replace all occurences of the string with that integer ID. In this way, we can encode to the model what the various input strings are. This is called "indexing" the data.


In [4]:
padding_token = "@@PADDING@@"
oov_token = "@@UNKOWN@@"
word_indices = {padding_token: 0, oov_token: 1}

In [5]:
for train_instance in tqdm(raw_train_lines):
    # unpack the tuple into 3 variables
    question_1, question_2, label = train_instance

    # iterate over the tokens in each question, and add them to the word
    # indices if they aren't in there already
    for word in question_1:
        if word not in word_indices:
            # by taking the current length of the dictionary
            # to be the index, we can guarantee that each unique word
            # will get a unique index.
            index = len(word_indices)
            word_indices[word] = index

    for word in question_2:
        if word not in word_indices:
            # by taking the current length of the dictionary
            # to be the index, we can guarantee that each unique word
            # will get a unique index.
            index = len(word_indices)
            word_indices[word] = index


100%|██████████| 404290/404290 [00:02<00:00, 140863.81it/s]

In [6]:
# The number of unique tokens in our corpus
len(word_indices)


Out[6]:
104472

Now we will convert the raw_train_lines, which are string representations, to integers.


In [7]:
indexed_train_lines = []
for train_instance in tqdm(raw_train_lines):
    # unpack the tuple into 3 variables
    question_1, question_2, label = train_instance
    
    # for each token in question_1 and question_2, replace it with its index
    indexed_question_1 = [word_indices[word] for word in question_1]
    indexed_question_2 = [word_indices[word] for word in question_2]

    indexed_train_lines.append((indexed_question_1, indexed_question_2, label))


100%|██████████| 404290/404290 [00:06<00:00, 64112.93it/s]

In [8]:
# Print the first indexed example, which is the indexed version of 
# the raw example we printed above.
indexed_train_lines[0]


Out[8]:
([2, 3, 4, 5, 6, 5, 7, 8, 9, 10, 11, 12, 10, 13, 14],
 [2, 3, 4, 5, 6, 5, 7, 8, 9, 10, 11, 12, 14],
 0)

If you compare the output of the first indexed example with the first raw example, you will see that each word has been assigned a unique index and words that are the same across sentences have the same index.

Now, we'll repackage the lists into a slightly more digestible format for the model. We will have one list of lists (note that each "question" now is a list of integers) for all of the question_1's, and one list of lists for all of the question_2's. Then, we'll have a list of labels.

These lists should correspond index-wise, so that label[i] should correspond to the correct label of the data point with indexed_question_1s[i] and indexed_question_2s[i].


In [9]:
indexed_question_1s = []
indexed_question_2s = []
labels = []

for indexed_train_line in tqdm(indexed_train_lines):
    # Unpack the tuple into 3 variables
    indexed_question_1, indexed_question_2, label = indexed_train_line
    
    # Now add each of the individual elements of one train instance to their
    # separate lists.
    indexed_question_1s.append(indexed_question_1)
    indexed_question_2s.append(indexed_question_2)
    labels.append(label)


100%|██████████| 404290/404290 [00:00<00:00, 941351.69it/s]

In [10]:
# Print the first element from each of the lists, it should be the same as the
# first element of the combined dataset above.
print("First indexed_question_1s: {}".format(indexed_question_1s[0]))
print("First indexed_question_2s: {}".format(indexed_question_2s[0]))
print("First label: {}".format(labels[0]))


First indexed_question_1s: [2, 3, 4, 5, 6, 5, 7, 8, 9, 10, 11, 12, 10, 13, 14]
First indexed_question_2s: [2, 3, 4, 5, 6, 5, 7, 8, 9, 10, 11, 12, 14]
First label: 0

Looks like everything matches up! We'll pickle these indexed instances for use when actually training the model.


In [11]:
# Pickle the data lists.
pickle.dump(indexed_question_1s, open("./data/processed/02.indexed_question_1s_train.pkl", "wb"))
pickle.dump(indexed_question_2s, open("./data/processed/02.indexed_question_2s_train.pkl", "wb"))
pickle.dump(labels, open("./data/processed/02.labels_train.pkl", "wb"))

# Also pickle the word indices
pickle.dump(word_indices, open("./data/processed/02.word_indices.pkl", "wb"))