In [1]:
from __future__ import print_function
import zipfile
import csv
try:
    import cPickle as pickle
except:
    import pickle

from tqdm import tqdm
# It is necessary to download spacy data by running:
# "python -m spacy download en" in your shell.
import spacy
en_nlp = spacy.load('en')

In [2]:
# We're assuming the data is at ./data/raw/train.csv
# If it is not there, you will want to change this variable
# to indicate the correct path
quora_data_path = "./data/raw/train.csv"

In [3]:
cleaned_lines = []

with open(quora_data_path, "r") as quora_data:
    data_reader = csv.reader(quora_data)
    # skip the header
    header = next(data_reader, None)
    # iterate over the lines in the CSV reader (around 400,000)
    for line in tqdm(data_reader):
        question_1 = line[3]
        question_2 = line[4]
        is_duplicate = line[5]
        
        # tokenize the questions (break it up into words) with SpaCy
        # Make sure all the tokens are lowercased.
        question_1_tokens = [str(token).lower() for token in en_nlp.tokenizer(question_1)]
        question_2_tokens = [str(token).lower() for token in en_nlp.tokenizer(question_2)]
        
        # add a tuple of (tokenized questions, label (as an int))
        # to the list of cleaned lines.
        cleaned_lines.append((question_1_tokens, question_2_tokens, int(is_duplicate)))
    print("Header: {}".format(header))


404290it [00:51, 7820.49it/s] 
Header: ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate']


In [4]:
# Inspect the first few lines of the cleaned lines
cleaned_lines[:5]


Out[4]:
[(['what',
   'is',
   'the',
   'step',
   'by',
   'step',
   'guide',
   'to',
   'invest',
   'in',
   'share',
   'market',
   'in',
   'india',
   '?'],
  ['what',
   'is',
   'the',
   'step',
   'by',
   'step',
   'guide',
   'to',
   'invest',
   'in',
   'share',
   'market',
   '?'],
  0),
 (['what',
   'is',
   'the',
   'story',
   'of',
   'kohinoor',
   '(',
   'koh',
   '-',
   'i',
   '-',
   'noor',
   ')',
   'diamond',
   '?'],
  ['what',
   'would',
   'happen',
   'if',
   'the',
   'indian',
   'government',
   'stole',
   'the',
   'kohinoor',
   '(',
   'koh',
   '-',
   'i',
   '-',
   'noor',
   ')',
   'diamond',
   'back',
   '?'],
  0),
 (['how',
   'can',
   'i',
   'increase',
   'the',
   'speed',
   'of',
   'my',
   'internet',
   'connection',
   'while',
   'using',
   'a',
   'vpn',
   '?'],
  ['how',
   'can',
   'internet',
   'speed',
   'be',
   'increased',
   'by',
   'hacking',
   'through',
   'dns',
   '?'],
  0),
 (['why',
   'am',
   'i',
   'mentally',
   'very',
   'lonely',
   '?',
   'how',
   'can',
   'i',
   'solve',
   'it',
   '?'],
  ['find',
   'the',
   'remainder',
   'when',
   '[',
   'math]23^{24}[/math',
   ']',
   'is',
   'divided',
   'by',
   '24,23',
   '?'],
  0),
 (['which',
   'one',
   'dissolve',
   'in',
   'water',
   'quikly',
   'sugar',
   ',',
   'salt',
   ',',
   'methane',
   'and',
   'carbon',
   'di',
   'oxide',
   '?'],
  ['which', 'fish', 'would', 'survive', 'in', 'salt', 'water', '?'],
  0)]

In [5]:
# Now pickle the output cleaned, tokenized file.
pickle.dump(cleaned_lines, open("./data/processed/01.processed_train.pkl", "wb"))

In [6]:
# As a sanity-check, read the pickle and compare it to cleaned_list
pickled_cleaned_lines = pickle.load(open("./data/processed/01.processed_train.pkl", "rb"))
assert pickled_cleaned_lines == cleaned_lines
print("Successfully read the pickle.")


Successfully read the pickle.