In [4]:
%matplotlib inline
import pandas as pd
import gensim
import bcolz
import keras
import sys
import re
import os
import csv
import codecs
import spacy
import numpy as np


from gensim.models import Word2Vec, KeyedVectors, word2vec
from matplotlib import pyplot as plt
from datetime import datetime
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from bcolz_array_iterator import BcolzArrayIterator
from string import punctuation
from keras.layers import InputSpec, Layer, Input, Dense, merge
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers import Bidirectional, GRU, LSTM
from keras.layers.noise import GaussianNoise
from keras.layers.advanced_activations import ELU
from keras.models import Sequential, Model, model_from_json
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Merge

from keras.preprocessing.text import Tokenizer
from keras import optimizers
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Reshape, Lambda, Conv2D, Conv1D
from keras.layers import merge
from keras.layers.merge import concatenate, dot
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, BaseLogger, ProgbarLogger

np.random.seed(7)
np.set_printoptions(precision=3)

# !cd en_core_web_md-1.2.1 && python setup.py install
# !python -m spacy link en_core_web_md en_core_web_md

# !pip install gensim nltk keras pandas bcolz h5py spacy

In [5]:
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
bcolz_chunklen = 64
stem = 0
init_random = 1
stop_words = 0
w2v_provider = 'glove'
bcolz_prefix = '%s_%s_%s_%s_%s' % (MAX_SEQUENCE_LENGTH,
                                   bcolz_chunklen,
                                   stem,
                                   init_random,
                                   stop_words)
bcolz_prefix


Out[5]:
'30_64_0_1_0'

In [3]:
df = pd.read_csv('train.csv.zip')
df = df[(df.question1.isnull()  == False) & (df.question2.isnull() == False)]
train_df = df.iloc[:360000]
val_df = df.iloc[360000:]
print(df.shape)
df = None
test_df = pd.read_csv('test.csv.zip')
print(test_df.shape)


(404288, 6)
(2345796, 3)

In [6]:
%%time
if w2v_provider == 'glove':
    w2v = {}
    w2v_path = 'glove.840B.300d.txt'
    with open(w2v_path) as f_:
        for line in f_:
            try:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                w2v[word] = coefs
            except Exception as e:
                continue
else:
    w2v_path = 'GoogleNews-vectors-negative300.bin.gz'
    w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)


CPU times: user 2min 48s, sys: 3.5 s, total: 2min 52s
Wall time: 2min 57s

In [10]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    if text is np.nan:
        text = ''
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
#     text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"\[math\].*\[\/math\]", 'math', text)
    text = re.sub(r"\[code\].*\[\/code\]", 'code', text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\(", "", text)
    text = re.sub(r"\)", "", text)
    text = re.sub(r":", "", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return text.split()

In [11]:
def word_to_vec(word, w2v, custom_w2v, init_random=False):
    if word in w2v:
        return w2v[word]
    elif word in custom_w2v:
        return custom_w2v[word]
    else:
        if init_random:
            custom_w2v[word] = np.random.normal(scale=.5, size=[1,300])
        else:
            custom_w2v[word] = np.zeros([1,300])
        return custom_w2v[word]

In [12]:
custom_w2v = {}
def save_bcolz_array(data_iter, rootdir):
    carr = bcolz.carray([next(data_iter)], chunklen=64,
                        rootdir=rootdir, mode='w')
    for chunk in data_iter:
        carr.append(chunk)
    carr.flush()
    

def emb_iter(data):
    for qid, question in enumerate(data):
        emb = np.zeros([MAX_SEQUENCE_LENGTH,300]) 
        for idx, word in enumerate(text_to_wordlist(question,
                                                    remove_stopwords=stop_words,
                                                    stem_words=stem)):
            if idx < MAX_SEQUENCE_LENGTH:
                emb[idx,:] = word_to_vec(word, w2v, custom_w2v, init_random=init_random)
        yield emb

In [12]:
%%time

# fill custom_w2v
for emb in emb_iter(df.question1.values):
    continue
for emb in emb_iter(df.question2.values):
    continue
for emb in emb_iter(test_df.question1.values):
    continue
for emb in emb_iter(test_df.question2.values):
    continue


CPU times: user 7min 26s, sys: 313 ms, total: 7min 26s
Wall time: 7min 26s

In [13]:
%%time
!date

# Train
!mkdir -p bclz/train
save_bcolz_array(emb_iter(train_df.question1.values), 'bclz/train/q1_%s' % bcolz_prefix)
save_bcolz_array(emb_iter(train_df.question2.values), 'bclz/train/q2_%s' % bcolz_prefix)
save_bcolz_array(iter(train_df.is_duplicate.values.astype(float)), 'bclz/train/y_%s' % bcolz_chunklen)

# Validation
!mkdir -p bclz/val
save_bcolz_array(emb_iter(val_df.question1.values), 'bclz/val/q1_%s' % bcolz_prefix)
save_bcolz_array(emb_iter(val_df.question2.values), 'bclz/val/q2_%s' % bcolz_prefix)
save_bcolz_array(iter(val_df.is_duplicate.values.astype(float)), 'bclz/val/y_%s' % bcolz_chunklen)

# Test 
!mkdir -p bclz/test
save_bcolz_array(emb_iter(test_df.question1.values), 'bclz/test/q1_%s' % bcolz_prefix)
save_bcolz_array(emb_iter(test_df.question2.values), 'bclz/test/q2_%s' % bcolz_prefix)


Tue May 30 04:39:33 MSK 2017
CPU times: user 13min 58s, sys: 37.4 s, total: 14min 35s
Wall time: 24min 21s

In [14]:
nlp = spacy.load('en_core_web_md')
def text_to_pos(text):
    return [w.pos_ for w in nlp(text)]

def seq_to_pos(sequence):
    text = ' '.join(sequence)
    return ' '.join([w.pos_ for w in nlp(text)])

def pos_iter(data, tokenizer):
    for qid, question in enumerate(data):
        q_pos = seq_to_pos(text_to_wordlist(question))
        seq = tokenizer.texts_to_sequences([q_pos])
        seq = pad_sequences(seq, MAX_SEQUENCE_LENGTH, padding='post')
        yield np.array(seq[0]) + 1

In [16]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([seq_to_pos(text_to_wordlist(t))
                        for t in train_df.question1.values[:10000]])

In [29]:
%%time
!date

# Train
save_bcolz_array(pos_iter(train_df.question1.values, tokenizer), 'bclz/train/pos1_%s' % bcolz_prefix)
save_bcolz_array(pos_iter(train_df.question2.values, tokenizer), 'bclz/train/pos2_%s' % bcolz_prefix)

# Validation
save_bcolz_array(pos_iter(val_df.question1.values, tokenizer), 'bclz/val/pos1_%s' % bcolz_prefix)
save_bcolz_array(pos_iter(val_df.question2.values, tokenizer), 'bclz/val/pos2_%s' % bcolz_prefix)

# Test
save_bcolz_array(pos_iter(test_df.question1.values, tokenizer), 'bclz/test/pos1_%s' % bcolz_prefix)
save_bcolz_array(pos_iter(test_df.question2.values, tokenizer), 'bclz/test/pos2_%s' % bcolz_prefix)


CPU times: user 1h 30min 38s, sys: 20.5 s, total: 1h 30min 59s
Wall time: 1h 43min 3s

In [30]:
# Train
X1_train_pos = bcolz.open('bclz/train/pos1_%s' % bcolz_prefix)
X2_train_pos = bcolz.open('bclz/train/pos2_%s' % bcolz_prefix)

# Validation
X1_val_pos = bcolz.open('bclz/val/pos1_%s' % bcolz_prefix)
X2_val_pos = bcolz.open('bclz/val/pos2_%s' % bcolz_prefix)

# Test
X1_test_pos = bcolz.open('bclz/test/pos1_%s' % bcolz_prefix)
X2_test_pos = bcolz.open('bclz/test/pos2_%s' % bcolz_prefix)