In [4]:
%matplotlib inline
import pandas as pd
import gensim
import bcolz
import keras
import sys
import re
import os
import csv
import codecs
import spacy
import numpy as np
from gensim.models import Word2Vec, KeyedVectors, word2vec
from matplotlib import pyplot as plt
from datetime import datetime
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from bcolz_array_iterator import BcolzArrayIterator
from string import punctuation
from keras.layers import InputSpec, Layer, Input, Dense, merge
from keras.layers import Lambda, Activation, Dropout, Embedding, TimeDistributed
from keras.layers import Bidirectional, GRU, LSTM
from keras.layers.noise import GaussianNoise
from keras.layers.advanced_activations import ELU
from keras.models import Sequential, Model, model_from_json
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras.layers.pooling import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Merge
from keras.preprocessing.text import Tokenizer
from keras import optimizers
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Reshape, Lambda, Conv2D, Conv1D
from keras.layers import merge
from keras.layers.merge import concatenate, dot
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, BaseLogger, ProgbarLogger
np.random.seed(7)
np.set_printoptions(precision=3)
# !cd en_core_web_md-1.2.1 && python setup.py install
# !python -m spacy link en_core_web_md en_core_web_md
# !pip install gensim nltk keras pandas bcolz h5py spacy
In [5]:
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
bcolz_chunklen = 64
stem = 0
init_random = 1
stop_words = 0
w2v_provider = 'glove'
bcolz_prefix = '%s_%s_%s_%s_%s' % (MAX_SEQUENCE_LENGTH,
bcolz_chunklen,
stem,
init_random,
stop_words)
bcolz_prefix
Out[5]:
In [3]:
df = pd.read_csv('train.csv.zip')
df = df[(df.question1.isnull() == False) & (df.question2.isnull() == False)]
train_df = df.iloc[:360000]
val_df = df.iloc[360000:]
print(df.shape)
df = None
test_df = pd.read_csv('test.csv.zip')
print(test_df.shape)
In [6]:
%%time
if w2v_provider == 'glove':
w2v = {}
w2v_path = 'glove.840B.300d.txt'
with open(w2v_path) as f_:
for line in f_:
try:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
w2v[word] = coefs
except Exception as e:
continue
else:
w2v_path = 'GoogleNews-vectors-negative300.bin.gz'
w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
In [10]:
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
if text is np.nan:
text = ''
# Clean the text, with the option to remove stopwords and to stem words.
# Convert words to lower case and split them
text = text.split()
# Optionally, remove stop words
if remove_stopwords:
stops = set(stopwords.words("english"))
text = [w for w in text if not w in stops]
text = " ".join(text)
# Clean the text
# text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
text = re.sub(r"\[math\].*\[\/math\]", 'math', text)
text = re.sub(r"\[code\].*\[\/code\]", 'code', text)
text = re.sub(r"what's", "what is ", text)
text = re.sub(r"\'s", " ", text)
text = re.sub(r"\'ve", " have ", text)
text = re.sub(r"can't", "cannot ", text)
text = re.sub(r"n't", " not ", text)
text = re.sub(r"i'm", "i am ", text)
text = re.sub(r"\'re", " are ", text)
text = re.sub(r"\'d", " would ", text)
text = re.sub(r"\'ll", " will ", text)
text = re.sub(r",", " ", text)
text = re.sub(r"\.", " ", text)
text = re.sub(r"!", " ! ", text)
text = re.sub(r"\(", "", text)
text = re.sub(r"\)", "", text)
text = re.sub(r":", "", text)
text = re.sub(r"\/", " ", text)
text = re.sub(r"\^", " ^ ", text)
text = re.sub(r"\+", " + ", text)
text = re.sub(r"\-", " - ", text)
text = re.sub(r"\=", " = ", text)
text = re.sub(r"'", " ", text)
text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
text = re.sub(r" e g ", " eg ", text)
text = re.sub(r" b g ", " bg ", text)
text = re.sub(r" u s ", " american ", text)
text = re.sub(r"\0s", "0", text)
text = re.sub(r" 9 11 ", "911", text)
text = re.sub(r"e - mail", "email", text)
text = re.sub(r"j k", "jk", text)
text = re.sub(r"\s{2,}", " ", text)
# Optionally, shorten words to their stems
if stem_words:
text = text.split()
stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in text]
text = " ".join(stemmed_words)
# Return a list of words
return text.split()
In [11]:
def word_to_vec(word, w2v, custom_w2v, init_random=False):
if word in w2v:
return w2v[word]
elif word in custom_w2v:
return custom_w2v[word]
else:
if init_random:
custom_w2v[word] = np.random.normal(scale=.5, size=[1,300])
else:
custom_w2v[word] = np.zeros([1,300])
return custom_w2v[word]
In [12]:
custom_w2v = {}
def save_bcolz_array(data_iter, rootdir):
carr = bcolz.carray([next(data_iter)], chunklen=64,
rootdir=rootdir, mode='w')
for chunk in data_iter:
carr.append(chunk)
carr.flush()
def emb_iter(data):
for qid, question in enumerate(data):
emb = np.zeros([MAX_SEQUENCE_LENGTH,300])
for idx, word in enumerate(text_to_wordlist(question,
remove_stopwords=stop_words,
stem_words=stem)):
if idx < MAX_SEQUENCE_LENGTH:
emb[idx,:] = word_to_vec(word, w2v, custom_w2v, init_random=init_random)
yield emb
In [12]:
%%time
# fill custom_w2v
for emb in emb_iter(df.question1.values):
continue
for emb in emb_iter(df.question2.values):
continue
for emb in emb_iter(test_df.question1.values):
continue
for emb in emb_iter(test_df.question2.values):
continue
In [13]:
%%time
!date
# Train
!mkdir -p bclz/train
save_bcolz_array(emb_iter(train_df.question1.values), 'bclz/train/q1_%s' % bcolz_prefix)
save_bcolz_array(emb_iter(train_df.question2.values), 'bclz/train/q2_%s' % bcolz_prefix)
save_bcolz_array(iter(train_df.is_duplicate.values.astype(float)), 'bclz/train/y_%s' % bcolz_chunklen)
# Validation
!mkdir -p bclz/val
save_bcolz_array(emb_iter(val_df.question1.values), 'bclz/val/q1_%s' % bcolz_prefix)
save_bcolz_array(emb_iter(val_df.question2.values), 'bclz/val/q2_%s' % bcolz_prefix)
save_bcolz_array(iter(val_df.is_duplicate.values.astype(float)), 'bclz/val/y_%s' % bcolz_chunklen)
# Test
!mkdir -p bclz/test
save_bcolz_array(emb_iter(test_df.question1.values), 'bclz/test/q1_%s' % bcolz_prefix)
save_bcolz_array(emb_iter(test_df.question2.values), 'bclz/test/q2_%s' % bcolz_prefix)
In [14]:
nlp = spacy.load('en_core_web_md')
def text_to_pos(text):
return [w.pos_ for w in nlp(text)]
def seq_to_pos(sequence):
text = ' '.join(sequence)
return ' '.join([w.pos_ for w in nlp(text)])
def pos_iter(data, tokenizer):
for qid, question in enumerate(data):
q_pos = seq_to_pos(text_to_wordlist(question))
seq = tokenizer.texts_to_sequences([q_pos])
seq = pad_sequences(seq, MAX_SEQUENCE_LENGTH, padding='post')
yield np.array(seq[0]) + 1
In [16]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([seq_to_pos(text_to_wordlist(t))
for t in train_df.question1.values[:10000]])
In [29]:
%%time
!date
# Train
save_bcolz_array(pos_iter(train_df.question1.values, tokenizer), 'bclz/train/pos1_%s' % bcolz_prefix)
save_bcolz_array(pos_iter(train_df.question2.values, tokenizer), 'bclz/train/pos2_%s' % bcolz_prefix)
# Validation
save_bcolz_array(pos_iter(val_df.question1.values, tokenizer), 'bclz/val/pos1_%s' % bcolz_prefix)
save_bcolz_array(pos_iter(val_df.question2.values, tokenizer), 'bclz/val/pos2_%s' % bcolz_prefix)
# Test
save_bcolz_array(pos_iter(test_df.question1.values, tokenizer), 'bclz/test/pos1_%s' % bcolz_prefix)
save_bcolz_array(pos_iter(test_df.question2.values, tokenizer), 'bclz/test/pos2_%s' % bcolz_prefix)
In [30]:
# Train
X1_train_pos = bcolz.open('bclz/train/pos1_%s' % bcolz_prefix)
X2_train_pos = bcolz.open('bclz/train/pos2_%s' % bcolz_prefix)
# Validation
X1_val_pos = bcolz.open('bclz/val/pos1_%s' % bcolz_prefix)
X2_val_pos = bcolz.open('bclz/val/pos2_%s' % bcolz_prefix)
# Test
X1_test_pos = bcolz.open('bclz/test/pos1_%s' % bcolz_prefix)
X2_test_pos = bcolz.open('bclz/test/pos2_%s' % bcolz_prefix)