In [ ]:
# fit word2vec on full/test questions
# fit tokenizer on full/test questions

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from nltk.corpus import stopwords
import gensim, logging
import json

import os.path

MAX_NUM_WORDS = 125


Using Theano backend.
WARNING (theano.sandbox.cuda): The cuda backend is deprecated and will be removed in the next release (v0.10).  Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 760 (CNMeM is disabled, cuDNN not available)

In [3]:
def submit(y_pred, test, filename):
    sub = pd.DataFrame()
    sub = pd.DataFrame()
    sub['test_id'] = test['test_id']
    sub['is_duplicate'] = y_test
    sub.to_csv(filename, index=False)

def save_sparse_csr(filename,array):
    np.savez(filename,data = array.data ,indices=array.indices,
             indptr =array.indptr, shape=array.shape )

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

def correct_dataset(dataset):
    dataset.loc[(dataset['question1'] == dataset['question2']), 'is_duplicate'] = 1
    return dataset

def process_dataset(dataset, correct_dataset=False):
    dataset['question1'].fillna(' ', inplace=True)
    dataset['question2'].fillna(' ', inplace=True)
    
    #delete punctuation
    dataset['question1'] = dataset['question1'].str.replace('[^\w\s]','')
    dataset['question2'] = dataset['question2'].str.replace('[^\w\s]','')

    #lower questions
    dataset['question1'] = dataset['question1'].str.lower()
    dataset['question2'] = dataset['question2'].str.lower()

    #union questions
    dataset['union'] = pd.Series(dataset['question1']).str.cat(dataset['question2'], sep=' ')

    if correct_dataset:
        return correct_dataset(dataset)
    else:
        return dataset

def split_and_rem_stop_words(line):
    cachedStopWords = stopwords.words("english")
    return [word for word in line.split() if word not in cachedStopWords]

def create_word_to_vec(sentences, embedding_path, verbose=0, save=1, **params_for_w2v):
    if verbose:
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    model = gensim.models.Word2Vec(sentences, **params_for_w2v)
    
    if save:
        model.save(embedding_path)
    
    return model
    

def create_embeddings(sentences, embeddings_path='embeddings/embedding.npz',
                      verbose=0, **params):
    """
    Generate embeddings from a batch of text
    :param embeddings_path: where to save the embeddings
    :param vocab_path: where to save the word-index map
    """

    if verbose:
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    model = gensim.models.Word2Vec(sentences, **params)
    weights = model.wv.syn0
    np.save(open(embeddings_path, 'wb'), weights)


def load_vocab(vocab_path):
    """
    Load word -> index and index -> word mappings
    :param vocab_path: where the word-index map is saved
    :return: word2idx, idx2word
    """

    with open(vocab_path, 'r') as f:
        data = json.loads(f.read())
    word2idx = data
    idx2word = dict([(v, k) for k, v in data.items()])
    return word2idx, idx2word


def get_word2vec_embedding_layer(embeddings_path):
    """
    Generate an embedding layer word2vec embeddings
    :param embeddings_path: where the embeddings are saved (as a numpy file)
    :return: the generated embedding layer
    """

    weights = np.load(open(embeddings_path, 'rb'))
    layer = Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], weights=[weights],
                     trainable=False)
    return layer

In [4]:
#Load train



if os.path.isfile('dataframes/train.h5'):
    train = pd.read_pickle('dataframes/train.h5')
else:
    train = pd.read_csv('../datasets/train.csv')
    train = process_dataset(train)
    train['union_splitted'] = train['union'].apply(lambda sentence: split_and_rem_stop_words(sentence))
    train.to_pickle('dataframes/train.h5')

In [5]:
# Load test

if all([os.path.isfile('dataframes/test_0.h5'), os.path.isfile('dataframes/test_1.h5'),
        os.path.isfile('dataframes/test_2.h5'), os.path.isfile('dataframes/test_3.h5')]):
    
    test = pd.read_csv('../datasets/test.csv')
    test = process_dataset(test)
    
#     test_0 = pd.read_pickle('dataframes/test_0.h5')
#     test_1 = pd.read_pickle('dataframes/test_1.h5')
#     test_2 = pd.read_pickle('dataframes/test_2.h5')
#     test_3 = pd.read_pickle('dataframes/test_3.h5')

#     test_0.columns = ['union_splitted']
#     test_1.columns = ['union_splitted']
#     test_2.columns = ['union_splitted']
#     test_3.columns = ['union_splitted']

#     test_full_splitted = test_0.append(
#                          test_1.append(
#                          test_2.append(
#                          test_3)))

#     test['union_splitted'] = test_full_splitted['union_splitted'].values
else:
    print 'Not enough files for test'

In [ ]:
#Tokenize test

tokenizer = Tokenizer(nb_words=MAX_NUM_WORDS, split=' ')
tokenizer.fit_on_texts(train['union'])
sequences = tokenizer.texts_to_sequences(test['union'])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_test = pad_sequences(sequences, maxlen=MAX_NUM_WORDS)

print('Shape of data tensor:', X_test.shape)


Found 108180 unique tokens.
('Shape of data tensor:', (2345796, 125))

In [9]:
#Load model

model = load_model('keras_models/my_model_6_epochs.h5')


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-9-4caf434906fa> in <module>()
      5 #predict
      6 
----> 7 y_preds = model.predict(X_test, batch_size=128)

/home/loopdigga/Documents/ml/ml_env/local/lib/python2.7/site-packages/keras/models.pyc in predict(self, x, batch_size, verbose)
    722         if self.model is None:
    723             self.build()
--> 724         return self.model.predict(x, batch_size=batch_size, verbose=verbose)
    725 
    726     def predict_on_batch(self, x):

/home/loopdigga/Documents/ml/ml_env/local/lib/python2.7/site-packages/keras/engine/training.pyc in predict(self, x, batch_size, verbose)
   1270         f = self.predict_function
   1271         return self._predict_loop(f, ins,
-> 1272                                   batch_size=batch_size, verbose=verbose)
   1273 
   1274     def train_on_batch(self, x, y,

/home/loopdigga/Documents/ml/ml_env/local/lib/python2.7/site-packages/keras/engine/training.pyc in _predict_loop(self, f, ins, batch_size, verbose)
    943                 ins_batch = slice_X(ins, batch_ids)
    944 
--> 945             batch_outs = f(ins_batch)
    946             if not isinstance(batch_outs, list):
    947                 batch_outs = [batch_outs]

/home/loopdigga/Documents/ml/ml_env/local/lib/python2.7/site-packages/keras/backend/theano_backend.pyc in __call__(self, inputs)
    957     def __call__(self, inputs):
    958         assert isinstance(inputs, (list, tuple))
--> 959         return self.function(*inputs)
    960 
    961 

/home/loopdigga/Documents/ml/ml_env/local/lib/python2.7/site-packages/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    882         try:
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)
    886         except Exception:

KeyboardInterrupt: 

In [ ]:
#predict

y_preds = model.predict(X_test, batch_size=128, verbose=1)

In [10]:
max_num_words = train['union_splitted'].map(len).max()
len_x = len(train['union_splitted'])

In [11]:
#Tokenize train

tokenizer = Tokenizer(nb_words=MAX_NUM_WORDS, split=' ')
tokenizer.fit_on_texts(train['union'])
sequences = tokenizer.texts_to_sequences(train['union'])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_train = pad_sequences(sequences, maxlen=MAX_NUM_WORDS)
y_train = train.is_duplicate.tolist()

print('Shape of data tensor:', X_train.shape)


Found 108180 unique tokens.
('Shape of data tensor:', (404290, 125))

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10)

In [16]:
if not os.path.isfile('../embeddings/embedding.npz'):
    create_embeddings(sentences=train['union_splitted'], embeddings_path='../embeddings/embedding.npz', verbose=1)

weights = np.load(open('../embeddings/embedding.npz', 'rb'))


2017-04-12 15:22:55,259 : INFO : collecting all words and their counts
2017-04-12 15:22:55,262 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-12 15:22:55,292 : INFO : PROGRESS: at sentence #10000, processed 110842 words, keeping 15889 word types
2017-04-12 15:22:55,322 : INFO : PROGRESS: at sentence #20000, processed 221497 words, keeping 23104 word types
2017-04-12 15:22:55,349 : INFO : PROGRESS: at sentence #30000, processed 332393 words, keeping 28724 word types
2017-04-12 15:22:55,376 : INFO : PROGRESS: at sentence #40000, processed 442629 words, keeping 33290 word types
2017-04-12 15:22:55,401 : INFO : PROGRESS: at sentence #50000, processed 553973 words, keeping 37495 word types
2017-04-12 15:22:55,427 : INFO : PROGRESS: at sentence #60000, processed 664660 words, keeping 41167 word types
2017-04-12 15:22:55,455 : INFO : PROGRESS: at sentence #70000, processed 775454 words, keeping 44525 word types
2017-04-12 15:22:55,482 : INFO : PROGRESS: at sentence #80000, processed 886679 words, keeping 47538 word types
2017-04-12 15:22:55,510 : INFO : PROGRESS: at sentence #90000, processed 997109 words, keeping 50378 word types
2017-04-12 15:22:55,537 : INFO : PROGRESS: at sentence #100000, processed 1107801 words, keeping 53304 word types
2017-04-12 15:22:55,563 : INFO : PROGRESS: at sentence #110000, processed 1217243 words, keeping 55932 word types
2017-04-12 15:22:55,590 : INFO : PROGRESS: at sentence #120000, processed 1327824 words, keeping 58361 word types
2017-04-12 15:22:55,614 : INFO : PROGRESS: at sentence #130000, processed 1439135 words, keeping 60859 word types
2017-04-12 15:22:55,639 : INFO : PROGRESS: at sentence #140000, processed 1550433 words, keeping 63217 word types
2017-04-12 15:22:55,664 : INFO : PROGRESS: at sentence #150000, processed 1661890 words, keeping 65626 word types
2017-04-12 15:22:55,691 : INFO : PROGRESS: at sentence #160000, processed 1773442 words, keeping 67889 word types
2017-04-12 15:22:55,719 : INFO : PROGRESS: at sentence #170000, processed 1884607 words, keeping 69904 word types
2017-04-12 15:22:55,747 : INFO : PROGRESS: at sentence #180000, processed 1996244 words, keeping 71998 word types
2017-04-12 15:22:55,772 : INFO : PROGRESS: at sentence #190000, processed 2107414 words, keeping 73910 word types
2017-04-12 15:22:55,803 : INFO : PROGRESS: at sentence #200000, processed 2217690 words, keeping 75809 word types
2017-04-12 15:22:55,831 : INFO : PROGRESS: at sentence #210000, processed 2328075 words, keeping 77682 word types
2017-04-12 15:22:55,856 : INFO : PROGRESS: at sentence #220000, processed 2438747 words, keeping 79489 word types
2017-04-12 15:22:55,884 : INFO : PROGRESS: at sentence #230000, processed 2549833 words, keeping 81295 word types
2017-04-12 15:22:55,907 : INFO : PROGRESS: at sentence #240000, processed 2660238 words, keeping 82994 word types
2017-04-12 15:22:55,932 : INFO : PROGRESS: at sentence #250000, processed 2771487 words, keeping 84820 word types
2017-04-12 15:22:55,961 : INFO : PROGRESS: at sentence #260000, processed 2883236 words, keeping 86574 word types
2017-04-12 15:22:55,990 : INFO : PROGRESS: at sentence #270000, processed 2995182 words, keeping 88277 word types
2017-04-12 15:22:56,016 : INFO : PROGRESS: at sentence #280000, processed 3106000 words, keeping 89921 word types
2017-04-12 15:22:56,045 : INFO : PROGRESS: at sentence #290000, processed 3216907 words, keeping 91541 word types
2017-04-12 15:22:56,069 : INFO : PROGRESS: at sentence #300000, processed 3327044 words, keeping 93172 word types
2017-04-12 15:22:56,098 : INFO : PROGRESS: at sentence #310000, processed 3437812 words, keeping 94709 word types
2017-04-12 15:22:56,128 : INFO : PROGRESS: at sentence #320000, processed 3548152 words, keeping 96171 word types
2017-04-12 15:22:56,155 : INFO : PROGRESS: at sentence #330000, processed 3659521 words, keeping 97707 word types
2017-04-12 15:22:56,184 : INFO : PROGRESS: at sentence #340000, processed 3769913 words, keeping 99101 word types
2017-04-12 15:22:56,213 : INFO : PROGRESS: at sentence #350000, processed 3880766 words, keeping 100536 word types
2017-04-12 15:22:56,243 : INFO : PROGRESS: at sentence #360000, processed 3991804 words, keeping 101954 word types
2017-04-12 15:22:56,274 : INFO : PROGRESS: at sentence #370000, processed 4102433 words, keeping 103335 word types
2017-04-12 15:22:56,306 : INFO : PROGRESS: at sentence #380000, processed 4213923 words, keeping 104808 word types
2017-04-12 15:22:56,337 : INFO : PROGRESS: at sentence #390000, processed 4326079 words, keeping 106244 word types
2017-04-12 15:22:56,366 : INFO : PROGRESS: at sentence #400000, processed 4438291 words, keeping 107566 word types
2017-04-12 15:22:56,380 : INFO : collected 108152 word types from a corpus of 4486323 raw words and 404290 sentences
2017-04-12 15:22:56,380 : INFO : Loading a fresh vocabulary
2017-04-12 15:22:56,495 : INFO : min_count=5 retains 31660 unique words (29% of original 108152, drops 76492)
2017-04-12 15:22:56,496 : INFO : min_count=5 leaves 4366911 word corpus (97% of original 4486323, drops 119412)
2017-04-12 15:22:56,565 : INFO : deleting the raw counts dictionary of 108152 items
2017-04-12 15:22:56,570 : INFO : sample=0.001 downsamples 23 most-common words
2017-04-12 15:22:56,571 : INFO : downsampling leaves estimated 4207849 word corpus (96.4% of prior 4366911)
2017-04-12 15:22:56,571 : INFO : estimated required memory for 31660 words and 100 dimensions: 41158000 bytes
2017-04-12 15:22:56,674 : INFO : resetting layer weights
2017-04-12 15:22:56,936 : INFO : training model with 3 workers on 31660 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-04-12 15:22:57,947 : INFO : PROGRESS: at 6.30% examples, 1313877 words/s, in_qsize 6, out_qsize 1
2017-04-12 15:22:58,950 : INFO : PROGRESS: at 13.11% examples, 1371247 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:22:59,954 : INFO : PROGRESS: at 19.87% examples, 1386943 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:00,955 : INFO : PROGRESS: at 26.48% examples, 1386573 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:01,965 : INFO : PROGRESS: at 31.55% examples, 1320490 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:02,970 : INFO : PROGRESS: at 37.79% examples, 1318146 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:03,972 : INFO : PROGRESS: at 43.98% examples, 1315573 words/s, in_qsize 6, out_qsize 0
2017-04-12 15:23:04,974 : INFO : PROGRESS: at 51.69% examples, 1353389 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:05,974 : INFO : PROGRESS: at 59.35% examples, 1381949 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:06,976 : INFO : PROGRESS: at 67.06% examples, 1405561 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:07,980 : INFO : PROGRESS: at 74.86% examples, 1426234 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:08,985 : INFO : PROGRESS: at 82.65% examples, 1443462 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:09,986 : INFO : PROGRESS: at 90.41% examples, 1457797 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:10,987 : INFO : PROGRESS: at 98.12% examples, 1469311 words/s, in_qsize 5, out_qsize 0
2017-04-12 15:23:11,221 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-04-12 15:23:11,226 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-04-12 15:23:11,228 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-04-12 15:23:11,229 : INFO : training on 22431615 raw words (21039307 effective words) took 14.3s, 1472407 effective words/s

In [13]:
embedding_layer = Embedding(input_dim=weights.shape[0], output_dim=100, weights=[weights], 
                            input_length=max_num_words, trainable=False)
# # embedding_layer = get_word2vec_embedding_layer('embeddings/embedding.npz')

model = Sequential()
model.add(embedding_layer)

model.add(Conv1D(16, 2, activation='relu'))
# model.add(MaxPooling1D(5))

model.add(Conv1D(32, 3, activation='relu'))
model.add(MaxPooling1D(2))

model.add(Conv1D(64, 4, activation='relu'))
model.add(MaxPooling1D(5))

model.add(Flatten())

model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=128, nb_epoch=1, 
          validation_data=(X_val, y_val))


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-13-3c22c3a7e831> in <module>()
----> 1 weights = np.load(open('../embeddings/embedding.npz', 'rb'))
      2 
      3 embedding_layer = Embedding(input_dim=weights.shape[0], output_dim=100, weights=[weights], 
      4                             input_length=max_num_words, trainable=False)
      5 # # embedding_layer = get_word2vec_embedding_layer('embeddings/embedding.npz')

IOError: [Errno 2] No such file or directory: '../embeddings/embedding.npz'

In [ ]: