import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

/home/surya/DL/lib/python3.5/site-packages/sklearn/ DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Using TensorFlow backend.

import spacy
nlp = spacy.load('en')

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

max_len = max([len(i.split()) for i in train.text.values])


def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

lbl_encoder = preprocessing.LabelEncoder()
y = lbl_encoder.fit_transform(

x_train,x_valid,y_train,y_valid = train_test_split(train.text.values,y,

x_train_fltr = [[token.text for token in nlp(doc) if not token.is_stop 
                 and token.text not in ['.',',',':']]for doc in x_train]
x_valid_fltr = [[token.text for token in nlp(doc) if not token.is_stop
                 and token.text not in ['.',',',':']]for doc in x_valid]

x_train_fltr_join = [' '.join(doc) for doc in x_train_fltr]
x_valid_fltr_join = [' '.join(doc) for doc in x_valid_fltr]

xtrain = np.array(x_train_fltr_join)
xvalid = np.array(x_valid_fltr_join)

((9789,), (9790,), (9789,), (9790,))

from spacy.vectors import Vectors
import spacy

nlp =  spacy.load('en')

for i in x_train:
    doc = nlp(i)
    xtrain_token.append([token.text for token in doc ])

for i in x_valid:
    doc = nlp(i)
    xvalid_token.append([token.text for token in doc ])

loc = "data/"
import bcolz
import pickle
import sys   
import pdb
import os
from tensorflow.contrib.learn.python import preprocessing

p = preprocessing.text.VocabularyProcessor(max_document_length=500)
xtrain_ids = np.array(list(p.fit_transform(x_train_fltr_join)))
xvalid_ids = np.array(list(p.fit_transform(x_valid_fltr_join)))

((9789, 500), (9790, 500))

In [43]:
xtrain_ids_v[np.where(xtrain_ids_v > 21000)].size


vocab_size = 21000

xtrain_ids_v = np.array([[i if i<vocab_size else vocab_size-1 for i in s] for s in xtrain_ids])
xvalid_ids_v = np.array([[i if i<vocab_size else vocab_size-1 for i in s] for s in xvalid_ids])

/home/surya/DL/lib/python3.5/site-packages/sklearn/utils/ DataConversionWarning: Data with input dtype int64 was converted to float64 by StandardScaler.
# A simple bidirectional LSTM with glove embeddings and two dense layers
model2 = Sequential()
model2.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))

model2.add(Dense(1024, activation='relu'))

model2.add(Dense(1024, activation='relu'))

model2.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto'), y=yt, batch_size=512, epochs=10, 
          verbose=1, validation_data=(xv, yv), callbacks=[earlystop])

Train on 9789 samples, validate on 9790 samples
Epoch 1/10
9789/9789 [==============================] - 164s 17ms/step - loss: 1.0879 - acc: 0.3988 - val_loss: 1.0806 - val_acc: 0.4040
Epoch 2/10
9789/9789 [==============================] - 161s 16ms/step - loss: 1.0662 - acc: 0.4270 - val_loss: 1.0658 - val_acc: 0.4581
Epoch 3/10
9789/9789 [==============================] - 162s 17ms/step - loss: 0.9704 - acc: 0.5429 - val_loss: 0.9752 - val_acc: 0.5461
Epoch 4/10
9789/9789 [==============================] - 175s 18ms/step - loss: 0.8510 - acc: 0.6094 - val_loss: 0.9600 - val_acc: 0.5727
Epoch 5/10
9789/9789 [==============================] - 225s 23ms/step - loss: 0.7768 - acc: 0.6471 - val_loss: 0.9077 - val_acc: 0.6064
Epoch 6/10
9789/9789 [==============================] - 179s 18ms/step - loss: 0.6984 - acc: 0.7098 - val_loss: 0.8735 - val_acc: 0.6413
Epoch 7/10
9789/9789 [==============================] - 160s 16ms/step - loss: 0.6250 - acc: 0.7596 - val_loss: 0.8218 - val_acc: 0.6726
Epoch 8/10
9789/9789 [==============================] - 160s 16ms/step - loss: 0.5088 - acc: 0.8189 - val_loss: 0.9391 - val_acc: 0.6681
Epoch 9/10
9789/9789 [==============================] - 160s 16ms/step - loss: 0.5222 - acc: 0.8143 - val_loss: 0.8012 - val_acc: 0.6789
Epoch 10/10
9789/9789 [==============================] - 160s 16ms/step - loss: 0.4640 - acc: 0.8293 - val_loss: 0.7907 - val_acc: 0.6904
<keras.callbacks.History at 0x7fbcb2575ac8>

          verbose=1, validation_data=(xv, yv), callbacks=[earlystop])

Train on 9789 samples, validate on 9790 samples
Epoch 1/3
9789/9789 [==============================] - 160s 16ms/step - loss: 0.4421 - acc: 0.8329 - val_loss: 0.7880 - val_acc: 0.6962
Epoch 2/3
9789/9789 [==============================] - 156s 16ms/step - loss: 0.4174 - acc: 0.8403 - val_loss: 0.7993 - val_acc: 0.7044
Epoch 3/3
9789/9789 [==============================] - 160s 16ms/step - loss: 0.3863 - acc: 0.8519 - val_loss: 0.8664 - val_acc: 0.6799
<keras.callbacks.History at 0x7fbcb221bd30>