AV_Enigma_NLP_functional_api


ML


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [3]:
def preprocessing_text(s):
    import re
    s = re.sub(r"[^A-Za-z0-9^,\*+-=]", " ",s)
    s = re.sub(r"(\d+)(k)", r"\g<1>000", s) #expand 'k' to '000' eg. 50k to 50000
    s = re.sub(r"\;"," ",s)
    s = re.sub(r"\:"," ",s)
    s = re.sub(r"\,"," ",s)
    s = re.sub(r"\."," ",s)
    s = re.sub(r"\<"," ",s)
    s = re.sub(r"\^"," ",s)
    s = re.sub(r"(\d+)(/)", "\g<1> divide ", s) #change number/number to number divide number (eg. 2/3 to 2 divide 3)
    s = re.sub(r"\/"," ",s) #replace the rest of / with white space
    s = re.sub(r"\+", " plus ", s)
    s = re.sub(r"\-", " minus ", s)
    s = re.sub(r"\*", " multiply ", s)
    s = re.sub(r"\=", "equal", s)
    s = re.sub(r"What's", "What is ", s)
    s = re.sub(r"what's", "what is ", s)
    s = re.sub(r"Who's", "Who is ", s)
    s = re.sub(r"who's", "who is ", s)
    s = re.sub(r"\'s", " ", s)
    s = re.sub(r"\'ve", " have ", s)
    s = re.sub(r"can't", "cannot ", s)
    s = re.sub(r"n't", " not ", s)
    s = re.sub(r"\'re", " are ", s)
    s = re.sub(r"\'d", " would ", s)
    s = re.sub(r"\'ll", " will ", s)
    s = re.sub(r"'m", " am ", s)
    s = re.sub(r"or not", " ", s)
    s = re.sub(r"What should I do to", "How can I", s)
    s = re.sub(r"How do I", "How can I", s)
    s = re.sub(r"How can you make", "What can make", s)
    s = re.sub(r"How do we", "How do I", s)
    s = re.sub(r"How do you", "How do I", s)
    s = re.sub(r"Is it possible", "Can we", s)
    s = re.sub(r"Why is", "Why", s)
    s = re.sub(r"Which are", "What are", s)
    s = re.sub(r"What are the reasons", "Why", s)
    s = re.sub(r"What are some tips", "tips", s)
    s = re.sub(r"What is the best way", "best way", s)
    s = re.sub(r"e-mail", "email", s)
    s = re.sub(r"e - mail", "email", s)
    s = re.sub(r"US", "America", s)
    s = re.sub(r"USA", "America", s)
    s = re.sub(r"us", "America", s)
    s = re.sub(r"usa", "America", s)
    s = re.sub(r"Chinese", "China", s)
    s = re.sub(r"india", "India", s)
    s = re.sub(r"\s{2,}", " ", s) #remove extra white space
    s = s.strip()
    return s

def remove_stopwords(string):
    word_list = [word.lower() for word in string.split()]
    from nltk.corpus import stopwords
    stopwords_list = list(stopwords.words("english"))
    for word in word_list:
        if word in stopwords_list:
            word_list.remove(word)
    return ' '.join(word_list)

def get_char_length_ratio(row):
    return len(row['tweet'])/max(1,len(row['tweet_without_stopwords']))

def get_synonyms(word):
    from nltk.corpus import wordnet as wn
    synonyms = []
    if wn.synsets(word):
        for syn in wn.synsets(word):
            for l in syn.lemmas():
                synonyms.append(l.name())
    return list(set(synonyms))

def get_row_syn_set(row):
    import nltk
    syn_set = [nltk.word_tokenize(row)]
    for token in nltk.word_tokenize(row):
        if get_synonyms(token):
            syn_set.append(get_synonyms(token))
    return set([y for x in syn_set for y in x])

def get_Levenshtein(string1,string2):
    import editdistance
    return editdistance.eval(string1,string2)

def num_pos(sent):
    num_pos = 0
    word_list = [word.lower() for word in nltk.word_tokenize(sent)]
    for index, word in enumerate(word_list):
        if word in positive_words:
            if word_list[index-1] not in ['not','no']:
                num_pos += 1
    return num_pos

def num_neg(sent):
    num_neg = 0
    word_list = [word.lower() for word in nltk.word_tokenize(sent)]
    for index, word in enumerate(word_list):
        if word in negative_words:
            if word_list[index-1] not in ['not','no']:
                num_neg += 1
    return num_neg

p_url = 'http://ptrckprry.com/course/ssd/data/positive-words.txt'
n_url = 'http://ptrckprry.com/course/ssd/data/negative-words.txt'

import requests,nltk
positive_words = requests.get(p_url).content.decode('latin-1')
positive_words = nltk.word_tokenize(positive_words)
positive_words.remove('not')
negative_words = requests.get(n_url).content.decode('latin-1')
negative_words = nltk.word_tokenize(negative_words)
positive_words = positive_words[413:]
negative_words = negative_words[418:]

In [4]:
train['tweet'] = train['tweet'].map(lambda x: preprocessing_text(x))
test['tweet'] = test['tweet'].map(lambda x: preprocessing_text(x))

train['tweet'] = train['tweet'].astype(str)
train['tweet_without_stopwords'] = train['tweet'].apply(lambda x: remove_stopwords(x))
test['tweet'] = test['tweet'].astype(str)
test['tweet_without_stopwords'] = test['tweet'].apply(lambda x: remove_stopwords(x))

train['char_length_ratio'] = train.apply(lambda row: get_char_length_ratio(row), axis=1)
test['char_length_ratio'] = test.apply(lambda row: get_char_length_ratio(row), axis=1)

train['tweet_tokens_syn_set'] = train['tweet_without_stopwords'].map(lambda row: get_row_syn_set(row))
train['num_syn_words'] = train.apply(lambda x: len(x['tweet_tokens_syn_set'].intersection(set(nltk.word_tokenize(x['tweet'])))), axis=1)
test['tweet_tokens_syn_set'] = test['tweet_without_stopwords'].map(lambda row: get_row_syn_set(row))
test['num_syn_words'] = test.apply(lambda x:len(x['tweet_tokens_syn_set'].intersection(set(nltk.word_tokenize(x['tweet'])))), axis=1)

train['Lev_dist'] = train.apply(lambda row: get_Levenshtein(row['tweet'],row['tweet_without_stopwords']),axis = 1)
test['Lev_dist'] = test.apply(lambda row: get_Levenshtein(row['tweet'],row['tweet_without_stopwords']),axis = 1)

train['tweet_num_pos'] = train['tweet_without_stopwords'].apply(lambda x: num_pos(x))
train['tweet_num_neg'] = train['tweet_without_stopwords'].apply(lambda x: num_neg(x))

test['tweet_num_pos'] = test['tweet_without_stopwords'].apply(lambda x: num_pos(x))
test['tweet_num_neg'] = test['tweet_without_stopwords'].apply(lambda x: num_neg(x))

train['tweet_diff_num'] = (train['tweet_num_pos'] - train['tweet_num_neg']).abs()
test['tweet_diff_num'] = (test['tweet_num_pos'] - test['tweet_num_neg']).abs()

train.drop('tweet_tokens_syn_set', axis=1, inplace=True)
test.drop('tweet_tokens_syn_set', axis=1, inplace=True)

In [5]:
from sklearn.metrics import f1_score

In [6]:
y = train.label.values
xtrain, xvalid, ytrain, yvalid = train_test_split(train.tweet_without_stopwords.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

In [7]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 4), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)
xtest_tfv = tfv.transform(test.tweet_without_stopwords.values)

In [8]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=200)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)
xtest_svd = svd.transform(xtest_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)
xtest_svd_scl = scl.transform(xtest_svd)

In [9]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 4), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)
xtest_ctv = ctv.transform(test.tweet_without_stopwords.values)

In [18]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict(xvalid_svd_scl)

print ("f1 score: %0.3f " % f1_score(yvalid, predictions))
predictions_test = clf.predict_proba(xtest_svd_scl)
predictions_test = np.where(predictions_test[:,1]>=0.405, 1,0)
sample['label'] = predictions_test
sample.to_csv('svc_preds.csv', index=False)


f1 score: 0.800 

In [14]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=8, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, n_jobs=-1, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict(xvalid_tfv.tocsc())

print ("f1 score: %0.3f " % f1_score(yvalid, predictions))

predictions_test = clf.predict_proba(xtest_tfv.tocsc())
predictions_test = np.where(predictions_test[:,1]>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('xgb_preds.csv', index=False)


f1 score: 0.777 

In [23]:
mll_scorer = metrics.make_scorer(f1_score, greater_is_better=True, needs_proba=False)

In [24]:
# this is the main ensembling class. how to use it is in the next cell!
####################################################################### This Sciprt below to Kaggle.com
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
import pandas as pd
import os
import sys
import logging

logging.basicConfig(
    level=logging.DEBUG,
    format="[%(asctime)s] %(levelname)s %(message)s",
    datefmt="%H:%M:%S", stream=sys.stdout)
logger = logging.getLogger(__name__)


class Ensembler(object):
    def __init__(self, model_dict, num_folds=3, task_type='classification', optimize=roc_auc_score,
                 lower_is_better=False, save_path=None):
        """
        Ensembler init function
        :param model_dict: model dictionary, see README for its format
        :param num_folds: the number of folds for ensembling
        :param task_type: classification or regression
        :param optimize: the function to optimize for, e.g. AUC, logloss, etc. Must have two arguments y_test and y_pred
        :param lower_is_better: is lower value of optimization function better or higher
        :param save_path: path to which model pickles will be dumped to along with generated predictions, or None
        """

        self.model_dict = model_dict
        self.levels = len(self.model_dict)
        self.num_folds = num_folds
        self.task_type = task_type
        self.optimize = optimize
        self.lower_is_better = lower_is_better
        self.save_path = save_path

        self.training_data = None
        self.test_data = None
        self.y = None
        self.lbl_enc = None
        self.y_enc = None
        self.train_prediction_dict = None
        self.test_prediction_dict = None
        self.num_classes = None

    def fit(self, training_data, y, lentrain):
        """
        :param training_data: training data in tabular format
        :param y: binary, multi-class or regression
        :return: chain of models to be used in prediction
        """

        self.training_data = training_data
        self.y = y

        if self.task_type == 'classification':
            self.num_classes = len(np.unique(self.y))
            logger.info("Found %d classes", self.num_classes)
            self.lbl_enc = LabelEncoder()
            self.y_enc = self.lbl_enc.fit_transform(self.y)
            kf = StratifiedKFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, self.num_classes)
        else:
            self.num_classes = -1
            self.y_enc = self.y
            kf = KFold(n_splits=self.num_folds)
            train_prediction_shape = (lentrain, 1)

        self.train_prediction_dict = {}
        for level in range(self.levels):
            self.train_prediction_dict[level] = np.zeros((train_prediction_shape[0],
                                                          train_prediction_shape[1] * len(self.model_dict[level])))

        for level in range(self.levels):

            if level == 0:
                temp_train = self.training_data
            else:
                temp_train = self.train_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):
                validation_scores = []
                foldnum = 1
                for train_index, valid_index in kf.split(self.train_prediction_dict[0], self.y_enc):
                    logger.info("Training Level %d Fold # %d. Model # %d", level, foldnum, model_num)

                    if level != 0:
                        l_training_data = temp_train[train_index]
                        l_validation_data = temp_train[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])
                    else:
                        l0_training_data = temp_train[0][model_num]
                        if type(l0_training_data) == list:
                            l_training_data = [x[train_index] for x in l0_training_data]
                            l_validation_data = [x[valid_index] for x in l0_training_data]
                        else:
                            l_training_data = l0_training_data[train_index]
                            l_validation_data = l0_training_data[valid_index]
                        model.fit(l_training_data, self.y_enc[train_index])

                    logger.info("Predicting Level %d. Fold # %d. Model # %d", level, foldnum, model_num)

                    if self.task_type == 'classification':
                        temp_train_predictions = model.predict_proba(l_validation_data)
                        self.train_prediction_dict[level][valid_index,
                        (model_num * self.num_classes):(model_num * self.num_classes) +
                                                       self.num_classes] = temp_train_predictions

                    else:
                        temp_train_predictions = model.predict(l_validation_data)
                        self.train_prediction_dict[level][valid_index, model_num] = temp_train_predictions
                    validation_score = self.optimize(self.y_enc[valid_index], temp_train_predictions)
                    validation_scores.append(validation_score)
                    logger.info("Level %d. Fold # %d. Model # %d. Validation Score = %f", level, foldnum, model_num,
                                validation_score)
                    foldnum += 1
                avg_score = np.mean(validation_scores)
                std_score = np.std(validation_scores)
                logger.info("Level %d. Model # %d. Mean Score = %f. Std Dev = %f", level, model_num,
                            avg_score, std_score)

            logger.info("Saving predictions for level # %d", level)
            train_predictions_df = pd.DataFrame(self.train_prediction_dict[level])
            train_predictions_df.to_csv(os.path.join(self.save_path, "train_predictions_level_" + str(level) + ".csv"),
                                        index=False, header=None)

        return self.train_prediction_dict

    def predict(self, test_data, lentest):
        self.test_data = test_data
        if self.task_type == 'classification':
            test_prediction_shape = (lentest, self.num_classes)
        else:
            test_prediction_shape = (lentest, 1)

        self.test_prediction_dict = {}
        for level in range(self.levels):
            self.test_prediction_dict[level] = np.zeros((test_prediction_shape[0],
                                                         test_prediction_shape[1] * len(self.model_dict[level])))
        self.test_data = test_data
        for level in range(self.levels):
            if level == 0:
                temp_train = self.training_data
                temp_test = self.test_data
            else:
                temp_train = self.train_prediction_dict[level - 1]
                temp_test = self.test_prediction_dict[level - 1]

            for model_num, model in enumerate(self.model_dict[level]):

                logger.info("Training Fulldata Level %d. Model # %d", level, model_num)
                if level == 0:
                    model.fit(temp_train[0][model_num], self.y_enc)
                else:
                    model.fit(temp_train, self.y_enc)

                logger.info("Predicting Test Level %d. Model # %d", level, model_num)

                if self.task_type == 'classification':
                    if level == 0:
                        temp_test_predictions = model.predict_proba(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict_proba(temp_test)
                    self.test_prediction_dict[level][:, (model_num * self.num_classes): (model_num * self.num_classes) +
                                                                                        self.num_classes] = temp_test_predictions

                else:
                    if level == 0:
                        temp_test_predictions = model.predict(temp_test[0][model_num])
                    else:
                        temp_test_predictions = model.predict(temp_test)
                    self.test_prediction_dict[level][:, model_num] = temp_test_predictions

            test_predictions_df = pd.DataFrame(self.test_prediction_dict[level])
            test_predictions_df.to_csv(os.path.join(self.save_path, "test_predictions_level_" + str(level) + ".csv"),
                                       index=False, header=None)

        return self.test_prediction_dict

In [25]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [26]:
import string
eng_stopwords = set(stopwords.words("english"))

In [27]:
## Number of words in the text ##
train["num_words"] = train["tweet"].apply(lambda x: len(str(x).split()))
test["num_words"] = test["tweet"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
train["num_unique_words"] = train["tweet"].apply(lambda x: len(set(str(x).split())))
test["num_unique_words"] = test["tweet"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
train["num_chars"] = train["tweet"].apply(lambda x: len(str(x)))
test["num_chars"] = test["tweet"].apply(lambda x: len(str(x)))

## Number of stopwords in the text ##
train["num_stopwords"] = train["tweet"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
test["num_stopwords"] = test["tweet"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

## Number of punctuations in the text ##
train["num_punctuations"] =train["tweet"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test["num_punctuations"] =test["tweet"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of upper case words in the text ##
train["num_words_upper"] = train["tweet"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test["num_words_upper"] = test["tweet"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the text ##
train["num_words_title"] = train["tweet"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test["num_words_title"] = test["tweet"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
train["mean_word_len"] = train["tweet"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test["mean_word_len"] = test["tweet"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [28]:
cols = ['char_length_ratio',
       'num_syn_words', 'Lev_dist', 'tweet_num_pos', 'tweet_num_neg',
       'tweet_diff_num',"num_words", "num_unique_words", "num_chars", "num_stopwords", "num_punctuations", "num_words_upper", "num_words_title", "mean_word_len"]

train_X = train[cols]
test_X = test[cols]

In [99]:
# specify the data to be used for every level of ensembling:
train_data_dict = {0: [xtrain_tfv, xtrain_ctv, xtrain_tfv, xtrain_ctv, train_X.values,], 1: [xtrain_tfv]}
test_data_dict = {0: [xtest_tfv, xtest_ctv, xtest_tfv, xtest_ctv, test_X.values], 1: [xtest_tfv]}

model_dict = {0: [SVC(C=1.5,probability=True),LogisticRegression(C=5),xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7)\
                  ,MultinomialNB(alpha=0.1)],

              1: [SVC(C=1,probability=True),SVC(C=5,probability=True),\
                  SVC(C=2,probability=True), xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7)]}

ens = Ensembler(model_dict=model_dict, num_folds=3, task_type='classification',
                optimize=multiclass_logloss, lower_is_better=True, save_path='./temp//')

ens.fit(train_data_dict, ytrain, lentrain=xtrain_tfv.shape[0])


[17:01:11] INFO Found 2 classes
[17:01:11] INFO Training Level 0 Fold # 1. Model # 0
[17:01:22] INFO Predicting Level 0. Fold # 1. Model # 0
[17:01:23] INFO Level 0. Fold # 1. Model # 0. Validation Score = 0.335918
[17:01:23] INFO Training Level 0 Fold # 2. Model # 0
[17:01:33] INFO Predicting Level 0. Fold # 2. Model # 0
[17:01:34] INFO Level 0. Fold # 2. Model # 0. Validation Score = 0.285538
[17:01:34] INFO Training Level 0 Fold # 3. Model # 0
[17:01:44] INFO Predicting Level 0. Fold # 3. Model # 0
[17:01:45] INFO Level 0. Fold # 3. Model # 0. Validation Score = 0.301440
[17:01:45] INFO Level 0. Model # 0. Mean Score = 0.307632. Std Dev = 0.021029
[17:01:45] INFO Training Level 0 Fold # 1. Model # 1
[17:01:46] INFO Predicting Level 0. Fold # 1. Model # 1
[17:01:46] INFO Level 0. Fold # 1. Model # 1. Validation Score = 0.309723
[17:01:46] INFO Training Level 0 Fold # 2. Model # 1
[17:01:46] INFO Predicting Level 0. Fold # 2. Model # 1
[17:01:46] INFO Level 0. Fold # 2. Model # 1. Validation Score = 0.269141
[17:01:46] INFO Training Level 0 Fold # 3. Model # 1
[17:01:47] INFO Predicting Level 0. Fold # 3. Model # 1
[17:01:47] INFO Level 0. Fold # 3. Model # 1. Validation Score = 0.293519
[17:01:47] INFO Level 0. Model # 1. Mean Score = 0.290794. Std Dev = 0.016679
[17:01:47] INFO Training Level 0 Fold # 1. Model # 2
[17:01:51] INFO Predicting Level 0. Fold # 1. Model # 2
[17:01:51] INFO Level 0. Fold # 1. Model # 2. Validation Score = 0.283444
[17:01:51] INFO Training Level 0 Fold # 2. Model # 2
[17:01:55] INFO Predicting Level 0. Fold # 2. Model # 2
[17:01:55] INFO Level 0. Fold # 2. Model # 2. Validation Score = 0.262973
[17:01:55] INFO Training Level 0 Fold # 3. Model # 2
[17:02:00] INFO Predicting Level 0. Fold # 3. Model # 2
[17:02:00] INFO Level 0. Fold # 3. Model # 2. Validation Score = 0.270856
[17:02:00] INFO Level 0. Model # 2. Mean Score = 0.272424. Std Dev = 0.008431
[17:02:00] INFO Training Level 0 Fold # 1. Model # 3
[17:02:00] INFO Predicting Level 0. Fold # 1. Model # 3
[17:02:00] INFO Level 0. Fold # 1. Model # 3. Validation Score = 8.691493
[17:02:00] INFO Training Level 0 Fold # 2. Model # 3
[17:02:00] INFO Predicting Level 0. Fold # 2. Model # 3
[17:02:00] INFO Level 0. Fold # 2. Model # 3. Validation Score = 9.387439
[17:02:00] INFO Training Level 0 Fold # 3. Model # 3
[17:02:00] INFO Predicting Level 0. Fold # 3. Model # 3
[17:02:00] INFO Level 0. Fold # 3. Model # 3. Validation Score = 8.697290
[17:02:00] INFO Level 0. Model # 3. Mean Score = 8.925407. Std Dev = 0.326714
[17:02:00] INFO Saving predictions for level # 0
[17:02:00] INFO Training Level 1 Fold # 1. Model # 0
[17:02:01] INFO Predicting Level 1. Fold # 1. Model # 0
[17:02:01] INFO Level 1. Fold # 1. Model # 0. Validation Score = 0.292691
[17:02:01] INFO Training Level 1 Fold # 2. Model # 0
[17:02:02] INFO Predicting Level 1. Fold # 2. Model # 0
[17:02:03] INFO Level 1. Fold # 2. Model # 0. Validation Score = 0.270917
[17:02:03] INFO Training Level 1 Fold # 3. Model # 0
[17:02:04] INFO Predicting Level 1. Fold # 3. Model # 0
[17:02:04] INFO Level 1. Fold # 3. Model # 0. Validation Score = 0.281586
[17:02:04] INFO Level 1. Model # 0. Mean Score = 0.281731. Std Dev = 0.008890
[17:02:04] INFO Training Level 1 Fold # 1. Model # 1
[17:02:05] INFO Predicting Level 1. Fold # 1. Model # 1
[17:02:05] INFO Level 1. Fold # 1. Model # 1. Validation Score = 0.296477
[17:02:05] INFO Training Level 1 Fold # 2. Model # 1
[17:02:06] INFO Predicting Level 1. Fold # 2. Model # 1
[17:02:06] INFO Level 1. Fold # 2. Model # 1. Validation Score = 0.277753
[17:02:06] INFO Training Level 1 Fold # 3. Model # 1
[17:02:07] INFO Predicting Level 1. Fold # 3. Model # 1
[17:02:07] INFO Level 1. Fold # 3. Model # 1. Validation Score = 0.286212
[17:02:07] INFO Level 1. Model # 1. Mean Score = 0.286814. Std Dev = 0.007656
[17:02:07] INFO Training Level 1 Fold # 1. Model # 2
[17:02:08] INFO Predicting Level 1. Fold # 1. Model # 2
[17:02:08] INFO Level 1. Fold # 1. Model # 2. Validation Score = 0.294685
[17:02:08] INFO Training Level 1 Fold # 2. Model # 2
[17:02:09] INFO Predicting Level 1. Fold # 2. Model # 2
[17:02:09] INFO Level 1. Fold # 2. Model # 2. Validation Score = 0.274593
[17:02:09] INFO Training Level 1 Fold # 3. Model # 2
[17:02:10] INFO Predicting Level 1. Fold # 3. Model # 2
[17:02:10] INFO Level 1. Fold # 3. Model # 2. Validation Score = 0.284617
[17:02:10] INFO Level 1. Model # 2. Mean Score = 0.284632. Std Dev = 0.008203
[17:02:10] INFO Training Level 1 Fold # 1. Model # 3
[17:02:11] INFO Predicting Level 1. Fold # 1. Model # 3
[17:02:11] INFO Level 1. Fold # 1. Model # 3. Validation Score = 0.284142
[17:02:11] INFO Training Level 1 Fold # 2. Model # 3
[17:02:12] INFO Predicting Level 1. Fold # 2. Model # 3
[17:02:12] INFO Level 1. Fold # 2. Model # 3. Validation Score = 0.254376
[17:02:12] INFO Training Level 1 Fold # 3. Model # 3
[17:02:13] INFO Predicting Level 1. Fold # 3. Model # 3
[17:02:13] INFO Level 1. Fold # 3. Model # 3. Validation Score = 0.266175
[17:02:13] INFO Level 1. Model # 3. Mean Score = 0.268231. Std Dev = 0.012239
[17:02:13] INFO Saving predictions for level # 1
Out[99]:
{0: array([[  9.42286218e-01,   5.77137818e-02,   9.99805711e-01, ...,
           8.85082595e-03,   1.00000000e+00,   2.52200033e-92],
        [  9.70445041e-01,   2.95549592e-02,   9.98932747e-01, ...,
           8.55579413e-03,   7.18654629e-16,   1.00000000e+00],
        [  6.34743234e-02,   9.36525677e-01,   3.47517494e-01, ...,
           3.90822142e-01,   1.51577760e-25,   1.00000000e+00],
        ..., 
        [  9.16477625e-01,   8.35223753e-02,   9.99999647e-01, ...,
           2.98558129e-03,   1.00000000e+00,   6.80273006e-86],
        [  8.66455855e-01,   1.33544145e-01,   9.90745144e-01, ...,
           5.04983328e-02,   1.04948717e-07,   9.99999895e-01],
        [  7.55104872e-03,   9.92448951e-01,   2.64850446e-02, ...,
           7.63928294e-01,   1.44059087e-17,   1.00000000e+00]]),
 1: array([[  9.52074421e-01,   4.79255788e-02,   9.58195149e-01, ...,
           4.55324932e-02,   9.99392152e-01,   6.07825583e-04],
        [  9.70454063e-01,   2.95459367e-02,   9.54415407e-01, ...,
           3.65875520e-02,   9.97906983e-01,   2.09303363e-03],
        [  2.51692893e-01,   7.48307107e-01,   2.38694091e-01, ...,
           7.51031222e-01,   1.72431529e-01,   8.27568471e-01],
        ..., 
        [  9.53999115e-01,   4.60008851e-02,   9.59065266e-01, ...,
           4.47468298e-02,   9.99341369e-01,   6.58636214e-04],
        [  9.65884051e-01,   3.41159487e-02,   9.60644513e-01, ...,
           3.49709151e-02,   9.34725642e-01,   6.52743727e-02],
        [  1.81446211e-01,   8.18553789e-01,   1.89758722e-01, ...,
           8.10529066e-01,   4.97666001e-02,   9.50233400e-01]])}

In [100]:
preds = ens.predict(test_data_dict, lentest=xtest_tfv.shape[0])
predictions_test = preds[1][:,1]
predictions_test = np.where(predictions_test>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('ensemble.csv', index=False)


[17:02:13] INFO Training Fulldata Level 0. Model # 0
[17:02:39] INFO Predicting Test Level 0. Model # 0
[17:02:40] INFO Training Fulldata Level 0. Model # 1
[17:02:41] INFO Predicting Test Level 0. Model # 1
[17:02:41] INFO Training Fulldata Level 0. Model # 2
[17:02:47] INFO Predicting Test Level 0. Model # 2
[17:02:47] INFO Training Fulldata Level 0. Model # 3
[17:02:47] INFO Predicting Test Level 0. Model # 3
[17:02:47] INFO Training Fulldata Level 1. Model # 0
[17:02:50] INFO Predicting Test Level 1. Model # 0
[17:02:50] INFO Training Fulldata Level 1. Model # 1
[17:02:53] INFO Predicting Test Level 1. Model # 1
[17:02:53] INFO Training Fulldata Level 1. Model # 2
[17:02:56] INFO Predicting Test Level 1. Model # 2
[17:02:56] INFO Training Fulldata Level 1. Model # 3
[17:02:57] INFO Predicting Test Level 1. Model # 3

DL


In [171]:
# Dependecy imports
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.layers import SpatialDropout1D, Dropout

from keras.layers import Dense, Activation, Reshape, Merge, Embedding, Input, Concatenate
from keras.models import Model as KerasModel

from sklearn.preprocessing import scale

In [128]:
max_fatures = 2500 # Top 2000 words
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
# The training phase is by means of the fit_on_texts method and you
# can see the word index using the word_index property:
tokenizer.fit_on_texts(train['tweet_without_stopwords'].values)

In [129]:
# texts_to_sequences method turns input into numerical arrays
train_data = tokenizer.texts_to_sequences(train['tweet_without_stopwords'].values)
test_data = tokenizer.texts_to_sequences(test['tweet_without_stopwords'].values)

In [130]:
print("\nExamples:")
print(train['tweet_without_stopwords'][100], '-->', train_data[100])
print(train['tweet_without_stopwords'][200], '-->', train_data[200])
print(train['tweet_without_stopwords'][300], '-->', train_data[300])


Examples:
dreamy effect created straight the iphone impressive spring flower blossom http instagram com p tpogyugzze --> [1432, 2143, 13, 1, 519, 530, 2, 7, 3, 6]
woohoo finally get nx1000 finally camera nx1000 samsung excited nice black peace tha http instagr p sa8b nzddg --> [76, 43, 76, 109, 8, 185, 188, 160, 514, 1918, 2, 16, 6]
fibs rhymes sibs nibs jibs bibs ribs dibs start today http bit ly rhymeapp rhyme iphone --> [561, 476, 38, 2, 54, 33, 562, 563, 1]

In [133]:
# All Phrase numerical values reshape to match size for all
train_data_pad = pad_sequences(train_data,maxlen=68)
test_data_pad = pad_sequences(test_data)
print("\nExample")
print(train_data[100], '-->', train_data_pad[100])


Example
[1432, 2143, 13, 1, 519, 530, 2, 7, 3, 6] --> [   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 1432 2143
   13    1  519  530    2    7    3    6]

In [134]:
print('\nInput train data shape:', train_data_pad.shape)
print('Input test data shape:', test_data_pad.shape)


Input train data shape: (7920, 68)
Input test data shape: (1953, 68)

In [135]:
# One Hot encoding
train_labels = pd.get_dummies(train['label']).values
print('Sample labels:')
print(train_labels[0:2])


Sample labels:
[[1 0]
 [1 0]]

In [140]:
embed_dim = 200
lstm_out = 150 # Output Neurons
batch_size = 128
drop_out = 0.2

In [169]:
from keras.layers import Input

In [ ]:
def fit(self, X_train, y_train):
    self.model.fit(self.preprocessing(X_train), y_train, epochs=self.epochs, batch_size=512)

def guess(self, features):
        features = self.preprocessing(features)
        result = self.model.predict(features).flatten()
        return result

In [179]:
train_data_pad.shape


Out[179]:
(7920, 68)

In [182]:
train_X.shape


Out[182]:
(7920, 14)

In [220]:
from sklearn.preprocessing import StandardScaler

In [221]:
scaler = StandardScaler()
scaler.fit(train_X.values)


Out[221]:
StandardScaler(copy=True, with_mean=True, with_std=True)

In [222]:
scaled_train_X = scaler.transform(train_X)
scaled_test_X = scaler.transform(test_X)

In [223]:
def split_features(X):
    
    X_list = []
    x_0 = train_data_pad[..., :]
    X_list.append(x_0)
    
    x_1 = X[..., :]
    X_list.append(x_1)

    return X_list

In [230]:
inp_1 = Input(shape=(68,))
out_1 = Embedding(max_fatures, embed_dim, input_length=train_data_pad.shape[1])(inp_1)
s_drop_1 = SpatialDropout1D(.2)(out_1)
lstm_1 = LSTM(lstm_out,return_sequences=True, dropout = 0.2)(s_drop_1)
lstm_2 = LSTM(50, dropout=0.2)(lstm_1)
drop_2 = Dropout(drop_out)(lstm_2)
out_dense_1 = Dense(64,activation='relu')(drop_2)

inp_2 = Input(shape=(14,))
dense_2 = Dense(64,activation='relu')(inp_2)
out_dense_2 = Reshape(target_shape=(64,))(dense_2)
dense_3 = Dense(32,activation='relu')(out_dense_2)
out_dense_3 = Reshape(target_shape=(32,))(dense_3)

input_model = [inp_1, inp_2]
output_model = [out_dense_1, out_dense_3]

output = Concatenate()(output_model)
output = Dense(16, activation='relu')(output)
output = Dropout(drop_out)(output)
output = Dense(2, activation='softmax')(output)

model = KerasModel(inputs=input_model, outputs=output)

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
input_13 (InputLayer)            (None, 68)            0                                            
____________________________________________________________________________________________________
embedding_31 (Embedding)         (None, 68, 200)       500000      input_13[0][0]                   
____________________________________________________________________________________________________
spatial_dropout1d_29 (SpatialDro (None, 68, 200)       0           embedding_31[0][0]               
____________________________________________________________________________________________________
input_14 (InputLayer)            (None, 14)            0                                            
____________________________________________________________________________________________________
lstm_35 (LSTM)                   (None, 68, 150)       210600      spatial_dropout1d_29[0][0]       
____________________________________________________________________________________________________
dense_64 (Dense)                 (None, 64)            960         input_14[0][0]                   
____________________________________________________________________________________________________
lstm_36 (LSTM)                   (None, 50)            40200       lstm_35[0][0]                    
____________________________________________________________________________________________________
reshape_9 (Reshape)              (None, 64)            0           dense_64[0][0]                   
____________________________________________________________________________________________________
dropout_41 (Dropout)             (None, 50)            0           lstm_36[0][0]                    
____________________________________________________________________________________________________
dense_65 (Dense)                 (None, 32)            2080        reshape_9[0][0]                  
____________________________________________________________________________________________________
dense_63 (Dense)                 (None, 64)            3264        dropout_41[0][0]                 
____________________________________________________________________________________________________
reshape_10 (Reshape)             (None, 32)            0           dense_65[0][0]                   
____________________________________________________________________________________________________
concatenate_4 (Concatenate)      (None, 96)            0           dense_63[0][0]                   
                                                                   reshape_10[0][0]                 
____________________________________________________________________________________________________
dense_66 (Dense)                 (None, 16)            1552        concatenate_4[0][0]              
____________________________________________________________________________________________________
dropout_42 (Dropout)             (None, 16)            0           dense_66[0][0]                   
____________________________________________________________________________________________________
dense_67 (Dense)                 (None, 2)             34          dropout_42[0][0]                 
====================================================================================================
Total params: 758,690
Trainable params: 758,690
Non-trainable params: 0
____________________________________________________________________________________________________

In [233]:
model.fit(split_features(train_X.values), train_labels, epochs=10, batch_size=batch_size, verbose=2)


Epoch 1/10
21s - loss: 0.1701 - acc: 0.9407
Epoch 2/10
21s - loss: 0.1496 - acc: 0.9511
Epoch 3/10
21s - loss: 0.1369 - acc: 0.9578
Epoch 4/10
21s - loss: 0.1280 - acc: 0.9645
Epoch 5/10
21s - loss: 0.1254 - acc: 0.9620
Epoch 6/10
21s - loss: 0.1046 - acc: 0.9718
Epoch 7/10
23s - loss: 0.1044 - acc: 0.9734
Epoch 8/10
22s - loss: 0.0958 - acc: 0.9745
Epoch 9/10
22s - loss: 0.0864 - acc: 0.9792
Epoch 10/10
22s - loss: 0.0792 - acc: 0.9828
Out[233]:
<keras.callbacks.History at 0x25ccdb01f28>

In [234]:
def split_features(X):
    
    X_list = []
    x_0 = test_data_pad[..., :]
    X_list.append(x_0)
    
    x_1 = X[..., :]
    X_list.append(x_1)

    return X_list

In [235]:
result = model.predict(split_features(test_X.values))

In [236]:
result = result[:,1]

In [237]:
sample['label'] = result
sample.to_csv('_a.csv', index=False)

In [238]:
predictions_test = np.where(result>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('__a.csv', index=False)

In [239]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

In [ ]:
# model = Sequential()
# model.add(Embedding(max_fatures, embed_dim, input_length=train_data_pad.shape[1]))
# model.add(SpatialDropout1D(drop_out))

# # LSTMs
# model.add(LSTM(lstm_out,return_sequences=True, dropout = 0.2))
# model.add(LSTM(50, dropout=0.2))
# model.add(Dropout(drop_out))

# model.add(Dense(2, activation='softmax'))
# model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())

In [166]:
model.fit(train_data_pad, train_labels, epochs=10, batch_size=batch_size, verbose=2, validation_split=0.2)


Train on 6336 samples, validate on 1584 samples
Epoch 1/6
22s - loss: 0.3987 - acc: 0.8174 - val_loss: 0.2621 - val_acc: 0.8826
Epoch 2/6
21s - loss: 0.2250 - acc: 0.9058 - val_loss: 0.2379 - val_acc: 0.8939
Epoch 3/6
19s - loss: 0.1781 - acc: 0.9306 - val_loss: 0.2584 - val_acc: 0.8826
Epoch 4/6
20s - loss: 0.1465 - acc: 0.9426 - val_loss: 0.2943 - val_acc: 0.8782
Epoch 5/6
21s - loss: 0.1368 - acc: 0.9485 - val_loss: 0.3084 - val_acc: 0.8756
Epoch 6/6
21s - loss: 0.1204 - acc: 0.9568 - val_loss: 0.3744 - val_acc: 0.8712
Out[166]:
<keras.callbacks.History at 0x25cdd5655f8>

In [167]:
preds_1 = model.predict_proba(test_data_pad)[:,1]


1953/1953 [==============================] - ETA: 78 - ETA: 41 - ETA: 28 - ETA: 22 - ETA: 18 - ETA: 15 - ETA: 14 - ETA: 12 - ETA: 11 - ETA: 10 - ETA: 9 - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - 6s     

model_2


In [154]:
# A simple LSTM with two dense layers
from keras.callbacks import EarlyStopping

model_2 = Sequential()
model_2.add(Embedding(max_fatures, embed_dim, input_length=train_data_pad.shape[1]))
model_2.add(SpatialDropout1D(0.3))
model_2.add(LSTM(300, dropout=0.3, recurrent_dropout=0.3))

model_2.add(Dense(512, activation='relu'))
model_2.add(Dropout(0.3))

model_2.add(Dense(256, activation='relu'))
model_2.add(Dropout(0.3))

model_2.add(Dense(2, activation='softmax'))
model_2.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

# Fit the model_2 with early stopping callback

earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=2, mode='auto')
model_2.fit(train_data_pad, train_labels, epochs=10, batch_size=batch_size, verbose=2, validation_split=0.2,callbacks=[earlystop])


Train on 6336 samples, validate on 1584 samples
Epoch 1/10
17s - loss: 0.6530 - acc: 0.7331 - val_loss: 0.6125 - val_acc: 0.7519
Epoch 2/10
15s - loss: 0.5993 - acc: 0.7423 - val_loss: 0.5776 - val_acc: 0.7519
Epoch 3/10
15s - loss: 0.5788 - acc: 0.7423 - val_loss: 0.5654 - val_acc: 0.7519
Epoch 4/10
15s - loss: 0.5721 - acc: 0.7423 - val_loss: 0.5607 - val_acc: 0.7519
Epoch 5/10
15s - loss: 0.5696 - acc: 0.7423 - val_loss: 0.5587 - val_acc: 0.7519
Epoch 6/10
15s - loss: 0.5689 - acc: 0.7423 - val_loss: 0.5577 - val_acc: 0.7519
Epoch 7/10
15s - loss: 0.5685 - acc: 0.7423 - val_loss: 0.5569 - val_acc: 0.7519
Epoch 8/10
15s - loss: 0.5671 - acc: 0.7423 - val_loss: 0.5562 - val_acc: 0.7519
Epoch 9/10
15s - loss: 0.5667 - acc: 0.7423 - val_loss: 0.5556 - val_acc: 0.7519
Epoch 10/10
15s - loss: 0.5667 - acc: 0.7423 - val_loss: 0.5550 - val_acc: 0.7519
Out[154]:
<keras.callbacks.History at 0x25ccb7e1eb8>

In [155]:
model_2.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_17 (Embedding)     (None, 68, 200)           500000    
_________________________________________________________________
spatial_dropout1d_17 (Spatia (None, 68, 200)           0         
_________________________________________________________________
lstm_17 (LSTM)               (None, 300)               601200    
_________________________________________________________________
dense_33 (Dense)             (None, 512)               154112    
_________________________________________________________________
dropout_25 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_34 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_26 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_35 (Dense)             (None, 2)                 514       
=================================================================
Total params: 1,387,154
Trainable params: 1,387,154
Non-trainable params: 0
_________________________________________________________________

In [156]:
preds_2 = model_2.predict_proba(test_data_pad)[:,1]


1952/1953 [============================>.] - ETA: 49 - ETA: 26 - ETA: 18 - ETA: 14 - ETA: 12 - ETA: 10 - ETA: 9 - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA: 0s

model_3


In [160]:
# A simple bidirectional LSTM with Embeddings and two dense layers
from keras.layers import Bidirectional
model_3 = Sequential()
model_3.add(Embedding(max_fatures, embed_dim, input_length=train_data_pad.shape[1]))
model_3.add(SpatialDropout1D(0.3))
model_3.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

model_3.add(Dense(512, activation='relu'))
model_3.add(Dropout(0.3))

model_3.add(Dense(256, activation='relu'))
model_3.add(Dropout(0.3))

model_3.add(Dense(2, activation='softmax'))
model_3.compile(loss='categorical_crossentropy', optimizer='adam')

earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=2, mode='auto')
model_3.fit(train_data_pad, train_labels, epochs=10, batch_size=batch_size, verbose=2, validation_split=0.2,callbacks=[earlystop])


Train on 6336 samples, validate on 1584 samples
Epoch 1/10
36s - loss: 0.3734 - val_loss: 0.2472
Epoch 2/10
33s - loss: 0.2249 - val_loss: 0.2318
Epoch 3/10
33s - loss: 0.1813 - val_loss: 0.2470
Epoch 4/10
33s - loss: 0.1651 - val_loss: 0.2776
Epoch 5/10
33s - loss: 0.1390 - val_loss: 0.3069
Epoch 6/10
32s - loss: 0.1168 - val_loss: 0.3684
Epoch 00005: early stopping
Out[160]:
<keras.callbacks.History at 0x25cd949ef60>

In [161]:
model_3.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_20 (Embedding)     (None, 68, 200)           500000    
_________________________________________________________________
spatial_dropout1d_20 (Spatia (None, 68, 200)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 600)               1202400   
_________________________________________________________________
dense_39 (Dense)             (None, 512)               307712    
_________________________________________________________________
dropout_29 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_40 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_30 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_41 (Dense)             (None, 2)                 514       
=================================================================
Total params: 2,141,954
Trainable params: 2,141,954
Non-trainable params: 0
_________________________________________________________________

In [162]:
preds_3 = model_3.predict_proba(test_data_pad)[:,1]


1953/1953 [==============================] - ETA: 69 - ETA: 37 - ETA: 26 - ETA: 21 - ETA: 17 - ETA: 15 - ETA: 14 - ETA: 12 - ETA: 11 - ETA: 11 - ETA: 10 - ETA: 9 - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - ETA:  - 8s     

all_model_preds


In [163]:
predictions_test = np.where(preds_1>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('pred_lstm1.csv', index=False)

predictions_test = np.where(preds_2>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('pred_lstm2.csv', index=False)

predictions_test = np.where(preds_3>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('pred_lstm3.csv', index=False)

In [168]:
predictions_test = np.where(preds_1>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('pred_lstmlstm1.csv', index=False)

In [ ]: