In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')
In [3]:
def preprocessing_text(s):
import re
s = re.sub(r"[^A-Za-z0-9^,\*+-=]", " ",s)
s = re.sub(r"(\d+)(k)", r"\g<1>000", s) #expand 'k' to '000' eg. 50k to 50000
s = re.sub(r"\;"," ",s)
s = re.sub(r"\:"," ",s)
s = re.sub(r"\,"," ",s)
s = re.sub(r"\."," ",s)
s = re.sub(r"\<"," ",s)
s = re.sub(r"\^"," ",s)
s = re.sub(r"(\d+)(/)", "\g<1> divide ", s) #change number/number to number divide number (eg. 2/3 to 2 divide 3)
s = re.sub(r"\/"," ",s) #replace the rest of / with white space
s = re.sub(r"\+", " plus ", s)
s = re.sub(r"\-", " minus ", s)
s = re.sub(r"\*", " multiply ", s)
s = re.sub(r"\=", "equal", s)
s = re.sub(r"What's", "What is ", s)
s = re.sub(r"what's", "what is ", s)
s = re.sub(r"Who's", "Who is ", s)
s = re.sub(r"who's", "who is ", s)
s = re.sub(r"\'s", " ", s)
s = re.sub(r"\'ve", " have ", s)
s = re.sub(r"can't", "cannot ", s)
s = re.sub(r"n't", " not ", s)
s = re.sub(r"\'re", " are ", s)
s = re.sub(r"\'d", " would ", s)
s = re.sub(r"\'ll", " will ", s)
s = re.sub(r"'m", " am ", s)
s = re.sub(r"or not", " ", s)
s = re.sub(r"What should I do to", "How can I", s)
s = re.sub(r"How do I", "How can I", s)
s = re.sub(r"How can you make", "What can make", s)
s = re.sub(r"How do we", "How do I", s)
s = re.sub(r"How do you", "How do I", s)
s = re.sub(r"Is it possible", "Can we", s)
s = re.sub(r"Why is", "Why", s)
s = re.sub(r"Which are", "What are", s)
s = re.sub(r"What are the reasons", "Why", s)
s = re.sub(r"What are some tips", "tips", s)
s = re.sub(r"What is the best way", "best way", s)
s = re.sub(r"e-mail", "email", s)
s = re.sub(r"e - mail", "email", s)
s = re.sub(r"US", "America", s)
s = re.sub(r"USA", "America", s)
s = re.sub(r"us", "America", s)
s = re.sub(r"usa", "America", s)
s = re.sub(r"Chinese", "China", s)
s = re.sub(r"india", "India", s)
s = re.sub(r"\s{2,}", " ", s) #remove extra white space
s = s.strip()
return s
def remove_stopwords(string):
word_list = [word.lower() for word in string.split()]
from nltk.corpus import stopwords
stopwords_list = list(stopwords.words("english"))
for word in word_list:
if word in stopwords_list:
word_list.remove(word)
return ' '.join(word_list)
def get_char_length_ratio(row):
return len(row['tweet'])/max(1,len(row['tweet_without_stopwords']))
def get_synonyms(word):
from nltk.corpus import wordnet as wn
synonyms = []
if wn.synsets(word):
for syn in wn.synsets(word):
for l in syn.lemmas():
synonyms.append(l.name())
return list(set(synonyms))
def get_row_syn_set(row):
import nltk
syn_set = [nltk.word_tokenize(row)]
for token in nltk.word_tokenize(row):
if get_synonyms(token):
syn_set.append(get_synonyms(token))
return set([y for x in syn_set for y in x])
def get_Levenshtein(string1,string2):
import editdistance
return editdistance.eval(string1,string2)
def num_pos(sent):
num_pos = 0
word_list = [word.lower() for word in nltk.word_tokenize(sent)]
for index, word in enumerate(word_list):
if word in positive_words:
if word_list[index-1] not in ['not','no']:
num_pos += 1
return num_pos
def num_neg(sent):
num_neg = 0
word_list = [word.lower() for word in nltk.word_tokenize(sent)]
for index, word in enumerate(word_list):
if word in negative_words:
if word_list[index-1] not in ['not','no']:
num_neg += 1
return num_neg
p_url = 'http://ptrckprry.com/course/ssd/data/positive-words.txt'
n_url = 'http://ptrckprry.com/course/ssd/data/negative-words.txt'
import requests,nltk
positive_words = requests.get(p_url).content.decode('latin-1')
positive_words = nltk.word_tokenize(positive_words)
positive_words.remove('not')
negative_words = requests.get(n_url).content.decode('latin-1')
negative_words = nltk.word_tokenize(negative_words)
positive_words = positive_words[413:]
negative_words = negative_words[418:]
In [4]:
train['tweet'] = train['tweet'].map(lambda x: preprocessing_text(x))
test['tweet'] = test['tweet'].map(lambda x: preprocessing_text(x))
train['tweet'] = train['tweet'].astype(str)
train['tweet_without_stopwords'] = train['tweet'].apply(lambda x: remove_stopwords(x))
test['tweet'] = test['tweet'].astype(str)
test['tweet_without_stopwords'] = test['tweet'].apply(lambda x: remove_stopwords(x))
train['char_length_ratio'] = train.apply(lambda row: get_char_length_ratio(row), axis=1)
test['char_length_ratio'] = test.apply(lambda row: get_char_length_ratio(row), axis=1)
train['tweet_tokens_syn_set'] = train['tweet_without_stopwords'].map(lambda row: get_row_syn_set(row))
train['num_syn_words'] = train.apply(lambda x: len(x['tweet_tokens_syn_set'].intersection(set(nltk.word_tokenize(x['tweet'])))), axis=1)
test['tweet_tokens_syn_set'] = test['tweet_without_stopwords'].map(lambda row: get_row_syn_set(row))
test['num_syn_words'] = test.apply(lambda x:len(x['tweet_tokens_syn_set'].intersection(set(nltk.word_tokenize(x['tweet'])))), axis=1)
train['Lev_dist'] = train.apply(lambda row: get_Levenshtein(row['tweet'],row['tweet_without_stopwords']),axis = 1)
test['Lev_dist'] = test.apply(lambda row: get_Levenshtein(row['tweet'],row['tweet_without_stopwords']),axis = 1)
train['tweet_num_pos'] = train['tweet_without_stopwords'].apply(lambda x: num_pos(x))
train['tweet_num_neg'] = train['tweet_without_stopwords'].apply(lambda x: num_neg(x))
test['tweet_num_pos'] = test['tweet_without_stopwords'].apply(lambda x: num_pos(x))
test['tweet_num_neg'] = test['tweet_without_stopwords'].apply(lambda x: num_neg(x))
train['tweet_diff_num'] = (train['tweet_num_pos'] - train['tweet_num_neg']).abs()
test['tweet_diff_num'] = (test['tweet_num_pos'] - test['tweet_num_neg']).abs()
train.drop('tweet_tokens_syn_set', axis=1, inplace=True)
test.drop('tweet_tokens_syn_set', axis=1, inplace=True)
In [5]:
from sklearn.metrics import f1_score
In [6]:
y = train.label.values
xtrain, xvalid, ytrain, yvalid = train_test_split(train.tweet_without_stopwords.values, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)
In [7]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 4), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv = tfv.transform(xtrain)
xvalid_tfv = tfv.transform(xvalid)
xtest_tfv = tfv.transform(test.tweet_without_stopwords.values)
In [8]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=200)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)
xtest_svd = svd.transform(xtest_tfv)
# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)
xtest_svd_scl = scl.transform(xtest_svd)
In [9]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 4), stop_words = 'english')
# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv = ctv.transform(xtrain)
xvalid_ctv = ctv.transform(xvalid)
xtest_ctv = ctv.transform(test.tweet_without_stopwords.values)
In [18]:
# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict(xvalid_svd_scl)
print ("f1 score: %0.3f " % f1_score(yvalid, predictions))
predictions_test = clf.predict_proba(xtest_svd_scl)
predictions_test = np.where(predictions_test[:,1]>=0.405, 1,0)
sample['label'] = predictions_test
sample.to_csv('svc_preds.csv', index=False)
In [14]:
# Fitting a simple xgboost on tf-idf
clf = xgb.XGBClassifier(max_depth=8, n_estimators=200, colsample_bytree=0.8,
subsample=0.8, n_jobs=-1, learning_rate=0.1)
clf.fit(xtrain_tfv.tocsc(), ytrain)
predictions = clf.predict(xvalid_tfv.tocsc())
print ("f1 score: %0.3f " % f1_score(yvalid, predictions))
predictions_test = clf.predict_proba(xtest_tfv.tocsc())
predictions_test = np.where(predictions_test[:,1]>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('xgb_preds.csv', index=False)
In [23]:
mll_scorer = metrics.make_scorer(f1_score, greater_is_better=True, needs_proba=False)
In [24]:
# this is the main ensembling class. how to use it is in the next cell!
####################################################################### This Sciprt below to Kaggle.com
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
import pandas as pd
import os
import sys
import logging
logging.basicConfig(
level=logging.DEBUG,
format="[%(asctime)s] %(levelname)s %(message)s",
datefmt="%H:%M:%S", stream=sys.stdout)
logger = logging.getLogger(__name__)
class Ensembler(object):
def __init__(self, model_dict, num_folds=3, task_type='classification', optimize=roc_auc_score,
lower_is_better=False, save_path=None):
"""
Ensembler init function
:param model_dict: model dictionary, see README for its format
:param num_folds: the number of folds for ensembling
:param task_type: classification or regression
:param optimize: the function to optimize for, e.g. AUC, logloss, etc. Must have two arguments y_test and y_pred
:param lower_is_better: is lower value of optimization function better or higher
:param save_path: path to which model pickles will be dumped to along with generated predictions, or None
"""
self.model_dict = model_dict
self.levels = len(self.model_dict)
self.num_folds = num_folds
self.task_type = task_type
self.optimize = optimize
self.lower_is_better = lower_is_better
self.save_path = save_path
self.training_data = None
self.test_data = None
self.y = None
self.lbl_enc = None
self.y_enc = None
self.train_prediction_dict = None
self.test_prediction_dict = None
self.num_classes = None
def fit(self, training_data, y, lentrain):
"""
:param training_data: training data in tabular format
:param y: binary, multi-class or regression
:return: chain of models to be used in prediction
"""
self.training_data = training_data
self.y = y
if self.task_type == 'classification':
self.num_classes = len(np.unique(self.y))
logger.info("Found %d classes", self.num_classes)
self.lbl_enc = LabelEncoder()
self.y_enc = self.lbl_enc.fit_transform(self.y)
kf = StratifiedKFold(n_splits=self.num_folds)
train_prediction_shape = (lentrain, self.num_classes)
else:
self.num_classes = -1
self.y_enc = self.y
kf = KFold(n_splits=self.num_folds)
train_prediction_shape = (lentrain, 1)
self.train_prediction_dict = {}
for level in range(self.levels):
self.train_prediction_dict[level] = np.zeros((train_prediction_shape[0],
train_prediction_shape[1] * len(self.model_dict[level])))
for level in range(self.levels):
if level == 0:
temp_train = self.training_data
else:
temp_train = self.train_prediction_dict[level - 1]
for model_num, model in enumerate(self.model_dict[level]):
validation_scores = []
foldnum = 1
for train_index, valid_index in kf.split(self.train_prediction_dict[0], self.y_enc):
logger.info("Training Level %d Fold # %d. Model # %d", level, foldnum, model_num)
if level != 0:
l_training_data = temp_train[train_index]
l_validation_data = temp_train[valid_index]
model.fit(l_training_data, self.y_enc[train_index])
else:
l0_training_data = temp_train[0][model_num]
if type(l0_training_data) == list:
l_training_data = [x[train_index] for x in l0_training_data]
l_validation_data = [x[valid_index] for x in l0_training_data]
else:
l_training_data = l0_training_data[train_index]
l_validation_data = l0_training_data[valid_index]
model.fit(l_training_data, self.y_enc[train_index])
logger.info("Predicting Level %d. Fold # %d. Model # %d", level, foldnum, model_num)
if self.task_type == 'classification':
temp_train_predictions = model.predict_proba(l_validation_data)
self.train_prediction_dict[level][valid_index,
(model_num * self.num_classes):(model_num * self.num_classes) +
self.num_classes] = temp_train_predictions
else:
temp_train_predictions = model.predict(l_validation_data)
self.train_prediction_dict[level][valid_index, model_num] = temp_train_predictions
validation_score = self.optimize(self.y_enc[valid_index], temp_train_predictions)
validation_scores.append(validation_score)
logger.info("Level %d. Fold # %d. Model # %d. Validation Score = %f", level, foldnum, model_num,
validation_score)
foldnum += 1
avg_score = np.mean(validation_scores)
std_score = np.std(validation_scores)
logger.info("Level %d. Model # %d. Mean Score = %f. Std Dev = %f", level, model_num,
avg_score, std_score)
logger.info("Saving predictions for level # %d", level)
train_predictions_df = pd.DataFrame(self.train_prediction_dict[level])
train_predictions_df.to_csv(os.path.join(self.save_path, "train_predictions_level_" + str(level) + ".csv"),
index=False, header=None)
return self.train_prediction_dict
def predict(self, test_data, lentest):
self.test_data = test_data
if self.task_type == 'classification':
test_prediction_shape = (lentest, self.num_classes)
else:
test_prediction_shape = (lentest, 1)
self.test_prediction_dict = {}
for level in range(self.levels):
self.test_prediction_dict[level] = np.zeros((test_prediction_shape[0],
test_prediction_shape[1] * len(self.model_dict[level])))
self.test_data = test_data
for level in range(self.levels):
if level == 0:
temp_train = self.training_data
temp_test = self.test_data
else:
temp_train = self.train_prediction_dict[level - 1]
temp_test = self.test_prediction_dict[level - 1]
for model_num, model in enumerate(self.model_dict[level]):
logger.info("Training Fulldata Level %d. Model # %d", level, model_num)
if level == 0:
model.fit(temp_train[0][model_num], self.y_enc)
else:
model.fit(temp_train, self.y_enc)
logger.info("Predicting Test Level %d. Model # %d", level, model_num)
if self.task_type == 'classification':
if level == 0:
temp_test_predictions = model.predict_proba(temp_test[0][model_num])
else:
temp_test_predictions = model.predict_proba(temp_test)
self.test_prediction_dict[level][:, (model_num * self.num_classes): (model_num * self.num_classes) +
self.num_classes] = temp_test_predictions
else:
if level == 0:
temp_test_predictions = model.predict(temp_test[0][model_num])
else:
temp_test_predictions = model.predict(temp_test)
self.test_prediction_dict[level][:, model_num] = temp_test_predictions
test_predictions_df = pd.DataFrame(self.test_prediction_dict[level])
test_predictions_df.to_csv(os.path.join(self.save_path, "test_predictions_level_" + str(level) + ".csv"),
index=False, header=None)
return self.test_prediction_dict
In [25]:
def multiclass_logloss(actual, predicted, eps=1e-15):
"""Multi class version of Logarithmic Loss metric.
:param actual: Array containing the actual target classes
:param predicted: Matrix with class predictions, one probability per class
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2
clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota
In [26]:
import string
eng_stopwords = set(stopwords.words("english"))
In [27]:
## Number of words in the text ##
train["num_words"] = train["tweet"].apply(lambda x: len(str(x).split()))
test["num_words"] = test["tweet"].apply(lambda x: len(str(x).split()))
## Number of unique words in the text ##
train["num_unique_words"] = train["tweet"].apply(lambda x: len(set(str(x).split())))
test["num_unique_words"] = test["tweet"].apply(lambda x: len(set(str(x).split())))
## Number of characters in the text ##
train["num_chars"] = train["tweet"].apply(lambda x: len(str(x)))
test["num_chars"] = test["tweet"].apply(lambda x: len(str(x)))
## Number of stopwords in the text ##
train["num_stopwords"] = train["tweet"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
test["num_stopwords"] = test["tweet"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
## Number of punctuations in the text ##
train["num_punctuations"] =train["tweet"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test["num_punctuations"] =test["tweet"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
## Number of upper case words in the text ##
train["num_words_upper"] = train["tweet"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test["num_words_upper"] = test["tweet"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
## Number of title case words in the text ##
train["num_words_title"] = train["tweet"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test["num_words_title"] = test["tweet"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
## Average length of the words in the text ##
train["mean_word_len"] = train["tweet"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test["mean_word_len"] = test["tweet"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
In [28]:
cols = ['char_length_ratio',
'num_syn_words', 'Lev_dist', 'tweet_num_pos', 'tweet_num_neg',
'tweet_diff_num',"num_words", "num_unique_words", "num_chars", "num_stopwords", "num_punctuations", "num_words_upper", "num_words_title", "mean_word_len"]
train_X = train[cols]
test_X = test[cols]
In [99]:
# specify the data to be used for every level of ensembling:
train_data_dict = {0: [xtrain_tfv, xtrain_ctv, xtrain_tfv, xtrain_ctv, train_X.values,], 1: [xtrain_tfv]}
test_data_dict = {0: [xtest_tfv, xtest_ctv, xtest_tfv, xtest_ctv, test_X.values], 1: [xtest_tfv]}
model_dict = {0: [SVC(C=1.5,probability=True),LogisticRegression(C=5),xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7)\
,MultinomialNB(alpha=0.1)],
1: [SVC(C=1,probability=True),SVC(C=5,probability=True),\
SVC(C=2,probability=True), xgb.XGBClassifier(silent=True, n_estimators=120, max_depth=7)]}
ens = Ensembler(model_dict=model_dict, num_folds=3, task_type='classification',
optimize=multiclass_logloss, lower_is_better=True, save_path='./temp//')
ens.fit(train_data_dict, ytrain, lentrain=xtrain_tfv.shape[0])
Out[99]:
In [100]:
preds = ens.predict(test_data_dict, lentest=xtest_tfv.shape[0])
predictions_test = preds[1][:,1]
predictions_test = np.where(predictions_test>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('ensemble.csv', index=False)
In [171]:
# Dependecy imports
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.layers import SpatialDropout1D, Dropout
from keras.layers import Dense, Activation, Reshape, Merge, Embedding, Input, Concatenate
from keras.models import Model as KerasModel
from sklearn.preprocessing import scale
In [128]:
max_fatures = 2500 # Top 2000 words
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
# The training phase is by means of the fit_on_texts method and you
# can see the word index using the word_index property:
tokenizer.fit_on_texts(train['tweet_without_stopwords'].values)
In [129]:
# texts_to_sequences method turns input into numerical arrays
train_data = tokenizer.texts_to_sequences(train['tweet_without_stopwords'].values)
test_data = tokenizer.texts_to_sequences(test['tweet_without_stopwords'].values)
In [130]:
print("\nExamples:")
print(train['tweet_without_stopwords'][100], '-->', train_data[100])
print(train['tweet_without_stopwords'][200], '-->', train_data[200])
print(train['tweet_without_stopwords'][300], '-->', train_data[300])
In [133]:
# All Phrase numerical values reshape to match size for all
train_data_pad = pad_sequences(train_data,maxlen=68)
test_data_pad = pad_sequences(test_data)
print("\nExample")
print(train_data[100], '-->', train_data_pad[100])
In [134]:
print('\nInput train data shape:', train_data_pad.shape)
print('Input test data shape:', test_data_pad.shape)
In [135]:
# One Hot encoding
train_labels = pd.get_dummies(train['label']).values
print('Sample labels:')
print(train_labels[0:2])
In [140]:
embed_dim = 200
lstm_out = 150 # Output Neurons
batch_size = 128
drop_out = 0.2
In [169]:
from keras.layers import Input
In [ ]:
def fit(self, X_train, y_train):
self.model.fit(self.preprocessing(X_train), y_train, epochs=self.epochs, batch_size=512)
def guess(self, features):
features = self.preprocessing(features)
result = self.model.predict(features).flatten()
return result
In [179]:
train_data_pad.shape
Out[179]:
In [182]:
train_X.shape
Out[182]:
In [220]:
from sklearn.preprocessing import StandardScaler
In [221]:
scaler = StandardScaler()
scaler.fit(train_X.values)
Out[221]:
In [222]:
scaled_train_X = scaler.transform(train_X)
scaled_test_X = scaler.transform(test_X)
In [223]:
def split_features(X):
X_list = []
x_0 = train_data_pad[..., :]
X_list.append(x_0)
x_1 = X[..., :]
X_list.append(x_1)
return X_list
In [230]:
inp_1 = Input(shape=(68,))
out_1 = Embedding(max_fatures, embed_dim, input_length=train_data_pad.shape[1])(inp_1)
s_drop_1 = SpatialDropout1D(.2)(out_1)
lstm_1 = LSTM(lstm_out,return_sequences=True, dropout = 0.2)(s_drop_1)
lstm_2 = LSTM(50, dropout=0.2)(lstm_1)
drop_2 = Dropout(drop_out)(lstm_2)
out_dense_1 = Dense(64,activation='relu')(drop_2)
inp_2 = Input(shape=(14,))
dense_2 = Dense(64,activation='relu')(inp_2)
out_dense_2 = Reshape(target_shape=(64,))(dense_2)
dense_3 = Dense(32,activation='relu')(out_dense_2)
out_dense_3 = Reshape(target_shape=(32,))(dense_3)
input_model = [inp_1, inp_2]
output_model = [out_dense_1, out_dense_3]
output = Concatenate()(output_model)
output = Dense(16, activation='relu')(output)
output = Dropout(drop_out)(output)
output = Dense(2, activation='softmax')(output)
model = KerasModel(inputs=input_model, outputs=output)
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
In [233]:
model.fit(split_features(train_X.values), train_labels, epochs=10, batch_size=batch_size, verbose=2)
Out[233]:
In [234]:
def split_features(X):
X_list = []
x_0 = test_data_pad[..., :]
X_list.append(x_0)
x_1 = X[..., :]
X_list.append(x_1)
return X_list
In [235]:
result = model.predict(split_features(test_X.values))
In [236]:
result = result[:,1]
In [237]:
sample['label'] = result
sample.to_csv('_a.csv', index=False)
In [238]:
predictions_test = np.where(result>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('__a.csv', index=False)
In [239]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')
In [ ]:
# model = Sequential()
# model.add(Embedding(max_fatures, embed_dim, input_length=train_data_pad.shape[1]))
# model.add(SpatialDropout1D(drop_out))
# # LSTMs
# model.add(LSTM(lstm_out,return_sequences=True, dropout = 0.2))
# model.add(LSTM(50, dropout=0.2))
# model.add(Dropout(drop_out))
# model.add(Dense(2, activation='softmax'))
# model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())
In [166]:
model.fit(train_data_pad, train_labels, epochs=10, batch_size=batch_size, verbose=2, validation_split=0.2)
Out[166]:
In [167]:
preds_1 = model.predict_proba(test_data_pad)[:,1]
In [154]:
# A simple LSTM with two dense layers
from keras.callbacks import EarlyStopping
model_2 = Sequential()
model_2.add(Embedding(max_fatures, embed_dim, input_length=train_data_pad.shape[1]))
model_2.add(SpatialDropout1D(0.3))
model_2.add(LSTM(300, dropout=0.3, recurrent_dropout=0.3))
model_2.add(Dense(512, activation='relu'))
model_2.add(Dropout(0.3))
model_2.add(Dense(256, activation='relu'))
model_2.add(Dropout(0.3))
model_2.add(Dense(2, activation='softmax'))
model_2.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
# Fit the model_2 with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=2, mode='auto')
model_2.fit(train_data_pad, train_labels, epochs=10, batch_size=batch_size, verbose=2, validation_split=0.2,callbacks=[earlystop])
Out[154]:
In [155]:
model_2.summary()
In [156]:
preds_2 = model_2.predict_proba(test_data_pad)[:,1]
In [160]:
# A simple bidirectional LSTM with Embeddings and two dense layers
from keras.layers import Bidirectional
model_3 = Sequential()
model_3.add(Embedding(max_fatures, embed_dim, input_length=train_data_pad.shape[1]))
model_3.add(SpatialDropout1D(0.3))
model_3.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))
model_3.add(Dense(512, activation='relu'))
model_3.add(Dropout(0.3))
model_3.add(Dense(256, activation='relu'))
model_3.add(Dropout(0.3))
model_3.add(Dense(2, activation='softmax'))
model_3.compile(loss='categorical_crossentropy', optimizer='adam')
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=2, mode='auto')
model_3.fit(train_data_pad, train_labels, epochs=10, batch_size=batch_size, verbose=2, validation_split=0.2,callbacks=[earlystop])
Out[160]:
In [161]:
model_3.summary()
In [162]:
preds_3 = model_3.predict_proba(test_data_pad)[:,1]
In [163]:
predictions_test = np.where(preds_1>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('pred_lstm1.csv', index=False)
predictions_test = np.where(preds_2>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('pred_lstm2.csv', index=False)
predictions_test = np.where(preds_3>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('pred_lstm3.csv', index=False)
In [168]:
predictions_test = np.where(preds_1>=0.41, 1,0)
sample['label'] = predictions_test
sample.to_csv('pred_lstmlstm1.csv', index=False)
In [ ]: