In [2]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# fix random seed for reproducibility
seed = 66
numpy.random.seed(seed)
data = pandas.read_csv("../data/processed/train.csv")
notnull_data = data[data.notnull().all(axis=1)]
train = notnull_data.values
data2 = pandas.read_csv("../data/processed/test.csv")
notnull_data2 = data2[data2.notnull().all(axis=1)]
test = notnull_data2.values
In [3]:
X_train = train[:,3:7558].astype(float)
#X_train = train[:,3:13].astype(float)
Y_train = train[:,7558]
X_test = test[:,3:7558].astype(float)
#X_test = test[:,3:13].astype(float)
Y_test = test[:,7558]
# One hot encoding of the response variable (using dummy variables)
from keras.utils.np_utils import to_categorical
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = to_categorical(encoded_Y_train)
encoder.fit(Y_test)
encoded_Y_test = encoder.transform(Y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = to_categorical(encoded_Y_test)
# Sanity check on matrix dimensions, after droppinig null/nans
#print X_train.shape #(4472, 7555)
#print Y_test.shape #(1044, )
#print dummy_y_test.shape # (1044, 5)
In [4]:
# We standardize on the basis of the training data
scaler = StandardScaler().fit(X_train)
X_train_st = scaler.transform(X_train)
X_test_st = scaler.transform(X_test)
# We do PCA, too
from sklearn import decomposition
n_comp = 100
pca = decomposition.PCA(n_components=n_comp)
X_train_pca = pca.fit_transform(X_train_st)
X_test_pca = pca.transform(X_test_st)
print 'Variance explained:'
print pca.explained_variance_ratio_
print 'Total variance explained by '+str(n_comp)+' components:'
print sum(pca.explained_variance_ratio_)
In [39]:
# We create the look-back dataset
#print train[:,1:3] # session is [1], timestamp is [2]
# convert an array of values into a dataset matrix, depending on the nr. of steps to look back
def create_dataset(sessiontimes, X, Y, look_back=1):
dataX, dataY = [], []
if((sessiontimes.shape[0] != X.shape[0]) or
(sessiontimes.shape[0] != Y.shape[0])):
# Different number of rows, something's wrong!
print 'Dimensions of the X, Y and timestamps do not match!!'
return None
sessions = numpy.unique(sessiontimes[:,0])
for session in sessions:
print session
stimes = sessiontimes[numpy.where(sessiontimes[:,0] == session)]
sessionX = X[numpy.where(sessiontimes[:,0] == session)]
sessionY = Y[numpy.where(sessiontimes[:,0] == session)]
sessiondataX, sessiondataY = [], []
# For Y, we just eliminate the first look_back rows
sessiondataY = sessionY[look_back:sessionY.shape[0],:]
# For X, se successively roll and append the data, then eliminate the first look_back rows
for i in range(look_back+1):
rolled = numpy.roll(sessionX,i,axis=0)
if(i == 0):
sessiondataX = numpy.array(rolled)
else:
sessiondataX = numpy.hstack((sessiondataX, numpy.array(rolled)))
sessiondataX = sessiondataX[look_back:(sessiondataX.shape[0]),:]
# We join all the sessions data
if len(dataX)==0:
dataX = sessiondataX
dataY = sessiondataY
else:
dataX = numpy.vstack((dataX, sessiondataX))
dataY = numpy.vstack((dataY, sessiondataY))
return dataX, dataY
# reshape into X=t and Y=t+1
lookbk = 9
X_train_lb, Y_train_lb = create_dataset(train[:,1:3], X_train_pca, dummy_y_train, lookbk)
X_test_lb, Y_test_lb = create_dataset(test[:,1:3], X_test_pca, dummy_y_test, lookbk)
print X_train_lb.shape, Y_test_lb.shape
In [40]:
from keras.layers import Dropout
from keras.constraints import maxnorm
from keras.optimizers import SGD
# baseline model
def create_baseline():
# create model
model = Sequential()
model.add(Dense(200, input_dim=7555, init='uniform', activation='tanh', W_constraint=maxnorm(4)))
model.add(Dense(20, init='uniform', activation='tanh', W_constraint=maxnorm(4)))
model.add(Dense(5, init='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# Apply dropout regularization, it is overfitting!
def create_dropout():
# create model
model = Sequential()
model.add(Dropout(0.2, input_shape=(7555,)))
model.add(Dense(200, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(20, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(5, init='uniform', activation='sigmoid'))
# Compile model, with larger learning rate and momentum, as recommended by the original paper
sgd = SGD(lr=0.01, momentum=0.9, decay=0.0, nesterov=False)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
return model
# Apply dropout regularization, it is overfitting!
def create_dropout_decay():
# create model
model = Sequential()
model.add(Dropout(0.2, input_shape=(7555,)))
model.add(Dense(200, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(20, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(5, init='uniform', activation='sigmoid'))
# Compile model, with larger learning rate and momentum, as recommended by the original paper
sgd = SGD(lr=0.1, momentum=0.9, decay=0.005, nesterov=False)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
return model
# Apply dropout regularization, it is overfitting!
def create_deeper_dropout_decay_PCA(n_components, lookback, learningrate, n_epochs):
# create model
model = Sequential()
model.add(Dropout(0.2, input_shape=(n_components*(lookback+1),)))
model.add(Dense(300, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(300, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(80, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(80, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(20, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(20, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(5, init='uniform', activation='sigmoid'))
# Compile model, with larger learning rate and momentum, as recommended by the original paper
sgd = SGD(lr=learningrate, momentum=0.8, decay=learningrate/n_epochs, nesterov=False)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
return model
# evaluate baseline model with standardized dataset
numpy.random.seed(seed)
#estimators = []
#estimators.append(('standardize', StandardScaler()))
#estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=10, verbose=1)))
# We define a pipeline of estimators, in which first the scaler is fitted to the data, then the MLP is applied
#pipeline = Pipeline(estimators)
#kfold = StratifiedKFold(y=Y_train, n_folds=3, shuffle=True, random_state=seed)
#model = create_baseline()
learningrate = 0.1
n_epochs = 1000
model = create_deeper_dropout_decay_PCA(n_components=n_comp, lookback=lookbk, learningrate=learningrate, n_epochs=n_epochs)
print model.summary()
In [41]:
from keras.callbacks import ModelCheckpoint
# To save the best model
# serialize model to JSON
model_json = model.to_json()
with open("model--7mlp--lb.json", "w") as json_file:
json_file.write(model_json)
filepath="weights--7mlp--lb.best.hdf5"
# Define that the accuracy in cv is monitored, and that weights are stored in a file when max accuracy is achieved
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
# Fit the model
history = model.fit(X_train_lb, Y_train_lb, validation_data=(X_test_lb,Y_test_lb),
nb_epoch=n_epochs, batch_size=10, verbose=0, callbacks=callbacks_list)
#results = cross_val_score(pipeline, X_train, dummy_y_train, cv=kfold)
#print("Standardized data Acc (in CV training data): %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
# evaluate the model
#scores = pipeline.evaluate(X_test, dummy_y_test)
#print pipeline.metrics_names[1]
#print scores[1]*100
# For other metrics, see http://machinelearningmastery.com/metrics-evaluate-machine-learning-algorithms-python/
In [42]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
# TODO: Reload the best model and weights from the previous step, before showing the evaluation
# Other performance/accuracy metrics
Y_pred = model.predict(X_test_lb)
print Y_pred.shape
# Accuracy
print('Accuracy:')
print(accuracy_score(numpy.argmax(Y_test_lb, axis=1), numpy.argmax(Y_pred, axis=1)))
# Confusion matrix
cm = confusion_matrix(numpy.argmax(Y_test_lb, axis=1), numpy.argmax(Y_pred, axis=1))
numpy.set_printoptions(precision=2)
print('Confusion matrix:')
print(cm)
# AUC
roc = roc_auc_score(Y_test_lb, Y_pred, average='macro')
print('AUC score:')
print(roc)
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
So, the lookback MLP approach (from 100-component PCA) seems to work a bit better (and beat the raw random forest), but not by much.
In [45]:
X_train = train[:,3:7558].astype(float)
#X_train = train[:,3:13].astype(float)
Y_train = train[:,7559]
X_test = test[:,3:7558].astype(float)
#X_test = test[:,3:13].astype(float)
Y_test = test[:,7559]
# One hot encoding of the response variable (using dummy variables)
from keras.utils.np_utils import to_categorical
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = to_categorical(encoded_Y_train)
encoder.fit(Y_test)
encoded_Y_test = encoder.transform(Y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = to_categorical(encoded_Y_test)
# Sanity check on matrix dimensions, after droppinig null/nans
#print X_train.shape #(4472, 7555)
#print Y_test.shape #(1044, )
#print dummy_y_test.shape # (1044, 5)
In [46]:
# We standardize on the basis of the training data
scaler = StandardScaler().fit(X_train)
X_train_st = scaler.transform(X_train)
X_test_st = scaler.transform(X_test)
# We do PCA, too
from sklearn import decomposition
n_comp = 100
pca = decomposition.PCA(n_components=n_comp)
X_train_pca = pca.fit_transform(X_train_st)
X_test_pca = pca.transform(X_test_st)
print 'Variance explained:'
print pca.explained_variance_ratio_
print 'Total variance explained by '+str(n_comp)+' components:'
print sum(pca.explained_variance_ratio_)
In [47]:
# We create the look-back dataset
#print train[:,1:3] # session is [1], timestamp is [2]
# convert an array of values into a dataset matrix, depending on the nr. of steps to look back
def create_dataset(sessiontimes, X, Y, look_back=1):
dataX, dataY = [], []
if((sessiontimes.shape[0] != X.shape[0]) or
(sessiontimes.shape[0] != Y.shape[0])):
# Different number of rows, something's wrong!
print 'Dimensions of the X, Y and timestamps do not match!!'
return None
sessions = numpy.unique(sessiontimes[:,0])
for session in sessions:
print session
stimes = sessiontimes[numpy.where(sessiontimes[:,0] == session)]
sessionX = X[numpy.where(sessiontimes[:,0] == session)]
sessionY = Y[numpy.where(sessiontimes[:,0] == session)]
sessiondataX, sessiondataY = [], []
# For Y, we just eliminate the first look_back rows
sessiondataY = sessionY[look_back:sessionY.shape[0],:]
# For X, se successively roll and append the data, then eliminate the first look_back rows
for i in range(look_back+1):
rolled = numpy.roll(sessionX,i,axis=0)
if(i == 0):
sessiondataX = numpy.array(rolled)
else:
sessiondataX = numpy.hstack((sessiondataX, numpy.array(rolled)))
sessiondataX = sessiondataX[look_back:(sessiondataX.shape[0]),:]
# We join all the sessions data
if len(dataX)==0:
dataX = sessiondataX
dataY = sessiondataY
else:
dataX = numpy.vstack((dataX, sessiondataX))
dataY = numpy.vstack((dataY, sessiondataY))
return dataX, dataY
# reshape into X=t and Y=t+1
lookbk = 9
X_train_lb, Y_train_lb = create_dataset(train[:,1:3], X_train_pca, dummy_y_train, lookbk)
X_test_lb, Y_test_lb = create_dataset(test[:,1:3], X_test_pca, dummy_y_test, lookbk)
print X_train_lb.shape, Y_test_lb.shape
In [50]:
from keras.layers import Dropout
from keras.constraints import maxnorm
from keras.optimizers import SGD
# Apply dropout regularization, it is overfitting!
def create_deeper_dropout_decay_PCA(n_components, lookback, learningrate, n_epochs):
# create model
model = Sequential()
model.add(Dropout(0.2, input_shape=(n_components*(lookback+1),)))
model.add(Dense(300, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(300, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(80, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(80, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(20, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(20, init='uniform', activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(4, init='uniform', activation='sigmoid'))
# Compile model, with larger learning rate and momentum, as recommended by the original paper
sgd = SGD(lr=learningrate, momentum=0.8, decay=learningrate/n_epochs, nesterov=False)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
return model
# evaluate baseline model with standardized dataset
numpy.random.seed(seed)
#estimators = []
#estimators.append(('standardize', StandardScaler()))
#estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=10, verbose=1)))
# We define a pipeline of estimators, in which first the scaler is fitted to the data, then the MLP is applied
#pipeline = Pipeline(estimators)
#kfold = StratifiedKFold(y=Y_train, n_folds=3, shuffle=True, random_state=seed)
#model = create_baseline()
learningrate = 0.1
n_epochs = 20
model = create_deeper_dropout_decay_PCA(n_components=n_comp, lookback=lookbk, learningrate=learningrate, n_epochs=n_epochs)
print model.summary()
In [51]:
from keras.callbacks import ModelCheckpoint
# To save the best model
# serialize model to JSON
model_json = model.to_json()
with open("social.model--7mlp--lb.json", "w") as json_file:
json_file.write(model_json)
filepath="social.weights--7mlp--lb.best.hdf5"
# Define that the accuracy in cv is monitored, and that weights are stored in a file when max accuracy is achieved
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
# Fit the model
history = model.fit(X_train_lb, Y_train_lb, validation_data=(X_test_lb,Y_test_lb),
nb_epoch=n_epochs, batch_size=10, verbose=0, callbacks=callbacks_list)
#results = cross_val_score(pipeline, X_train, dummy_y_train, cv=kfold)
#print("Standardized data Acc (in CV training data): %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
# evaluate the model
#scores = pipeline.evaluate(X_test, dummy_y_test)
#print pipeline.metrics_names[1]
#print scores[1]*100
# For other metrics, see http://machinelearningmastery.com/metrics-evaluate-machine-learning-algorithms-python/
In [52]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
# TODO: Reload the best model and weights from the previous step, before showing the evaluation
# Other performance/accuracy metrics
Y_pred = model.predict(X_test_lb)
print Y_pred.shape
# Accuracy
print('Accuracy:')
print(accuracy_score(numpy.argmax(Y_test_lb, axis=1), numpy.argmax(Y_pred, axis=1)))
# Confusion matrix
cm = confusion_matrix(numpy.argmax(Y_test_lb, axis=1), numpy.argmax(Y_pred, axis=1))
numpy.set_printoptions(precision=2)
print('Confusion matrix:')
print(cm)
# AUC
roc = roc_auc_score(Y_test_lb, Y_pred, average='macro')
print('AUC score:')
print(roc)
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
In [ ]: