In [1]:
import pandas
import matplotlib.pyplot as plt
import numpy
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
numpy.random.seed(7)
data = pandas.read_csv("../data/interim/accelData.csv")
print data.shape
print data[1:5] #Data is separated 60ms
#plt.plot(data[['accelerationX','accelerationY','accelerationZ']][1:100])
#plt.show()
# We clean up the Activity and social values
cleandata = data
cleandata.loc[cleandata['Activity'].isnull(),'Activity'] = 'Other'
cleandata.loc[cleandata['Activity'] == 'OFF','Activity'] = 'Other'
cleandata.loc[cleandata['Activity'] == 'TEC','Activity'] = 'Other'
cleandata.loc[cleandata['Activity'] == 'TDT','Activity'] = 'Other'
cleandata.loc[cleandata['Social'].isnull(),'Social'] = 'Other'
#print numpy.unique(cleandata['Activity']), numpy.unique(cleandata['Social'])
#print cleandata.shape
cleandata = cleandata[cleandata.notnull().all(axis=1)]
#print 'Not null data'
#print cleandata.shape
train = cleandata[~cleandata['session'].isin(['case1-day1-session1-teacher1','case2-day3-session1-teacher2'])]
#print train.shape
test = cleandata[cleandata['session'].isin(['case1-day1-session1-teacher1','case2-day3-session1-teacher2'])]
#print test.shape
# We split our datasets into session+timestamps, X and Y
times_train = train.loc[:,['session','timestamp']]
times_test = test.loc[:,['session','timestamp']]
X_train = train.loc[:,['accelerationX','accelerationY','accelerationZ']].astype(float)
Y_train = train.loc[:,'Activity'] # Social is 8
X_test = test.loc[:,['accelerationX','accelerationY','accelerationZ']].astype(float)
Y_test = test.loc[:,'Activity']
# One hot encoding of the response variable (using dummy variables)
from keras.utils.np_utils import to_categorical
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = to_categorical(encoded_Y_train)
encoder.fit(Y_test)
encoded_Y_test = encoder.transform(Y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = to_categorical(encoded_Y_test)
# Sanity check on matrix dimensions, after droppinig null/nans
print times_train.shape #
print X_train.shape #
print Y_test.shape #
print dummy_y_test.shape #
#print 'X before normalization'
#print X_train[1:5]
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#print 'X after normalization'
#print X_train[1:5,:]
In [2]:
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
testX = numpy.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
print trainX.shape
In [11]:
from keras.layers import Dropout
from keras.layers import LSTM
from keras.constraints import maxnorm
from keras.optimizers import SGD
def create_LSTM2Acc(batch_size = 1, trainShape1=100):
# create and fit the LSTM network
model = Sequential()
# stateful LSTM!
#model.add(LSTM(200, batch_input_shape=(batch_size, 1, trainShape1),
# return_sequences=True, stateful=True))
#model.add(Dropout(0.2))
model.add(LSTM(100, batch_input_shape=(batch_size, 1, trainShape1),
return_sequences=True, stateful=True))
model.add(Dropout(0.2))
model.add(LSTM(50,
return_sequences=False, stateful=True))
model.add(Dropout(0.2))
model.add(Dense(50, activation='tanh'))
model.add(Dropout(0.2))
#model.add(Dense(20, activation='tanh'))
#model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
# evaluate baseline model with standardized dataset
numpy.random.seed(66)
#estimators = []
#estimators.append(('standardize', StandardScaler()))
#estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=10, verbose=1)))
# We define a pipeline of estimators, in which first the scaler is fitted to the data, then the MLP is applied
#pipeline = Pipeline(estimators)
#kfold = StratifiedKFold(y=Y_train, n_folds=3, shuffle=True, random_state=seed)
batch_size = 1
nb_epochs = 3
print trainX.shape[2]
#model = create_baseline()
model = create_LSTM2Acc(batch_size=batch_size, trainShape1=trainX.shape[2])
print model.summary()
In [12]:
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
# To save the best model
# serialize model to JSON
model_json = model.to_json()
with open("acc.model--2lstm.json", "w") as json_file:
json_file.write(model_json)
filepath="acc.weights--2lstm.best.hdf5"
# Define that the accuracy in cv is monitored, and that weights are stored in a file when max accuracy is achieved
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
def printValStats(model, testX, dummy_y_test):
# Other performance/accuracy metrics
Y_pred = model.predict(testX, batch_size=batch_size)
model.reset_states()
print 'Performance of model on test set ----------------------------'
# Accuracy
print('Accuracy:')
print(accuracy_score(numpy.argmax(dummy_y_test, axis=1), numpy.argmax(Y_pred, axis=1)))
# Confusion matrix
cm = confusion_matrix(numpy.argmax(dummy_y_test, axis=1), numpy.argmax(Y_pred, axis=1))
numpy.set_printoptions(precision=2)
print('Confusion matrix:')
print(cm)
# AUC
roc = roc_auc_score(dummy_y_test, Y_pred, average='macro')
print('AUC score:')
print(roc)
# Fit the model
accs =[]
val_accs =[]
losss =[]
val_losss =[]
# Manually create epochs and reset between sessions
for i in range(nb_epochs):
# Single epoch. Remember to not shuffle the data!
print('Epoch', i+1, '/', nb_epochs)
#print trainX[0:5,:,:]
#print dummy_y_train[0:5,:]
history = model.fit(trainX, dummy_y_train, validation_data=(testX,dummy_y_test),
nb_epoch=1, batch_size=batch_size, shuffle=False,
verbose=1, callbacks=callbacks_list)
accs.append(history.history['acc'][0])
val_accs.append(history.history['val_acc'][0])
losss.append(history.history['loss'][0])
val_losss.append(history.history['val_loss'][0])
model.reset_states()
printValStats(model, testX, dummy_y_test)
import operator
index, value = max(enumerate(val_accs), key=operator.itemgetter(1))
print index, value
# summarize history for accuracy
plt.plot(accs)
plt.plot(val_accs)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(losss)
plt.plot(val_losss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
In [ ]:
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier
# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100)
# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])
# Take the same decision trees and run it on the test data
print 'Accuracy on test data (RAW):',forest.score(test_data)