In [1]:
import pandas
import matplotlib.pyplot as plt
import numpy
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder


numpy.random.seed(7)

data = pandas.read_csv("../data/interim/accelData.csv")
print data.shape
print data[1:5] #Data is separated 60ms
#plt.plot(data[['accelerationX','accelerationY','accelerationZ']][1:100])
#plt.show()

# We clean up the Activity and social values
cleandata = data
cleandata.loc[cleandata['Activity'].isnull(),'Activity'] = 'Other'
cleandata.loc[cleandata['Activity'] == 'OFF','Activity'] = 'Other'
cleandata.loc[cleandata['Activity'] == 'TEC','Activity'] = 'Other'
cleandata.loc[cleandata['Activity'] == 'TDT','Activity'] = 'Other'
cleandata.loc[cleandata['Social'].isnull(),'Social'] = 'Other'
#print numpy.unique(cleandata['Activity']), numpy.unique(cleandata['Social'])
#print cleandata.shape

cleandata = cleandata[cleandata.notnull().all(axis=1)]
#print 'Not null data'
#print cleandata.shape

train = cleandata[~cleandata['session'].isin(['case1-day1-session1-teacher1','case2-day3-session1-teacher2'])]
#print train.shape
test = cleandata[cleandata['session'].isin(['case1-day1-session1-teacher1','case2-day3-session1-teacher2'])]
#print test.shape

# We split our datasets into session+timestamps, X and Y
times_train = train.loc[:,['session','timestamp']]
times_test = test.loc[:,['session','timestamp']]

X_train = train.loc[:,['accelerationX','accelerationY','accelerationZ']].astype(float)
Y_train = train.loc[:,'Activity'] # Social is 8

X_test = test.loc[:,['accelerationX','accelerationY','accelerationZ']].astype(float)
Y_test = test.loc[:,'Activity']

# One hot encoding of the response variable (using dummy variables)
from keras.utils.np_utils import to_categorical

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = to_categorical(encoded_Y_train)
encoder.fit(Y_test)
encoded_Y_test = encoder.transform(Y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_test = to_categorical(encoded_Y_test)

# Sanity check on matrix dimensions, after droppinig null/nans
print times_train.shape #
print X_train.shape #
print Y_test.shape #
print dummy_y_test.shape #

#print 'X before normalization'
#print X_train[1:5]
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
#print 'X after normalization'
#print X_train[1:5,:]


/usr/local/lib/python2.7/dist-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')
Using Theano backend.
(505693, 9)
   Unnamed: 0  accelerationX  accelerationY  accelerationZ  timestamp  \
1         621         -0.306          6.704          7.834       60.0   
2          63         -0.383          6.771          6.799      121.0   
3         642         -0.162          6.416          7.010      183.0   
4         655         -0.249          6.445          6.962      241.0   

                        session  timestamp.orig Activity Social  
1  case1-day1-session1-teacher1   1433229445693      NaN    NaN  
2  case1-day1-session1-teacher1   1433229445754      TDT    CLS  
3  case1-day1-session1-teacher1   1433229445816      TDT    CLS  
4  case1-day1-session1-teacher1   1433229445874      TDT    CLS  
(411852, 2)
(411852, 3)
(93841,)
(93841, 5)

In [2]:
# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
testX = numpy.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))
print trainX.shape


(411852, 1, 3)

In [11]:
from keras.layers import Dropout
from keras.layers import LSTM
from keras.constraints import maxnorm
from keras.optimizers import SGD

def create_LSTM2Acc(batch_size = 1, trainShape1=100):
    # create and fit the LSTM network
    model = Sequential()
    # stateful LSTM!
    #model.add(LSTM(200, batch_input_shape=(batch_size, 1, trainShape1), 
    #               return_sequences=True, stateful=True))
    #model.add(Dropout(0.2))
    model.add(LSTM(100, batch_input_shape=(batch_size, 1, trainShape1), 
                   return_sequences=True, stateful=True))
    model.add(Dropout(0.2))
    model.add(LSTM(50, 
                   return_sequences=False, stateful=True))
    model.add(Dropout(0.2))
    model.add(Dense(50, activation='tanh'))
    model.add(Dropout(0.2))
    #model.add(Dense(20, activation='tanh'))
    #model.add(Dropout(0.2))
    model.add(Dense(5, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# evaluate baseline model with standardized dataset
numpy.random.seed(66)
#estimators = []
#estimators.append(('standardize', StandardScaler()))
#estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=10, verbose=1)))
# We define a pipeline of estimators, in which first the scaler is fitted to the data, then the MLP is applied
#pipeline = Pipeline(estimators)
#kfold = StratifiedKFold(y=Y_train, n_folds=3, shuffle=True, random_state=seed)

batch_size = 1
nb_epochs = 3
print trainX.shape[2]

#model = create_baseline()
model = create_LSTM2Acc(batch_size=batch_size, trainShape1=trainX.shape[2])
print model.summary()


3
____________________________________________________________________________________________________
Layer (type)                       Output Shape        Param #     Connected to                     
====================================================================================================
lstm_15 (LSTM)                     (1, 1, 100)         41600       lstm_input_6[0][0]               
____________________________________________________________________________________________________
dropout_24 (Dropout)               (1, 1, 100)         0           lstm_15[0][0]                    
____________________________________________________________________________________________________
lstm_16 (LSTM)                     (1, 50)             30200       dropout_24[0][0]                 
____________________________________________________________________________________________________
dropout_25 (Dropout)               (1, 50)             0           lstm_16[0][0]                    
____________________________________________________________________________________________________
dense_15 (Dense)                   (1, 50)             2550        dropout_25[0][0]                 
____________________________________________________________________________________________________
dropout_26 (Dropout)               (1, 50)             0           dense_15[0][0]                   
____________________________________________________________________________________________________
dense_16 (Dense)                   (1, 5)              255         dropout_26[0][0]                 
====================================================================================================
Total params: 74605
____________________________________________________________________________________________________
None

In [12]:
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score

# To save the best model
# serialize model to JSON
model_json = model.to_json()
with open("acc.model--2lstm.json", "w") as json_file:
    json_file.write(model_json)
filepath="acc.weights--2lstm.best.hdf5"
# Define that the accuracy in cv is monitored, and that weights are stored in a file when max accuracy is achieved
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]


def printValStats(model, testX, dummy_y_test):
    # Other performance/accuracy metrics
    Y_pred = model.predict(testX, batch_size=batch_size)
    model.reset_states()
    print 'Performance of model on test set ----------------------------'
    # Accuracy
    print('Accuracy:')
    print(accuracy_score(numpy.argmax(dummy_y_test, axis=1), numpy.argmax(Y_pred, axis=1)))
    # Confusion matrix
    cm = confusion_matrix(numpy.argmax(dummy_y_test, axis=1), numpy.argmax(Y_pred, axis=1))
    numpy.set_printoptions(precision=2)
    print('Confusion matrix:')
    print(cm)
    # AUC
    roc = roc_auc_score(dummy_y_test, Y_pred, average='macro')
    print('AUC score:')
    print(roc)


# Fit the model

accs =[]
val_accs =[]
losss =[]
val_losss =[]

# Manually create epochs and reset between sessions
for i in range(nb_epochs):
    # Single epoch. Remember to not shuffle the data!
    print('Epoch', i+1, '/', nb_epochs)
    #print trainX[0:5,:,:]
    #print dummy_y_train[0:5,:]
    history = model.fit(trainX, dummy_y_train, validation_data=(testX,dummy_y_test), 
                        nb_epoch=1, batch_size=batch_size, shuffle=False, 
                        verbose=1, callbacks=callbacks_list)
    accs.append(history.history['acc'][0])
    val_accs.append(history.history['val_acc'][0])
    losss.append(history.history['loss'][0])
    val_losss.append(history.history['val_loss'][0])

    model.reset_states()
    printValStats(model, testX, dummy_y_test)

import operator
index, value = max(enumerate(val_accs), key=operator.itemgetter(1))
print index, value

# summarize history for accuracy
plt.plot(accs)
plt.plot(val_accs)
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(losss)
plt.plot(val_losss)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()


('Epoch', 1, '/', 3)
Train on 411852 samples, validate on 93841 samples
Epoch 1/1
411851/411852 [============================>.] - ETA: 0s - loss: 0.2801 - acc: 0.9534Epoch 00000: val_acc improved from -inf to 0.41287, saving model to acc.weights--2lstm.best.hdf5
411852/411852 [==============================] - 10243s - loss: 0.2801 - acc: 0.9534 - val_loss: 7.2106 - val_acc: 0.4129
Performance of model on test set ----------------------------
Accuracy:
0.412868575569
Confusion matrix:
[[    0     0 16008     0     0]
 [    0     0 24517     0     0]
 [    0     0 38744     0     0]
 [    0     0  7769     0     0]
 [    0     0  6803     0     0]]
AUC score:
0.499916350484
('Epoch', 2, '/', 3)
Train on 411852 samples, validate on 93841 samples
Epoch 1/1
182230/411852 [============>.................] - ETA: 6254s - loss: 0.3251 - acc: 0.9531
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-12-da51cd5256be> in <module>()
     48     history = model.fit(trainX, dummy_y_train, validation_data=(testX,dummy_y_test), 
     49                         nb_epoch=1, batch_size=batch_size, shuffle=False,
---> 50                         verbose=1, callbacks=callbacks_list)
     51     accs.append(history.history['acc'][0])
     52     val_accs.append(history.history['val_acc'][0])

/usr/local/lib/python2.7/dist-packages/keras/models.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, **kwargs)
    407                               shuffle=shuffle,
    408                               class_weight=class_weight,
--> 409                               sample_weight=sample_weight)
    410 
    411     def evaluate(self, x, y, batch_size=32, verbose=1,

/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight)
   1050                               verbose=verbose, callbacks=callbacks,
   1051                               val_f=val_f, val_ins=val_ins, shuffle=shuffle,
-> 1052                               callback_metrics=callback_metrics)
   1053 
   1054     def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):

/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc in _fit_loop(self, f, ins, out_labels, batch_size, nb_epoch, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics)
    788                 batch_logs['size'] = len(batch_ids)
    789                 callbacks.on_batch_begin(batch_index, batch_logs)
--> 790                 outs = f(ins_batch)
    791                 if type(outs) != list:
    792                     outs = [outs]

/usr/local/lib/python2.7/dist-packages/keras/backend/theano_backend.pyc in __call__(self, inputs)
    516     def __call__(self, inputs):
    517         assert type(inputs) in {list, tuple}
--> 518         return self.function(*inputs)
    519 
    520 

/usr/local/lib/python2.7/dist-packages/theano/compile/function_module.pyc in __call__(self, *args, **kwargs)
    857         t0_fn = time.time()
    858         try:
--> 859             outputs = self.fn()
    860         except Exception:
    861             if hasattr(self.fn, 'position_of_error'):

KeyboardInterrupt: 

For comparison, do RF of accel dataset

Both on the raw data, and on the 10s data (accel features only)


In [ ]:
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier 

# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])

# Take the same decision trees and run it on the test data
print 'Accuracy on test data (RAW):',forest.score(test_data)