In [12]:
import numpy
import pandas
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, make_scorer, cohen_kappa_score, f1_score
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from math import ceil, sqrt
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dropout
from keras.layers import LSTM
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.models import model_from_json
import numpy
import os
from keras.utils.np_utils import to_categorical
from hmmlearn.hmm import GaussianHMM, GMMHMM
import xgboost
from __future__ import print_function

In [2]:
data = pandas.read_csv("../data/processed/train.csv")
notnull_data = data[data.notnull().all(axis=1)]
train = notnull_data.values
data2 = pandas.read_csv("../data/processed/test.csv")
notnull_data2 = data2[data2.notnull().all(axis=1)]
test = notnull_data2.values

In [20]:
# Separate the target values (Activity and Social) from features, etc.
X_train = train[:,3:7558].astype(float)
Y_trainA = train[:,7558] #Activity
Y_trainS = train[:,7559] #Social
X_test = test[:,3:7558].astype(float)
Y_testA = test[:,7558]
Y_testS = test[:,7559]

# encode class values as integers
encoderA = LabelEncoder()
encoderA.fit(Y_trainA)
encoded_Y_trainA = encoderA.transform(Y_trainA)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_trainA = to_categorical(encoded_Y_trainA)
encoderA.fit(Y_testA)
encoded_Y_testA = encoderA.transform(Y_testA)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_testA = to_categorical(encoded_Y_testA)

# encode class values as integers
encoderS = LabelEncoder()
encoderS.fit(Y_trainS)
encoded_Y_trainS = encoderS.transform(Y_trainS)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_trainS = to_categorical(encoded_Y_trainS)
encoderS.fit(Y_testS)
encoded_Y_testS = encoderS.transform(Y_testS)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_testS = to_categorical(encoded_Y_testS)

# We standardize on the basis of the training data
scaler = StandardScaler().fit(X_train)
X_train_st = scaler.transform(X_train)
X_test_st = scaler.transform(X_test)

# Number of components to extract from the dataset
n_components = 100

#print 'Reducing dataset with PCA --',n_components
#pca = decomposition.PCA(n_components=n_components)
perc_variance = 0.8
print('Reducing dataset with PCA -- to %d variance', perc_variance)
pca = decomposition.PCA(n_components=perc_variance)
X_train_pca = pca.fit_transform(X_train_st)
X_test_pca = pca.transform(X_test_st)

#print 'Variance explained:'
#print pca.explained_variance_ratio_
print('Total variance explained by %d components: ',pca.n_components_)
print(sum(pca.explained_variance_ratio_))


Reducing dataset with PCA -- to %d variance 0.8
Total variance explained by %d components:  426
0.800088479971

Basic SVM and XGBoost

All features


In [25]:
X = X_train_st
Y = Y_trainA
Xtest = X_test_st
Ytest = Y_testA

modelSVM = SVC()
modelXGB = xgboost.XGBClassifier()

print("Training the data...", end="")

modelSVM.fit(X,Y)
modelXGB.fit(X,Y)

print("done")


Training the data...done

In [26]:
# Test it on validation dataset, and get the different performance metrics
models = {'SVM': modelSVM, 'XGBoost': modelXGB}


for name, model in models.iteritems():
    print(name)
    predictions = model.predict(Xtest)
    print("Accuracy: ",accuracy_score(Ytest, predictions))
    print("Kappa: ",cohen_kappa_score(Ytest, predictions))
    print("F1 macro: ",f1_score(Ytest, predictions, average='macro'))
    print("F1 weighted: ",f1_score(Ytest, predictions, average='weighted'))
    encoded_predictions = encoderA.transform(predictions)
    dummy_predictions = to_categorical(encoded_predictions)
    print("AUC: ",roc_auc_score(dummy_y_testA, dummy_predictions))


XGBoost
Accuracy:  0.532567049808
Kappa:  0.3699012688
F1 macro:  0.461764195905
F1 weighted:  0.513931923736
AUC:  0.673024006504
SVM
Accuracy:  0.593869731801
Kappa:  0.447808541294
F1 macro:  0.505375511432
F1 weighted:  0.572504799314
AUC:  0.698861888467

PCA features


In [27]:
# Fit with PCA features

X = X_train_pca
Y = Y_trainA
Xtest = X_test_pca
Ytest = Y_testA

modelSVMPCA = SVC()
modelXGBPCA = xgboost.XGBClassifier()

print("Training the data...", end="")

modelSVMPCA.fit(X,Y)
modelXGBPCA.fit(X,Y)

print("done")
# Add models to dictionary
models['SVMPCA'] = modelSVMPCA
models['XGBPCA'] = modelXGBPCA


Training the data...done

In [28]:
# Test it on validation dataset, and get the different performance metrics
models = {'SVMPCA': modelSVMPCA, 'XGBPCA': modelXGBPCA}

for name, model in models.iteritems():
    print(name)
    predictions = model.predict(Xtest)
    print("Accuracy: ",accuracy_score(Ytest, predictions))
    print("Kappa: ",cohen_kappa_score(Ytest, predictions))
    print("F1 macro: ",f1_score(Ytest, predictions, average='macro'))
    print("F1 weighted: ",f1_score(Ytest, predictions, average='weighted'))
    encoded_predictions = encoderA.transform(predictions)
    dummy_predictions = to_categorical(encoded_predictions)
    print("AUC: ",roc_auc_score(dummy_y_testA, dummy_predictions))


XGBPCA
Accuracy:  0.534482758621
Kappa:  0.371441172025
F1 macro:  0.436350481854
F1 weighted:  0.508069864979
AUC:  0.666185477209
SVMPCA
Accuracy:  0.311302681992
Kappa:  0.0
F1 macro:  0.0949598246896
F1 weighted:  0.147806240537
AUC:  0.5
/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.py:1074: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

With HMMs

From raw features - then SVM


In [45]:
X = X_train_st
Y = Y_trainA
Xtest = X_test_st
Ytest = Y_testA

# # Create a simple 2-state with the raw data
# print("fitting to HMM and decoding ...", end="")
# # Make an HMM instance and execute fit
# modelHMM = GaussianHMM(n_components=2, covariance_type="diag").fit(X)
# # Predict the optimal sequence of internal hidden state
# hidden_states = modelHMM.predict(X)
# print("done")
# print(hidden_states.shape)
# print("Transition matrix")
# print(modelHMM.transmat_)
# print()
# print("Means and vars of each hidden state")
# for i in range(modelHMM.n_components):
#     print("{0}th hidden state".format(i))
#     print("mean = ", modelHMM.means_[i])
#     print("var = ", numpy.diag(modelHMM.covars_[i]))
#     print()

newX = X
newXtest = Xtest

for n_comp in range(2,16):
    print("fitting to HMM and decoding %d ..." % n_comp , end="")
    modelHMM = GaussianHMM(n_components=n_comp, covariance_type="diag").fit(X)
    hidden_states_train = to_categorical(modelHMM.predict(X))
    hidden_states_test = to_categorical(modelHMM.predict(Xtest))
    print("done")
    newX = numpy.column_stack((newX,hidden_states_train))
    newXtest = numpy.column_stack((newXtest,hidden_states_test))
    
print('New dataset size',newX.shape,newXtest.shape)


fitting to HMM and decoding 2 ...done
fitting to HMM and decoding 3 ...done
fitting to HMM and decoding 4 ...done
fitting to HMM and decoding 5 ...done
fitting to HMM and decoding 6 ...done
fitting to HMM and decoding 7 ...done
fitting to HMM and decoding 8 ...done
fitting to HMM and decoding 9 ...done
fitting to HMM and decoding 10 ...done
fitting to HMM and decoding 11 ...done
fitting to HMM and decoding 12 ...done
fitting to HMM and decoding 13 ...done
fitting to HMM and decoding 14 ...done
fitting to HMM and decoding 15 ...done
New dataset size (4472, 7674) (1044, 7674)

In [46]:
modelSVMHMM = SVC()
print("Training the data...", end="")
modelSVMHMM.fit(newX,Y)
print("done")


Training the data...done

In [47]:
print('HMM+SVM on ALL features')
predictions = modelSVMHMM.predict(newXtest)
print("Accuracy: ",accuracy_score(Ytest, predictions))
print("Kappa: ",cohen_kappa_score(Ytest, predictions))
print("F1 macro: ",f1_score(Ytest, predictions, average='macro'))
print("F1 weighted: ",f1_score(Ytest, predictions, average='weighted'))
encoded_predictions = encoderA.transform(predictions)
dummy_predictions = to_categorical(encoded_predictions)
print("AUC: ",roc_auc_score(dummy_y_testA, dummy_predictions))


HMM+SVM on ALL features
Accuracy:  0.594827586207
Kappa:  0.449310097578
F1 macro:  0.503299459974
F1 weighted:  0.572533894184
AUC:  0.698988529981

From PCA features - then XGB


In [48]:
X = X_train_pca
Y = Y_trainA
Xtest = X_test_pca
Ytest = Y_testA

newX = X
newXtest = Xtest

for n_comp in range(2,16):
    print("fitting to HMM and decoding %d ..." % n_comp , end="")
    modelHMM = GaussianHMM(n_components=n_comp, covariance_type="diag").fit(X)
    hidden_states_train = to_categorical(modelHMM.predict(X))
    hidden_states_test = to_categorical(modelHMM.predict(Xtest))
    print("done")
    newX = numpy.column_stack((newX,hidden_states_train))
    newXtest = numpy.column_stack((newXtest,hidden_states_test))
    
print('New dataset size',newX.shape,newXtest.shape)


fitting to HMM and decoding 2 ...done
fitting to HMM and decoding 3 ...done
fitting to HMM and decoding 4 ...done
fitting to HMM and decoding 5 ...done
fitting to HMM and decoding 6 ...done
fitting to HMM and decoding 7 ...done
fitting to HMM and decoding 8 ...done
fitting to HMM and decoding 9 ...done
fitting to HMM and decoding 10 ...done
fitting to HMM and decoding 11 ...done
fitting to HMM and decoding 12 ...done
fitting to HMM and decoding 13 ...done
fitting to HMM and decoding 14 ...done
fitting to HMM and decoding 15 ...done
New dataset size (4472, 545) (1044, 545)

In [50]:
modelXGBPCAHMM = xgboost.XGBClassifier()
print("Training the data...", end="")
modelXGBPCAHMM.fit(newX,Y)
print("done")


Training the data...done

In [52]:
print('HMM+XGBoost on %d PCA features' % X_train_pca.shape[1])
predictions = modelXGBPCAHMM.predict(newXtest)
print("Accuracy: ",accuracy_score(Ytest, predictions))
print("Kappa: ",cohen_kappa_score(Ytest, predictions))
print("F1 macro: ",f1_score(Ytest, predictions, average='macro'))
print("F1 weighted: ",f1_score(Ytest, predictions, average='weighted'))
encoded_predictions = encoderA.transform(predictions)
dummy_predictions = to_categorical(encoded_predictions)
print("AUC: ",roc_auc_score(dummy_y_testA, dummy_predictions))


HMM+XGBoost on 426 PCA features
Accuracy:  0.519157088123
Kappa:  0.35603225196
F1 macro:  0.428643743077
F1 weighted:  0.490372954964
AUC:  0.666712371431

No advantage, apparently!


In [ ]: