In [12]:
import numpy
import pandas
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.cross_validation import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, make_scorer, cohen_kappa_score, f1_score
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from math import ceil, sqrt
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dropout
from keras.layers import LSTM
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.models import model_from_json
import numpy
import os
from keras.utils.np_utils import to_categorical
from hmmlearn.hmm import GaussianHMM, GMMHMM
import xgboost
from __future__ import print_function
In [2]:
data = pandas.read_csv("../data/processed/train.csv")
notnull_data = data[data.notnull().all(axis=1)]
train = notnull_data.values
data2 = pandas.read_csv("../data/processed/test.csv")
notnull_data2 = data2[data2.notnull().all(axis=1)]
test = notnull_data2.values
In [20]:
# Separate the target values (Activity and Social) from features, etc.
X_train = train[:,3:7558].astype(float)
Y_trainA = train[:,7558] #Activity
Y_trainS = train[:,7559] #Social
X_test = test[:,3:7558].astype(float)
Y_testA = test[:,7558]
Y_testS = test[:,7559]
# encode class values as integers
encoderA = LabelEncoder()
encoderA.fit(Y_trainA)
encoded_Y_trainA = encoderA.transform(Y_trainA)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_trainA = to_categorical(encoded_Y_trainA)
encoderA.fit(Y_testA)
encoded_Y_testA = encoderA.transform(Y_testA)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_testA = to_categorical(encoded_Y_testA)
# encode class values as integers
encoderS = LabelEncoder()
encoderS.fit(Y_trainS)
encoded_Y_trainS = encoderS.transform(Y_trainS)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_trainS = to_categorical(encoded_Y_trainS)
encoderS.fit(Y_testS)
encoded_Y_testS = encoderS.transform(Y_testS)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_testS = to_categorical(encoded_Y_testS)
# We standardize on the basis of the training data
scaler = StandardScaler().fit(X_train)
X_train_st = scaler.transform(X_train)
X_test_st = scaler.transform(X_test)
# Number of components to extract from the dataset
n_components = 100
#print 'Reducing dataset with PCA --',n_components
#pca = decomposition.PCA(n_components=n_components)
perc_variance = 0.8
print('Reducing dataset with PCA -- to %d variance', perc_variance)
pca = decomposition.PCA(n_components=perc_variance)
X_train_pca = pca.fit_transform(X_train_st)
X_test_pca = pca.transform(X_test_st)
#print 'Variance explained:'
#print pca.explained_variance_ratio_
print('Total variance explained by %d components: ',pca.n_components_)
print(sum(pca.explained_variance_ratio_))
In [25]:
X = X_train_st
Y = Y_trainA
Xtest = X_test_st
Ytest = Y_testA
modelSVM = SVC()
modelXGB = xgboost.XGBClassifier()
print("Training the data...", end="")
modelSVM.fit(X,Y)
modelXGB.fit(X,Y)
print("done")
In [26]:
# Test it on validation dataset, and get the different performance metrics
models = {'SVM': modelSVM, 'XGBoost': modelXGB}
for name, model in models.iteritems():
print(name)
predictions = model.predict(Xtest)
print("Accuracy: ",accuracy_score(Ytest, predictions))
print("Kappa: ",cohen_kappa_score(Ytest, predictions))
print("F1 macro: ",f1_score(Ytest, predictions, average='macro'))
print("F1 weighted: ",f1_score(Ytest, predictions, average='weighted'))
encoded_predictions = encoderA.transform(predictions)
dummy_predictions = to_categorical(encoded_predictions)
print("AUC: ",roc_auc_score(dummy_y_testA, dummy_predictions))
In [27]:
# Fit with PCA features
X = X_train_pca
Y = Y_trainA
Xtest = X_test_pca
Ytest = Y_testA
modelSVMPCA = SVC()
modelXGBPCA = xgboost.XGBClassifier()
print("Training the data...", end="")
modelSVMPCA.fit(X,Y)
modelXGBPCA.fit(X,Y)
print("done")
# Add models to dictionary
models['SVMPCA'] = modelSVMPCA
models['XGBPCA'] = modelXGBPCA
In [28]:
# Test it on validation dataset, and get the different performance metrics
models = {'SVMPCA': modelSVMPCA, 'XGBPCA': modelXGBPCA}
for name, model in models.iteritems():
print(name)
predictions = model.predict(Xtest)
print("Accuracy: ",accuracy_score(Ytest, predictions))
print("Kappa: ",cohen_kappa_score(Ytest, predictions))
print("F1 macro: ",f1_score(Ytest, predictions, average='macro'))
print("F1 weighted: ",f1_score(Ytest, predictions, average='weighted'))
encoded_predictions = encoderA.transform(predictions)
dummy_predictions = to_categorical(encoded_predictions)
print("AUC: ",roc_auc_score(dummy_y_testA, dummy_predictions))
In [45]:
X = X_train_st
Y = Y_trainA
Xtest = X_test_st
Ytest = Y_testA
# # Create a simple 2-state with the raw data
# print("fitting to HMM and decoding ...", end="")
# # Make an HMM instance and execute fit
# modelHMM = GaussianHMM(n_components=2, covariance_type="diag").fit(X)
# # Predict the optimal sequence of internal hidden state
# hidden_states = modelHMM.predict(X)
# print("done")
# print(hidden_states.shape)
# print("Transition matrix")
# print(modelHMM.transmat_)
# print()
# print("Means and vars of each hidden state")
# for i in range(modelHMM.n_components):
# print("{0}th hidden state".format(i))
# print("mean = ", modelHMM.means_[i])
# print("var = ", numpy.diag(modelHMM.covars_[i]))
# print()
newX = X
newXtest = Xtest
for n_comp in range(2,16):
print("fitting to HMM and decoding %d ..." % n_comp , end="")
modelHMM = GaussianHMM(n_components=n_comp, covariance_type="diag").fit(X)
hidden_states_train = to_categorical(modelHMM.predict(X))
hidden_states_test = to_categorical(modelHMM.predict(Xtest))
print("done")
newX = numpy.column_stack((newX,hidden_states_train))
newXtest = numpy.column_stack((newXtest,hidden_states_test))
print('New dataset size',newX.shape,newXtest.shape)
In [46]:
modelSVMHMM = SVC()
print("Training the data...", end="")
modelSVMHMM.fit(newX,Y)
print("done")
In [47]:
print('HMM+SVM on ALL features')
predictions = modelSVMHMM.predict(newXtest)
print("Accuracy: ",accuracy_score(Ytest, predictions))
print("Kappa: ",cohen_kappa_score(Ytest, predictions))
print("F1 macro: ",f1_score(Ytest, predictions, average='macro'))
print("F1 weighted: ",f1_score(Ytest, predictions, average='weighted'))
encoded_predictions = encoderA.transform(predictions)
dummy_predictions = to_categorical(encoded_predictions)
print("AUC: ",roc_auc_score(dummy_y_testA, dummy_predictions))
In [48]:
X = X_train_pca
Y = Y_trainA
Xtest = X_test_pca
Ytest = Y_testA
newX = X
newXtest = Xtest
for n_comp in range(2,16):
print("fitting to HMM and decoding %d ..." % n_comp , end="")
modelHMM = GaussianHMM(n_components=n_comp, covariance_type="diag").fit(X)
hidden_states_train = to_categorical(modelHMM.predict(X))
hidden_states_test = to_categorical(modelHMM.predict(Xtest))
print("done")
newX = numpy.column_stack((newX,hidden_states_train))
newXtest = numpy.column_stack((newXtest,hidden_states_test))
print('New dataset size',newX.shape,newXtest.shape)
In [50]:
modelXGBPCAHMM = xgboost.XGBClassifier()
print("Training the data...", end="")
modelXGBPCAHMM.fit(newX,Y)
print("done")
In [52]:
print('HMM+XGBoost on %d PCA features' % X_train_pca.shape[1])
predictions = modelXGBPCAHMM.predict(newXtest)
print("Accuracy: ",accuracy_score(Ytest, predictions))
print("Kappa: ",cohen_kappa_score(Ytest, predictions))
print("F1 macro: ",f1_score(Ytest, predictions, average='macro'))
print("F1 weighted: ",f1_score(Ytest, predictions, average='weighted'))
encoded_predictions = encoderA.transform(predictions)
dummy_predictions = to_categorical(encoded_predictions)
print("AUC: ",roc_auc_score(dummy_y_testA, dummy_predictions))
No advantage, apparently!
In [ ]: