In [16]:
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn import grid_search
from sklearn import metrics
from sklearn import cross_validation
from sklearn.externals import joblib

import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

import operator
import itertools
import random
import os
import pickle

PATHS


In [17]:
if not os.path.exists("results"):
    os.makedirs("results")

In [18]:
PREPROCESSED_DIRECTORY = "E:\\eaglesense\\data\\topviewkinect\\all"

In [19]:
FEATURE_SET = "weak"

LOAD DATA


In [30]:
s1_data_path = "{root}/{tag}_s1_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
with open(s1_data_path, "rb") as f:
    s1_data = pickle.load(f)

s2_data_path = "{root}/{tag}_s2_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
with open(s2_data_path, "rb") as f:
    s2_data = pickle.load(f)

cs_data_path = "{root}/{tag}_cs_data.pickle".format(root=PREPROCESSED_DIRECTORY, tag=FEATURE_SET)
with open(cs_data_path, "rb") as f:
    cs_data = pickle.load(f)

XGBOOST KNOBS


In [21]:
XGBOOST_SEED = 0

In [22]:
xgboost_clf = xgb.XGBClassifier(learning_rate=0.3, n_estimators=100, objective="multi:softmax", seed=XGBOOST_SEED)

In [44]:
max_depth = [5, 6, 7, 8]
gamma = [1, 2, 3]
subsample = [0.5, 1]
colsample_bytree = [0.5, 1]
colsample_bylevel = [0.5, 1]
reg_alpha = [1, 2, 3]
reg_lambda = [1, 2, 3]

xgboost_knobs = {
    "max_depth": max_depth,
    "gamma": gamma,
    "subsample": subsample,
    "colsample_bytree": colsample_bytree,
    "colsample_bylevel": colsample_bylevel,
    "reg_alpha": reg_alpha,
    "reg_lambda": reg_lambda
}

In [45]:
num_combinations = len(list(itertools.product(max_depth, gamma, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda)))
num_combinations


Out[45]:
648

In [25]:
RAND_SEED = 42

In [47]:
num_grid_searches = int(num_combinations / 3)

In [48]:
params_search = grid_search.RandomizedSearchCV(estimator=xgboost_clf, param_distributions=xgboost_knobs, cv=5,
                                               n_iter=num_grid_searches, random_state=RAND_SEED, verbose=1)

PARAMETERS


In [15]:
s1_params_path = "{root}/{tag}_s1_params.pickle".format(root=data_directory, tag=FEATURE_SET)
s2_params_path = "{root}/{tag}_s2_params.pickle".format(root=data_directory, tag=FEATURE_SET)
cs_params_path = "{root}/{tag}_cs_params.pickle".format(root=data_directory, tag=FEATURE_SET)

SAMPLES TEST 1


In [49]:
s1_X_train = s1_data["X_train"]
s1_y_train = s1_data["y_train"]

In [50]:
params_search.fit(s1_X_train, s1_y_train.ravel())


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-50-ce810cd9d3c7> in <module>()
----> 1 params_search.fit(s1_X_train, s1_y_train.ravel())

c:\local\python34\lib\site-packages\sklearn\grid_search.py in fit(self, X, y)
    994                                           self.n_iter,
    995                                           random_state=self.random_state)
--> 996         return self._fit(X, y, sampled_params)

c:\local\python34\lib\site-packages\sklearn\grid_search.py in _fit(self, X, y, parameter_iterable)
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
    554                 for train, test in cv)
    555 

c:\local\python34\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
    798             # was dispatched. In particular this covers the edge
    799             # case of Parallel used with an exhausted iterator.
--> 800             while self.dispatch_one_batch(iterator):
    801                 self._iterating = True
    802             else:

c:\local\python34\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
    656                 return False
    657             else:
--> 658                 self._dispatch(tasks)
    659                 return True
    660 

c:\local\python34\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
    564 
    565         if self._pool is None:
--> 566             job = ImmediateComputeBatch(batch)
    567             self._jobs.append(job)
    568             self.n_dispatched_batches += 1

c:\local\python34\lib\site-packages\sklearn\externals\joblib\parallel.py in __init__(self, batch)
    178         # Don't delay the application, to avoid keeping the input
    179         # arguments in memory
--> 180         self.results = batch()
    181 
    182     def get(self):

c:\local\python34\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):

c:\local\python34\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):

c:\local\python34\lib\site-packages\sklearn\cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
   1529             estimator.fit(X_train, **fit_params)
   1530         else:
-> 1531             estimator.fit(X_train, y_train, **fit_params)
   1532 
   1533     except Exception as e:

c:\local\python34\lib\site-packages\xgboost-0.4-py3.4.egg\xgboost\sklearn.py in fit(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose)
    440                               early_stopping_rounds=early_stopping_rounds,
    441                               evals_result=evals_result, obj=obj, feval=feval,
--> 442                               verbose_eval=verbose)
    443 
    444         self.objective = xgb_options["objective"]

c:\local\python34\lib\site-packages\xgboost-0.4-py3.4.egg\xgboost\training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
    203                            evals=evals,
    204                            obj=obj, feval=feval,
--> 205                            xgb_model=xgb_model, callbacks=callbacks)
    206 
    207 

c:\local\python34\lib\site-packages\xgboost-0.4-py3.4.egg\xgboost\training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
     74         # Skip the first update if it is a recovery step.
     75         if version % 2 == 0:
---> 76             bst.update(dtrain, i, obj)
     77             bst.save_rabit_checkpoint()
     78             version += 1

c:\local\python34\lib\site-packages\xgboost-0.4-py3.4.egg\xgboost\core.py in update(self, dtrain, iteration, fobj)
    804 
    805         if fobj is None:
--> 806             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    807         else:
    808             pred = self.predict(dtrain)

KeyboardInterrupt: 

In [14]:
params_search.best_params_


Out[14]:
{'colsample_bylevel': 0.5,
 'colsample_bytree': 0.5,
 'gamma': 1,
 'max_depth': 7,
 'reg_alpha': 2,
 'reg_lambda': 3,
 'subsample': 0.8}

In [31]:
params_search.best_score_


Out[31]:
0.95537193303841128

In [16]:
s1_params = {
    "best_params": params_search.best_params_,
    "best_score": params_search.best_score_,
    "grid_scores": params_search.grid_scores_
}
with open(s1_params_path, "wb") as f:
    pickle.dump(s1_params, f)

SAMPLES TEST 2


In [20]:
s2_X_train = s2_data["X_train"]
s2_y_train = s2_data["y_train"]

In [21]:
params_search.fit(s2_X_train, s2_y_train.ravel())


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed: 21.0min
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed: 74.1min
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 95.3min finished
Out[21]:
RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softmax', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=50, n_jobs=1,
          param_distributions={'max_depth': [3, 4, 5, 6, 7], 'colsample_bytree': [0.5, 0.8, 1], 'reg_lambda': [0, 1, 2, 3], 'colsample_bylevel': [0.5, 0.8, 1], 'gamma': [0, 1, 2, 3], 'subsample': [0.5, 0.8, 1], 'reg_alpha': [0, 1, 2, 3]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          scoring=None, verbose=1)

In [26]:
params_search.best_params_


Out[26]:
{'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 2,
 'max_depth': 4,
 'reg_alpha': 1,
 'reg_lambda': 3,
 'subsample': 0.8}

In [30]:
params_search.best_score_


Out[30]:
0.95537193303841128

In [32]:
s2_params = {
    "best_params": params_search.best_params_,
    "best_score": params_search.best_score_,
    "grid_scores": params_search.grid_scores_
}
with open(s2_params_path, "wb") as f:
    pickle.dump(s2_params, f)

CROSS-SUBJECT TEST


In [34]:
cs_X_train = cs_data["X_train"]
cs_y_train = cs_data["y_train"]

In [35]:
params_search.fit(cs_X_train, cs_y_train.ravel())


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed: 12.6min
[Parallel(n_jobs=1)]: Done 199 tasks       | elapsed: 44.2min
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 57.1min finished
Out[35]:
RandomizedSearchCV(cv=5, error_score='raise',
          estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.3, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softmax', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
          fit_params={}, iid=True, n_iter=50, n_jobs=1,
          param_distributions={'max_depth': [3, 4, 5, 6, 7], 'colsample_bytree': [0.5, 0.8, 1], 'reg_lambda': [0, 1, 2, 3], 'colsample_bylevel': [0.5, 0.8, 1], 'gamma': [0, 1, 2, 3], 'subsample': [0.5, 0.8, 1], 'reg_alpha': [0, 1, 2, 3]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          scoring=None, verbose=1)

In [36]:
params_search.best_params_


Out[36]:
{'colsample_bylevel': 0.8,
 'colsample_bytree': 0.5,
 'gamma': 0,
 'max_depth': 5,
 'reg_alpha': 2,
 'reg_lambda': 1,
 'subsample': 1}

In [37]:
params_search.best_score_


Out[37]:
0.95181550258948755

In [39]:
cs_params = {
    "best_params": params_search.best_params_,
    "best_score": params_search.best_score_,
    "grid_scores": params_search.grid_scores_
}
with open(cs_params_path, "wb") as f:
    pickle.dump(cs_params, f)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

K-FOLD (SUBJECT) CV


In [ ]:
subject_kfolds_indices = list()

In [ ]:
subject_kfolds = list(itertools.combinations(unique_subjects, len(unique_subjects) - 1))

In [ ]:
for kth_fold in subject_kfolds:
    print(kth_fold)
    kth_fold_train_indices = list()
    kth_fold_test_indices = list()
    for subject_id in unique_subjects:
        subject_features = features_df[:][features_df["subject"] == subject_id]
        if subject_id in kth_fold:
            kth_fold_train_indices.extend(subject_features.index)
        else:
            kth_fold_test_indices.extend(subject_features.index)
    subject_kfolds_indices.append((kth_fold_train_indices, kth_fold_test_indices))

In [ ]:
clf = xgb.XGBClassifier(learning_rate=0.3, n_estimators=100, objective="multi:softmax", seed=M_XGB_SEED)

In [ ]:
num_searches = 30

random_search = grid_search.RandomizedSearchCV(clf, param_distributions=tuning_params, cv=subject_kfolds_indices,
                                               verbose=2, n_iter=num_searches, random_state=M_RAND_SEED)

In [24]:
random_search.fit(X, y.ravel())


-----------------------------------------------------------------------
NameError                             Traceback (most recent call last)
<ipython-input-24-c1da6aae1183> in <module>()
----> 1 random_search.fit(X, y.ravel())

NameError: name 'random_search' is not defined

In [ ]:
random_search.grid_scores_

In [ ]:
random_search.best_score_

In [ ]:
random_search.best_params_

In [ ]:


In [ ]:
all_xgbmatrix = xgb.DMatrix(X, y)

In [ ]:
params_cv_results = xgb.cv(params=XGB_PARAM_CV, dtrain=all_xgbmatrix, num_boost_round=200, nfold=12, folds=subject_kfolds_indices, 
                           verbose_eval=True, early_stopping_rounds=50)

In [ ]:
params_cv_results

In [ ]:
sample_1_accuracy

Confusion matrix


In [ ]:
sample_1_cm = metrics.confusion_matrix(sample_1_y_test, sample_1_y_predicted)
sample_1_cm_normalized = sample_1_cm.astype("float") / sample_1_cm.sum(axis=1)[:, np.newaxis]
sample_1_cm_normalized *= 100

In [ ]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=thesis_figsize)
    sns.heatmap(data=sample_1_cm_normalized, annot=True, fmt=".2f", linewidths=0.5, square=True,
                vmin=0, vmax=100, ax=ax, xticklabels=ACTIVITIES, yticklabels=ACTIVITIES)
    plt.yticks(rotation=0)
    sns.despine()

In [ ]:

SAMPLES TEST 2 (2/3 for training)


In [ ]:
SAMPLE_2_TRAIN_SIZE = 2 / 3

In [ ]:
sample_2_train_size_by_subject = np.zeros((len(unique_subjects), NUM_ACTIVITIES))

In [ ]:
for subject_idx, subject_id in enumerate(unique_subjects):
    subject_y = labels_df[labels_df["subject"] == subject_id]["label"].values
    subject_activities_bin = np.bincount(np.squeeze(subject_y))
    sample_2_train_size_by_subject[subject_idx] = np.array([int(size * SAMPLE_2_TRAIN_SIZE) for size in subject_activities_bin])

In [ ]:
sample_2_X_train = np.array([], dtype=np.float64).reshape(0, num_features)
sample_2_y_train = np.array([], dtype=np.int32).reshape(0, 1)

sample_2_X_test = np.array([], dtype=np.float64).reshape(0, num_features)
sample_2_y_test = np.array([], dtype=np.int32).reshape(0, 1)

for subject_idx, subject_id in enumerate(unique_subjects):
    
    print(subject_id)
    
    subject_features = features_df[:][features_df["subject"] == subject_id]
    subject_features.drop(["subject"], axis=1, inplace=True)
    
    subject_labels = labels_df[:][labels_df["subject"] == subject_id]
    subject_labels.drop(["subject"], axis=1, inplace=True)
    
    for activity_idx in range(NUM_ACTIVITIES):
        subject_activity_train_size = sample_2_train_size_by_subject[subject_idx, activity_idx]
        subject_activity_labels_df = subject_labels[subject_labels["label"] == activity_idx]
        subject_activity_train_labels_df = subject_activity_labels_df.sample(n=subject_activity_train_size, replace=False, random_state=M_RAND_SEED)
        
        subject_activity_all_indices = list(subject_activity_labels_df.index.values)
        subject_activity_train_indices = list(subject_activity_train_labels_df.index.values)
        subject_activity_test_indices =  [idx for idx in subject_activity_all_indices if idx not in subject_activity_train_indices]
        
        subject_acitivty_X_train = subject_features.ix[subject_activity_train_indices]
        subject_activity_y_train = subject_labels.ix[subject_activity_train_indices]
        subject_acitivty_X_test = subject_features.ix[subject_activity_test_indices]
        subject_activity_y_test = subject_labels.ix[subject_activity_test_indices]
        
        sample_2_X_train = np.vstack([sample_2_X_train, subject_acitivty_X_train.values])
        sample_2_y_train = np.vstack([sample_2_y_train, subject_activity_y_train.values])
        sample_2_X_test = np.vstack([sample_2_X_test, subject_acitivty_X_test.values])
        sample_2_y_test = np.vstack([sample_2_y_test, subject_activity_y_test.values])

In [ ]:
sample_2_X_train.shape

In [ ]:
sample_2_X_test.shape

In [ ]:
sample_2_train_xgbmatrix = xgb.DMatrix(sample_2_X_train, sample_2_y_train)
sample_2_test_xgbmatrix = xgb.DMatrix(sample_2_X_test, sample_2_y_test)
# sample_2_watchlist = [(sample_2_train_xgbmatrix, "train"), (sample_2_test_xgbmatrix, "eval")]

In [ ]:
# sample_2_results = {}
# xgb.train(params=XGB_PARAM, dtrain=sample_2_train_xgbmatrix, num_boost_round=XGB_NUM_ROUNDS,
#           evals=sample_2_watchlist, evals_result=sample_2_results, early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS)

In [ ]:
# iterations = list(range(len(sample_2_results["eval"]["merror"]))) * 2
# errors = sample_2_results["eval"]["merror"] + sample_2_results["train"]["merror"]
# types = ["Test"] * int(len(iterations)/2) + ["Train"] * int(len(iterations)/2)
# units = [0] * len(iterations)
# data = {
#     "iteration": iterations,
#     "error": errors,
#     "type": types,
#     "unit": units
# }
# samples_test2_df = pd.DataFrame(data)

In [ ]:
# with sns.axes_style("ticks"):
#     fig, ax = plt.subplots(figsize=(10, 7.5))
#     sns.tsplot(data=samples_test2_df, time="iteration", value="error", condition="type", unit="unit", 
#                color=sns.color_palette("Set1"), ax=ax)
#     ax.set_xlabel("Iteration")
#     ax.set_ylabel("Mean Error")
#     sns.despine()
#     plt.tight_layout()

XGB TREE BOOSTER


In [ ]:
sample_2_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=sample_2_train_xgbmatrix, num_boost_round=XGB_NUM_ROUNDS_FINAL)

99.08%


In [ ]:
sample_2_y_predicted = sample_2_booster.predict(sample_2_test_xgbmatrix)

In [ ]:
sample_2_accuracy = metrics.accuracy_score(sample_2_y_test, sample_2_y_predicted)

In [ ]:
sample_2_accuracy

In [ ]:
sample_2_cm = metrics.confusion_matrix(sample_2_y_test, sample_2_y_predicted)
sample_2_cm_normalized = sample_2_cm.astype("float") / sample_2_cm.sum(axis=1)[:, np.newaxis]
sample_2_cm_normalized *= 100

In [ ]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=thesis_figsize)
    sns.heatmap(data=sample_2_cm_normalized, annot=True, fmt=".2f", linewidths=0.5, square=True,
                vmin=0, vmax=100, ax=ax, xticklabels=ACTIVITIES, yticklabels=ACTIVITIES)
    sns.despine()
    plt.yticks(rotation=0)

CROSS SUBJECT TEST 1 (50/50 Subjects)


In [ ]:
crosssubject_1_X_train = np.array([], dtype=np.float64).reshape(0, num_features)
crosssubject_1_y_train = np.array([], dtype=np.int32).reshape(0, 1)

crosssubject_1_X_test = np.array([], dtype=np.float64).reshape(0, num_features)
crosssubject_1_y_test = np.array([], dtype=np.int32).reshape(0, 1)

for subject_id in unique_subjects:
    
    subject_features = features_df[:][features_df["subject"] == subject_id]
    subject_features.drop(["subject"], axis=1, inplace=True)
    subject_labels = labels_df[:][labels_df["subject"] == subject_id]
    subject_labels.drop(["subject"], axis=1, inplace=True)

    subject_X = subject_features.values
    subject_y = subject_labels.values
    
    if subject_id % 2 == 1:
        print(subject_id, "\tTrain")
        crosssubject_1_X_train = np.vstack([crosssubject_1_X_train, subject_X])
        crosssubject_1_y_train = np.vstack([crosssubject_1_y_train, subject_y])
    else:
        print(subject_id, "\tTest")
        crosssubject_1_X_test = np.vstack([crosssubject_1_X_test, subject_X])
        crosssubject_1_y_test = np.vstack([crosssubject_1_y_test, subject_y])

In [ ]:
crosssubject_1_X_train.shape

In [ ]:
crosssubject_1_X_test.shape

In [ ]:
crosssubject_1_train_xgbmatrix = xgb.DMatrix(crosssubject_1_X_train, crosssubject_1_y_train)
crosssubject_1_test_xgbmatrix = xgb.DMatrix(crosssubject_1_X_test, crosssubject_1_y_test)
crosssubject_1_watchlist = [(crosssubject_1_train_xgbmatrix, "train"), (crosssubject_1_test_xgbmatrix, "eval")]

In [ ]:
crosssubject_1_results = {}
xgb.train(params=XGB_PARAM_FINAL, dtrain=crosssubject_1_train_xgbmatrix, num_boost_round=XGB_NUM_ROUNDS,
          evals=crosssubject_1_watchlist, evals_result=crosssubject_1_results, early_stopping_rounds=50)

In [ ]:
iterations = list(range(len(crosssubject_1_results["eval"]["merror"]))) * 2
errors = crosssubject_1_results["eval"]["merror"] + crosssubject_1_results["train"]["merror"]
types = ["Test"] * int(len(iterations)/2) + ["Train"] * int(len(iterations)/2)
units = [0] * len(iterations)
data = {
    "iteration": iterations,
    "error": errors,
    "Type": types,
    "unit": units
}
crosssubject_1_df = pd.DataFrame(data)

In [ ]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=(10, 7.5))
    sns.tsplot(data=crosssubject_1_df, time="iteration", value="error", condition="Type", unit="unit", 
               color=sns.color_palette("Set1"), ax=ax)
    ax.set_xlabel("Iteration")
    ax.set_ylabel("Mean Error")
    sns.despine()
    plt.tight_layout()

In [ ]:

CROSS-SUBJECT 1 CLASSIFIER


In [ ]:
crosssubject_1_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=crosssubject_1_train_xgbmatrix, 
                                   num_boost_round=XGB_NUM_ROUNDS_FINAL)

In [ ]:
feature_importance = crosssubject_1_booster.get_fscore()

In [ ]:
xgb.plot_importance(crosssubject_1_booster)

In [ ]:
len(features_vector.columns)

In [ ]:
features_importance_formated = dict()

for feature_idx, feature_name in enumerate(features_vector.columns):
    old_key = "f{}".format(feature_idx)
    if old_key not in feature_importance:
        continue
    new_key = feature_name
    features_importance_formated[new_key] = feature_importance[old_key]

In [ ]:
features_importance_formated = sorted(features_importance_formated.items(), key=operator.itemgetter(1))

In [ ]:
features_importance_formated

In [ ]:
features_importance_formated_last = features_importance_formated[-10:]

In [ ]:
features_importance_formated_last

In [ ]:
features_importance_df = pd.DataFrame(features_importance_formated_last, columns=["feature", "gain"])
features_importance_df["gain"] = features_importance_df["gain"] / features_importance_df["gain"].sum()

In [ ]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=(10, 7.5))
    sns.barplot(x="feature", y="gain", data=features_importance_df, label="Total", color="#3498db", ax=ax)
    plt.xticks(rotation=90)
    plt.xlabel("")
    plt.ylabel("")
    sns.despine()

Indiviudal subject result


In [ ]:
np.set_printoptions(formatter={'float': lambda x: "{:.2f}".format(x)})

In [ ]:
crosssubject_individual_cm_list = list()

for subject_id in unique_subjects:
        print(subject_id)
    
        subject_features = features_df[:][features_df["subject"] == subject_id]
        subject_features.drop(["subject"], axis=1, inplace=True)
        subject_labels = labels_df[:][labels_df["subject"] == subject_id]
        subject_labels.drop(["subject"], axis=1, inplace=True)
        
        subject_X = subject_features.values
        subject_y = subject_labels.values
        
        subject_xgbmatrix = xgb.DMatrix(subject_X, subject_y)
        subject_y_predicted = crosssubject_1_booster.predict(subject_xgbmatrix)
        
        subject_accuracy = metrics.accuracy_score(subject_y, subject_y_predicted)        
        print("accuracy:", subject_accuracy)
        
        subject_cm = metrics.confusion_matrix(subject_y, subject_y_predicted)
        subject_cm_normalized = subject_cm.astype("float") / subject_cm.sum(axis=1)[:, np.newaxis]
        subject_cm_normalized *= 100
        print("confusion matrix:\n", subject_cm_normalized, "\n")
        
        if subject_id % 2 == 0:
            crosssubject_individual_cm_list.append((subject_id, subject_cm_normalized))

In [ ]:
crosssubject_individual_results_list = list()

for subject_id, subject_cm in crosssubject_individual_cm_list:
    subject_string = str(subject_id)[2:4]
    for activity_id, activity in enumerate(ACTIVITIES):
        crosssubject_subject_result = {
            "Activity": activity,
            "Accuracy": subject_cm[activity_id, activity_id],
            "Subject": subject_string
        }
        crosssubject_individual_results_list.append(crosssubject_subject_result)

crosssubject_individuals_df = pd.DataFrame(crosssubject_individual_results_list)

In [ ]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=thesis_figsize)
    g = sns.FacetGrid(data=crosssubject_individuals_df, col="Subject", col_wrap=3)
    g = (g.map(sns.barplot, "Activity", "Accuracy", palette=sns.color_palette(), ci=None)
         .set_xlabels("")
         .set_ylabels("")
         .set_xticklabels(rotation=90))
    plt.yticks(np.arange(0, 110, 20.0))
    g.despine()

Average: 93.32%


In [ ]:
crosssubject_1_y_predicted = crosssubject_1_booster.predict(crosssubject_1_test_xgbmatrix)

In [ ]:
crosssubject_1_accuracy = metrics.accuracy_score(crosssubject_1_y_test, crosssubject_1_y_predicted)

In [ ]:
crosssubject_1_accuracy

In [ ]:
crosssubject_1_accuracy

In [ ]:


In [ ]:
crosssubject_1_cm = metrics.confusion_matrix(crosssubject_1_y_test, crosssubject_1_y_predicted)
crosssubject_1_cm_normalized = crosssubject_1_cm.astype("float") / crosssubject_1_cm.sum(axis=1)[:, np.newaxis]
crosssubject_1_cm_normalized *= 100

In [ ]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=thesis_figsize)
    sns.heatmap(data=crosssubject_1_cm_normalized, annot=True, fmt=".2f", linewidths=0.5, square=True,
                vmin=0, vmax=100, ax=ax, xticklabels=ACTIVITIES, yticklabels=ACTIVITIES)
    plt.yticks(rotation=0)
    sns.despine()

NO INFRARED


In [ ]:
noinfrared_features_cols = [c for c in features_df.columns if not c.startswith("extreme_infrared_")]

In [ ]:
noinfrared_features_df = features_df[noinfrared_features_cols]

In [ ]:
noinfrared_features_df.shape

In [ ]:
num_noinfrared_features = noinfrared_features_df.shape[1] - 1

In [ ]:
noinfrared_X_train = np.array([], dtype=np.float64).reshape(0, num_noinfrared_features)
noinfrared_y_train = np.array([], dtype=np.int32).reshape(0, 1)

noinfrared_X_test = np.array([], dtype=np.float64).reshape(0, num_noinfrared_features)
noinfrared_y_test = np.array([], dtype=np.int32).reshape(0, 1)

for subject_id in unique_subjects:
    subject_features = noinfrared_features_df[:][noinfrared_features_df["subject"] == subject_id]
    subject_features.drop(["subject"], axis=1, inplace=True)
    subject_labels = labels_df[:][labels_df["subject"] == subject_id]
    subject_labels.drop(["subject"], axis=1, inplace=True)
    
    subject_X = subject_features.values
    subject_y = subject_labels.values
    
    if subject_id % 2 == 1:
        print(subject_id, "\tTrain")
        noinfrared_X_train = np.vstack([noinfrared_X_train, subject_X])
        noinfrared_y_train = np.vstack([noinfrared_y_train, subject_y])
    else:
        print(subject_id, "\tTest")
        noinfrared_X_test = np.vstack([noinfrared_X_test, subject_X])
        noinfrared_y_test = np.vstack([noinfrared_y_test, subject_y])

In [ ]:
noinfrared_train_xgbmatrix = xgb.DMatrix(noinfrared_X_train, noinfrared_y_train)
noinfrared_test_xgbmatrix = xgb.DMatrix(noinfrared_X_test, noinfrared_y_test)

In [ ]:
noinfrared_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=noinfrared_train_xgbmatrix, num_boost_round=XGB_NUM_ROUNDS_FINAL)

85.63%


In [ ]:
noinfrared_y_predicted = noinfrared_booster.predict(noinfrared_test_xgbmatrix)

In [ ]:
noinfrared_accuracy = metrics.accuracy_score(noinfrared_y_test, noinfrared_y_predicted)

In [ ]:
noinfrared_accuracy

In [ ]:
noinfrared_cm = metrics.confusion_matrix(noinfrared_y_test, noinfrared_y_predicted)
noinfrared_cm_normalized = noinfrared_cm.astype("float") / noinfrared_cm.sum(axis=1)[:, np.newaxis]
noinfrared_cm_normalized *= 100

In [ ]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=(10, 7.5))
    sns.heatmap(data=noinfrared_cm_normalized, annot=True, fmt=".2f", linewidths=0.5, square=True,
                vmin=0, vmax=100, ax=ax, xticklabels=ACTIVITIES, yticklabels=ACTIVITIES)
    sns.despine()
    plt.yticks(rotation=0)

In [ ]:
crosssubject_infrared_comparisons_list = list()

for activity_id, activity in enumerate(ACTIVITIES):
    crosssubject_result = {
        "Activity": activity,
        "Accuracy": crosssubject_1_cm_normalized[activity_id, activity_id],
        "Condition": "All"
    }
    crosssubject_infrared_comparisons_list.append(crosssubject_result)

for activity_id, activity in enumerate(ACTIVITIES):
    crosssubject_result = {
        "Activity": activity,
        "Accuracy": noinfrared_cm_normalized[activity_id, activity_id],
        "Condition": "Without infrared"
    }
    crosssubject_infrared_comparisons_list.append(crosssubject_result)

crosssubject_infrared_comparisons_df = pd.DataFrame(crosssubject_infrared_comparisons_list)

In [ ]:
sns.palplot(sns.color_palette("coolwarm", n_colors=7))

In [ ]:
sns.palplot(sns.color_palette("hls", 8))

In [ ]:
sns.palplot(sns.color_palette("Set2", 10))

In [ ]:
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]

In [ ]:
sns.palplot(sns.color_palette(flatui))

In [ ]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=thesis_figsize)
    sns.barplot(x="Activity", y="Accuracy", hue="Condition", palette=[flatui[1], flatui[4]],
                data=crosssubject_infrared_comparisons_df, ax=ax)
    ax.set_xlabel("")
    ax.set_ylabel("")
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), frameon=True, framealpha=1, fancybox=True, shadow=True, ncol=2)
    sns.despine()
    
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x(), height+ 3, '%1.2f'%(height/100))

In [ ]:
crosssubject_infrared_comparisons_df

In [ ]:
# reading paper
91.858038 - 92.790536

In [ ]:
# phone
77.292500 - 64.903428

In [ ]:
# tablet
99.087616 - 64.430431

In [ ]:

DEMO


In [ ]:
XGB_PARAM_DEMO = {}
XGB_PARAM_DEMO["eta"] = 0.3
XGB_PARAM_DEMO["gamma"] = 1
XGB_PARAM_DEMO["lambda"] = 3
XGB_PARAM_DEMO["alpha"] = 1
XGB_PARAM_DEMO["max_depth"] = 6
XGB_PARAM_DEMO["colsample_bytree"] = 0.8
XGB_PARAM_DEMO["subsample"] = 0.5
XGB_PARAM_DEMO["objective"] = "multi:softmax"
XGB_PARAM_DEMO["eval_metric"] = "merror"
XGB_PARAM_DEMO["num_class"] = len(ACTIVITIES)
XGB_PARAM_DEMO["silent"] = 0

XGB_NUM_ROUNDS_DEMO = 40

In [ ]:
X.shape

In [ ]:
y.shape

In [ ]:
demo_train_xgbmatrix = xgb.DMatrix(X, y)
demo_test_xgbmatrix = xgb.DMatrix(X, y)
demo_watchlist = [(demo_train_xgbmatrix, "train"), (demo_test_xgbmatrix, "eval")]

In [ ]:
demo_results = {}
demo_booster = xgb.train(XGB_PARAM_DEMO, demo_train_xgbmatrix, XGB_NUM_ROUNDS_DEMO, demo_watchlist, evals_result=demo_results, early_stopping_rounds=20)

In [ ]:
demo_booster.save_model("demo-xgboost.model")

In [ ]:
bst2 = xgb.Booster(model_file="demo-xgboost.model")

In [ ]:
test_dmatrix = xgb.DMatrix(X)
y_predicted = bst2.predict(test_dmatrix)
accuracy = metrics.accuracy_score(y, y_predicted)

In [ ]:
accuracy

CROSS SUBJECT TEST 2 ALL COMBINATIONS


In [ ]:
cs_combinations = list(itertools.combinations(unique_subjects, int(len(unique_subjects)/2)))

In [ ]:
len(cs_combinations)

In [ ]:
cs_combinations_results_csv = "E:\\angel\\data\\topviewkinect\\all\\iss-cs_combinations.csv"

In [ ]:
open(combinations_results_csv, "w").close()
with open(combinations_results_csv, "a") as f:
    data_columns = pd.DataFrame(columns=["combination", "activity", "a1", "a2", "a3", "a4", "a5", "a6"])
    data_columns.to_csv(f, header=True, index=False)

In [ ]:
with open(combinations_results_csv, "a") as f:
    for cs_combination_idx, subjects_combination in enumerate(cs_combinations):
        print(cs_combination_idx)

        cs_combination_X_train = np.array([], dtype=np.float64).reshape(0, num_features)
        cs_combination_y_train = np.array([], dtype=np.int32).reshape(0, 1)
        cs_combination_X_test = np.array([], dtype=np.float64).reshape(0, num_features)
        cs_combination_y_test = np.array([], dtype=np.int32).reshape(0, 1)

        for subject_id in unique_subjects:
            subject_features = features_df[:][features_df["subject"] == subject_id]
            subject_features.drop(["subject"], axis=1, inplace=True)
            subject_labels = labels_df[:][labels_df["subject"] == subject_id]
            subject_labels.drop(["subject"], axis=1, inplace=True)
            subject_X = subject_features.values
            subject_y = subject_labels.values

            if subject_id in subjects_combination:
                cs_combination_X_train = np.vstack([cs_combination_X_train, subject_X])
                cs_combination_y_train = np.vstack([cs_combination_y_train, subject_y])
            else:
                cs_combination_X_test = np.vstack([cs_combination_X_test, subject_X])
                cs_combination_y_test = np.vstack([cs_combination_y_test, subject_y])

        cs_combination_train_xgbmatrix = xgb.DMatrix(cs_combination_X_train, cs_combination_y_train)
        cs_combination_test_xgbmatrix = xgb.DMatrix(cs_combination_X_test, cs_combination_y_test)

        cs_combination_booster = xgb.train(XGB_PARAM_FINAL, dtrain=cs_combination_train_xgbmatrix, 
                                           num_boost_round=XGB_NUM_ROUNDS_FINAL)
        cs_combination_y_predicted = cs_combination_booster.predict(cs_combination_test_xgbmatrix)
        result = metrics.confusion_matrix(cs_combination_y_test, cs_combination_y_predicted)
        
        data = pd.DataFrame(columns=["combination", "activity", "a1", "a2", "a3", "a4", "a5", "a6"])
        for activity_id, activity in enumerate(ACTIVITIES):
            data.loc[activity_id] = [cs_combination_idx, activity, 
                                     result[activity_id,0], 
                                     result[activity_id,1], 
                                     result[activity_id,2], 
                                     result[activity_id,3], 
                                     result[activity_id,4], 
                                     result[activity_id,5]]
            
        data.to_csv(f, header=False, index=False)

In [ ]:
cs_combinations_results_pd = pd.read_csv(cs_combinations_results_csv)

In [ ]:
cs_combinations_results_pd

In [ ]:
noinfrared_cm

In [ ]:
# standing
combinations_standing = cs_combinations_results_pd[:][cs_combinations_results_pd["activity"] == "Standing"]
combinations_sitting = cs_combinations_results_pd[:][cs_combinations_results_pd["activity"] == "Sitting"]
combinations_pointing = cs_combinations_results_pd[:][cs_combinations_results_pd["activity"] == "Pointing"]
combinations_phone = cs_combinations_results_pd[:][cs_combinations_results_pd["activity"] == "Phone"]
combinations_tablet = cs_combinations_results_pd[:][cs_combinations_results_pd["activity"] == "Tablet"]
combinations_paper = cs_combinations_results_pd[:][cs_combinations_results_pd["activity"] == "Paper"]

standing_cm = list()
sitting_cm = list()
pointing_cm = list()
phone_cm = list()
tablet_cm = list()
paper_cm = list()


for col in ["a1", "a2", "a3", "a4", "a5", "a6"]:
    standing_cm.append(combinations_standing[col].sum())
    sitting_cm.append(combinations_sitting[col].sum())
    pointing_cm.append(combinations_pointing[col].sum())
    phone_cm.append(combinations_phone[col].sum())
    tablet_cm.append(combinations_tablet[col].sum())
    paper_cm.append(combinations_paper[col].sum())

combinations_cm = np.array([
        standing_cm, sitting_cm, pointing_cm, phone_cm, tablet_cm, paper_cm
    ])

In [ ]:
combinations_cm

In [ ]:
all_samples = np.sum(combinations_cm)

In [ ]:
accurate_samples = 0
for activity_id in range(len(ACTIVITIES)):
    accurate_samples += combinations_cm[activity_id, activity_id]

In [ ]:
combinations_accuracy = accurate_samples / all_samples

92.34%


In [ ]:
combinations_accuracy

In [ ]:
combinations_cm
combinations_cm_normalized = combinations_cm.astype("float") / combinations_cm.sum(axis=1)[:, np.newaxis]
combinations_cm_normalized *= 100

In [ ]:
combinations_cm_normalized

In [ ]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots(figsize=(10, 7.5))
    sns.heatmap(data=combinations_cm_normalized, annot=True, fmt=".2f", linewidths=0.5, square=True,
                vmin=0, vmax=100, ax=ax, xticklabels=ACTIVITIES, yticklabels=ACTIVITIES)
    sns.despine()
    plt.yticks(rotation=0)