In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import math
import itertools

In [2]:
DATA_INPUT_DIR = "D:\\p_eaglesense\\eaglesense\\data\\topviewkinect"

In [3]:
DATA_OUTPUT_DIR = DATA_INPUT_DIR + "\\v2-models"
if not os.path.exists(DATA_OUTPUT_DIR):
    os.makedirs(DATA_OUTPUT_DIR)

In [4]:
data_dirs = []
for subdir in sorted(next(os.walk(DATA_INPUT_DIR))[1]):
    if subdir.isdigit():
        data_dirs.append(int(subdir))
data_dirs.sort(key=int)

In [5]:
data_dirs


Out[5]:
[1, 2, 3, 4, 5, 6]

In [6]:
ignored_X_cols = ["frame_id", "skeleton_id", "x", "y", "z"]
ignored_y_cols = ["frame_id", "skeleton_id", "orientation", "orientation_accurate"]

In [7]:
all_X_csv = "{d}/{tag}_X.csv".format(d=DATA_OUTPUT_DIR, tag="v2")
all_y_csv = "{d}/{tag}_y.csv".format(d=DATA_OUTPUT_DIR, tag="v2")

In [42]:
open(all_X_csv, "w").close()
open(all_y_csv, "w").close()

all_X_file = open(all_X_csv, "a")
all_y_file = open(all_y_csv, "a")

write_header=True

for dataset_id in data_dirs:
    # read data
    X_csv = "{d}/{dataset}/features.csv".format(d=DATA_INPUT_DIR, dataset=dataset_id)
    X_df = pd.read_csv(X_csv, dtype=np.float64)
    y_csv = "{d}/{dataset}/labels.csv".format(d=DATA_INPUT_DIR, dataset=dataset_id)
    y_df = pd.read_csv(y_csv, dtype=np.float64)

    # get single skeleton activities
    y_df = y_df.loc[y_df["skeleton_id"] == 0]
    y_df = y_df.loc[y_df["activity"].isin(list(range(0, 6)))]

    # get common frame indices
    X_df = X_df.loc[X_df["frame_id"].isin(y_df["frame_id"].values)]
    y_df = y_df.loc[y_df["frame_id"].isin(X_df["frame_id"].values)]
    
    # drop unncessary columns
    X_df = X_df.drop(labels=ignored_X_cols, axis=1)
    y_df = y_df.drop(labels=ignored_y_cols, axis=1)
    
    # add dataset id
    X_df["dataset_id"] = dataset_id
    y_df["dataset_id"] = dataset_id
    
    # save to csv
    X_df = X_df.astype("float64")
    X_df.to_csv(all_X_file, header=write_header, index=False)
    y_df = y_df.astype("int")
    y_df.to_csv(all_y_file, header=write_header, index=False)
    
    write_header = False
    
    print(dataset_id, "Done!")
    
all_X_file.close()
all_y_file.close()


1 Done!
2 Done!
3 Done!
4 Done!
5 Done!
6 Done!

In [9]:
ACTIVITIES = ["Standing", "Sitting", "Pointing", "Phone", "Tablet", "Paper"]

In [10]:
num_activities = len(ACTIVITIES)

In [11]:
num_activities


Out[11]:
6

In [12]:
X_all_df = pd.read_csv(all_X_csv, dtype=np.float64)
y_all_df = pd.read_csv(all_y_csv, dtype=np.int)

In [13]:
X_all_df.shape, y_all_df.shape


Out[13]:
((7762, 73), (7752, 2))

In [14]:
X_all_df.head()


Out[14]:
layer_area_0 layer_area_1 layer_area_2 layer_contours_0 layer_contours_1 layer_distance_0 layer_distance_1 layer_distance_2 layer_distance_3 layer_distance_4 ... interlayer_pos_16 interlayer_pos_17 extremities0 extreme_infrared_0 extreme_infrared_1 extreme_infrared_2 extreme_infrared_3 extreme_infrared_4 extreme_infrared_5 dataset_id
0 0.433004 0.425952 0.141044 3.0 3.0 29.8329 26.3059 27.4591 26.3059 267.5 ... -33.0 -194.0 4.0 71.5 61.5 86.5 81.5 0.0 197.5 1.0
1 0.345013 0.502695 0.152291 3.0 3.0 31.0161 38.0132 29.0172 38.0132 297.5 ... -32.0 -191.0 4.0 53.5 54.0 74.5 94.0 0.0 103.5 1.0
2 0.418367 0.428571 0.153061 3.0 3.0 29.5466 27.7308 26.6833 27.7308 216.0 ... -32.0 -190.0 2.0 52.5 54.0 0.0 0.0 0.0 54.0 1.0
3 0.452675 0.397119 0.150206 3.0 3.0 26.4008 23.2594 25.0799 23.2594 164.0 ... -31.0 -186.0 2.0 15.5 57.5 0.0 0.0 0.0 57.5 1.0
4 0.486784 0.400881 0.112335 3.0 3.0 25.0599 19.6469 24.0208 19.6469 157.0 ... -31.0 -188.0 5.0 12.0 19.5 68.0 83.5 87.5 131.0 1.0

5 rows × 73 columns


In [15]:
y_all_df.head()


Out[15]:
activity dataset_id
0 0 1
1 0 1
2 0 1
3 0 1
4 0 1

In [16]:
X_vec = X_all_df.drop(["dataset_id"], axis=1)
y_vec = y_all_df.drop(["dataset_id"], axis=1)

In [19]:
y_bins = np.bincount(np.squeeze(y_vec))

In [20]:
y_bins


Out[20]:
array([ 912,  900, 1161, 1497, 1561, 1721], dtype=int64)

In [21]:
y_min_count = min(y_bins)

In [22]:
y_min_count


Out[22]:
900

In [23]:
X_balanced_df = pd.DataFrame(columns=X_all_df.columns, dtype=np.float64)
y_balanced_df = pd.DataFrame(columns=y_all_df.columns, dtype=np.int)

In [25]:
for activity_id in range(num_activities):
    samples_y = y_all_df.loc[(y_all_df["activity"] == activity_id)]
    samples_y = samples_y.sample(y_min_count, replace=False, random_state=42)
    samples_X = X_all_df.iloc[samples_y.index]
    X_balanced_df = X_balanced_df.append(samples_X, ignore_index=True)
    y_balanced_df = y_balanced_df.append(samples_y, ignore_index=True)

In [26]:
X_balanced_df = X_balanced_df.astype(np.float64)

In [27]:
y_balanced_df = y_balanced_df.astype(np.int)

In [28]:
X_balanced_df.shape, y_balanced_df.shape


Out[28]:
((5400, 73), (5400, 2))

In [30]:
X_final_df = X_balanced_df.drop(labels=["dataset_id"], axis=1)
y_final_df = y_balanced_df.drop(labels=["dataset_id"], axis=1)

In [31]:
X_final_df.head()


Out[31]:
layer_area_0 layer_area_1 layer_area_2 layer_contours_0 layer_contours_1 layer_distance_0 layer_distance_1 layer_distance_2 layer_distance_3 layer_distance_4 ... interlayer_pos_15 interlayer_pos_16 interlayer_pos_17 extremities0 extreme_infrared_0 extreme_infrared_1 extreme_infrared_2 extreme_infrared_3 extreme_infrared_4 extreme_infrared_5
0 0.493197 0.362245 0.144558 3.0 3.0 26.5707 25.9422 26.9258 25.9422 180.5 ... -14.0 -33.0 -174.0 3.0 34.5 43.0 81.0 0.0 0.0 102.0
1 0.155372 0.598347 0.246281 3.0 3.0 13.1529 29.1548 23.0217 29.1548 326.0 ... -10.0 -26.0 -154.0 5.0 10.0 37.0 72.5 57.0 56.5 97.5
2 0.455904 0.393124 0.150972 3.0 3.0 26.4008 28.2312 29.6142 28.2312 224.5 ... -14.0 -33.0 -169.0 5.0 55.5 26.0 83.0 93.5 60.5 192.5
3 0.157895 0.441426 0.400679 3.0 3.0 12.3693 27.6586 25.3180 27.6586 226.5 ... -17.0 -27.0 -137.0 5.0 3.5 62.0 55.0 48.5 17.5 116.5
4 0.170498 0.419540 0.409962 3.0 3.0 13.0384 26.0768 25.0200 26.0768 189.5 ... -17.0 -24.0 -137.0 3.0 11.5 28.5 22.0 0.0 0.0 28.5

5 rows × 72 columns


In [32]:
y_final_df.head()


Out[32]:
activity
0 0
1 0
2 0
3 0
4 0

In [43]:
X_final_csv = "{d}/{tag}_X.csv".format(d=DATA_OUTPUT_DIR, tag="v2")
y_final_csv = "{d}/{tag}_y.csv".format(d=DATA_OUTPUT_DIR, tag="v2")

In [44]:
X_final_file = open(X_final_csv, "w")
y_final_file = open(y_final_csv, "w")
X_final_df.to_csv(X_final_file, header=False, index=False)
y_final_df.to_csv(y_final_file, header=False, index=False)
X_final_file.close()
y_final_file.close()

In [45]:
def sample_test_split(X_df, y_df, train_ratio, seed=42):
    
    train_size = math.floor(len(X_df) * train_ratio)
    
    X_train_df = X_df.sample(train_size, replace=False, random_state=seed)
    y_train_df = y_df.loc[X_train_df.index]
    
    X_test_df = X_df.loc[~X_df.index.isin(X_train_df.index)]
    y_test_df = y_df.loc[X_test_df.index]
    
    X_train, y_train = X_train_df.values, y_train_df.values
    X_test, y_test = X_test_df.values, y_test_df.values
    
    return X_train, y_train, X_test, y_test

In [77]:
X, y = X_final_df.values, y_final_df.values

In [78]:
X_train, y_train, X_test, y_test = sample_test_split(X_final_df, y_final_df, 7/10)

In [79]:
X.shape, y.shape


Out[79]:
((5400, 72), (5400, 1))

In [47]:
X_train.shape, y_train.shape


Out[47]:
((3779, 72), (3779, 1))

In [48]:
X_test.shape, y_test.shape


Out[48]:
((1621, 72), (1621, 1))

In [49]:
import xgboost as xgb
from sklearn import metrics

In [57]:
XGB_PARAM_FINAL = {}
XGB_PARAM_FINAL["eta"] = 0.3
XGB_PARAM_FINAL["gamma"] = 1
XGB_PARAM_FINAL["lambda"] = 1
XGB_PARAM_FINAL["alpha"] = 0
XGB_PARAM_FINAL["max_depth"] = 6
XGB_PARAM_FINAL["colsample_bytree"] = 0.5
XGB_PARAM_FINAL["colsample_bylevel"] = 0.5
XGB_PARAM_FINAL["subsample"] = 0.5
XGB_PARAM_FINAL["objective"] = "multi:softmax"
XGB_PARAM_FINAL["eval_metric"] = "merror"
XGB_PARAM_FINAL["num_class"] = len(ACTIVITIES)
XGB_PARAM_FINAL["silent"] = 0
XGB_PARAM_FINAL["seed"] = 42
XGB_NUM_ROUNDS = 200
XGB_EARLYSTOPPING_ROUNDS = 30

In [58]:
train_xgbmatrix = xgb.DMatrix(X_train, y_train)
test_xgbmatrix = xgb.DMatrix(X_test, y_test)
watchlist = [(train_xgbmatrix, "train"), (test_xgbmatrix, "eval")]

In [59]:
eval_results = {}
validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=train_xgbmatrix, evals=watchlist, evals_result=eval_results, 
                       num_boost_round=XGB_NUM_ROUNDS, early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)


[0]	train-merror:0.365705	eval-merror:0.42628
Multiple eval metrics have been passed: 'eval-merror' will be used for early stopping.

Will train until eval-merror hasn't improved in 30 rounds.
[100]	train-merror:0.014554	eval-merror:0.164713
Stopping. Best iteration:
[106]	train-merror:0.013496	eval-merror:0.159161


In [65]:
validation.best_iteration+1


Out[65]:
107

In [66]:
booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=train_xgbmatrix, num_boost_round=validation.best_iteration+1)

In [67]:
y_predicted = booster.predict(test_xgbmatrix)

In [68]:
accuracy = metrics.accuracy_score(y_test, y_predicted)

In [69]:
accuracy


Out[69]:
0.8408389882788402

In [71]:
def get_normalized_confusion_matrix(y_true, y_predicted):
    confusion_matrix = metrics.confusion_matrix(y_true, y_predicted)
    confusion_matrix_normalized = confusion_matrix.astype("float") / confusion_matrix.sum(axis=1)[:, np.newaxis]
    confusion_matrix_normalized *= 100
    return confusion_matrix_normalized

In [72]:
confusion_matrix = get_normalized_confusion_matrix(y_test, y_predicted)

In [73]:
fig, ax = plt.subplots(figsize=(10,7.5))
sns.heatmap(data=confusion_matrix, annot=True, fmt=".2f", linewidths=1, square=True,
            vmin=0, vmax=100, ax=ax, xticklabels=ACTIVITIES, yticklabels=ACTIVITIES, cmap=sns.cubehelix_palette(8))
plt.yticks(rotation=0)
sns.despine(top=False, right=False, left=False, bottom=False)



In [81]:
all_dmatrix = xgb.DMatrix(X, y)

In [82]:
final_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=all_dmatrix, num_boost_round=validation.best_iteration+1)

In [83]:
final_booster.save_model("v2.model")