In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import math
import itertools
In [2]:
DATA_INPUT_DIR = "D:\\p_eaglesense\\eaglesense\\data\\topviewkinect"
In [3]:
DATA_OUTPUT_DIR = DATA_INPUT_DIR + "\\v2-models"
if not os.path.exists(DATA_OUTPUT_DIR):
os.makedirs(DATA_OUTPUT_DIR)
In [4]:
data_dirs = []
for subdir in sorted(next(os.walk(DATA_INPUT_DIR))[1]):
if subdir.isdigit():
data_dirs.append(int(subdir))
data_dirs.sort(key=int)
In [5]:
data_dirs
Out[5]:
In [6]:
ignored_X_cols = ["frame_id", "skeleton_id", "x", "y", "z"]
ignored_y_cols = ["frame_id", "skeleton_id", "orientation", "orientation_accurate"]
In [7]:
all_X_csv = "{d}/{tag}_X.csv".format(d=DATA_OUTPUT_DIR, tag="v2")
all_y_csv = "{d}/{tag}_y.csv".format(d=DATA_OUTPUT_DIR, tag="v2")
In [42]:
open(all_X_csv, "w").close()
open(all_y_csv, "w").close()
all_X_file = open(all_X_csv, "a")
all_y_file = open(all_y_csv, "a")
write_header=True
for dataset_id in data_dirs:
# read data
X_csv = "{d}/{dataset}/features.csv".format(d=DATA_INPUT_DIR, dataset=dataset_id)
X_df = pd.read_csv(X_csv, dtype=np.float64)
y_csv = "{d}/{dataset}/labels.csv".format(d=DATA_INPUT_DIR, dataset=dataset_id)
y_df = pd.read_csv(y_csv, dtype=np.float64)
# get single skeleton activities
y_df = y_df.loc[y_df["skeleton_id"] == 0]
y_df = y_df.loc[y_df["activity"].isin(list(range(0, 6)))]
# get common frame indices
X_df = X_df.loc[X_df["frame_id"].isin(y_df["frame_id"].values)]
y_df = y_df.loc[y_df["frame_id"].isin(X_df["frame_id"].values)]
# drop unncessary columns
X_df = X_df.drop(labels=ignored_X_cols, axis=1)
y_df = y_df.drop(labels=ignored_y_cols, axis=1)
# add dataset id
X_df["dataset_id"] = dataset_id
y_df["dataset_id"] = dataset_id
# save to csv
X_df = X_df.astype("float64")
X_df.to_csv(all_X_file, header=write_header, index=False)
y_df = y_df.astype("int")
y_df.to_csv(all_y_file, header=write_header, index=False)
write_header = False
print(dataset_id, "Done!")
all_X_file.close()
all_y_file.close()
In [9]:
ACTIVITIES = ["Standing", "Sitting", "Pointing", "Phone", "Tablet", "Paper"]
In [10]:
num_activities = len(ACTIVITIES)
In [11]:
num_activities
Out[11]:
In [12]:
X_all_df = pd.read_csv(all_X_csv, dtype=np.float64)
y_all_df = pd.read_csv(all_y_csv, dtype=np.int)
In [13]:
X_all_df.shape, y_all_df.shape
Out[13]:
In [14]:
X_all_df.head()
Out[14]:
In [15]:
y_all_df.head()
Out[15]:
In [16]:
X_vec = X_all_df.drop(["dataset_id"], axis=1)
y_vec = y_all_df.drop(["dataset_id"], axis=1)
In [19]:
y_bins = np.bincount(np.squeeze(y_vec))
In [20]:
y_bins
Out[20]:
In [21]:
y_min_count = min(y_bins)
In [22]:
y_min_count
Out[22]:
In [23]:
X_balanced_df = pd.DataFrame(columns=X_all_df.columns, dtype=np.float64)
y_balanced_df = pd.DataFrame(columns=y_all_df.columns, dtype=np.int)
In [25]:
for activity_id in range(num_activities):
samples_y = y_all_df.loc[(y_all_df["activity"] == activity_id)]
samples_y = samples_y.sample(y_min_count, replace=False, random_state=42)
samples_X = X_all_df.iloc[samples_y.index]
X_balanced_df = X_balanced_df.append(samples_X, ignore_index=True)
y_balanced_df = y_balanced_df.append(samples_y, ignore_index=True)
In [26]:
X_balanced_df = X_balanced_df.astype(np.float64)
In [27]:
y_balanced_df = y_balanced_df.astype(np.int)
In [28]:
X_balanced_df.shape, y_balanced_df.shape
Out[28]:
In [30]:
X_final_df = X_balanced_df.drop(labels=["dataset_id"], axis=1)
y_final_df = y_balanced_df.drop(labels=["dataset_id"], axis=1)
In [31]:
X_final_df.head()
Out[31]:
In [32]:
y_final_df.head()
Out[32]:
In [43]:
X_final_csv = "{d}/{tag}_X.csv".format(d=DATA_OUTPUT_DIR, tag="v2")
y_final_csv = "{d}/{tag}_y.csv".format(d=DATA_OUTPUT_DIR, tag="v2")
In [44]:
X_final_file = open(X_final_csv, "w")
y_final_file = open(y_final_csv, "w")
X_final_df.to_csv(X_final_file, header=False, index=False)
y_final_df.to_csv(y_final_file, header=False, index=False)
X_final_file.close()
y_final_file.close()
In [45]:
def sample_test_split(X_df, y_df, train_ratio, seed=42):
train_size = math.floor(len(X_df) * train_ratio)
X_train_df = X_df.sample(train_size, replace=False, random_state=seed)
y_train_df = y_df.loc[X_train_df.index]
X_test_df = X_df.loc[~X_df.index.isin(X_train_df.index)]
y_test_df = y_df.loc[X_test_df.index]
X_train, y_train = X_train_df.values, y_train_df.values
X_test, y_test = X_test_df.values, y_test_df.values
return X_train, y_train, X_test, y_test
In [77]:
X, y = X_final_df.values, y_final_df.values
In [78]:
X_train, y_train, X_test, y_test = sample_test_split(X_final_df, y_final_df, 7/10)
In [79]:
X.shape, y.shape
Out[79]:
In [47]:
X_train.shape, y_train.shape
Out[47]:
In [48]:
X_test.shape, y_test.shape
Out[48]:
In [49]:
import xgboost as xgb
from sklearn import metrics
In [57]:
XGB_PARAM_FINAL = {}
XGB_PARAM_FINAL["eta"] = 0.3
XGB_PARAM_FINAL["gamma"] = 1
XGB_PARAM_FINAL["lambda"] = 1
XGB_PARAM_FINAL["alpha"] = 0
XGB_PARAM_FINAL["max_depth"] = 6
XGB_PARAM_FINAL["colsample_bytree"] = 0.5
XGB_PARAM_FINAL["colsample_bylevel"] = 0.5
XGB_PARAM_FINAL["subsample"] = 0.5
XGB_PARAM_FINAL["objective"] = "multi:softmax"
XGB_PARAM_FINAL["eval_metric"] = "merror"
XGB_PARAM_FINAL["num_class"] = len(ACTIVITIES)
XGB_PARAM_FINAL["silent"] = 0
XGB_PARAM_FINAL["seed"] = 42
XGB_NUM_ROUNDS = 200
XGB_EARLYSTOPPING_ROUNDS = 30
In [58]:
train_xgbmatrix = xgb.DMatrix(X_train, y_train)
test_xgbmatrix = xgb.DMatrix(X_test, y_test)
watchlist = [(train_xgbmatrix, "train"), (test_xgbmatrix, "eval")]
In [59]:
eval_results = {}
validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=train_xgbmatrix, evals=watchlist, evals_result=eval_results,
num_boost_round=XGB_NUM_ROUNDS, early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS, verbose_eval=100)
In [65]:
validation.best_iteration+1
Out[65]:
In [66]:
booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=train_xgbmatrix, num_boost_round=validation.best_iteration+1)
In [67]:
y_predicted = booster.predict(test_xgbmatrix)
In [68]:
accuracy = metrics.accuracy_score(y_test, y_predicted)
In [69]:
accuracy
Out[69]:
In [71]:
def get_normalized_confusion_matrix(y_true, y_predicted):
confusion_matrix = metrics.confusion_matrix(y_true, y_predicted)
confusion_matrix_normalized = confusion_matrix.astype("float") / confusion_matrix.sum(axis=1)[:, np.newaxis]
confusion_matrix_normalized *= 100
return confusion_matrix_normalized
In [72]:
confusion_matrix = get_normalized_confusion_matrix(y_test, y_predicted)
In [73]:
fig, ax = plt.subplots(figsize=(10,7.5))
sns.heatmap(data=confusion_matrix, annot=True, fmt=".2f", linewidths=1, square=True,
vmin=0, vmax=100, ax=ax, xticklabels=ACTIVITIES, yticklabels=ACTIVITIES, cmap=sns.cubehelix_palette(8))
plt.yticks(rotation=0)
sns.despine(top=False, right=False, left=False, bottom=False)
In [81]:
all_dmatrix = xgb.DMatrix(X, y)
In [82]:
final_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=all_dmatrix, num_boost_round=validation.best_iteration+1)
In [83]:
final_booster.save_model("v2.model")