In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import grid_search
from sklearn import metrics
from sklearn import cross_validation
from sklearn.externals import joblib
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import operator
import itertools
import random
In [324]:
sns.set_context("paper")
sns.set(font_scale=1.5)
thesis_figsize = (12, 9)
In [2]:
FEATURES_DIRECTORY = "Q:\\p_eaglesense\\eaglesense\\data\\topviewkinect\\all"
In [3]:
FEATURES_TAG = "test"
In [4]:
features_csv = "{root}/{tag}_features.csv".format(root=FEATURES_DIRECTORY, tag=FEATURES_TAG)
features_df = pd.read_csv(features_csv)
In [5]:
labels_csv = "{root}/{tag}_labels.csv".format(root=FEATURES_DIRECTORY, tag=FEATURES_TAG)
labels_df = pd.read_csv(labels_csv)
In [18]:
ACTIVITIES = ["Standing", "Sitting", "Pointing", "Phone", "Tablet", "Paper"]
In [19]:
NUM_ACTIVITIES = len(ACTIVITIES)
In [20]:
features_vector = features_df.drop(["subject"], axis=1)
In [333]:
features_corr = features_vector.corr()
fig, ax = plt.subplots(figsize=(10,7.5))
x = sns.heatmap(features_corr, ax=ax, xticklabels=False, yticklabels=False)
sns.despine()
plt.yticks(rotation=0)
Out[333]:
In [21]:
num_features = features_vector.shape[1]
In [22]:
output_vector = labels_df[["activity"]]
In [23]:
X = features_vector.values
In [24]:
y = output_vector.values
In [25]:
unique_subjects = features_df["subject"].unique()
In [26]:
X.shape
Out[26]:
In [27]:
y.shape
Out[27]:
In [339]:
features_df.shape
Out[339]:
In [340]:
features_df.head()
Out[340]:
In [341]:
crosssubject_1_X_train = np.array([], dtype=np.float64).reshape(0, num_features)
crosssubject_1_y_train = np.array([], dtype=np.int32).reshape(0, 1)
crosssubject_1_X_test = np.array([], dtype=np.float64).reshape(0, num_features)
crosssubject_1_y_test = np.array([], dtype=np.int32).reshape(0, 1)
for subject_id in unique_subjects:
subject_features = features_df[features_df["subject"] == subject_id]
subject_features = subject_features.drop(["subject"], axis=1)
subject_labels = labels_df[labels_df["subject"] == subject_id]
subject_labels = subject_labels[["activity"]]
subject_X = subject_features.values
subject_y = subject_labels.values
if subject_id % 2 == 1:
print(subject_id, "\tTrain")
crosssubject_1_X_train = np.vstack([crosssubject_1_X_train, subject_X])
crosssubject_1_y_train = np.vstack([crosssubject_1_y_train, subject_y])
else:
print(subject_id, "\tTest")
crosssubject_1_X_test = np.vstack([crosssubject_1_X_test, subject_X])
crosssubject_1_y_test = np.vstack([crosssubject_1_y_test, subject_y])
In [342]:
crosssubject_1_X_train.shape
Out[342]:
In [343]:
crosssubject_1_X_test.shape
Out[343]:
In [344]:
crosssubject_1_train_xgbmatrix = xgb.DMatrix(crosssubject_1_X_train, crosssubject_1_y_train)
crosssubject_1_test_xgbmatrix = xgb.DMatrix(crosssubject_1_X_test, crosssubject_1_y_test)
crosssubject_1_watchlist = [(crosssubject_1_train_xgbmatrix, "train"), (crosssubject_1_test_xgbmatrix, "eval")]
In [359]:
XGB_PARAM_FINAL = {}
XGB_PARAM_FINAL["eta"] = 0.3
XGB_PARAM_FINAL["gamma"] = 1
XGB_PARAM_FINAL["lambda"] = 1
XGB_PARAM_FINAL["alpha"] = 0
XGB_PARAM_FINAL["max_depth"] = 6
XGB_PARAM_FINAL["colsample_bytree"] = 0.5
XGB_PARAM_FINAL["colsample_bylevel"] = 0.5
XGB_PARAM_FINAL["subsample"] = 0.5
XGB_PARAM_FINAL["objective"] = "multi:softmax"
XGB_PARAM_FINAL["eval_metric"] = "merror"
XGB_PARAM_FINAL["num_class"] = len(ACTIVITIES)
XGB_PARAM_FINAL["silent"] = 0
XGB_NUM_ROUNDS = 200
XGB_EARLYSTOPPING_ROUNDS = 30
In [360]:
crosssubject_1_results = {}
cs_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=crosssubject_1_train_xgbmatrix, num_boost_round=XGB_NUM_ROUNDS,
evals=crosssubject_1_watchlist, evals_result=crosssubject_1_results,
early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS)
In [361]:
crosssubject_1_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=crosssubject_1_train_xgbmatrix,
num_boost_round=cs_validation.best_iteration+1)
In [362]:
crosssubject_1_y_predicted = crosssubject_1_booster.predict(crosssubject_1_test_xgbmatrix)
In [363]:
crosssubject_1_accuracy = metrics.accuracy_score(crosssubject_1_y_test, crosssubject_1_y_predicted)
crosssubject_1_accuracy
Out[363]:
In [364]:
crosssubject_1_cm = metrics.confusion_matrix(crosssubject_1_y_test, crosssubject_1_y_predicted)
crosssubject_1_cm_normalized = crosssubject_1_cm.astype("float") / crosssubject_1_cm.sum(axis=1)[:, np.newaxis]
crosssubject_1_cm_normalized *= 100
In [365]:
with sns.axes_style("ticks"):
fig, ax = plt.subplots(figsize=(10, 7.5))
sns.heatmap(data=crosssubject_1_cm_normalized, annot=True, fmt=".2f", linewidths=0.5, square=True,
vmin=0, vmax=100, ax=ax, xticklabels=ACTIVITIES, yticklabels=ACTIVITIES)
plt.yticks(rotation=0)
sns.despine()
In [216]:
np.set_printoptions(formatter={'float': lambda x: "{:.2f}".format(x)})
for subject_id in unique_subjects:
subject_features = features_df[features_df["subject"] == subject_id]
subject_features = subject_features.drop(["subject"], axis=1)
subject_labels = labels_df[labels_df["subject"] == subject_id]
subject_labels = subject_labels[["activity"]]
subject_X = subject_features.values
subject_y = subject_labels.values
subject_xgbmatrix = xgb.DMatrix(subject_X, subject_y)
subject_y_predicted = crosssubject_1_booster.predict(subject_xgbmatrix)
subject_accuracy = metrics.accuracy_score(subject_y, subject_y_predicted)
subject_confusion_matrix = metrics.confusion_matrix(subject_y, subject_y_predicted)
subject_confusion_matrix = subject_confusion_matrix.astype("float") / subject_confusion_matrix.sum(axis=1)[:, np.newaxis]
subject_confusion_matrix *= 100
print(subject_id, subject_accuracy)
print(subject_confusion_matrix)
In [217]:
feature_importance = crosssubject_1_booster.get_fscore()
In [218]:
# Create feature importance dataframe
features_importance_formatted = dict()
for feature_idx, feature_name in enumerate(features_vector.columns):
old_key = "f{}".format(feature_idx)
if old_key not in feature_importance:
continue
new_key = feature_name
features_importance_formatted[new_key] = feature_importance[old_key]
features_importance_formatted = sorted(features_importance_formatted.items(), key=operator.itemgetter(1))
features_importance_top10 = features_importance_formatted[-10:]
features_importance_df = pd.DataFrame(features_importance_top10, columns=["feature", "gain"])
features_importance_df["gain"] = features_importance_df["gain"] / features_importance_df["gain"].sum()
# Plot
fig, ax = plt.subplots(figsize=(10,7.5))
sns.barplot(x="gain", y="feature", data=features_importance_df, label="Total", color="#3498db", ax=ax)
sns.despine()
ax.set_xlabel("Gain")
ax.set_ylabel("Feature")
ax.set_title("Cross-Subject Test")
Out[218]:
In [277]:
features_importance_formatted
Out[277]:
In [4]:
p1_features_csv = "{root}/{tag}_features.csv".format(root=FEATURES_DIRECTORY, tag="p1")
p1_features_df = pd.read_csv(p1_features_csv)
p1_labels_csv = "{root}/{tag}_labels.csv".format(root=FEATURES_DIRECTORY, tag="p1")
p1_labels_df = pd.read_csv(p1_labels_csv)
In [5]:
all_features_csv = "{root}/{tag}_features.csv".format(root=FEATURES_DIRECTORY, tag="all")
all_features_df = pd.read_csv(all_features_csv)
all_labels_csv = "{root}/{tag}_labels.csv".format(root=FEATURES_DIRECTORY, tag="all")
all_labels_df = pd.read_csv(all_labels_csv)
In [6]:
eval_features_csv = "{root}/{tag}_features.csv".format(root=FEATURES_DIRECTORY, tag="eval")
eval_features_df = pd.read_csv(eval_features_csv)
eval_labels_csv = "{root}/{tag}_labels.csv".format(root=FEATURES_DIRECTORY, tag="eval")
eval_labels_df = pd.read_csv(eval_labels_csv)
In [7]:
# standing and tablet only
p1_features = p1_features_df.drop(["subject"], axis=1)
p1_labels = p1_labels_df[["activity"]]
p1_standingtablet_series = p1_labels["activity"].isin([0, 4])
p1_standingtablet_features = p1_features.loc[p1_standingtablet_series]
p1_standingtablet_labels = p1_labels.loc[p1_standingtablet_series]
p1_X = p1_standingtablet_features.values
p1_y = p1_standingtablet_labels.values
all_features = all_features_df.drop(["subject"], axis=1)
all_labels = all_labels_df[["activity"]]
all_standingtablet_series = all_labels["activity"].isin([0, 4])
all_standingtablet_features = all_features.loc[all_standingtablet_series]
all_standingtablet_labels = all_labels.loc[all_standingtablet_series]
all_X = all_standingtablet_features.values
all_y = all_standingtablet_labels.values
eval_features = eval_features_df.drop(["subject"], axis=1)
eval_labels = eval_labels_df[["activity"]]
eval_standingtablet_series = eval_labels["activity"].isin([0, 4])
eval_standingtablet_features = eval_features.loc[eval_standingtablet_series]
eval_standingtablet_labels = eval_labels.loc[eval_standingtablet_series]
eval_X = eval_standingtablet_features.values
eval_y = eval_standingtablet_labels.values
In [8]:
p1_standingtablet_features.shape
Out[8]:
In [9]:
p1_standingtablet_labels.shape
Out[9]:
In [10]:
all_standingtablet_features.shape
Out[10]:
In [11]:
all_standingtablet_labels.shape
Out[11]:
In [12]:
eval_standingtablet_features.shape
Out[12]:
In [14]:
eval_standingtablet_labels.shape
Out[14]:
In [28]:
XGB_PARAM_FINAL = {}
XGB_PARAM_FINAL["eta"] = 0.3
XGB_PARAM_FINAL["gamma"] = 1
XGB_PARAM_FINAL["lambda"] = 2
XGB_PARAM_FINAL["alpha"] = 1
XGB_PARAM_FINAL["max_depth"] = 7
XGB_PARAM_FINAL["colsample_bytree"] = 0.5
XGB_PARAM_FINAL["colsample_bylevel"] = 0.5
XGB_PARAM_FINAL["subsample"] = 0.5
XGB_PARAM_FINAL["objective"] = "multi:softmax"
XGB_PARAM_FINAL["eval_metric"] = "merror"
XGB_PARAM_FINAL["num_class"] = len(ACTIVITIES)
XGB_PARAM_FINAL["silent"] = 0
XGB_NUM_ROUNDS = 200
XGB_EARLYSTOPPING_ROUNDS = 30
In [29]:
y = y.astype(np.int)
test_xgbmatrix = xgb.DMatrix(X, y)
In [18]:
standingtablet_train_xgbmatrix = xgb.DMatrix(all_X, all_y)
standingtablet_test_xgbmatrix = xgb.DMatrix(p1_X, p1_y)
standingtablet_watchlist = [(standingtablet_train_xgbmatrix, "train"), (standingtablet_test_xgbmatrix, "eval")]
In [19]:
standingtablet_results = {}
standingtablet_validation = xgb.train(params=XGB_PARAM_FINAL, dtrain=standingtablet_train_xgbmatrix, num_boost_round=XGB_NUM_ROUNDS,
evals=standingtablet_watchlist, evals_result=standingtablet_results,
early_stopping_rounds=XGB_EARLYSTOPPING_ROUNDS)
In [30]:
test_booster = xgb.train(params=XGB_PARAM_FINAL, dtrain=test_xgbmatrix, num_boost_round=28)
In [31]:
test_booster.save_model("test.model")
In [11]:
test_booster = xgb.Booster(model_file="test.model")
In [12]:
X = np.array([[
0.297578,0.411765,0.290657,3,3,16.5529,26.6833,26.0192,26.6833,201,0,0,0,0,26.0192,133.5,0,0,0,0,-7,3,10,8,-4,1,15,-7,-9,-12,-5,7,12,3,4,24,8,-3,-12,-1,0,14,0,0,26,1,0,-3,7,18,5,-7,-25,-25,-12,-64,-17,-26,-107,-25,-12,-64,-17,-26,-107,4,0,10,11.5,11.5,0,11.5
]])
In [13]:
X.shape
Out[13]:
In [14]:
X_dmatrix = xgb.DMatrix(X)
In [20]:
X_dmatrix.get_label()
Out[20]:
In [15]:
test_booster.predict(X_dmatrix).astype(np.int)
Out[15]:
In [ ]:
y_predicted = test_booster.predict(X_dmatrix).astype(np.int)
accuracy = metrics.accuracy_score(y, y_predicted)
accuracy