In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, re, sys
# sklearn stuff
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
# keras stuff
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
# scipy stuff
from scipy.interpolate import interp1d
# my stuff
from preprocessing import *
%matplotlib inline
In [3]:
def delete_relevant(folder):
if folder[-1] != '/':
folder += '/'
files = os.listdir(folder)
for f in files:
if re.match(r'u[0-9]+_w[0-9]+_(accelerometer|gyroscope)\.log', f) == None:
print('deleting %s' % f)
os.remove(folder + f)
In [4]:
base = './Data/'
if 1 == 0:
for d in os.listdir(base):
if os.path.isdir(base + d):
delete_relevant(base + d)
In [5]:
u_dict = generate_udict(base)
In [11]:
def filter_datapoints(data, threshold=0.25):
"""
input:
data (list of lists containing individual cycle values in each list)
threshold [optional] (threshold value for length of lists to include
eg. 0.25 mean lists of 0.75 < len < 1.25)
output: list of lists
filters out very long lists and very short lists
"""
# immediately remove samples less than 80 or greater than 300
lengths = [len(x) for x in data if len(x) >= 80 and len(x) <= 300]
mean_len = np.mean(lengths)
return [x for x in data if len(x) <= (1 + threshold) * mean_len and len(x) > (1 - threshold) * mean_len]
In [12]:
def extract_feats(data, starts, threshold=0.25, filter_short=True):
"""
input:
data (time series)
starts (start of each cycle)
filter_short [optional] (filters out short cycles)
output: out (list of lists containing individual cycle values in each list)
"""
ns = len(starts)
out = []
for i in range(ns-1):
out.append(data[starts[i]:starts[i+1]])
if filter_short:
return filter_datapoints(out, threshold)
return out
In [13]:
def plot_steps(data, starts, main_title='Accelerometer Magnitude', main_ylabel='$m/s^2$'):
"""
plots overall time series with starts overlayed
plots individual walking cycles
"""
plt.figure(figsize=(15,4))
plt.plot(data)
plt.title(main_title)
plt.xlabel('samples')
plt.ylabel(main_ylabel)
mx = data.max()
mn = data.min()
vy = np.linspace(mn, mx, 2)
for s in starts:
vx = [s for _ in vy]
plt.plot(vx, vy, c='r', linewidth=2)
plt.show()
plt.figure(figsize=(15,4))
plt.title('Individual Walking Cycles')
plt.xlabel('samples')
plt.ylabel(main_ylabel)
for s in extract_feats(data, starts):
plt.plot(s)
In [14]:
def interpolate_features(feats, num=300):
"""
input:
feats (list of lists containing individual cycle values in each list)
num [optional] (number of points to interpolate to)
output: feats_interp (interpolated list of lists of same length)
"""
feats_interp = []
for point in feats:
n = len(point)
f = interp1d(np.arange(n), point, kind='cubic')
point_interp = np.linspace(0, n-1, num=num, endpoint=True)
feats_interp.append(f(point_interp))
feats_interp = np.array(feats_interp)
return feats_interp
In [15]:
def rot_trans(raw_data):
"""
input: raw_data
output: rotation invariant data via linear PCA
"""
n, d = raw_data.shape
pca = PCA(n_components=d)
return pca.fit_transform(raw_data)
In [16]:
def merge_consecutive_starts(starts, num_merges=2):
return starts[::num_merges]
In [48]:
def preprocess_data(df, begin_idx=1000, threshold=0.25, num_merges=1):
"""
input: df (dataframe of accelerometer readings. 1 column for each axis. Each row must be at one timestamp)
output: out (dataframe of interpolated, distinct step data values)
"""
# keep accelerometer columns
accelerometer_cols = [x for x in df.columns if 'data' in x]
# rotation invariance transformation
data = rot_trans(df[accelerometer_cols].values)
# toss first few values to noise/errors
data = data[begin_idx:, :]
# calculate accelerometer magnitude
acc_mag = (data ** 2).sum(axis=1) ** 0.5
# calculate start of each cycle
starts = merge_consecutive_starts(find_cycles(acc_mag), num_merges=num_merges)
# [n x k*c] matrix where
# n = number of distinct cycles
# k = number of axes in accelerometer
# c = number of interpolated columns
out = []
columns = []
for idx, row in enumerate(data.T):
feats = extract_feats(row, starts, threshold=threshold, filter_short=True)
feats_interp = interpolate_features(feats)
n, d = feats_interp.shape
out.append(feats_interp)
ctr = 0
for i in range(d):
columns.append('axis_%d_feat_%d' % (idx, i))
out = np.concatenate(out, axis=1)
return pd.DataFrame(out, columns=columns)
In [41]:
a = get_data(51, 1)
accelerometer_cols = [x for x in a.columns if 'data' in x]
# gyroscope_cols = [x for x in a.columns if 'gyr' in x]
In [42]:
data = rot_trans(a[accelerometer_cols].values)
acc_mag = (data ** 2).sum(axis=1) ** 0.5
In [43]:
samples2 = acc_mag[1000:5000]
starts = find_cycles(samples2)
starts2 = merge_consecutive_starts(starts, 2)
plot_steps(samples2, starts2)
x,y,z=a[accelerometer_cols].values[1000:5000, :].T
plot_steps(x, starts2, main_title='$a_x$ values')
plot_steps(y, starts2, main_title='$a_y$ values')
plot_steps(z, starts2, main_title='$a_z$ values')
In [44]:
feats = extract_feats(x, starts2, filter_short=True)
feats_interp = interpolate_features(feats)
plt.figure(figsize=(15,4))
for row in feats_interp:
plt.plot(row)
In [29]:
feats = extract_feats(y, starts2, filter_short=True)
feats_interp = interpolate_features(feats)
plt.figure(figsize=(15,4))
for row in feats_interp:
plt.plot(row)
In [30]:
feats = extract_feats(z, starts2, filter_short=True)
feats_interp = interpolate_features(feats)
plt.figure(figsize=(15,4))
for row in feats_interp:
plt.plot(row)
In [52]:
s = 0
for key in u_dict:
s += len(u_dict[key])
In [53]:
ctr = 0
for uid in u_dict:
dir_name = './Data/processed/'
xfname = dir_name + '%d_X.csv' % (uid)
yfname = dir_name + '%d_Y.npy' % (uid)
# check if files exist
if not (os.path.isfile(xfname) and os.path.isfile(yfname)):
# get trials for user
trials = u_dict[uid]
data = []
labels = []
# for each trial
for trial in trials:
ctr += 1
print('Processing user %d, trial %d, %d remaining' % (uid, trial, (s-ctr)))
try:
# get data from disk
df = get_data(uid, trial)
# proprocess data, also choose number of strides to combine
d = preprocess_data(df, num_merges=2)
# append to combined data
data.append(d)
# append to combined labels
n,_ = d.shape
labels.append(np.zeros(n) + uid)
except:
print('FAILED')
# save data and labels
if len(data) != 0:
data = pd.concat(data, axis=0, ignore_index=True)
labels = np.concatenate(labels)
data.to_csv(xfname)
np.save(yfname, labels)
else:
print('Files for user %d exist on disk.' % uid)
In [54]:
base = './Data/processed/'
builder = pd.DataFrame()
labels = []
for f in os.listdir(base):
if re.match(r'[0-9]+_X.csv', f):
uid = int(re.findall(r'([0-9]+)_X.csv', f)[0])
print('Loading user %d' % uid)
data = pd.read_csv(base + f, index_col=0)
n, _ = data.shape
for i in range(n):
labels.append(uid)
if builder.shape == (0,0):
builder = data
else:
builder = pd.concat([builder, data], axis=0)
In [57]:
builder.to_pickle(base + 'full.pickle')
In [58]:
np.save(base+'labels.npy', labels)
In [66]:
1 - (np.array(labels) == 51).mean()
Out[66]:
In [ ]: