In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os, re, sys

# sklearn stuff
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# keras stuff
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

# scipy stuff
from scipy.interpolate import interp1d

# my stuff
from preprocessing import *

%matplotlib inline

In [3]:
def delete_relevant(folder):
    if folder[-1] != '/':
        folder += '/'
    files = os.listdir(folder)
    for f in files:
        if re.match(r'u[0-9]+_w[0-9]+_(accelerometer|gyroscope)\.log', f) == None:
            print('deleting %s' % f)
            os.remove(folder + f)

Do not run


In [4]:
base = './Data/'
if 1 == 0:
    for d in os.listdir(base):
        if os.path.isdir(base + d):
            delete_relevant(base + d)

In [5]:
u_dict = generate_udict(base)

Steps to extract cycles

  1. discard the first 500 readings (assume sampling rate of 50Hz)
  2. discard readings until first local minimum
  3. assume template size of 150 readings

In [11]:
def filter_datapoints(data, threshold=0.25):
    """
    input:
        data (list of lists containing individual cycle values in each list)
        threshold [optional] (threshold value for length of lists to include
        eg. 0.25 mean lists of 0.75 < len < 1.25)
    output: list of lists
    
    filters out very long lists and very short lists
    """
    # immediately remove samples less than 80 or greater than 300
    lengths = [len(x) for x in data if len(x) >= 80 and len(x) <= 300]
    mean_len = np.mean(lengths)
    return [x for x in data if len(x) <= (1 + threshold) * mean_len and len(x) > (1 - threshold) * mean_len]

In [12]:
def extract_feats(data, starts, threshold=0.25, filter_short=True):
    """
    input:
        data (time series)
        starts (start of each cycle)
        filter_short [optional] (filters out short cycles)
    output: out (list of lists containing individual cycle values in each list)
    """
    ns = len(starts)
    out = []
    for i in range(ns-1):
        out.append(data[starts[i]:starts[i+1]])
    if filter_short:
        return filter_datapoints(out, threshold)
    return out

In [13]:
def plot_steps(data, starts, main_title='Accelerometer Magnitude', main_ylabel='$m/s^2$'):
    """
    plots overall time series with starts overlayed
    plots individual walking cycles
    """
    
    plt.figure(figsize=(15,4))
    plt.plot(data)
    plt.title(main_title)
    plt.xlabel('samples')
    plt.ylabel(main_ylabel)
    mx = data.max()
    mn = data.min()
    vy = np.linspace(mn, mx, 2)

    for s in starts:
        vx = [s for _ in vy]
        plt.plot(vx, vy, c='r', linewidth=2)
    plt.show()
    
    plt.figure(figsize=(15,4))
    plt.title('Individual Walking Cycles')
    plt.xlabel('samples')
    plt.ylabel(main_ylabel)
    for s in extract_feats(data, starts):
        plt.plot(s)

In [14]:
def interpolate_features(feats, num=300):
    """
    input:
        feats (list of lists containing individual cycle values in each list)
        num [optional] (number of points to interpolate to)
    output: feats_interp (interpolated list of lists of same length)
    """
    feats_interp = []
    for point in feats:
        n = len(point)
        f = interp1d(np.arange(n), point, kind='cubic')
        point_interp = np.linspace(0, n-1, num=num, endpoint=True)
        feats_interp.append(f(point_interp))
    feats_interp = np.array(feats_interp)
    return feats_interp

In [15]:
def rot_trans(raw_data):
    """
    input: raw_data
    output: rotation invariant data via linear PCA
    """
    n, d = raw_data.shape
    pca = PCA(n_components=d)
    return pca.fit_transform(raw_data)

In [16]:
def merge_consecutive_starts(starts, num_merges=2):
    return starts[::num_merges]

In [48]:
def preprocess_data(df, begin_idx=1000, threshold=0.25, num_merges=1):
    """
    input: df (dataframe of accelerometer readings. 1 column for each axis. Each row must be at one timestamp)
    output: out (dataframe of interpolated, distinct step data values)
    """
    # keep accelerometer columns
    accelerometer_cols = [x for x in df.columns if 'data' in x]
    # rotation invariance transformation
    data = rot_trans(df[accelerometer_cols].values)
    # toss first few values to noise/errors
    data = data[begin_idx:, :]
    
    # calculate accelerometer magnitude
    acc_mag = (data ** 2).sum(axis=1) ** 0.5
    
    # calculate start of each cycle
    starts = merge_consecutive_starts(find_cycles(acc_mag), num_merges=num_merges)
    
    # [n x k*c] matrix where
    # n = number of distinct cycles
    # k = number of axes in accelerometer
    # c = number of interpolated columns
    out = []
    columns = []
    for idx, row in enumerate(data.T):
        feats = extract_feats(row, starts, threshold=threshold, filter_short=True)
        feats_interp = interpolate_features(feats)
        n, d = feats_interp.shape
        out.append(feats_interp)
        ctr = 0
        for i in range(d):
            columns.append('axis_%d_feat_%d' % (idx, i))
    out = np.concatenate(out, axis=1)
    return pd.DataFrame(out, columns=columns)

Trials


In [41]:
a = get_data(51, 1)
accelerometer_cols = [x for x in a.columns if 'data' in x]
# gyroscope_cols = [x for x in a.columns if 'gyr' in x]

In [42]:
data = rot_trans(a[accelerometer_cols].values)
acc_mag = (data ** 2).sum(axis=1) ** 0.5

In [43]:
samples2 = acc_mag[1000:5000]
starts = find_cycles(samples2)
starts2 = merge_consecutive_starts(starts, 2)
plot_steps(samples2, starts2)

x,y,z=a[accelerometer_cols].values[1000:5000, :].T
plot_steps(x, starts2, main_title='$a_x$ values')
plot_steps(y, starts2, main_title='$a_y$ values')
plot_steps(z, starts2, main_title='$a_z$ values')



In [44]:
feats = extract_feats(x, starts2, filter_short=True)
feats_interp = interpolate_features(feats)
plt.figure(figsize=(15,4))
for row in feats_interp:
    plt.plot(row)



In [29]:
feats = extract_feats(y, starts2, filter_short=True)
feats_interp = interpolate_features(feats)
plt.figure(figsize=(15,4))
for row in feats_interp:
    plt.plot(row)



In [30]:
feats = extract_feats(z, starts2, filter_short=True)
feats_interp = interpolate_features(feats)
plt.figure(figsize=(15,4))
for row in feats_interp:
    plt.plot(row)


Preprocess and save data


In [52]:
s = 0
for key in u_dict:
    s += len(u_dict[key])

In [53]:
ctr = 0
for uid in u_dict:
    dir_name = './Data/processed/'
    xfname = dir_name + '%d_X.csv' % (uid)
    yfname = dir_name + '%d_Y.npy' % (uid)
    
    # check if files exist
    if not (os.path.isfile(xfname) and os.path.isfile(yfname)):
        # get trials for user
        trials = u_dict[uid]
        data = []
        labels = []
        # for each trial
        for trial in trials:
            ctr += 1
            print('Processing user %d, trial %d, %d remaining' % (uid, trial, (s-ctr)))
            try:
                # get data from disk
                df = get_data(uid, trial)
                # proprocess data, also choose number of strides to combine
                d = preprocess_data(df, num_merges=2)
                # append to combined data
                data.append(d)
                # append to combined labels
                n,_ = d.shape
                labels.append(np.zeros(n) + uid)
            except:
                print('FAILED')

        # save data and labels
        if len(data) != 0:
            data = pd.concat(data, axis=0, ignore_index=True)
            labels = np.concatenate(labels)
            data.to_csv(xfname)
            np.save(yfname, labels)
    else:
        print('Files for user %d exist on disk.' % uid)


Files for user 1 exist on disk.
Files for user 2 exist on disk.
Files for user 3 exist on disk.
Files for user 4 exist on disk.
Processing user 5, trial 1, 135 remaining
/home/ubicomp/anaconda3/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2889: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
FAILED
Files for user 6 exist on disk.
Files for user 7 exist on disk.
Files for user 8 exist on disk.
Files for user 9 exist on disk.
Files for user 10 exist on disk.
Files for user 11 exist on disk.
Files for user 12 exist on disk.
Files for user 13 exist on disk.
Files for user 14 exist on disk.
Files for user 15 exist on disk.
Files for user 16 exist on disk.
Files for user 17 exist on disk.
Files for user 18 exist on disk.
Files for user 19 exist on disk.
Files for user 20 exist on disk.
Files for user 21 exist on disk.
Files for user 22 exist on disk.
Files for user 23 exist on disk.
Files for user 24 exist on disk.
Files for user 25 exist on disk.
Files for user 26 exist on disk.
Files for user 27 exist on disk.
Files for user 28 exist on disk.
Files for user 29 exist on disk.
Files for user 30 exist on disk.
Files for user 31 exist on disk.
Files for user 32 exist on disk.
Files for user 33 exist on disk.
Files for user 34 exist on disk.
Files for user 35 exist on disk.
Files for user 36 exist on disk.
Files for user 37 exist on disk.
Files for user 38 exist on disk.
Files for user 39 exist on disk.
Files for user 40 exist on disk.
Files for user 41 exist on disk.
Files for user 42 exist on disk.
Files for user 43 exist on disk.
Files for user 44 exist on disk.
Files for user 45 exist on disk.
Files for user 46 exist on disk.
Files for user 47 exist on disk.
Files for user 48 exist on disk.
Files for user 49 exist on disk.
Files for user 50 exist on disk.
Processing user 51, trial 1, 134 remaining

Merge files into 1


In [54]:
base = './Data/processed/'
builder = pd.DataFrame()
labels = []
for f in os.listdir(base):
    if re.match(r'[0-9]+_X.csv', f):
        uid = int(re.findall(r'([0-9]+)_X.csv', f)[0])
        print('Loading user %d' % uid)
        data = pd.read_csv(base + f, index_col=0)
        n, _ = data.shape
        for i in range(n):
            labels.append(uid)
        if builder.shape == (0,0):
            builder = data
        else:
            builder = pd.concat([builder, data], axis=0)


Loading user 20
Loading user 4
Loading user 16
Loading user 2
Loading user 45
Loading user 15
Loading user 34
Loading user 3
Loading user 1
Loading user 50
Loading user 42
Loading user 27
Loading user 30
Loading user 46
Loading user 18
Loading user 33
Loading user 51
Loading user 32
Loading user 41
Loading user 9
Loading user 12
Loading user 38
Loading user 19
Loading user 13
Loading user 39
Loading user 40
Loading user 8
Loading user 28
Loading user 44
Loading user 14
Loading user 24
Loading user 47
Loading user 11
Loading user 26
Loading user 21
Loading user 37
Loading user 29
Loading user 25
Loading user 49
Loading user 35
Loading user 10
Loading user 48
Loading user 17
Loading user 43
Loading user 7
Loading user 31
Loading user 6
Loading user 22
Loading user 36
Loading user 23

In [57]:
builder.to_pickle(base + 'full.pickle')

In [58]:
np.save(base+'labels.npy', labels)

In [66]:
1 - (np.array(labels) == 51).mean()


Out[66]:
0.99459631277813099

In [ ]: