In [28]:
import pandas as pd
from scipy.ndimage import gaussian_filter
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
%matplotlib inline
In [29]:
df = pd.read_csv('../Data/shaneiphone_exp2_processed.csv', index_col='DateTime')
In [30]:
# Use only userAcceleration and gyroscope data, since these features are expected to generalize well.
xyz = ['X', 'Y', 'Z']
measures = ['userAcceleration', 'gyroscope']
basefeatures = [i + j for i in measures for j in xyz]
features = [i + j for i in measures for j in xyz]
In [31]:
# Add Gaussian smoothed features
smoothfeatures = []
for i in features:
df[i + 'sm'] = gaussian_filter(df[i], 3)
df[i + '2sm'] = gaussian_filter(df[i], 100)
smoothfeatures.append(i + 'sm')
smoothfeatures.append(i + '2sm')
features.extend(smoothfeatures)
In [32]:
# Generate Jerk signal
jerkfeatures = []
for i in features:
diffsignal = np.diff(df[i])
df[i + 'jerk'] = np.append(0, diffsignal)
jerkfeatures.append(i + 'jerk')
features.extend(jerkfeatures)
In [33]:
# assign class labels
class0 = (df.index > '2015-08-25 14:35:00') & \
(df.index < '2015-08-25 14:42:00')
class1 = (df.index > '2015-08-25 14:43:00') & \
(df.index < '2015-08-25 14:48:00')
df['class'] = -1
df['class'][class0] = 0
df['class'][class1] = 1
In [37]:
# separate into quarters for train and validation
q1 = df[(df.index <= '2015-08-25 14:38:30') &
(df.index > '2015-08-25 14:33:00')]
q2 = df[(df.index > '2015-08-25 14:38:30') &
(df.index <= '2015-08-25 14:42:00')]
q3 = df[(df.index > '2015-08-25 14:43:00') &
(df.index <= '2015-08-25 14:45:30')]
q4 = df[(df.index > '2015-08-25 14:45:30') &
(df.index <= '2015-08-25 14:48:00')]
traindf = pd.concat([q1, q3])
validationdf = pd.concat([q2, q4])
In [10]:
# check for NaNs in the dataframes
print(traindf.isnull().sum().sum())
print(validationdf.isnull().sum().sum())
In [11]:
# drop NaNs
traindf = traindf.dropna()
validationdf = validationdf.dropna()
In [12]:
# Make the training and validation sets
X_train = traindf[features].values
y_train = traindf['class'].values
X_test = validationdf[features].values
y_test = validationdf['class'].values
In [13]:
# train a random forest
clf = RandomForestClassifier(n_estimators=200)
In [14]:
# get the 5-fold cross-validation score
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores, scores.mean(), scores.std())
In [15]:
# apply model to test set
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)
In [16]:
# obtain accuracy score
testscore = accuracy_score(y_test, predict_y)
print("Accuracy score on test set: %6.3f" % testscore)
In [17]:
# Inspect feature importances
for i, ifeature in enumerate(features):
print(ifeature + ': %6.4f' % clf.feature_importances_[i])
In [19]:
# compare bus gyroscopeZ2sm and car gyroscopeZ2sm
#q1['gyroscopeXsm'].plot(color='blue', figsize=(12,6), kind='hist', bins=40, alpha=0.4) # car
#q3['gyroscopeXsm'].plot(color='green', kind='hist', bins=40, alpha=0.4) # bus
q1['gyroscopeXsm'].plot(color='blue', figsize=(12,6)) # car
q3['gyroscopeXsm'].plot(color='green') # bus
Out[19]:
In [35]:
# Generate Fourier Transform of features
fftfeatures = []
for i in features:
reals = np.real(np.fft.rfft(df[i]))
imags = np.imag(np.fft.rfft(df[i]))
complexs = [reals[0]]
n = len(reals)
if n % 2 == 0:
complexs.append(imags[0])
for j in range(1, n - 1):
complexs.append(reals[j])
complexs.append(imags[j])
complexs.append(reals[j])
if len(df) > len(complexs):
complexs.append(imags[j])
df['f' + i] = complexs
fftfeatures.append('f' + i)
features.extend(fftfeatures)
In [38]:
# separate into quarters for train and validation
q1 = df[(df.index <= '2015-08-25 14:38:30') &
(df.index > '2015-08-25 14:33:00')]
q2 = df[(df.index > '2015-08-25 14:38:30') &
(df.index <= '2015-08-25 14:42:00')]
q3 = df[(df.index > '2015-08-25 14:43:00') &
(df.index <= '2015-08-25 14:45:30')]
q4 = df[(df.index > '2015-08-25 14:45:30') &
(df.index <= '2015-08-25 14:48:00')]
traindf = pd.concat([q1, q3])
validationdf = pd.concat([q2, q4])
In [39]:
# Make the training and validation sets
X_train = traindf[fftfeatures].values
y_train = traindf['class'].values
X_test = validationdf[fftfeatures].values
y_test = validationdf['class'].values
In [40]:
# train a random forest
clf = RandomForestClassifier(n_estimators=200)
In [41]:
# get the 5-fold cross-validation score
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores, scores.mean(), scores.std())
In [42]:
# apply model to test set
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)
In [43]:
# obtain accuracy score
testscore = accuracy_score(y_test, predict_y)
print("Accuracy score on test set: %6.3f" % testscore)
In [44]:
# Inspect feature importances
for i, ifeature in enumerate(fftfeatures):
print(ifeature + ': %6.4f' % clf.feature_importances_[i])
In [ ]: