In [52]:
import pandas as pd
from scipy.ndimage import gaussian_filter
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np
%matplotlib inline
In [20]:
dfcar = pd.read_csv('../Data/shaneiphone_exp2_processed.csv', index_col='DateTime')
In [21]:
dfbus = pd.read_csv('../Data/shanebus20150827_processed.csv', index_col='DateTime')
In [22]:
# combine into a single dataframe
df = pd.concat([dfcar, dfbus])
In [38]:
# Use only userAcceleration and gyroscope data, since these features are expected to generalize well.
xyz = ['X', 'Y', 'Z']
measures = ['userAcceleration', 'gyroscope']
basefeatures = [i + j for i in measures for j in xyz]
features = [i + j for i in measures for j in xyz]
In [39]:
# Add Gaussian smoothed features
smoothfeatures = []
for i in features:
df[i + 'sm'] = gaussian_filter(df[i], 3)
df[i + '2sm'] = gaussian_filter(df[i], 100)
smoothfeatures.append(i + 'sm')
smoothfeatures.append(i + '2sm')
features.extend(smoothfeatures)
In [40]:
# Generate Jerk signal
jerkfeatures = []
for i in features:
diffsignal = np.diff(df[i])
df[i + 'jerk'] = np.append(0, diffsignal)
jerkfeatures.append(i + 'jerk')
features.extend(jerkfeatures)
In [41]:
# assign class labels
car0 = (df.index > '2015-08-25 14:35:00') & \
(df.index <= '2015-08-25 14:42:00')
car1 = (df.index > '2015-08-25 14:43:00') & \
(df.index <= '2015-08-25 14:48:00')
bus0 = (df.index > '2015-08-27 10:10:00') & \
(df.index <= '2015-08-27 10:15:00')
bus1 = (df.index > '2015-08-27 10:15:00') & \
(df.index <= '2015-08-27 10:20:00')
nc = len(df)
df['class'] = np.zeros(nc) - 1
df['class'][car0] = np.zeros(nc)
df['class'][car1] = np.zeros(nc)
df['class'][bus0] = np.ones(nc)
df['class'][bus1] = np.ones(nc)
In [42]:
# separate into quarters for train and validation
q1 = df[car0]
q2 = df[car1]
q3 = df[bus0]
q4 = df[bus1]
traindf = pd.concat([q2, q4])
validationdf = pd.concat([q1, q3])
In [43]:
# check for NaNs in the dataframes
print(traindf.isnull().sum().sum())
print(validationdf.isnull().sum().sum())
In [44]:
# drop NaNs
traindf = traindf.dropna()
validationdf = validationdf.dropna()
In [45]:
# Make the training and validation sets
X_train = traindf[features].values
y_train = traindf['class'].values
X_test = validationdf[features].values
y_test = validationdf['class'].values
In [46]:
# train a random forest
clf = RandomForestClassifier(n_estimators=200)
In [47]:
# get the 5-fold cross-validation score
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores, scores.mean(), scores.std())
In [48]:
# apply model to test set
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)
In [49]:
# obtain accuracy score
testscore = accuracy_score(y_test, predict_y)
print("Accuracy score on test set: %6.3f" % testscore)
In [50]:
# Inspect feature importances
for i, ifeature in enumerate(features):
print(ifeature + ': %6.4f' % clf.feature_importances_[i])
In [60]:
# compare bus gyroscopeZ2sm and car gyroscopeZ2sm
q1['gyroscopeZ2sm'].plot(color='blue', figsize=(12,6), kind='hist', bins=40, alpha=0.4) # car
q3['gyroscopeZ2sm'].plot(color='green', kind='hist', bins=40, alpha=0.4) # bus
Out[60]:
In [61]:
# Generate Fourier Transform of features
fftfeatures = []
for i in features:
reals = np.real(np.fft.rfft(df[i]))
imags = np.imag(np.fft.rfft(df[i]))
complexs = [reals[0]]
n = len(reals)
if n % 2 == 0:
complexs.append(imags[0])
for j in range(1, n - 1):
complexs.append(reals[j])
complexs.append(imags[j])
complexs.append(reals[j])
df['f' + i] = complexs
fftfeatures.append('f' + i)
features.extend(fftfeatures)
In [62]:
# Make the training and validation sets
X_train = traindf[fftfeatures].values
y_train = traindf['class'].values
X_test = validationdf[fftfeatures].values
y_test = validationdf['class'].values
In [63]:
# train a random forest
clf = RandomForestClassifier(n_estimators=200)
In [64]:
# get the 5-fold cross-validation score
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores, scores.mean(), scores.std())
In [65]:
# apply model to test set
clf.fit(X_train, y_train)
predict_y = clf.predict(X_test)
In [66]:
# obtain accuracy score
testscore = accuracy_score(y_test, predict_y)
print("Accuracy score on test set: %6.3f" % testscore)
In [68]:
# Inspect feature importances
for i, ifeature in enumerate(fftfeatures):
print(ifeature + ': %6.4f' % clf.feature_importances_[i])
In [ ]: