This example show shows how to use tsfresh to exctract useful features from multiple timeseries and use them to improve classification performance.
In [1]:
%matplotlib inline
import matplotlib.pylab as plt
from tsfresh.examples.har_dataset import download_har_dataset, load_har_dataset, load_har_classes
import seaborn as sns
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import FeatureExtractionSettings
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
The dataset consists of timeseries for 7352 accelerometer readings. Each reading represents an accelerometer reading for 2.56 sec at 50hz (for a total of 128 samples per reading). Furthermore, each reading corresponds one of six activities (walking, walking upstairs, walking downstairs, sitting, standing and laying)
For more information, or to fetch dataset, go to https://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones
In [2]:
# fetch dataset from uci
download_har_dataset()
In [3]:
df = load_har_dataset()
df.head()
df.shape
Out[3]:
In [16]:
plt.title('accelerometer reading')
plt.plot(df.ix[0,:])
plt.show()
In [5]:
extraction_settings = FeatureExtractionSettings()
extraction_settings.IMPUTE = impute # Fill in Infs and NaNs
In [6]:
# transpose since tsfresh reads times series data column-wise, not row-wise
df_t = df.copy().transpose()
df_t.shape
Out[6]:
In [7]:
# rearrange sensor readings column-wise, not row-wise
master_df = pd.DataFrame(df_t[0])
master_df['id'] = 0
# grab first 500 readings to save time
for i in range(1, 500):
temp_df = pd.DataFrame(df_t[i])
temp_df['id'] = i
master_df = pd.DataFrame(np.vstack([master_df, temp_df]))
print(master_df.shape)
master_df.head()
Out[7]:
In [8]:
%time X = extract_features(master_df, column_id=1, feature_extraction_settings=extraction_settings);
In [9]:
# 206 features are extracted for each reading
X.shape
Out[9]:
In [10]:
y = load_har_classes()[:500]
In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
In [12]:
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)
print(classification_report(y_test, cl.predict(X_test)))
In [13]:
X_1 = df.ix[:499,:]
X_1.shape
Out[13]:
In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y, test_size=.2)
In [15]:
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)
print(classification_report(y_test, cl.predict(X_test)))
In [ ]: