AutoML service


In [1]:
%matplotlib inline
import json
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import pprint
import requests
import seaborn as sns
from sklearn.metrics import roc_auc_score
import tsfresh

from tsfresh.examples.har_dataset import download_har_dataset, load_har_dataset, load_har_classes
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

import logging


/Users/crawles/anaconda/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
/Users/crawles/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Start service


In [2]:
# python automl_service:app

Prep


In [3]:
c = sns.color_palette()

In [4]:
def get_model_results(evaluated_individuals):
    """For processing model run results, store results as dictionary of AUCS"""
    tpot_results = []
    for i,(k, (steps, auc)) in enumerate(evaluated_individuals.iteritems()):
        model_type = k.split('(')[0]
        tpot_results.append([model_type, i, auc])
    return tpot_results

In [5]:
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
plt.rcParams.update(params)

In [6]:
label_train = pd.read_json('data/label_train.json')
df = pd.read_json('data/data_train.json')
x_train = df.groupby('example_id').sum()
df.index = df.index.astype(np.int)
label_train = pd.read_json('data/label_train.json')

Show raw time series data


In [7]:
df = pd.read_json('data/data_train.json')
df.index = df.index.astype(int)
for i,(_,_df) in enumerate(df[df.example_id.isin([0, 1])].groupby('example_id')):
    plt.figure()
    _df.index = _df.index%128
    _df.sort_index().measurement.plot(linewidth=2, color=c[0])
    plt.ylabel('Amplitude')
    if i==0:
        plt.title('Class 0: Raw time series data examples', size=24)
plt.xlabel('Sample');



In [8]:
for i,(_,_df) in enumerate(df[df.example_id.isin([1246, 1248])].groupby('example_id')):
    plt.figure()
    _df.index = _df.index%128
    _df.sort_index().measurement.plot(linewidth=2, color=c[2])
    plt.ylabel('Amplitude')
    if i==0:
        plt.title('Class 1: Raw time series data examples', size=24)
plt.xlabel('Sample');


Use model serve API

Train Model


In [13]:
train_url = 'http://0.0.0.0:8080/train_pipeline'
train_files = {'raw_data': open('data/data_train.json', 'rb'),
               'labels'  : open('data/label_train.json', 'rb'),
               'params'  : open('parameters/train_parameters_model2.yml', 'rb')}

In [14]:
r_train = requests.post(train_url, files=train_files)
result_df = json.loads(r_train.json())

In [15]:
r=requests.get('http://0.0.0.0:8080/models')
pipelines = json.loads(r.json())

automl_experiments = get_model_results(pipelines['2']['evaluated_models'])
del result_df['evaluated_models'] # too long to print out
pprint.pprint(result_df)


{u'featureEngParams': {u'column_id': u'example_id',
                       u'default_fc_parameters': u"['median', 'minimum', 'standard_deviation', 'sum_values', 'variance', 'maximum', 'length', 'mean']",
                       u'impute_function': u'impute'},
 u'mean_cv_accuracy': 0.8712402361520161,
 u'mean_cv_roc_auc': 0.949478607623132,
 u'modelId': 2,
 u'modelType': u"Pipeline(memory=None,\n     steps=[('stackingestimator', StackingEstimator(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,\n              learning_rate=0.1, loss='deviance', max_depth=1,\n              max_features=0.1, max_leaf_nodes=None,\n              min_impurity_decrease=0.0, min_impurity_split=Non...bsample=0.75, verbose=0,\n              warm_start=False))), ('gaussiannb', GaussianNB(priors=None))])",
 u'trainShape': [1647, 8],
 u'trainTime': 12.85513710975647}

In [17]:
automl_experiments = pd.DataFrame(automl_experiments, columns=['model', 'id', 'auc']).sort_values('model')
# automl_experiments = automl_experiments.query('model != "LinearSVC"').query('model != "MultinomialNB"').query('model != "BernoulliNB"')
sns.set_style(style='darkgrid')
f, ax = plt.subplots(figsize=(10,10))
box = automl_experiments.boxplot(column='auc', by='model', rot=0, vert=False,
                             ax=ax, patch_artist=True, return_type='dict',
                             widths=0.8)
ax.grid(axis='y')
ax.set_title('AUC range by model type', size=20)
plt.suptitle('')

for b in box['auc']['boxes']:
    color = sns.color_palette()[4] 
    b.set(color=color, linewidth=2)
    b.set(facecolor=color, linewidth=2)

# for median in bp['medians']:
for median in box['auc']['medians']:
    median.set(color='grey', linewidth=3)

plt.xlim(0.8)


Out[17]:
(0.8, 0.88051082251082247)

Serve model prediction


In [18]:
serve_url = 'http://0.0.0.0:8080/serve_prediction'
test_files = {'raw_data': open('data/data_test.json', 'rb'),
              'params' : open('parameters/test_parameters_model2.yml', 'rb')}

In [19]:
r_test  = requests.post(serve_url, files=test_files)
result = pd.read_json(r_test.json()).set_index('id')

In [20]:
result.head()


Out[20]:
score
id
8 0.000779
15 1.000000
61 0.000000
559 1.000000
560 0.000000

In [21]:
label_test = pd.read_json('data/label_test.json')
result.index = result.index.astype(np.int)
result = result.loc[label_test.example_id]
auc = roc_auc_score(label_test.label, result.score)

print "AUC: {:1.2f}".format(auc)


AUC: 0.92

Appendix

Build an AutoML Pipeline

Load Data

The dataset consists of timeseries for 7352 accelerometer readings. Each reading represents an accelerometer reading for 2.56 sec at 50hz (for a total of 128 samples per reading). Each reading corresponds one of six activities (walking, walking upstairs, walking downstairs, sitting, standing and laying). We use only two labels to create a binary classification problem for demonstration purposes.

The dataset is available here: https://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones


In [ ]:
# fetch dataset from uci
download_har_dataset()

In [ ]:
# load data
df = load_har_dataset()
y = load_har_classes()

# binary classification
class1, class2 = 2, 3
two_classes = (y==class1) | (y==class2)
df = df[two_classes]
y = y[two_classes]
# change lavel names
y[y==class1] = 0
y[y==class2] = 1
df = df.reset_index(drop=True)
y = y.reset_index(drop=True)

In [ ]:
df.loc[0].plot()
plt.xlabel('Samples')
plt.ylabel('Value');

Prep data for feature building

We need to get the data in the format required by TSFRESH:

If there are 100 examples, where each example has 50 samples, we need to go from a (100, 50) dataframe to a (100*50, 2) dataframe as follows:

measurement|example_id
----------------------
0.5235     |0
0.4284     |0
0.9042     |0
...
0.9042     |100

See the TSFRESH docs for more details


In [ ]:
# values
v = df.stack().values

# ids
ids = []
for i in range(len(y)):
    ids.extend(128*[i])
ids = np.array(ids)

master_df = pd.DataFrame(v, columns=['measurement'])
master_df['example_id'] = ids

Build Train/Test Set


In [ ]:
# build label dataframe
label_df = pd.DataFrame(y.reset_index(drop=True))\
    .reset_index()
label_df.columns = ['example_id', 'label']

# split into training and test 
train_id, test_id = train_test_split(label_df.example_id, random_state=43, test_size=0.2)
train_id = pd.DataFrame(train_id)
test_id  = pd.DataFrame(test_id)

data_train = master_df.merge(train_id, on='example_id')
data_test = master_df.merge(test_id, on='example_id')
print float(data_train.shape[0])/(data_train.shape[0] + data_test.shape[0])

label_train = label_df.merge(train_id, on='example_id')
label_test = label_df.merge(test_id, on='example_id')

Build a model


In [ ]:
%%time

extraction_settings = MinimalFCParameters()

X_train = extract_features(data_train, column_id='example_id', impute_function=eval('tsfresh.utilities.dataframe_functions.impute'), default_fc_parameters=extraction_settings); 
X_test  = extract_features(data_test, column_id='example_id', impute_function=impute, default_fc_parameters=extraction_settings);

In [ ]:
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score
tpot = TPOTClassifier(generations=5, population_size=20, max_time_mins=0.2)
tpot.fit(X_train, label_train.label)
roc_auc_score(label_test.label, tpot.predict_proba(X_test)[:,1])

Compare to Random Forest (baseline)


In [ ]:
from sklearn.ensemble import RandomForestClassifier
cl = RandomForestClassifier(n_estimators=100, n_jobs=-1)
cl.fit(X_train, label_train.label)
roc_auc_score(label_test.label, cl.predict_proba(X_test)[:,1])
import sklearn
scoring = ['roc_auc', 'accuracy']
cv = sklearn.model_selection.cross_validate(cl, X_train, label_train.label, cv=5, scoring=scoring)
mean_accuracy = cv['test_accuracy'].mean()
mean_roc_auc = cv['test_roc_auc'].mean()
mean_accuracy, mean_roc_auc

In [ ]:
def plot_importances(cl, column_names, n_features=10, ax=None, error_bars = True):
    df_imp = pd.DataFrame({'features': column_names,
                           'importances': cl.feature_importances_})
    errors = np.std([tree.feature_importances_ for tree in cl.estimators_], axis=0)
    df_imp_sub = df_imp.set_index('features').sort_values('importances').tail(n_features)
    if error_bars:
        df_errors = pd.DataFrame({'features': column_names,
                                  'importances': errors})
        df_err_sub = df_errors.set_index('features').loc[df_imp_sub.index]
    else:
        df_err_sub = None
    ax = df_imp_sub.plot(kind='barh', width=.7, legend=False, ax=ax, xerr=df_err_sub, ecolor='g')
    for i,t in enumerate(df_imp_sub.index.tolist()):
        t = ax.text(0.001, i-.06,t)
        t.set_bbox(dict(facecolor='white', alpha=0.4, edgecolor='grey'))
    ax.yaxis.set_major_formatter(plt.NullFormatter())
    ax.set_title('Feature Importances')
    ax.set_xlim(0.0)
    ax.set_xlabel('importance')
    return df_imp_sub

plot_importances(cl, X_train.columns);

Export Data

Save training/testing data so we can build and test the AutoML Flask service


In [ ]:
# import os
# output_dir = 'data'
# data_train.to_json(os.path.join(output_dir, 'data_train.json'))
# data_test.to_json(os.path.join(output_dir, 'data_test.json'))
# label_train.to_json(os.path.join(output_dir, 'label_train.json'))
# label_test.to_json(os.path.join(output_dir, 'label_test.json'))