We used one of the dataset in Numenta Anomaly Benchmark (NAB) for demo, i.e. NYC taxi passengers dataset, which contains 10320 records, each indicating the total number of taxi passengers in NYC at a corresonponding time spot.
In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
# plot the predicted values and actual values (for the test data)
def plot_result(test_df, pred_df, dt_col="timestamp", value_col="value", past_seq_len=1):
# target column of dataframe is "value"
# default past sequence length is 1
pred_value = pred_df[value_col].values
true_value = test_df[value_col].values[past_seq_len:]
fig, axs = plt.subplots(figsize=(12, 5))
axs.plot(pred_df[dt_col], pred_value, color='red', label='predicted values')
axs.plot(test_df[dt_col][past_seq_len:], true_value, color='blue', label='actual values')
axs.set_title('the predicted values and actual values (for the test data)')
plt.xlabel(dt_col)
plt.xticks(rotation=45)
plt.ylabel('number of taxi passengers')
plt.legend(loc='upper left')
plt.show()
In [3]:
# plot results of multi step forecasting
# plot at most five values for better view
# plot the predicted values and actual values (for the test data)
def plot_less_five_step_result(test_df, pred_df, dt_col="timestamp", value_col="value", past_seq_len=1):
fig, axs = plt.subplots(figsize=(12, 5))
target_value = test_df[value_col].values[past_seq_len:]
axs.plot(test_df[dt_col][past_seq_len:], target_value, color='blue', label='actual values')
value_cols=["{}_{}".format(value_col, i) for i in range(min(pred_df.shape[1] - 1, 5))]
time_delta = pred_df[dt_col][1] - pred_df[dt_col][0]
plot_color = ["g", "r", "c", "m", "y"]
for i in range(len(value_cols)):
pred_value = pred_df[value_cols[i]].values
pred_dt = pred_df[dt_col].values + time_delta * i
axs.plot(pred_dt, pred_value, color=plot_color[i], label='predicted values' + str(i))
axs.set_title('the predicted values and actual values (for the test data)')
plt.xlabel(dt_col)
plt.xticks(rotation=45)
plt.ylabel('number of taxi passengers')
plt.legend(loc='upper left')
plt.show()
In [4]:
# plot results of multi step forecasting
# plot result of multi step forecasting
# plot the predicted values and actual values (for the test data)
def plot_first_last_step_result(test_df, pred_df, dt_col="timestamp", value_col="value", past_seq_len=1):
fig, axs = plt.subplots(figsize=(12, 5))
target_value = test_df[value_col].values[past_seq_len:]
axs.plot(test_df[dt_col][past_seq_len:], target_value, color='blue', label='actual values')
value_cols=["{}_{}".format(value_col, i) for i in range(pred_df.shape[1] - 1)]
time_delta = pred_df[dt_col][1] - pred_df[dt_col][0]
pred_value_first = pred_df[value_cols[0]].values
pred_dt_first = pred_df[dt_col].values
axs.plot(pred_dt_first, pred_value_first, color="g", label='first predicted values')
pred_value_last = pred_df[value_cols[-1]].values
pred_dt_last = pred_df[dt_col].values + time_delta * (len(value_cols)-1)
axs.plot(pred_dt_last, pred_value_last, color="r", label='last predicted values')
axs.set_title('the predicted values and actual values (for the test data)')
plt.xlabel(dt_col)
plt.xticks(rotation=45)
plt.ylabel('number of taxi passengers')
plt.legend(loc='upper left')
plt.show()
In [5]:
import os
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
%pylab inline
import matplotlib.dates as md
from matplotlib import pyplot as plt
Now we download the dataset and load it into a pandas dataframe.
Run the script $ANALYTICS_ZOO_HOME/dist/bin/data/NAB/nyc_taxi/get_nyc_taxi.sh to download the raw data
In [6]:
# load nyc taxi data
try:
dataset_path = os.getenv("ANALYTICS_ZOO_HOME")+"/bin/data/NAB/nyc_taxi/nyc_taxi.csv"
raw_df = pd.read_csv(dataset_path)
except Exception as e:
print("nyc_taxi.csv doesn't exist")
print("you can run $ANALYTICS_ZOO_HOME/bin/data/NAB/nyc_taxi/get_nyc_taxi.sh to download nyc_taxi.csv")
Below are some example records of the data
In [7]:
raw_df.head(5)
Out[7]:
Convert string timestamp to TimeStamp
In [8]:
df = pd.DataFrame(pd.to_datetime(raw_df.timestamp))
df["value"] = raw_df["value"]
df.head()
Out[8]:
You can use train_val_test_split to split the whole dataset into train/val/test sets. There will be two columns in the output dataframe: "timestamp" and "value", where the data type of "timestamp" column is datetime64.
In [9]:
from zoo.automl.common.util import train_val_test_split
train_df, val_df, test_df = train_val_test_split(df, val_ratio=0.1, test_ratio=0.1)
In [10]:
train_df.describe()
Out[10]:
In [11]:
train_df.head()
Out[11]:
In [12]:
# shape of the dataframe
print("The shape of train_df is", train_df.shape)
print("The shape of val_df is", val_df.shape)
print("The shape of test_df is", test_df.shape)
In [13]:
# visualisation of anomaly throughout time in train_df
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
fig, ax = plt.subplots(figsize=(12, 5))
# pd.plotting.deregister_matplotlib_converters()
ax.plot(train_df['timestamp'], train_df['value'], color='blue', linewidth=0.6)
ax.set_title('NYC taxi passengers throughout time')
plt.xlabel('timestamp')
plt.xticks(rotation=45)
plt.ylabel('The Number of NYC taxi passengers')
plt.legend(loc='upper left')
plt.show()
You can use analytices zoo automl to predict time series data by simply define a TimeSequencePredictor.
We use feature tools to generate features from the given datetime. The generated features are ['HOUR', 'DAY', 'MONTH'. 'IS_AWAKE', 'IS_BUSY_HOURS']. Our feature space comprises these generated features as well as the original inputs such as ['datetime','value','extra_features'].
Currently, We use RNN to learn from 50 previous values, and predict just the 1 next value. You can specify the sequence length to predict while creating TimeSequencePredictor with arg: future_seq_len.
In [14]:
# build time sequence predictor
from zoo.automl.regression.time_sequence_predictor import *
# you need to specify the name of datetime column and target column
# The default names are "timestamp" and "value" respectively.
tsp = TimeSequencePredictor(dt_col="timestamp",
target_col="value",
extra_features_col=None)
In [15]:
from zoo import init_spark_on_local
from zoo.ray import RayContext
sc = init_spark_on_local(cores=4)
ray_ctx = RayContext(sc=sc, object_store_memory="1g")
ray_ctx.init()
Out[15]:
In [16]:
%%time
# fit train_df and validate with val_df, return the best trial as pipeline.
# the default recipe is SmokeRecipe,which runs one epoch and one iteration with only 1 random sample.
# you can change recipe by define `recipe` in `fit`. The recipes you can choose are SmokeRecipe, RandomRecipe, LSTMGridRandomRecipe, GridRandomRecipe and BayesRecipe.
look_back_single = 5
pipeline = tsp.fit(train_df,
validation_df=val_df,
metric="mse",
recipe=LSTMGridRandomRecipe(
num_rand_samples=1,
epochs=2,
look_back=look_back_single,
batch_size=[64]))
print("Training completed.")
In [17]:
# predict test_df with the best trial
pred_df = pipeline.predict(test_df)
In [18]:
pred_df.head(5)
Out[18]:
In [19]:
# prediction value start from look_back_single
test_df[look_back_single:look_back_single+5]
Out[19]:
In [21]:
# plot the predicted values and actual values
plot_result(test_df, pred_df,past_seq_len=look_back_single)
In [22]:
# evaluate test_df
mse, smape = pipeline.evaluate(test_df, metrics=["mse", "smape"])
print("Evaluate: the mean square error is", mse)
print("Evaluate: the smape value is", smape)
We provide save and restore interface to save the pipeline with the best trial for easily rebuilding.
In [23]:
# save the pipeline with best trial
pipeline.save("/tmp/saved_pipeline/my.ppl")
Out[23]:
In [24]:
from zoo.automl.pipeline.time_sequence import load_ts_pipeline
new_pipeline = load_ts_pipeline("/tmp/saved_pipeline/my.ppl")
In [25]:
# you can do predict and evaluate again
# we use test_df as input in order to compare results before and after restoration
new_pred = new_pipeline.predict(test_df)
In [26]:
new_pred.head(5)
Out[26]:
In [27]:
# evaluate test_df
mse, smape = new_pipeline.evaluate(test_df, metrics=["mse", "smape"])
print("Evaluate: the mean square error is", mse)
print("Evaluate: the smape value is", smape)
We support continue training with incremental data using the best configuration searched and the trained model.
In [28]:
# review the initialization infomation if needed
new_pipeline.describe()
In [29]:
# Use val_df as incremental data
new_pipeline.fit(val_df,epoch_num=5)
In [30]:
# predict results of test_df
new_pred_df = new_pipeline.predict(test_df)
plot_result(test_df, new_pred_df,past_seq_len = look_back_single)
In [31]:
# evaluate test_df
mse, smape = new_pipeline.evaluate(test_df, metrics=["mse", "smape"])
print("Evaluate: the mean square error is", mse)
print("Evaluate: the smape value is", smape)
You can do multi step forecasting by simply changing the future_seq_len option while creating a new TimeSequencePredictor object.
In [79]:
# build time sequence predictor
from zoo.automl.regression.time_sequence_predictor import *
# change future_seq_len into the step you want to forcast.
tsp = TimeSequencePredictor(future_seq_len=5,
dt_col="timestamp",
target_col="value",
extra_features_col=None)
In [80]:
%%time
# you can specify the look back sequence length with a single number or a range of (min_len, max_len) in recipe.
look_back_multi = 10
pipeline = tsp.fit(train_df,
validation_df=val_df,
metric="mse",
recipe=LSTMGridRandomRecipe(
num_rand_samples=3,
epochs=2,
look_back=10,
training_iteration=look_back_multi,
batch_size=[64]))
print("Training completed.")
In [89]:
# test
# predict test_df with the best trial
pred_df = pipeline.predict(test_df)
In [90]:
pred_df.head(5)
Out[90]:
In [91]:
# plot multi step predicted values and actual values
# plot at most five step predict values for better view
plot_less_five_step_result(test_df, pred_df,past_seq_len=look_back_multi)
In [92]:
# plot only the first and the last step predict values and actual values
plot_first_last_step_result(test_df, pred_df, past_seq_len=look_back_multi)
In [93]:
# evaluate test_df
mse, smape = pipeline.evaluate(test_df, metrics=["mse", "smape"])
print("Evaluate: the mean square error is", mse)
print("Evaluate: the smape value is", smape)
In [94]:
ray_ctx.stop()
In [ ]: