Time-series Prediction with Keras


In [ ]:
# Import modules
import math
import pandas as pd
import numpy as np
import datetime as datetime

# Plotly Imports
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
plotly.offline.init_notebook_mode()

# Keras Imports
from keras.models import Sequential
from keras.layers import Dense

In [ ]:
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back)]
        dataX.append(a)
        dataY.append(dataset[i + look_back])
    return np.array(dataX), np.array(dataY)

def split_dataset(dataset):    
    dataset = dataset.astype('float32')
    # split into train and test sets
    train_size = int(len(dataset) * 0.67)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
    print(len(train), len(test))
    return train, test


def train_model(dataset, save_model=False):
    train, test = split_dataset(dataset)
    # reshape into X=t and Y=t+1
    look_back = 10
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)

    # create and fit Multilayer Perceptron model
    model = Sequential()
    model.add(Dense(8, input_dim=look_back, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, nb_epoch=200, batch_size=2, verbose=2)

    # Estimate model performance
    trainScore = model.evaluate(trainX, trainY, verbose=0)
    print('Train Score: %.2f MSE (%.2f RMSE)' % (trainScore, math.sqrt(trainScore)))
    testScore = model.evaluate(testX, testY, verbose=0)
    print('Test Score: %.2f MSE (%.2f RMSE)' % (testScore, math.sqrt(testScore)))    
    # Save model
    if (save_model):
        model.save('timeseries_rolling_cpu.h5')    
        
    return model

def create_data_with_rolling_avg(df):
    # Create rolling average with window: 20
    pdf = pd.DataFrame(df, columns=['MEAN_MAX_AIR_TEMP'])
    pdf['MEAN_MAX_AIR_TEMP'].rolling(window=20,center=False)

    rounded_data = np.round(pdf['MEAN_MAX_AIR_TEMP'].rolling(window=20,center=False).mean(),2)
    rounded_data = rounded_data.dropna()
    rounded_data.head()

    # Get values as a np.array
    dataset = rounded_data.values
    return dataset

def predict_from_model(model, train, test, look_back = 10):
    # generate predictions for training
    trainX, trainY = create_dataset(train, look_back)
    testX, testY = create_dataset(test, look_back)
    trainPredict = model.predict(trainX)
    testPredict = model.predict(testX)

    # shift train predictions for plotting
    trainPredictPlot = np.empty_like(dataset)
    trainPredictPlot[:] = np.nan
    trainPredictPlot[look_back:len(trainPredict)+look_back] = trainPredict[:,0]

    # shift test predictions for plotting
    testPredictPlot = np.empty_like(dataset)
    testPredictPlot[:] = np.nan
    testPredictPlot[len(trainPredict)+(look_back*2)+1:len(dataset)-1] = testPredict[:,0]
    return trainPredictPlot, testPredictPlot

def plot_all(model, train, test):
    trainPredictPlot, testPredictPlot = predict_from_model(model, train, test)

    traces = [
        go.Scatter(  
            y=dataset,
            name='Max Temp',
            opacity=0.7,
            fill='tozeroy'
        ),
        go.Scatter(  
            y=trainPredictPlot,
            name='Training Prediction Max Temp',
            opacity=0.7,
            fill='tozeroy'
        )
    ]
    plotly.offline.iplot(traces)    

    traces = [
        go.Scatter(  
            y=dataset,
            name='Max Temp',
            opacity=0.7,
            fill='tozeroy'
        ),
        go.Scatter(  
            y=testPredictPlot,
            name='Test Prediction Max Temp',
            opacity=0.7,
            fill='tozeroy'
        )
    ]
    plotly.offline.iplot(traces)

In [ ]:
df = pd.read_csv('canadian_5_day_avg_daily.dat')

# Calculate local Start Time
df.index = pd.DatetimeIndex(pd.to_datetime(df["START_DATE"]), tz="UTC")
df["local_starttime"]=df.index.tz_convert("America/Santiago")

# Calculate local End Time
df.index = pd.DatetimeIndex(pd.to_datetime(df["END_DATE"]), tz="UTC")
df["local_endtime"]=df.index.tz_convert("America/Santiago")

# Set Start and End Date
df['start_date'] = datetime.datetime(1975,1,1)
df['end_date'] = datetime.datetime(1975,1,1)

for index, row in df.iterrows():
    df.loc[index, 'start_date'] = datetime.datetime(row.local_starttime.year, row.local_starttime.month, row.local_starttime.day)
    df.loc[index, 'end_date'] = datetime.datetime(row.local_endtime.year, row.local_endtime.month, row.local_endtime.day)

# Replace invalid values with 0
df['MEAN_AVG_AIR_TEMP'].replace(-999, 0, inplace=True)
df['MEAN_MAX_AIR_TEMP'].replace(-999, 0, inplace=True)
df['MEAN_MIN_AIR_TEMP'].replace(-999, 0, inplace=True)
df.head()

Import Keras Model


In [ ]:
# Verify model can be loaded
from keras.models import load_model
saved_model = load_model('timeseries_rolling_cpu.h5')

Predict Time-Series with Model


In [ ]:
pdf = pd.DataFrame(df, columns=['MEAN_MAX_AIR_TEMP'])
dataset = pdf['MEAN_MAX_AIR_TEMP'].values # create_data_with_rolling_avg(pdf)
train, test = split_dataset(dataset)

In [ ]:
plot_all(saved_model, train, test)

Predict Time-Series with Rolling Avg and Keras Model


In [ ]:
pdf = pd.DataFrame(df, columns=['MEAN_MAX_AIR_TEMP'])
dataset = create_data_with_rolling_avg(pdf)
train, test = split_dataset(dataset)

In [ ]:
plot_all(saved_model, train, test)

Now you try training the Keras Model


In [ ]:
train_model(dataset) # This may take a while if you don't have NVIDIA CUDA-enabled GPU

In [ ]: