In [14]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM


Using TensorFlow backend.

Data Preprocessing


In [10]:
# Data preparation

from pandas import read_csv
from datetime import datetime
# load data
def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')
dataset = read_csv('weatherdata.csv',  parse_dates = [['year', 'month', 'day', 'hour']], index_col=0, date_parser=parse)
dataset.drop('No', axis=1, inplace=True)
# manually specify column names
dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
dataset.index.name = 'date'
# mark all NA values with 0
dataset['pollution'].fillna(0, inplace=True)
# drop the first 24 hours
dataset = dataset[24:]
# summarize first 5 rows
print(dataset.head(5))
# save to file
dataset.to_csv('pollution.csv')


                     pollution  dew  temp   press wnd_dir  wnd_spd  snow  rain
date                                                                          
2010-01-02 00:00:00      129.0  -16  -4.0  1020.0      SE     1.79     0     0
2010-01-02 01:00:00      148.0  -15  -4.0  1020.0      SE     2.68     0     0
2010-01-02 02:00:00      159.0  -11  -5.0  1021.0      SE     3.57     0     0
2010-01-02 03:00:00      181.0   -7  -5.0  1022.0      SE     5.36     1     0
2010-01-02 04:00:00      138.0   -7  -5.0  1022.0      SE     6.25     2     0

In [11]:
from pandas import read_csv
from matplotlib import pyplot
# load dataset
dataset = read_csv('pollution.csv', header=0, index_col=0)
values = dataset.values
# specify columns to plot
groups = [0, 1, 2, 3, 5, 6, 7]
i = 1
# plot each column
pyplot.figure()
for group in groups:
    pyplot.subplot(len(groups), 1, i)
    pyplot.plot(values[:, group])
    pyplot.title(dataset.columns[group], y=0.5, loc='right')
    i += 1
pyplot.show()


Data Preparation for LSTM

  1. Frame problem as supervised learning problem
  2. Normalize input variables (NN can't deal with differences in scale)

In [13]:
import pandas as pd

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

For future implementations make use of more than 1 hour of input time steps.


In [23]:
# load dataset
dataset = read_csv('pollution.csv', header=0, index_col=0)
values = dataset.values

# integer encode direction
encoder = LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])

# ensure all data is float
values = values.astype('float32')

# normalize features
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

# frame as supervised learning (use 1 previous step and predict 1 future step)
reframed = series_to_supervised(scaled, 1, 1)

# drop columns we don't want to predict
reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
print(reframed.head())


   var1(t-1)  var2(t-1)  var3(t-1)  var4(t-1)  var5(t-1)  var6(t-1)  \
1   0.129779   0.352941   0.245902   0.527273   0.666667   0.002290   
2   0.148893   0.367647   0.245902   0.527273   0.666667   0.003811   
3   0.159960   0.426471   0.229508   0.545454   0.666667   0.005332   
4   0.182093   0.485294   0.229508   0.563637   0.666667   0.008391   
5   0.138833   0.485294   0.229508   0.563637   0.666667   0.009912   

   var7(t-1)  var8(t-1)   var1(t)  
1   0.000000        0.0  0.148893  
2   0.000000        0.0  0.159960  
3   0.000000        0.0  0.182093  
4   0.037037        0.0  0.138833  
5   0.074074        0.0  0.109658  

Define and fit model

First, we must split the prepared dataset into train and test sets. To speed up the training of the model for this demonstration, we will only fit the model on the first year of data, then evaluate it on the remaining 4 years of data.


In [24]:
# split into train and test sets
values = reframed.values
n_train_hours = 365 * 24
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]

# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

# reshape input to be 3D [samples, timesteps, features]
# timesteps could be a sequence of timesteps in future impl
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)


(8760, 1, 8) (8760,) (35039, 1, 8) (35039,)

We will define the LSTM with 50 neurons in the first hidden layer and 1 neuron in the output layer for predicting pollution. The input shape will be 1 time step with 8 features.

The model will be fit for 50 training epochs with a batch size of 72. Remember that the internal state of the LSTM in Keras is reset at the end of each batch, so an internal state that is a function of a number of days may be helpful (try testing this).


In [20]:
# define LSTM model
# design network
n_hidden_layer = 50
n_output_layer = 1

model = Sequential()

model.add(LSTM(n_hidden_layer, input_shape=(train_X.shape[1], train_X.shape[2])))

model.add(Dense(n_output_layer))

model.compile(loss='mae', optimizer='adam')

# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)

# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()


Train on 8760 samples, validate on 35039 samples
Epoch 1/50
2s - loss: 0.0588 - val_loss: 0.0649
Epoch 2/50
1s - loss: 0.0396 - val_loss: 0.0701
Epoch 3/50
1s - loss: 0.0248 - val_loss: 0.0559
Epoch 4/50
1s - loss: 0.0177 - val_loss: 0.0412
Epoch 5/50
1s - loss: 0.0153 - val_loss: 0.0249
Epoch 6/50
1s - loss: 0.0149 - val_loss: 0.0191
Epoch 7/50
1s - loss: 0.0148 - val_loss: 0.0169
Epoch 8/50
1s - loss: 0.0147 - val_loss: 0.0154
Epoch 9/50
1s - loss: 0.0147 - val_loss: 0.0150
Epoch 10/50
1s - loss: 0.0147 - val_loss: 0.0149
Epoch 11/50
1s - loss: 0.0146 - val_loss: 0.0145
Epoch 12/50
1s - loss: 0.0146 - val_loss: 0.0141
Epoch 13/50
1s - loss: 0.0145 - val_loss: 0.0141
Epoch 14/50
1s - loss: 0.0146 - val_loss: 0.0139
Epoch 15/50
1s - loss: 0.0145 - val_loss: 0.0141
Epoch 16/50
1s - loss: 0.0146 - val_loss: 0.0139
Epoch 17/50
1s - loss: 0.0145 - val_loss: 0.0137
Epoch 18/50
1s - loss: 0.0145 - val_loss: 0.0138
Epoch 19/50
1s - loss: 0.0145 - val_loss: 0.0137
Epoch 20/50
1s - loss: 0.0145 - val_loss: 0.0137
Epoch 21/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 22/50
1s - loss: 0.0145 - val_loss: 0.0137
Epoch 23/50
1s - loss: 0.0144 - val_loss: 0.0137
Epoch 24/50
1s - loss: 0.0145 - val_loss: 0.0136
Epoch 25/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 26/50
1s - loss: 0.0145 - val_loss: 0.0138
Epoch 27/50
1s - loss: 0.0144 - val_loss: 0.0137
Epoch 28/50
1s - loss: 0.0145 - val_loss: 0.0138
Epoch 29/50
1s - loss: 0.0145 - val_loss: 0.0136
Epoch 30/50
1s - loss: 0.0145 - val_loss: 0.0137
Epoch 31/50
1s - loss: 0.0145 - val_loss: 0.0136
Epoch 32/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 33/50
2s - loss: 0.0144 - val_loss: 0.0136
Epoch 34/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 35/50
1s - loss: 0.0144 - val_loss: 0.0135
Epoch 36/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 37/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 38/50
2s - loss: 0.0144 - val_loss: 0.0136
Epoch 39/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 40/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 41/50
1s - loss: 0.0145 - val_loss: 0.0135
Epoch 42/50
1s - loss: 0.0144 - val_loss: 0.0135
Epoch 43/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 44/50
2s - loss: 0.0144 - val_loss: 0.0136
Epoch 45/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 46/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 47/50
1s - loss: 0.0144 - val_loss: 0.0136
Epoch 48/50
1s - loss: 0.0144 - val_loss: 0.0135
Epoch 49/50
1s - loss: 0.0144 - val_loss: 0.0137
Epoch 50/50
1s - loss: 0.0144 - val_loss: 0.0138

Evaluate Model

We combine the forecast with the test dataset and invert the scaling. We also invert scaling on the test dataset with the expected pollution numbers.


In [25]:
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))

# invert scaling for forecast (same shape as above with 8 cols necessary -> [:, 1:])
inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]

# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]

# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)


Test RMSE: 26.647

In [29]:
pyplot.plot(inv_yhat[:100], label='pred')
pyplot.plot(inv_y[:100], label='actual')
pyplot.show()



In [ ]: