In [31]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, LSTM
from datetime import datetime
# download the dataset here: https://archive.ics.uci.edu/ml/datasets/Beijing+PM2.5+Data
In [2]:
raw_data = pd.read_csv("beijing_pm25.csv")
raw_data.head()
Out[2]:
In [3]:
print raw_data.isnull().sum()
print raw_data.shape
print float(raw_data['pm2.5'].isnull().sum())/raw_data.shape[0]
In [4]:
print min(raw_data['pm2.5'].dropna()), max(raw_data['pm2.5'].dropna()), raw_data['pm2.5'].dropna().median()
In [5]:
# Data Preprocessing
def str2time(time_str):
return datetime.strptime(time_str, '%Y %m %d %H')
## combine year, month, day, hour into datetime
raw_data = pd.read_csv("beijing_pm25.csv", parse_dates=[['year', 'month', 'day', 'hour']], index_col=0, date_parser=str2time)
raw_data.drop('No', axis=1, inplace=True) # the above step create a column 'No'
## rename columns
raw_data.columns = ['polution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
raw_data.index.name = 'date'
## impute NA with median
population_median = raw_data['polution'].dropna().median()
raw_data['polution'].fillna(population_median, inplace=True)
raw_data.head()
Out[5]:
In [24]:
# label encoding wnd_spd to integers
encoder = LabelEncoder()
raw_data['wnd_dir'] = encoder.fit_transform(raw_data['wnd_dir'])
print raw_data['wnd_dir'].unique()
raw_data.head()
Out[24]:
In [29]:
pyplot.figure()
col_nums = raw_data.shape[1]
for i in range(col_nums):
pyplot.subplot(col_nums, 1, i+1)
pyplot.plot(raw_data.values[:, i])
pyplot.title(raw_data.columns[i], y=1, loc='right') # y is the interval width between each plot
pyplot.show()
In [35]:
# make sure all the data is float type
values = raw_data.values
values = values.astype('float32')
values
Out[35]:
In [37]:
# normalize the data into [0,1] range
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)
scaled
Out[37]:
In [41]:
# Convert to supervided data
## Here, vari(t) is the 1 time step forard from vari(t-1)
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1] # number of columns in the original data
df = pd.DataFrame(data)
cols, names = [], []
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
# put it all together
agg = pd.concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
reframed = series_to_supervised(scaled, 1, 1)
reframed
Out[41]:
In [42]:
# Here you just want to predict t time polution from t-1 time columns,
## so drop other t time columns
reframed.drop(reframed.columns[[9,10,11,12,13,14,15]], axis=1, inplace=True)
reframed.head()
Out[42]:
In [47]:
# split into train and test sets
values = reframed.values
n_train_hours = 365*24*3 # raw_data has 4 years data reocred in hourly basis, here I'm using first 3 years as training
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]
# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]
# reshape input to be 3D [samples, timesteps, dimensions]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
In [48]:
# LSTM
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam') # using Mean Absolute Error here
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y),
verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()
In [49]:
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)