In [1]:
import time
import pandas as pd
import numpy as np
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from datetime import datetime as dt, timedelta as td
# import models as m
from sqlalchemy import and_, func, or_, extract
epochs = [5, 30, 50, 100, 200, 500]
models = []
In [3]:
# Generar informacion de estaciones especificas training
start = dt(2015, 1, 1)
end = dt(2017, 1, 1)
minutes_delta = 15
with open('trips_train_cluster.csv', 'w') as file:
while start < end:
if start.isoweekday() not in range(1,6):
start += td(days=1)
continue
if (start.hour == 0 and start.minute > 30) or start.hour < 5:
start += td(minutes=minutes_delta)
continue
file.write("%s\n" % m.s.query(m.Trip).filter(and_(
or_(m.Trip.departure_station.in_((272, 271, 267, 266)), m.Trip.arrival_station.in_((272, 271, 267, 266))),
m.Trip.departure_time >= start.isoformat(),
m.Trip.departure_time <= (start + td(minutes=minutes_delta)).isoformat(),
)).count())
start += td(minutes=minutes_delta)
In [4]:
# Generar informacion de estaciones especificas testing
start = dt(2017, 1, 1)
end = dt(2018, 1, 1)
minutes_delta = 15
with open('trips_test_cluster.csv', 'w') as file:
while start < end:
if start.isoweekday() not in range(1,6):
start += td(days=1)
continue
if (start.hour == 0 and start.minute > 30) or start.hour < 5:
start += td(minutes=minutes_delta)
continue
file.write("%s\n" % m.s.query(m.Trip).filter(and_(
or_(m.Trip.departure_station.in_((272, 271, 267, 266)), m.Trip.arrival_station.in_((272, 271, 267, 266))),
m.Trip.departure_time >= start.isoformat(),
m.Trip.departure_time <= (start + td(minutes=minutes_delta)).isoformat(),
)).count())
start += td(minutes=minutes_delta)
In [5]:
def moving_test_window_preds(model, n_future_preds):
''' n_future_preds - Represents the number of future predictions we want to make
This coincides with the number of windows that we will move forward
on the test data
'''
preds_moving = [] # Use this to store the prediction made on each test window
moving_test_window = [test_X[0,:].tolist()] # Creating the first test window
moving_test_window = np.array(moving_test_window) # Making it an numpy array
for i in range(n_future_preds):
preds_one_step = model.predict(moving_test_window) # Note that this is already a scaled prediction so no need to rescale this
preds_moving.append(preds_one_step[0,0]) # get the value from the numpy 2D array and append to predictions
preds_one_step = preds_one_step.reshape(1,1,1) # Reshaping the prediction to 3D array for concatenation with moving test window
moving_test_window = np.concatenate((moving_test_window[:,1:,:], preds_one_step), axis=1) # This is the new moving test window, where the first element from the window has been removed and the prediction has been appended to the end
preds_moving = scaler.inverse_transform(np.array(preds_moving).reshape(-1, 1))
return preds_moving
In [ ]:
for epoch in epochs:
# Load data
train = pd.read_csv('trips_train_cluster.csv', header=None)
test = pd.read_csv('trips_test_cluster.csv', header=None )
scaler = MinMaxScaler(feature_range=(-1, 1))
window_size = 78 # 78 steps in one day
# normalize features
scaled = scaler.fit_transform(train.values)
train = pd.DataFrame(scaled)
series_s = train.copy()
for i in range(window_size):
train = pd.concat([train, series_s.shift(-(i+1))], axis=1)
train.dropna(axis=0, inplace=True)
# Hacer lo mismo para los datos de prueba
test = test.iloc[:24624, :] # The rest are all 0s
scaled = scaler.fit_transform(test.values)
test = pd.DataFrame(scaled)
series_s = test.copy()
for i in range(window_size):
test = pd.concat([test, series_s.shift(-(i+1))], axis = 1)
test.dropna(axis=0, inplace=True)
train = shuffle(train)
train_X = train.iloc[:,:-1]
train_y = train.iloc[:,-1]
test_X = test.iloc[:,:-1]
test_y = test.iloc[:,-1]
train_X = train_X.values
train_y = train_y.values
test_X = test_X.values
test_y = test_y.values
train_X = train_X.reshape(train_X.shape[0],train_X.shape[1],1)
test_X = test_X.reshape(test_X.shape[0],test_X.shape[1],1)
# Define the LSTM model
model = Sequential()
model.add(LSTM(input_shape=(window_size,1), output_dim=window_size, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(256))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation("linear"))
model.compile(loss="mse", optimizer="adam")
model.summary()
# Train
start = time.time()
model.fit(train_X, train_y, batch_size=100, epochs=epoch, validation_split=0.1)
print("> Compilation Time : ", time.time() - start)
models.append(model)
# Save
model.save("%s_epochs_model_cluster.h5" % epoch)
# Save first prediction
preds_moving = moving_test_window_preds(model, 500)
actuals = scaler.inverse_transform(test_y.reshape(-1, 1))
mse = mean_squared_error(actuals[74:150], preds_moving[74:150])
mae = mean_absolute_error(actuals[74:150], preds_moving[74:150])
# Save data
with open('f_cluster_%s_%s_%s.txt' % (epoch, mse, mae), 'w') as f:
for i in preds_moving:
f.write("%s\n" % i)
In [2]:
from keras.models import load_model
model = load_model('200_epochs_model.h5')
In [7]:
from matplotlib import pyplot
pyplot.figure(figsize=(20,6))
pyplot.plot(actuals[:600])
pyplot.plot(preds_moving[:600])
pyplot.title("200 epochs")
pyplot.show()
In [3]:
from keras.models import load_model
from matplotlib import pyplot
In [32]:
cluster = 8
epochs = 200
model = load_model('%s_epochs_model_cluster%s.h5' % (epochs, cluster))
train = pd.read_csv('cluster_data/trips_train_cluster%s.csv' % cluster, header=None)
test = pd.read_csv('cluster_data/trips_test_cluster%s.csv' % cluster, header=None )
scaler = MinMaxScaler(feature_range=(-1, 1))
window_size = 78 # 78 steps in one day
# normalize features
scaled = scaler.fit_transform(train.values)
train = pd.DataFrame(scaled)
series_s = train.copy()
for i in range(window_size):
train = pd.concat([train, series_s.shift(-(i+1))], axis=1)
train.dropna(axis=0, inplace=True)
# Hacer lo mismo para los datos de prueba
test = test.iloc[:24624, :] # The rest are all 0s
scaled = scaler.fit_transform(test.values)
test = pd.DataFrame(scaled)
series_s = test.copy()
for i in range(window_size):
test = pd.concat([test, series_s.shift(-(i+1))], axis = 1)
test.dropna(axis=0, inplace=True)
train = shuffle(train)
train_X = train.iloc[:,:-1]
train_y = train.iloc[:,-1]
test_X = test.iloc[:,:-1]
test_y = test.iloc[:,-1]
train_X = train_X.values
train_y = train_y.values
test_X = test_X.values
test_y = test_y.values
train_X = train_X.reshape(train_X.shape[0],train_X.shape[1],1)
test_X = test_X.reshape(test_X.shape[0],test_X.shape[1],1)
preds_moving = moving_test_window_preds(model, 500)
actuals = scaler.inverse_transform(test_y.reshape(-1, 1))
pyplot.figure(figsize=(20,6))
pyplot.plot(actuals[:300])
pyplot.plot(preds_moving[:300])
pyplot.title("%s epochs" % epochs)
pyplot.show()
In [11]:
def predict_next(info, n):
res = []
for i in range(n):
base = np.concatenate((scaler.fit_transform(info[i:78]), res)) if res else scaler.fit_transform(info[:78])
pred = model.predict(np.array([base]))
res.append(pred[0])
return res
In [53]:
info = actuals[2050:2150]
pyplot.figure(figsize=(20,6))
pyplot.plot(info[78:])
pyplot.plot(scaler.inverse_transform(predict_next(info, 20)))
pyplot.show()
In [33]:
m1 = load_model('%s_epochs_model_cluster%s.h5' % (30, cluster))
m2 = load_model('%s_epochs_model_cluster%s.h5' % (50, cluster))
m3 = load_model('%s_epochs_model_cluster%s.h5' % (100, cluster))
m4 = load_model('%s_epochs_model_cluster%s.h5' % (200, cluster))
In [50]:
pyplot.figure(figsize=(20,6))
pyplot.plot(actuals[:300])
pyplot.plot(a, label="30 epochs", color="r")
pyplot.plot(b, label="50 epochs", color=(1, 0.3, 0.6))
pyplot.plot(c, label="100 epochs", color="g")
pyplot.plot(d, label="200 epochs", color="y")
pyplot.title("Cluster 8 con diferentes epochs")
pyplot.legend()
pyplot.show()
In [45]:
a = moving_test_window_preds(m1, 500)[:300]
b = moving_test_window_preds(m2, 500)[:300]
c = moving_test_window_preds(m3, 500)[:300]
d = moving_test_window_preds(m4, 500)[:300]
In [51]:
print("30 epochs.\tMSE: %s\tMAE: %s" % (mean_squared_error(actuals[:300], a), mean_absolute_error(actuals[:300], a)) )
print("50 epochs.\tMSE: %s\tMAE: %s" % (mean_squared_error(actuals[:300], b), mean_absolute_error(actuals[:300], b)) )
print("100 epochs.\tMSE: %s\tMAE: %s" % (mean_squared_error(actuals[:300], c), mean_absolute_error(actuals[:300], c)) )
print("200 epochs.\tMSE: %s\tMAE: %s" % (mean_squared_error(actuals[:300], d), mean_absolute_error(actuals[:300], d)) )
In [ ]: