In [ ]:
%%capture
%pip install --upgrade pip
%pip install --upgrade seaborn
%pip install --upgrade numpy
%pip install --upgrade pandas
%pip install --upgrade "tensorflow<2"
%pip install --upgrade scikit-learn
%pip install --upgrade google-cloud-bigquery
In [ ]:
from math import sqrt
import numpy as np
import pandas as pd
from pandas.plotting import register_matplotlib_converters
import matplotlib.pyplot as plt
register_matplotlib_converters()
import seaborn as sns
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense, Dropout
import warnings
In [ ]:
from google.cloud import bigquery
sql = """
SELECT count(*) as count, TIMESTAMP_TRUNC(date, DAY) as day
FROM `bigquery-public-data.chicago_crime.crime`
GROUP BY day
ORDER BY day
"""
client = bigquery.Client()
df = client.query(sql).result().to_dataframe()
df.index = df.day
df = df[['count']]
df.head()
In [ ]:
plt.figure(figsize=(20, 6))
with warnings.catch_warnings():
warnings.simplefilter("ignore")
sns.lineplot(data=df).set_title('Daily Crime Reports')
plt.show()
In [ ]:
# Split dataset into sequences of previous values and current values
# For example, given a dataset: [1, 2, 3, 4, 5] and a window size of 2:
# data_X = [[1, 2], [2, 3], [3, 4]]
# data_y = [3, 4, 5]
def create_dataset(dataset, window_size = 1):
data_X, data_y = [], []
df = pd.DataFrame(dataset)
columns = [df.shift(i) for i in reversed(range(1, window_size+1))]
data_X = pd.concat(columns, axis=1).dropna().values
data_y = df.shift(-window_size).dropna().values
return data_X, data_y
In [ ]:
# The % of data we should use for training
TRAINING_SPLIT = 0.8
# The # of observations to use to predict the next observation
WINDOW_SIZE = 7
def preprocess_data(df, window_size):
# Normalize inputs to improve learning process
scaler = RobustScaler()
# Time series: split latest data into test set
train = df.values[:int(TRAINING_SPLIT * len(df)), :]
train = scaler.fit_transform(train)
test = df.values[int(TRAINING_SPLIT * len(df)):, :]
test = scaler.transform(test)
# Create test and training sets
train_X, train_y = create_dataset(train, window_size)
test_X, test_y = create_dataset(test, window_size)
# Reshape input data
train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1]))
test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1]))
return train_X, train_y, test_X, test_y, scaler
train_X, train_y, test_X, test_y, scaler = preprocess_data(df, WINDOW_SIZE)
In [ ]:
def input_fn(features, labels, shuffle, num_epochs, batch_size):
"""Generates an input function to be used for model training.
Args:
features: numpy array of features used for training or inference
labels: numpy array of labels for each example
shuffle: boolean for whether to shuffle the data or not (set True for
training, False for evaluation)
num_epochs: number of epochs to provide the data for
batch_size: batch size for training
Returns:
A tf.data.Dataset that can provide data to the Keras model for training or
evaluation
"""
if labels is None:
inputs = features
else:
inputs = (features, labels)
dataset = tf.data.Dataset.from_tensor_slices(inputs)
if shuffle:
dataset = dataset.shuffle(buffer_size=len(features))
# We call repeat after shuffling, rather than before, to prevent separate
# epochs from blending together.
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
return dataset
In [ ]:
def create_keras_model(input_dim, learning_rate, window_size):
"""Creates Keras model for regression.
Args:
input_dim: How many features the input has
learning_rate: Learning rate for training
Returns:
The compiled Keras model (still needs to be trained)
"""
model = keras.Sequential([
LSTM(4, dropout = 0.2, input_shape = (input_dim, window_size)),
Dense(1)
])
model.compile(loss='mean_squared_error', optimizer=tf.train.AdamOptimizer(
learning_rate=learning_rate))
return(model)
In [ ]:
def train_and_evaluate(batch_size, learning_rate, num_epochs, window_size):
# Dimensions
num_train_examples, input_dim, _ = train_X.shape
num_eval_examples = test_X.shape[0]
# Create the Keras Model
keras_model = create_keras_model(
input_dim=input_dim, learning_rate=learning_rate, window_size=window_size)
# Pass a numpy array by passing DataFrame.values
training_dataset = input_fn(
features=train_X,
labels=train_y,
shuffle=False,
num_epochs=num_epochs,
batch_size=batch_size)
# Pass a numpy array by passing DataFrame.values
validation_dataset = input_fn(
features=test_X,
labels=test_y,
shuffle=False,
num_epochs=num_epochs,
batch_size=num_eval_examples)
# Train model
keras_model.fit(
training_dataset,
steps_per_epoch=int(num_train_examples / batch_size),
epochs=num_epochs,
validation_data=validation_dataset,
validation_steps=1,
verbose=1,
shuffle=False,
)
return keras_model
In [ ]:
BATCH_SIZE = 256
LEARNING_RATE = 0.01
NUM_EPOCHS = 25
model = train_and_evaluate(BATCH_SIZE, LEARNING_RATE, NUM_EPOCHS, WINDOW_SIZE)
In [ ]:
def predict(model, X, y, scaler):
y_true = scaler.inverse_transform(y)
y_pred = scaler.inverse_transform(model.predict(X))
rmse = sqrt(mean_squared_error(y_true, y_pred))
return y_pred, rmse
train_predict, _ = predict(model, train_X, train_y, scaler)
test_predict, rmse = predict(model, test_X, test_y, scaler)
model.evaluate(train_X, train_y)
print(rmse)
In [ ]:
# Create new dataframe with similar indexes and columns to store prediction array
df_test_predict = pd.DataFrame().reindex_like(df)
# Assign test predictions to end of dataframe
df_test_predict['count'][len(train_predict) + (WINDOW_SIZE * 2):len(df)] = np.squeeze(test_predict)
# Append the test predictions to the end of the existing dataframe, while renaming the column to avoid collision
df_combined = df.join(df_test_predict.rename(index=str, columns={'count':'predicted'}))
# Plot the predicted vs actual counts
plt.figure(figsize=(20, 6))
with warnings.catch_warnings():
warnings.simplefilter("ignore")
sns.lineplot(data=df_combined).set_title('Daily Crime Reports')
plt.show()