Chicago Crime Prediction

The model forecasts how many crimes are expected to be reported the next day, based on how many were reported over the previous n days.

Imports


In [ ]:
%%capture

%pip install --upgrade pip
%pip install --upgrade seaborn
%pip install --upgrade numpy
%pip install --upgrade pandas
%pip install --upgrade "tensorflow<2"
%pip install --upgrade scikit-learn
%pip install --upgrade google-cloud-bigquery

In [ ]:
from math import sqrt

import numpy as np

import pandas as pd
from pandas.plotting import register_matplotlib_converters

import matplotlib.pyplot as plt
register_matplotlib_converters()

import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Dense, Dropout

import warnings

Load data


In [ ]:
from google.cloud import bigquery

sql = """
SELECT count(*) as count, TIMESTAMP_TRUNC(date, DAY) as day
FROM `bigquery-public-data.chicago_crime.crime`
GROUP BY day
ORDER BY day
"""

client = bigquery.Client()
df = client.query(sql).result().to_dataframe()

df.index = df.day
df = df[['count']]
df.head()

Visualize data


In [ ]:
plt.figure(figsize=(20, 6))
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sns.lineplot(data=df).set_title('Daily Crime Reports')
plt.show()

Preprocess data


In [ ]:
# Split dataset into sequences of previous values and current values
# For example, given a dataset: [1, 2, 3, 4, 5] and a window size of 2:
# data_X = [[1, 2], [2, 3], [3, 4]]
# data_y = [3, 4, 5]
def create_dataset(dataset, window_size = 1):
    data_X, data_y = [], []
    df = pd.DataFrame(dataset)
    columns = [df.shift(i) for i in reversed(range(1, window_size+1))]
    data_X = pd.concat(columns, axis=1).dropna().values
    data_y = df.shift(-window_size).dropna().values
    return data_X, data_y

In [ ]:
# The % of data we should use for training
TRAINING_SPLIT = 0.8
# The # of observations to use to predict the next observation
WINDOW_SIZE = 7

def preprocess_data(df, window_size):
    # Normalize inputs to improve learning process
    scaler = RobustScaler()

    # Time series: split latest data into test set
    train = df.values[:int(TRAINING_SPLIT * len(df)), :]
    train = scaler.fit_transform(train)
    test = df.values[int(TRAINING_SPLIT * len(df)):, :]
    test = scaler.transform(test)

    # Create test and training sets
    train_X, train_y = create_dataset(train, window_size)
    test_X, test_y = create_dataset(test, window_size)

    # Reshape input data
    train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1]))
    test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1]))

    return train_X, train_y, test_X, test_y, scaler

train_X, train_y, test_X, test_y, scaler = preprocess_data(df, WINDOW_SIZE)

Train model


In [ ]:
def input_fn(features, labels, shuffle, num_epochs, batch_size):
    """Generates an input function to be used for model training.

    Args:
      features: numpy array of features used for training or inference
      labels: numpy array of labels for each example
      shuffle: boolean for whether to shuffle the data or not (set True for
        training, False for evaluation)
      num_epochs: number of epochs to provide the data for
      batch_size: batch size for training

    Returns:
      A tf.data.Dataset that can provide data to the Keras model for training or
        evaluation
    """

    if labels is None:
        inputs = features
    else:
        inputs = (features, labels)

    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(features))

    # We call repeat after shuffling, rather than before, to prevent separate
    # epochs from blending together.
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    return dataset

In [ ]:
def create_keras_model(input_dim, learning_rate, window_size):
    """Creates Keras model for regression.

    Args:
      input_dim: How many features the input has
      learning_rate: Learning rate for training

    Returns:
      The compiled Keras model (still needs to be trained)
    """
    
    model = keras.Sequential([
        LSTM(4, dropout = 0.2, input_shape = (input_dim, window_size)),
        Dense(1)
    ])

    model.compile(loss='mean_squared_error', optimizer=tf.train.AdamOptimizer(
        learning_rate=learning_rate))    
    
    return(model)

In [ ]:
def train_and_evaluate(batch_size, learning_rate, num_epochs, window_size):
    # Dimensions
    num_train_examples, input_dim, _ = train_X.shape
    num_eval_examples = test_X.shape[0]

    # Create the Keras Model
    keras_model = create_keras_model(
        input_dim=input_dim, learning_rate=learning_rate, window_size=window_size)

    # Pass a numpy array by passing DataFrame.values
    training_dataset = input_fn(
        features=train_X,
        labels=train_y,
        shuffle=False,
        num_epochs=num_epochs,
        batch_size=batch_size)

    # Pass a numpy array by passing DataFrame.values
    validation_dataset = input_fn(
        features=test_X,
        labels=test_y,
        shuffle=False,
        num_epochs=num_epochs,
        batch_size=num_eval_examples)

    # Train model
    keras_model.fit(
        training_dataset,
        steps_per_epoch=int(num_train_examples / batch_size),
        epochs=num_epochs,
        validation_data=validation_dataset,
        validation_steps=1,
        verbose=1,
        shuffle=False,
    )
    
    return keras_model

In [ ]:
BATCH_SIZE = 256
LEARNING_RATE = 0.01
NUM_EPOCHS = 25

model = train_and_evaluate(BATCH_SIZE, LEARNING_RATE, NUM_EPOCHS, WINDOW_SIZE)

Evaluate model


In [ ]:
def predict(model, X, y, scaler):
    y_true = scaler.inverse_transform(y)
    y_pred = scaler.inverse_transform(model.predict(X))
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    return y_pred, rmse

train_predict, _ = predict(model, train_X, train_y, scaler)
test_predict, rmse = predict(model, test_X, test_y, scaler)

model.evaluate(train_X, train_y)
print(rmse)

Plot predictions


In [ ]:
# Create new dataframe with similar indexes and columns to store prediction array
df_test_predict = pd.DataFrame().reindex_like(df)
# Assign test predictions to end of dataframe
df_test_predict['count'][len(train_predict) + (WINDOW_SIZE * 2):len(df)] = np.squeeze(test_predict)
# Append the test predictions to the end of the existing dataframe, while renaming the column to avoid collision
df_combined = df.join(df_test_predict.rename(index=str, columns={'count':'predicted'}))

# Plot the predicted vs actual counts
plt.figure(figsize=(20, 6))
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sns.lineplot(data=df_combined).set_title('Daily Crime Reports')
plt.show()