In [2]:
import numpy as np
import pickle
import os
import sys

import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns


/usr/local/lib/python2.7/dist-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [3]:
# %load '../predictor/util.py'
'''
Utility Functions.
TODO(nautilik): These functions have currently only been tested with the Boston
data set!

Authors:
    Alex Wang (alexwang@college.harvard.edu)
    Luis Perez (luis.perez.live@gmail.com)
Copyright 2015, Harvard University
'''

# The mapping of data matrix columns to indexes.
columns = {'t': 0, 'x': 1, 'y': 2, 'count': 3}


def split(X, tr_size):
    '''
    Splits input matrix X. If tr_size = 0, splits by final year.
    Note that the ratio for the final year is currently hard-coded!
    '''
    n_col = np.shape(X)[1]
    if tr_size != 0:
        Y = np.copy(X)
        np.random.shuffle(Y)
        break_pt = tr_size * np.shape(X)[0]
        train, test = Y[:break_pt, :], Y[break_pt:, :]
    else:
        break_pt = (3500. / 4400.) * np.shape(X)[0]
        train, test = X[:break_pt, :], X[break_pt:, :]

    tr_t, te_t = train[:, n_col - 1], test[:, n_col - 1]
    tr, te = train[:, range(n_col - 1)], test[:, range(n_col - 1)]
    return tr, tr_t, te, te_t


def normalize_features(X_train):
    '''
    Implementation notes: set NaN to mean.
    Generally normalizes X_train across all columns.
    '''
    mean_X_train = np.nanmean(X_train, 0)
    for i in xrange(np.shape(X_train)[1]):
        col = X_train[:, i]
        col[np.isnan(col)] = mean_X_train[i]
    std_X_train = np.std(X_train, 0)
    std_X_train[std_X_train == 0] = 1
    X_train_normalized = (X_train - mean_X_train) / std_X_train
    return X_train_normalized


def bucket(X, cols, num_buckets):
    '''
    Note: bucket edits in place
    '''
    Y = np.copy(X)
    for col in cols:
        buckets = np.linspace(np.min(X[:, col]), np.max(
            X[:, col]), num=num_buckets + 1)
        for i in xrange(num_buckets):
            X_col = Y[:, col]
            X_col[(buckets[i] <= X_col) & (X_col <= buckets[i + 1])] = i
            Y[:, col] = X_col
    return Y


def rmse(predict, true):
    '''
    Root mean square error between the predictions and the true value.
    '''
    return np.sqrt(1.0 / np.shape(predict)[0] * np.sum(np.square(predict - true)))


def createBuckets(good_data, n_buckets=15, logSpace=True):
    '''
    Count data for each cell. If logSpace is true, returns log values.
    '''

    data_b = bucket(good_data, [1, 2], n_buckets)

    n_time = int(data_b[np.argmax(data_b[:, 0])][0])

    # buckets = np.zeros((n_time, n_buckets, n_buckets))
    buckets2 = np.zeros((n_buckets * n_buckets * n_time, 4))

    # divide the data up by year and month
    for i in xrange(n_time):
        for j in xrange(n_buckets):
            for k in xrange(n_buckets):
                count = data_b[(data_b[:, 0] == i + 1) &
                               (data_b[:, 1] == j) &
                               (data_b[:, 2] == k)]
                # buckets[i][j][k] = np.size(count,0)
                buckets2[i * (n_buckets * n_buckets) +
                         j * (n_buckets) + k][0] = i
                buckets2[i * (n_buckets * n_buckets) +
                         j * (n_buckets) + k][1] = j
                buckets2[i * (n_buckets * n_buckets) +
                         j * (n_buckets) + k][2] = k
                buckets2[i * (n_buckets * n_buckets) + j *
                         (n_buckets) + k][3] = np.size(count, 0)
    print np.shape(buckets2)

    if logSpace:
        buckets2[:, 3] = np.log(np.add(sys.float_info.epsilon, buckets2[:, 3]))

    return buckets2

In [4]:
# %load '../predictor/plot.py'
'''
Plotting utilities.

Authors:
    Alex Wang (alexwang@college.harvard.edu)
    Luis Perez (luis.perez.live@gmail.com)
Copyright 2015, Harvard University
'''

def createHeatMap(X):
    '''
    Given a data set, creates a heatmap of it based on x,y coordinates.
    Ignore the temporal feature. You should subset the data before passing
    it into this function if you'd like a heatmap for a specific time period.
    '''
    n = X[:, columns['x']].astype(int).max()
    m = X[:, columns['y']].astype(int).max()
    heatmap = np.zeros((n, m))
    for i in xrange(n):
        for j in xrange(m):
            total = X[(X[:, columns['x']] == i) &
                      (X[:, columns['y']] == j), columns['count']].sum()
            if total > 0:
                heatmap[i, j] = total
    heatmap = heatmap / float(heatmap.sum())
    return heatmap


def plotDistribution(predict, true, city, n, process='GP'):
    '''
    Make some plots for n = 15 for GP process
    '''
    minValue = min(len(predict), 100)
    yPred = predict[-minValue:]
    yTrue = true[-minValue:]

    # Plot Crime for Final Time Period
    plt.clf()
    plt.plot(yPred, label="Predictions")
    plt.plot(yTrue, label="Actual Data")
    plt.title('Crimes using {} in Last Time Period'.format(process))
    plt.xlabel('Final {}} Regions'.format(minValue))
    plt.ylabel('Crime Count')
    plt.legend()
    savefile = os.path.abspath('figures/{}_results/{}_crime_n={}_periods=last.png'.format(
        city, process, n))
    plt.savefig(savefile)
    plt.close()

    print "Crimes for final period saved to {}".format(savefile)

    # Plot crime distribution for final period
    yPred = yPred / float(np.sum(yPred))
    yTrue = yTrue / float(np.sum(yTrue))
    plt.clf()
    plt.plot(yPred, label="Predictions")
    plt.plot(yTrue, label="Actual Data")
    plt.title('Predictive Distribution using {} in Last Time Period'.format(process))
    plt.xlabel('Final {}} Regions'.format(minValue))
    plt.ylabel('Probability')
    plt.legend()
    savefile = os.path.abspath('figures/{}_results/{}_dist_n={}_periods=last.png'.format(
        city, process, n))
    plt.savefig(savefile)
    plt.close()

    print "Distribution saved to {}".format(savefile)

    yPred = predict[:minValue]
    yTrue = true[:minValue]

    # Plot Crime for First Time Period
    plt.clf()
    plt.plot(yPred, label="Predictions")
    plt.plot(yTrue, label="Actual Data")
    plt.title('Crimes using {} in First Time Period'.format(process))
    plt.xlabel('Final {}} Regions'.format(minValue))
    plt.ylabel('Crime Count')
    plt.legend()
    savefile = os.path.abspath('figures/{}_results/{}_crime_n={}_periods=first.png'.format(
        city, process, n))
    plt.savefig(savefile)
    plt.close()

    print "Crimes for first period saved to {}".format(savefile)

    yPred = yPred / float(np.sum(yPred))
    yTrue = yTrue / float(np.sum(yTrue))
    plt.clf()
    plt.plot(yPred, label="Predictions")
    plt.plot(yTrue, label="Actual Data")
    plt.title(
        'Predictive Distribution using {} in First Time Period'.format(process))
    plt.xlabel('Final 100 Regions')
    plt.ylabel('Probability')
    plt.legend()
    savefile = os.path.abspath('figures/{}_results/{}_dist_n={}_period=first.png'.format(
        city, process, n))
    plt.savefig(savefile)
    plt.close()

    print "Distribution saved to {}".format(savefile)


def plotHeatMaps(X_test, predict, city, n, process='GP'):
    '''
    Plots the heatmap based on the X_test data matrix and the predictions.
    Note that X_test must have a final column with the true values of the input
    vectors.
    '''
    # Attach the predictions to the data
    trueValues = np.copy(X_test)
    predictedValues = np.copy(X_test)
    predictedValues[:, columns['count']] = predict.reshape((predict.shape[0]))

    # Now we want to plot the heatmaps for the predictions/actual data
    # by time period
    months = np.unique(X_test[:, columns['t']])
    for month in months:
        # Create the heatmaps
        selected = (X_test[:, columns['t']] == month)
        if selected.sum() > 0:
            plt.clf()
            m1 = createHeatMap(trueValues[selected, :])
            m2 = createHeatMap(predictedValues[selected, :])

            # Make a plot only if both have data available :)
            if m1.sum() > 0 and m2.sum > 0:
                sns.heatmap(m1)
                plt.title('True Density Distribution in Month {}'.format(month))
                savefile = os.path.abspath('figures/{}_results/{}_heatmap_true_n={}_t={}.png'.format(
                    city, process, n, month))
                plt.savefig(savefile)
                plt.close()
                print "True heatmap saved to {}".format(savefile)

                sns.heatmap(m2)
                plt.title(
                    'Predicted Density Distribution in Month {}'.format(month))
                savefile = os.path.abspath('figures/{}_results/{}_heatmap_pred_n={}_t={}.png'.format(
                    city, process, n, month))
                plt.savefig(savefile)
                plt.close()
                print "Predictions heatmap saved to {}".format(savefile)

In [11]:
def KL(p,q):
    logs = np.copy(p + sys.float_info.epsilon)
    logs = p*np.log(logs/q)
    return np.sum(logs)

In [5]:
# Load in the base data for boston
'''
Read in data
'''

import pandas as pd
import re
import warnings

bos_file = '../data/boston.csv'
target_type = str  # The desired output type

with warnings.catch_warnings(record=True) as ws:
    warnings.simplefilter("always")

    bos_data = pd.read_csv(bos_file, sep=",", header=0)
    print("Warnings raised:", ws)
    # We have an error on specific columns, try and load them as string
    for w in ws:
        s = str(w.message)
        print("Warning message:", s)
        match = re.search(r"Columns \(([0-9,]+)\) have mixed types\.", s)
        if match:
            columns = match.group(1).split(',') # Get columns as a list
            columns = [int(c) for c in columns]
            print("Applying %s dtype to columns:" % target_type, columns)
            bos_data.iloc[:,columns] = bos_data.iloc[:,columns].astype(target_type)

'''
Featurize data
'''
# temporal features
# day of week
day = np.array(bos_data.DAY_WEEK)
day[ day == "Sunday"] = 0
day[ day == "Monday"] = 1
day[ day == "Tuesday"] = 2
day[ day == "Wednesday"] = 3
day[ day == "Thursday"] = 4
day[ day == "Friday"] = 5
day[ day == "Saturday"] = 6

# Split mm/dd/yyyy xx:yy:zz AM/PM into components
date_time = np.array([x.split() for x in bos_data.FROMDATE])
date = date_time[:,0]
time = date_time[:,1]
tod = date_time[:,2]

# month, day, year
date = np.array([x.split('/') for x in date])
month = [int(x) for x in date[:,0]]
dom = [int(x) for x in date[:,1]]
year = [int(x) for x in date[:,2]]
# months since Jan 2012
time_feat = np.subtract(year, 2012)*12 + month

# time of day
time_c = [x.split(':') for x in time]
time = [int(x[1]) if (y == 'AM' and int(x[0]) == 12) else 60*int(x[0])+int(x[1]) 
        if (y =='AM' and int(x[0]) != 12) or (int(x[0]) == 12 and y == 'PM') else 12*60+60*int(x[0])+int(x[1]) 
        for x,y in zip(time_c, tod)]

# grab the features we want
data_unnorm = np.transpose(np.vstack((time_feat, bos_data.X, bos_data.Y))).astype(float)
# remove NaNs
good_data = data_unnorm[~(np.isnan(data_unnorm[:,1]))]


('Warnings raised:', [<warnings.WarningMessage object at 0x7f791b239e10>])
('Warning message:', 'Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.')
("Applying <type 'str'> dtype to columns:", [10])

In [20]:
good_data = good_data[(good_data[:,0] != good_data[:,0].max())]

In [35]:
(train_t > 0).sum()


Out[35]:
1636

In [21]:
log = False
%time data = createBuckets(good_data, n_buckets=10, logSpace=log)
%time train, train_t, test, test_t = split(good_data, 0)


(4300, 4)
CPU times: user 10.9 s, sys: 12 ms, total: 10.9 s
Wall time: 10.9 s
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.75 ms

In [25]:
import GPy as gp

In [ ]:
# Collect likelihoods for different n values
testN = range(2,7) + range(8,11,2)
likelihoods = []
optimize = True
likelihoods = {}
KLs = {}

In [34]:
for n in testN:
    # Bucketize the data as specified! By default, does Boston data.
    data = createBuckets(good_data, n, logSpace=False)

    # Split for latest year.
    %time train, train_t, test, test_t = split(data, 0)

    # Train the model and optimize it too
    k_lin = gp.kern.Linear(input_dim = 3)
    k_per = gp.kern.StdPeriodic(input_dim = 3, ARD1= True, ARD2=True)
    
    # Linear times periodic
    k = k_lin * k_per
    
    # Optmize the values
    train_t = train_t.reshape((train_t.shape[0], 1))
    %time m = gp.models.GPRegression(train, train_t, k)
    
    # Fix the variance
    if optimize:
        m.Gaussian_noise.variance.constrain_fixed(train_t.std())

        # Optimize
        m.optimize(messages=True, max_iters=150)

    # Get likelihood
    likelihoods[n] = m.log_likelihood()
    
    
    predictions_optimal = m.predict(test)
    preds2 = predictions_optimal[0]
    preds_p = preds2.clip(min=sys.float_info.epsilon)
    n_test_t = test_t / np.sum(test_t)
    n_preds = preds_p / np.sum(preds_p)
    
    KLs[n] = KL(n_test_t, n_preds)
    print "KL {}".format(KLs[n])


(172, 4)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 66 µs
CPU times: user 52 ms, sys: 0 ns, total: 52 ms
Wall time: 52.7 ms
WARNING: reconstraining parameters GP_regression.Gaussian_noise.variance
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i     f              |g|        
    00s15  007   1.723476e+04   6.333227e+11 
    00s29  015   1.018111e+04   2.487903e+09 
    00s60  037   3.050891e+03   4.307706e+08 
    01s22  080   1.243379e+03   5.520190e+09 
    01s82  126   1.079361e+03   1.672625e+07 
    02s18  152   1.026266e+03   1.578293e+03 
Runtime:     02s18
Optimization status: Maximum number of f evaluations reached

KL 334.144929202
(387, 4)
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 60.1 µs
CPU times: user 308 ms, sys: 0 ns, total: 308 ms
Wall time: 307 ms
WARNING: reconstraining parameters GP_regression.Gaussian_noise.variance
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i     f              |g|        
    00s61  005   1.322473e+05   2.729948e+07 
    01s53  017   2.571787e+03   9.403852e+05 
    02s46  026   2.540849e+03   2.420951e+05 
    04s73  051   2.481302e+03   4.044285e+05 
    06s02  068   2.444315e+03   7.940125e+03 
    12s43  152   2.443585e+03   6.918906e+02 
Runtime:     12s43
Optimization status: Maximum number of f evaluations reached

KL 459.765828487
(688, 4)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 99.9 µs
CPU times: user 1.07 s, sys: 0 ns, total: 1.07 s
Wall time: 1.07 s
WARNING: reconstraining parameters GP_regression.Gaussian_noise.variance
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i     f              |g|        
    03s48  009   1.304406e+05   6.188847e+10 
    04s71  013   8.529696e+04   1.062896e+11 
    05s65  016   2.227780e+04   6.218604e+09 
    07s57  022   1.534771e+04   5.949242e+09 
    09s71  029   1.069103e+04   1.728876e+07 
    12s51  039   5.734223e+03   7.094386e+10 
    19s63  063   4.791008e+03   2.433776e+06 
    32s28  104   4.648944e+03   3.990199e+08 
    34s34  111   4.552994e+03   3.463919e+07 
    39s61  128   4.181903e+03   1.642305e+07 
    41s13  133   4.179345e+03   8.754646e+06 
    46s70  152   4.078065e+03   7.381075e+07 
Runtime:     46s70
Optimization status: Maximum number of f evaluations reached

KL 104.389940851
(1075, 4)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 91.1 µs
CPU times: user 2.86 s, sys: 0 ns, total: 2.86 s
Wall time: 2.86 s
WARNING: reconstraining parameters GP_regression.Gaussian_noise.variance
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i     f              |g|        
    03s86  002   1.794714e+05   1.187873e+05 
    13s55  012   7.023699e+04   2.743041e+12 
    18s32  017   8.399863e+03   3.226173e+24 
    28s39  027   8.495260e+03   7.821173e+05 
    45s98  045   8.366149e+03   5.266016e+24 
Runtime:     45s98
Optimization status: Converged

KL 450.514446422
(1548, 4)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 78.2 µs
CPU times: user 6.65 s, sys: 0 ns, total: 6.65 s
Wall time: 6.65 s
WARNING: reconstraining parameters GP_regression.Gaussian_noise.variance
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i     f              |g|        
    32s04  011   2.671914e+04   8.519395e+10 
    56s56  021   1.437191e+04   5.630286e+06 
 01m01s73  023   1.437157e+04   5.627959e+06 
 01m18s40  031   1.437053e+04   5.621521e+06 
 01m27s05  035   1.436983e+04   5.686845e+06 
 01m29s43  036   1.436983e+04   5.617659e+06 
 01m36s81  039   1.436913e+04   5.614126e+06 
 02m01s83  051   1.436771e+04   5.608005e+06 
 02m07s33  053   1.436735e+04   5.606663e+06 
 02m24s05  061   1.436628e+04   5.603069e+06 
 02m31s33  064   1.436556e+04   5.601020e+06 
 02m44s35  071   1.436484e+04   5.599237e+06 
 03m02s12  078   1.436374e+04   5.597042e+06 
 03m27s66  091   1.436190e+04   5.594589e+06 
 03m35s56  095   1.436116e+04   5.665674e+06 
 03m47s77  101   1.436041e+04   5.593639e+06 
 03m55s67  105   1.435966e+04   5.665739e+06 
 04m08s17  111   1.435890e+04   5.593526e+06 
 04m26s38  119   1.435738e+04   5.594194e+06 
 04m29s24  121   1.435738e+04   5.594194e+06 
 04m47s72  129   1.435584e+04   5.595592e+06 
 05m10s99  141   1.435428e+04   5.597673e+06 
 05m26s10  148   1.435310e+04   5.599658e+06 
 05m33s91  152   1.435270e+04   5.600393e+06 
Runtime:  05m33s91
Optimization status: Maximum number of f evaluations reached

KL 292.48975078
(2752, 4)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 93 µs
CPU times: user 27.1 s, sys: 268 ms, total: 27.4 s
Wall time: 27.5 s
WARNING: reconstraining parameters GP_regression.Gaussian_noise.variance
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i     f              |g|        
    24s60  001   1.887743e+05   4.188282e+03 
 01m31s26  007   1.816467e+05   7.608504e+08 
 02m41s02  013   4.938900e+04   2.289498e+11 
 03m51s74  019   3.441288e+04   1.100598e+12 
 04m47s16  024   1.965860e+04   2.388761e+08 
 06m30s32  033   1.904593e+04   7.729795e+07 
 15m53s15  047   1.387074e+04   4.277675e+07 
 18m08s52  059   1.367532e+04   7.741339e+04 
 23m54s35  090   1.336880e+04   3.903092e+07 
 25m20s92  098   1.315253e+04   3.337416e+07 
 26m04s48  102   1.306470e+04   1.379396e+07 
 26m37s02  105   1.321097e+04   1.475578e+09 
 28m30s89  115   1.298314e+04   3.951632e+06 
 30m05s30  123   1.297400e+04   4.191305e-01 
Runtime:  30m05s30
Optimization status: Converged

KL 5860.27785014
(4300, 4)
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 96.1 µs
CPU times: user 1min 12s, sys: 14.7 s, total: 1min 26s
Wall time: 1min 35s
WARNING: reconstraining parameters GP_regression.Gaussian_noise.variance
Running L-BFGS-B (Scipy implementation) Code:
  runtime   i     f              |g|        
 05m17s65  007   1.622823e+05   4.723162e+13 
 11m44s76  017   6.707304e+04   7.615566e+11 
 13m43s07  020   2.673694e+05   9.452543e-43 
 18m24s06  027   2.085235e+04   9.295752e+09 
 22m35s54  033   1.831159e+04   9.251310e+07 
 33m39s29  050   1.795366e+04   5.426638e+08 
 36m56s76  055   1.785071e+04   7.158137e+06 
 42m54s10  064   1.962258e+04   1.018331e+11 
 48m23s38  072   1.779754e+04   8.586540e+05 
 01h02m41  094   1.727396e+04   3.246583e+05 
 01h12m17  107   1.724344e+04   3.775361e+03 
 01h19m09  118   1.724324e+04   1.697708e+04 
 03h06m33  142   1.724321e+04   3.043073e+02 
 03h13m11  152   1.724320e+04   1.470197e+02 
Runtime:  03h13m11
Optimization status: Maximum number of f evaluations reached

KL 8802.04737447

In [37]:
KLs


Out[37]:
{2: 334.14492920210171,
 3: 459.76582848657102,
 4: 104.38994085124376,
 5: 450.51444642156696,
 6: 292.48975078031987,
 8: 5860.2778501384018,
 10: 8802.0473744740884}

In [46]:
plt.clf()
x = KLs.keys()
kls = KLs.values()
likelihood = likelihoods.values()
plt.plot(x, likelihood)
plt.title("Linear*Periodic Kernel on Boston City Data")
plt.xlabel("Dimension of Bucketization Grid")
plt.ylabel("Log Likelihood")
plt.savefig("kl_likelihood_periodic_linear_periodic.png")

In [47]:
plt.clf()
x = KLs.keys()
kls = KLs.values()
likelihood = likelihoods.values()
plt.plot(x, kls)
plt.title("Linear*Periodic Kernel on Boston City Data")
plt.xlabel("Dimension of Bucketization Grid")
plt.ylabel("Measured KL Divergence")
plt.savefig("kl_periodic_linear_periodic.png")

In [ ]: