In [1]:
import numpy as np
import pickle
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
# %load '../predictor/util.py'
'''
Utility Functions.
TODO(nautilik): These functions have currently only been tested with the Boston
data set!
Authors:
Alex Wang (alexwang@college.harvard.edu)
Luis Perez (luis.perez.live@gmail.com)
Copyright 2015, Harvard University
'''
# The mapping of data matrix columns to indexes.
columns = {'t': 0, 'x': 1, 'y': 2, 'count': 3}
def split(X, tr_size):
'''
Splits input matrix X. If tr_size = 0, splits by final year.
Note that the ratio for the final year is currently hard-coded!
'''
n_col = np.shape(X)[1]
if tr_size != 0:
Y = np.copy(X)
np.random.shuffle(Y)
break_pt = tr_size * np.shape(X)[0]
train, test = Y[:break_pt, :], Y[break_pt:, :]
else:
break_pt = (3500. / 4400.) * np.shape(X)[0]
train, test = X[:break_pt, :], X[break_pt:, :]
tr_t, te_t = train[:, n_col - 1], test[:, n_col - 1]
tr, te = train[:, range(n_col - 1)], test[:, range(n_col - 1)]
return tr, tr_t, te, te_t
def normalize_features(X_train):
'''
Implementation notes: set NaN to mean.
Generally normalizes X_train across all columns.
'''
mean_X_train = np.nanmean(X_train, 0)
for i in xrange(np.shape(X_train)[1]):
col = X_train[:, i]
col[np.isnan(col)] = mean_X_train[i]
std_X_train = np.std(X_train, 0)
std_X_train[std_X_train == 0] = 1
X_train_normalized = (X_train - mean_X_train) / std_X_train
return X_train_normalized
def bucket(X, cols, num_buckets):
'''
Note: bucket edits in place
'''
Y = np.copy(X)
for col in cols:
buckets = np.linspace(np.min(X[:, col]), np.max(
X[:, col]), num=num_buckets + 1)
for i in xrange(num_buckets):
X_col = Y[:, col]
X_col[(buckets[i] <= X_col) & (X_col <= buckets[i + 1])] = i
Y[:, col] = X_col
return Y
def rmse(predict, true):
'''
Root mean square error between the predictions and the true value.
'''
return np.sqrt(1.0 / np.shape(predict)[0] * np.sum(np.square(predict - true)))
def createBuckets(good_data, n_buckets=15, logSpace=True):
'''
Count data for each cell. If logSpace is true, returns log values.
'''
data_b = bucket(good_data, [1, 2], n_buckets)
n_time = int(data_b[np.argmax(data_b[:, 0])][0])
# buckets = np.zeros((n_time, n_buckets, n_buckets))
buckets2 = np.zeros((n_buckets * n_buckets * n_time, 4))
# divide the data up by year and month
for i in xrange(n_time):
for j in xrange(n_buckets):
for k in xrange(n_buckets):
count = data_b[(data_b[:, 0] == i + 1) &
(data_b[:, 1] == j) &
(data_b[:, 2] == k)]
# buckets[i][j][k] = np.size(count,0)
buckets2[i * (n_buckets * n_buckets) +
j * (n_buckets) + k][0] = i
buckets2[i * (n_buckets * n_buckets) +
j * (n_buckets) + k][1] = j
buckets2[i * (n_buckets * n_buckets) +
j * (n_buckets) + k][2] = k
buckets2[i * (n_buckets * n_buckets) + j *
(n_buckets) + k][3] = np.size(count, 0)
print np.shape(buckets2)
if logSpace:
buckets2[:, 3] = np.log(np.add(sys.float_info.epsilon, buckets2[:, 3]))
return buckets2
In [3]:
# %load '../predictor/plot.py'
'''
Plotting utilities.
Authors:
Alex Wang (alexwang@college.harvard.edu)
Luis Perez (luis.perez.live@gmail.com)
Copyright 2015, Harvard University
'''
def createHeatMap(X):
'''
Given a data set, creates a heatmap of it based on x,y coordinates.
Ignore the temporal feature. You should subset the data before passing
it into this function if you'd like a heatmap for a specific time period.
'''
n = X[:, columns['x']].astype(int).max()
m = X[:, columns['y']].astype(int).max()
heatmap = np.zeros((n, m))
for i in xrange(n):
for j in xrange(m):
total = X[(X[:, columns['x']] == i) &
(X[:, columns['y']] == j), columns['count']].sum()
if total > 0:
heatmap[i, j] = total
heatmap = heatmap / float(heatmap.sum())
return heatmap
def plotDistribution(predict, true, city, n, process='GP'):
'''
Make some plots for n = 15 for GP process
'''
minValue = min(len(predict), 100)
yPred = predict[-minValue:]
yTrue = true[-minValue:]
# Plot Crime for Final Time Period
plt.clf()
plt.plot(yPred, label="Predictions")
plt.plot(yTrue, label="Actual Data")
plt.title('Crimes using {} in Last Time Period'.format(process))
plt.xlabel('Final {}} Regions'.format(minValue))
plt.ylabel('Crime Count')
plt.legend()
savefile = os.path.abspath('figures/{}_results/{}_crime_n={}_periods=last.png'.format(
city, process, n))
plt.savefig(savefile)
plt.close()
print "Crimes for final period saved to {}".format(savefile)
# Plot crime distribution for final period
yPred = yPred / float(np.sum(yPred))
yTrue = yTrue / float(np.sum(yTrue))
plt.clf()
plt.plot(yPred, label="Predictions")
plt.plot(yTrue, label="Actual Data")
plt.title('Predictive Distribution using {} in Last Time Period'.format(process))
plt.xlabel('Final {}} Regions'.format(minValue))
plt.ylabel('Probability')
plt.legend()
savefile = os.path.abspath('figures/{}_results/{}_dist_n={}_periods=last.png'.format(
city, process, n))
plt.savefig(savefile)
plt.close()
print "Distribution saved to {}".format(savefile)
yPred = predict[:minValue]
yTrue = true[:minValue]
# Plot Crime for First Time Period
plt.clf()
plt.plot(yPred, label="Predictions")
plt.plot(yTrue, label="Actual Data")
plt.title('Crimes using {} in First Time Period'.format(process))
plt.xlabel('Final {}} Regions'.format(minValue))
plt.ylabel('Crime Count')
plt.legend()
savefile = os.path.abspath('figures/{}_results/{}_crime_n={}_periods=first.png'.format(
city, process, n))
plt.savefig(savefile)
plt.close()
print "Crimes for first period saved to {}".format(savefile)
yPred = yPred / float(np.sum(yPred))
yTrue = yTrue / float(np.sum(yTrue))
plt.clf()
plt.plot(yPred, label="Predictions")
plt.plot(yTrue, label="Actual Data")
plt.title(
'Predictive Distribution using {} in First Time Period'.format(process))
plt.xlabel('Final 100 Regions')
plt.ylabel('Probability')
plt.legend()
savefile = os.path.abspath('figures/{}_results/{}_dist_n={}_period=first.png'.format(
city, process, n))
plt.savefig(savefile)
plt.close()
print "Distribution saved to {}".format(savefile)
def plotHeatMaps(X_test, predict, city, n, process='GP'):
'''
Plots the heatmap based on the X_test data matrix and the predictions.
Note that X_test must have a final column with the true values of the input
vectors.
'''
# Attach the predictions to the data
trueValues = np.copy(X_test)
predictedValues = np.copy(X_test)
predictedValues[:, columns['count']] = predict.reshape((predict.shape[0]))
# Now we want to plot the heatmaps for the predictions/actual data
# by time period
months = np.unique(X_test[:, columns['t']])
for month in months:
# Create the heatmaps
selected = (X_test[:, columns['t']] == month)
if selected.sum() > 0:
plt.clf()
m1 = createHeatMap(trueValues[selected, :])
m2 = createHeatMap(predictedValues[selected, :])
# Make a plot only if both have data available :)
if m1.sum() > 0 and m2.sum > 0:
sns.heatmap(m1)
plt.title('True Density Distribution in Month {}'.format(month))
savefile = os.path.abspath('figures/{}_results/{}_heatmap_true_n={}_t={}.png'.format(
city, process, n, month))
plt.savefig(savefile)
plt.close()
print "True heatmap saved to {}".format(savefile)
sns.heatmap(m2)
plt.title(
'Predicted Density Distribution in Month {}'.format(month))
savefile = os.path.abspath('figures/{}_results/{}_heatmap_pred_n={}_t={}.png'.format(
city, process, n, month))
plt.savefig(savefile)
plt.close()
print "Predictions heatmap saved to {}".format(savefile)
In [4]:
def KL(p,q):
logs = np.copy(p + sys.float_info.epsilon)
logs = p*np.log(logs/q)
return np.sum(logs)
In [5]:
# Load in the base data for boston
'''
Read in data
'''
import pandas as pd
import re
import warnings
bos_file = '../data/boston.csv'
target_type = str # The desired output type
with warnings.catch_warnings(record=True) as ws:
warnings.simplefilter("always")
bos_data = pd.read_csv(bos_file, sep=",", header=0)
print("Warnings raised:", ws)
# We have an error on specific columns, try and load them as string
for w in ws:
s = str(w.message)
print("Warning message:", s)
match = re.search(r"Columns \(([0-9,]+)\) have mixed types\.", s)
if match:
columns = match.group(1).split(',') # Get columns as a list
columns = [int(c) for c in columns]
print("Applying %s dtype to columns:" % target_type, columns)
bos_data.iloc[:,columns] = bos_data.iloc[:,columns].astype(target_type)
'''
Featurize data
'''
# temporal features
# day of week
day = np.array(bos_data.DAY_WEEK)
day[ day == "Sunday"] = 0
day[ day == "Monday"] = 1
day[ day == "Tuesday"] = 2
day[ day == "Wednesday"] = 3
day[ day == "Thursday"] = 4
day[ day == "Friday"] = 5
day[ day == "Saturday"] = 6
# Split mm/dd/yyyy xx:yy:zz AM/PM into components
date_time = np.array([x.split() for x in bos_data.FROMDATE])
date = date_time[:,0]
time = date_time[:,1]
tod = date_time[:,2]
# month, day, year
date = np.array([x.split('/') for x in date])
month = [int(x) for x in date[:,0]]
dom = [int(x) for x in date[:,1]]
year = [int(x) for x in date[:,2]]
# months since Jan 2012
time_feat = np.subtract(year, 2012)*12 + month
# time of day
time_c = [x.split(':') for x in time]
time = [int(x[1]) if (y == 'AM' and int(x[0]) == 12) else 60*int(x[0])+int(x[1])
if (y =='AM' and int(x[0]) != 12) or (int(x[0]) == 12 and y == 'PM') else 12*60+60*int(x[0])+int(x[1])
for x,y in zip(time_c, tod)]
# grab the features we want
data_unnorm = np.transpose(np.vstack((time_feat, bos_data.X, bos_data.Y))).astype(float)
# remove NaNs
good_data = data_unnorm[~(np.isnan(data_unnorm[:,1]))]
In [6]:
good_data = good_data[(good_data[:,0] != good_data[:,0].max())]
In [21]:
log = False
%time data = createBuckets(good_data, n_buckets=10, logSpace=log)
%time train, train_t, test, test_t = split(data, 0)
In [24]:
len(train_t[train_t > 0])
Out[24]:
In [ ]:
import GPy as gp
In [ ]:
# Collect likelihoods for different n values
testN = range(2,7) + range(8,11,2)
likelihoods = []
optimize = True
likelihoods = {}
KLs = {}
In [ ]:
for n in testN:
# Bucketize the data as specified! By default, does Boston data.
data = createBuckets(good_data, n, logSpace=False)
# Split for latest year.
%time train, train_t, test, test_t = split(data, 0)
# Train the model and optimize it too
k_lin = gp.kern.Linear(input_dim = 3)
k_per = gp.kern.StdPeriodic(input_dim = 3, ARD1= True, ARD2=True)
# Linear times periodic
k = k_lin * k_per
# Optmize the values
train_t = train_t.reshape((train_t.shape[0], 1))
%time m = gp.models.GPRegression(train, train_t, k)
# Fix the variance
if optimize:
m.Gaussian_noise.variance.constrain_fixed(train_t.std())
# Optimize
m.optimize(messages=True, max_iters=150)
# Get likelihood
likelihoods[n] = m.log_likelihood()
predictions_optimal = m.predict(test)
preds2 = predictions_optimal[0]
preds_p = preds2.clip(min=sys.float_info.epsilon)
n_test_t = test_t / np.sum(test_t)
n_preds = preds_p / np.sum(preds_p)
KLs[n] = KL(n_test_t, n_preds)
print "KL {}".format(KLs[n])
In [ ]: