In [10]:
'''
Notebook to Calculate Baseline using a simple average predictor.
'''
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
In [2]:
'''
Utility functions.
'''
# Mapping for data matrix columns.
columns = { 'x' : 0,
'y' : 1,
'region' : 2,
't' : 3,
'count' : 4 }
# Author: Alex Wang -- Sets NaN to average.
def normalize_features(X_train):
mean_X_train = np.nanmean(X_train, 0)
for i in xrange(np.shape(X_train)[1]):
col = X_train[:,i]
col[ np.isnan(col) ] = mean_X_train[i]
std_X_train = np.std(X_train, 0)
std_X_train[ std_X_train == 0 ] = 1
X_train_normalized = (X_train - mean_X_train) / std_X_train
return X_train_normalized
def rmse(predict, true):
# Returns the root mean squared error.
return np.sqrt(1.0/np.shape(predict)[0] * np.sum(np.square(predict - true)))
def randomSplit(X, split_size):
# Randomly splits the data.
np.random.shuffle(X)
break_pt = int(split_size * np.shape(X)[0])
return X[:break_pt,:], X[break_pt:,:]
def splitLastN(X, t):
# Splits the X data matrix into historical data and data for the
# last t time steps.
times = np.unique(X[:, columns['t']])
lowBound = np.sort(times)[len(times) - t]
selected = X[:, columns['t']] <= lowBound
return X[selected,:], X[~selected,:]
def buckets(series, n):
# Takes a series and returns an array mapping each element to
# one of n buckets.
mi, ma = series.min(), series.max()
buckets = np.linspace(mi, ma, n + 1)
res = np.zeros(len(series))
try:
array = series.values
except AttributeError:
array = series
if np.isnan(array).any():
print "Error! NaN values found in series!"
for i in xrange(n):
res[(buckets[i] <= array) & (array < buckets[i+1])] = i
return res.astype(int)
def createSimplePartitions(data, n):
# Returns a partitioned version of data into nxn regions!
data['xRegion'] = buckets(data.Latitude, n).astype(int)
data['yRegion'] = buckets(data.Longitude, n).astype(int)
data['Region'] = n * data.xRegion + data.yRegion
return data
def extractRegionsFromMatrix(X_data, n):
# Does the same thing as extractDataMatrix, but as input takes in
# a matrix with just the latitude, longitude coordinates, time periods.
xRegion = buckets(X_data[:, 0], n)
yRegion = buckets(X_data[:, 1], n)
Region = n * xRegion + yRegion
xRegions = np.unique(xRegion).astype(int)
yRegions = np.unique(yRegion).astype(int)
regions = np.unique(Region).astype(int)
months = np.unique(X_data[:,2])
num_columns = 5
num_rows = len(regions) * len(months)
X_res = np.zeros((num_rows, num_columns))
el = 0
for x in xRegions:
for y in yRegions:
for month in months:
reg = n * x + y
count = len(X_data[ (Region == reg) &
(X_data[:, 2] == month)])
if count > 0:
X_res[el, :] = np.array([x,y,reg, month, count])
el += 1
# Convert data to right shape
X_res = X_res.astype(int)
if el < X_res.shape[0]:
print "Removing empty values from our data!"
print "Rows before: {}".format(X_res.shape[0])
X_res = X_res[~np.all(X_res == 0, axis=1)]
print "Rows after: {}".format(X_res.shape[0])
return X_res
def createDataMatrix(data):
'''
Transforms a panda dataframe into latitude longitude time period matrix
record of crimes.
'''
X = np.zeros((len(data), 3))
X[:, 0] = data.Latitude.values.astype(float)
X[:, 1] = data.Longitude.values.astype(float)
X[:, 2] = data.TimeFeature.values.astype(int)
return X
def extractDataMatrix(data, n):
# Creates a NxD data matrix from the given data set.
# data must contains xRegion, yRegion, Region, and TimeFeature columns.
# 0 -> xRegion
# 1 -> yRegion
# 2 -> Region
# 3 -> Month
# 4 -> Count
# The data is NOT normalized!
# Returns the data as well as a dictionary mapping column names
# to indeces.
partData = createSimplePartitions(data, n)
regions = partData.Region.unique()
xRegions = partData.xRegion.unique()
yRegions = partData.yRegion.unique()
months = partData.TimeFeature.unique()
num_columns = 5
num_rows = len(regions) * len(months)
X_data = np.zeros((num_rows, num_columns))
el = 0
for x in xRegions:
for y in yRegions:
for month in months:
count = len(data[ (data.xRegion == x) &
(data.yRegion == y) &
(data.TimeFeature == month)])
if count > 0:
X_data[el, :] = np.array([x,y,n*x + y, month, count])
el += 1
if el < X_data.shape[0]:
print "Removing empty values from our data!"
print "Rows before: {}".format(X_data.shape[0])
X_data = X_data[~np.all(X_data == 0, axis=1)]
print "Rows after: {}".format(X_data.shape[0])
return X_data.astype(int)
In [3]:
'''
More utility functions.
'''
# Note: If a data point does not exist, it is assumed to be 0.
def averagePredictions(X_train, nRegions, tMax):
averages = np.zeros(nRegions)
for region in xrange(nRegions):
averages[region] = X_train[
X_train[:, columns['region']] == region,
columns['count']].sum() / float(tMax)
return averages
def createHeatMap(X):
'''
Given a data set, creates a heatmap of it based on x,y coordinates.
Ignore the temporal feature. You should subset the data before passing
it into this function if you'd like a heatmap for a specific time period.
'''
n = X[:, columns['x']].astype(int).max()
m = X[:, columns['y']].astype(int).max()
heatmap = np.zeros((n,m))
for i in xrange(n):
for j in xrange(m):
total = X[(X[:, columns['x']] == i) &
(X[:, columns['y']] == j), columns['count']].sum()
if total > 0:
heatmap[i,j] = total
# Normalize
heatmap = heatmap / float(heatmap.sum())
return heatmap
In [4]:
# Make some plots for n = 15 for GP process
def plotDistribution(predict, true, city, n, process='GP'):
minValue = min(len(predict), 100)
yPred = predict[-minValue:]
yTrue = true[-minValue:]
yPred = yPred / float(np.sum(yPred))
yTrue = yTrue / float(np.sum(yTrue))
plt.clf()
plt.plot(yPred, label="Predictions")
plt.plot(yTrue, label="Actual Data")
plt.title('Predictive Distribution for {}'.format(process))
plt.xlabel('Compressed Features')
plt.ylabel('Probability')
plt.legend()
plt.savefig('../figures/{}_results/{}_n={}.png'.format(
city, process, n))
plt.close()
def plotHeatMaps(X_test, predict, city, n, process='GP'):
# Attach the predictions to the data
trueValues = np.copy(X_test)
predictedValues = np.copy(X_test)
predictedValues[:, columns['count']] = predict
# Now we want to plot the heatmaps for the predictions/actual data
# by time period
months = np.unique(X_test[:, columns['t']])
for month in months:
# Create the heatmaps
selected = (X_test[:, columns['t']] == month)
if selected.sum() > 0:
plt.clf()
m = createHeatMap(trueValues[selected, :])
if m.sum() > 0:
sns.heatmap(m)
plt.title('True Density Distribution in Month {}'.format(month))
plt.savefig('../figures/{}_results/{}_heatmap_true_n={}_t={}.png'.format(
city, process, n, month))
plt.close()
plt.clf()
m = createHeatMap(predictedValues[selected, :])
if m.sum() > 0:
sns.heatmap(m)
plt.title('Predicted Density Distribution in Month {}'.format(month))
plt.savefig('../figures/{}_results/{}_heatmap_pred_n={}_t={}.png'.format(
city, process, n, month))
plt.close()
In [12]:
# Given a value of n:
# 0. Normalize the data (if set to True)
# 1. Partition the data
# 2. Split into Train/Test, where test has lastN time feats.
# Options: 'random', 'last'
# splitRatio specifies the ratio of results to keep for testing.
# testPeriods specifies the number of time periods to test
# 3. Train the averages
# 4. Test on the hold-out
# 5. Calculate RMSE
def averageModel(n, X_data, normalize = False, splitMethod = 'random',
splitRatio = 0.2, testPeriods = 12, plot=None):
if normalize:
X_data = normalize_features(X_data)
print "Normalized data features!"
sys.stdout.flush()
# Returns an array indexed by region with the average over the
# training set for each region.
tMax = X_data[:, columns['t']].max()
nRegions = X_data[:, columns['region']].max() + 1
# Now we can split the data!
if splitMethod == 'random':
X_train, X_test = randomSplit(X_data, splitRatio)
elif splitMethod == 'last':
X_train, X_test = splitLastN(X_data, testPeriods)
else:
raise Exception("splitMethod {} unsupported".format(splitMethod))
print "Training model..."
sys.stdout.flush()
# Now use training data to calculate averages
model = averagePredictions(X_train, nRegions, tMax)
print "Averages Model:"
print model
sys.stdout.flush()
# Generate predictions vector
predict = model[X_test[:, columns['region']]]
true = X_test[:, columns['count']]
# Create distributions, and calculate the RMSE of the distribution
predict_dist = predict / float(np.sum(predict))
true_dist = true / float(np.sum(true))
# Plot the figure if we're predicting on the end!
if plot is not None:
plotDistribution(predict, true, 'sf', n, process='Baseline')
plotHeatMaps(X_test, predict, 'sf', n, process='Baseline')
print "Calculating RMSE..."
sys.stdout.flush()
return rmse(predict_dist, true_dist)
In [8]:
# Let's make a plot for some values of N to see if the data works out...
sfdata_file = '../../cs281_data/large_data/sfclean.pk'
with open(sfdata_file) as fp:
data = pickle.load(fp)
# For sfdata, need to remove outliers
data = data[-120 > data.Longitude][data.Longitude > (-130)]
data = data[data.Latitude > 37][data.Latitude < 40]
In [13]:
import sys
# Create the data matrix for San Francisco
X_data = createDataMatrix(data)
testN = range(2,10) + range(10,30,5)
rmses = []
for n in testN:
print "n = {}".format(n)
# Extracting more efficiently!
%time X_region = extractRegionsFromMatrix(X_data, n)
# print X_data.dtype
print "Partitioned data..."
sys.stdout.flush()
rmse_random = averageModel(n, X_region)
print "Random RMSE: {}".format(rmse_random)
sys.stdout.flush()
rmse_last = averageModel(n, X_region, plot='sf', splitMethod='last', testPeriods=6)
rmses.append((rmse_random, rmse_last))
print "Last RMSE: {}".format(rmse_last)
sys.stdout.flush()
In [ ]:
x = testN[:len(rmses)]
y1,y2 = zip(*rmses)
line1 = plt.plot(x, y1, label="Final 12 Time Units")
line2 = plt.plot(x, y2, label="Random Sample Data")
plt.title('Baseline Predictions for San Francisco')
plt.xlabel('Dimension of Grid')
plt.ylabel('Root Mean Square Error')
plt.legend()
plt.show()
In [ ]: