notebook.community

Edit and run



In [6]:

    
import numpy as np
import pandas as pd
import csv
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
import sys
import seaborn as sns
import pickle



In [7]:

    
'''
Utility Functions
'''
# DATA: months since 2012, X coord, Y coord
# if split size = 0, do non
def split(X, tr_size):
    n_col = np.shape(X)[1]
    if tr_size != 0:
        Y = np.copy(X)
        np.random.shuffle(Y)
        break_pt = tr_size * np.shape(X)[0]
        train, test = Y[:break_pt,:], Y[break_pt:,:]
    else:
        break_pt = (3500./4400.) * np.shape(X)[0]
        train, test = X[:break_pt,:], X[break_pt:,:]

    tr_t, te_t = train[:,n_col-1], test[:,n_col-1]
    tr, te = train[:,range(n_col-1)], test[:,range(n_col-1)]
    return tr, tr_t, te, te_t

# implementation notes: set NaN to mean
def normalize_features(X_train):
    mean_X_train = np.nanmean(X_train, 0)
    for i in xrange(np.shape(X_train)[1]):
        col = X_train[:,i]
        col[ np.isnan(col) ] = mean_X_train[i]
    std_X_train = np.std(X_train, 0)
    std_X_train[ std_X_train == 0 ] = 1
    X_train_normalized = (X_train - mean_X_train) / std_X_train
    return X_train_normalized

# Note: bucket edits in place
def bucket(X, cols, num_buckets):
    Y = np.copy(X)
    for col in cols:
        buckets = np.linspace(np.min(X[:,col]), np.max(X[:,col]), num=num_buckets+1)
        for i in xrange(num_buckets):
            X_col = Y[:,col]
            X_col[ (buckets[i] <= X_col) & (X_col <= buckets[i+1])] = i
            Y[:,col] = X_col
    return Y

def rmse(predict, true):
    return np.sqrt(1.0/np.shape(predict)[0] * np.sum(np.square(predict - true)))



In [8]:

    
def createDataMatrix(data):
    '''
    Transforms a panda dataframe into latitude longitude time period matrix
    record of crimes.
    '''
    X = np.zeros((len(data), 3))
    X[:, 1] = data.Latitude.values.astype(float)
    X[:, 2] = data.Longitude.values.astype(float)
    X[:, 0] = data.TimeFeature.values.astype(int)
    
    return X



In [9]:

    
'''
Read in data
'''
# Let's make a plot for some values of N to see if the data works out...
sfdata_file = '../../cs281_data/large_data/sfclean.pk'
with open(sfdata_file) as fp:
    data = pickle.load(fp)
    # For sfdata, need to remove outliers
    data = data[-120 > data.Longitude][data.Longitude > (-130)]
    data = data[data.Latitude > 37][data.Latitude < 40]
    
good_data = createDataMatrix(data)



In [10]:

    
len(good_data)









    Out[10]:





1834080



In [11]:

    
'''
Count data for each cell. If logSpace is true, returns log values.
'''
def createBuckets(n_buckets = 15, logSpace = True):
    data_b = bucket(good_data, [1, 2], n_buckets)

    n_time = int(data_b[np.argmax(data_b[:,0])][0])

    # buckets = np.zeros((n_time, n_buckets, n_buckets))
    buckets2 = np.zeros((n_buckets * n_buckets * n_time, 4))

    # divide the data up by year and month
    for i in xrange(n_time):
        for j in xrange(n_buckets):
            for k in xrange(n_buckets):
                count = data_b[ (data_b[:,0] == i+1) & 
                                (data_b[:,1] == j) & 
                                (data_b[:,2] == k)]
                # buckets[i][j][k] = np.size(count,0)
                buckets2[i*(n_buckets * n_buckets)+j*(n_buckets)+k][0] = i
                buckets2[i*(n_buckets * n_buckets)+j*(n_buckets)+k][1] = j
                buckets2[i*(n_buckets * n_buckets)+j*(n_buckets)+k][2] = k
                buckets2[i*(n_buckets * n_buckets)+j*(n_buckets)+k][3] = np.size(count,0)
    print np.shape(buckets2)
    
    if logSpace:
        buckets2[:,3] = np.log(np.add(sys.float_info.epsilon, buckets2[:,3]))
    
    return buckets2



In [12]:

    
'''
Our GP
    other implementations:
    - scikit-learn
    - GPy
'''

# compute the kernel matrix
# use square exponential by default
def ker_se(x, y, l, horz = 1.0):
    
    n = np.shape(x)[0]
    m = np.shape(y)[0]
    
    t = np.reshape(x, (np.shape(x)[0], 1, np.shape(x)[1]))
    s = np.reshape(y, (1, np.shape(y)[0], np.shape(y)[1]))

    # tile across columns
    cols = np.tile(t, (1, m, 1))
    # tile across rows
    rows = np.tile(s, (n, 1, 1))
    # get the differences and vectorize
    diff_vec = np.reshape(cols - rows, (n*m, np.shape(t)[2]))
    
    M = np.diag(l)
    
    # use multiply and sum to calculate matrix product
    s = np.multiply(-.5, np.sum(np.multiply(diff_vec, np.transpose(np.dot(M, np.transpose(diff_vec)))), axis=1))
    se = np.reshape(np.multiply(horz, np.exp(s)), (n, m))
    
    return se



In [13]:

    
'''
Calculate kernels
'''
def GaussianProcess(train, train_t, test, test_t, l,
                    horz, sig_eps, predict=True, rmse=True, ker='se'):
    # Try to be memory efficient by deleting data after use!
    if ker == 'se':
        ker_fun = ker_se
    else:
        raise Exception("Kernal {} Not Supported!".format(ker))
        
    ker1 = ker_fun(train, train, l, horz)
    L = np.linalg.cholesky(ker1 + np.multiply(sig_eps, np.identity(np.shape(ker1)[0])))
    
    alpha = np.linalg.solve(L.T, np.linalg.solve(L, train_t))
    
    # Only do this if we request the predictions or rmse
    ret = []
    if predict or rmse:
        ker2 = ker_fun(train,test, l, horz)
        preds = np.dot(np.transpose(ker2), alpha)
        del ker2
        ret.append(preds)
        
    
    # Only if we request the rmse
    if rmse:
        npreds = preds / float(preds.sum())
        ntest_t = test_t / float(test_t.sum())
        rmse_val = np.sqrt(np.sum(np.square(npreds - ntest_t))/np.shape(preds)[0])
        print rmse
        ret.append(rmse_val)
        
    # Calculate the marginal likelihood
    likelihood = -.5 * np.dot(np.transpose(train_t), alpha) - np.sum(np.log(np.diagonal(L))) - np.shape(ker1)[0]/2 * np.log(2*np.pi)
    ret.append(likelihood)
    
    del alpha
    del L
    del ker1
    
    return tuple(ret)



In [14]:

    
columns = { 't' : 0, 'x' : 1, 'y' : 2, 'count' : 3}
def createHeatMap(X):
    '''
    Given a data set, creates a heatmap of it based on x,y coordinates.
    Ignore the temporal feature. You should subset the data before passing
    it into this function if you'd like a heatmap for a specific time period.
    '''
    n = X[:, columns['x']].astype(int).max()
    m = X[:, columns['y']].astype(int).max()
    heatmap = np.zeros((n,m))
    for i in xrange(n):
        for j in xrange(m):
            total = X[(X[:, columns['x']] == i) & 
                      (X[:, columns['y']] == j), columns['count']].sum()
            if total > 0:
                heatmap[i,j] = total
    heatmap = heatmap / float(heatmap.sum())
    return heatmap



In [15]:

    
# Make some plots for n = 15 for GP process
def plotDistribution(predict, true, city, n, process='GP'):
    minValue = min(len(predict), 100)
    yPred = predict[-minValue:]
    yTrue = true[-minValue:]
    yPred = yPred / float(np.sum(yPred))
    yTrue = yTrue / float(np.sum(yTrue))
    plt.clf()
    plt.plot(yPred, label="Predictions")
    plt.plot(yTrue, label="Actual Data")
    plt.title('Predictive Distribution for {}'.format(process))
    plt.xlabel('Compressed Features')
    plt.ylabel('Probability')
    plt.legend()
    plt.savefig('../figures/{}_results/{}_n={}_periods={}.png'.format(
        city, process, n,12))
    plt.close()



In [16]:

    
def plotHeatMaps(X_test, predict, city, n, process='GP'):
    # Attach the predictions to the data
    trueValues = np.copy(X_test)
    predictedValues = np.copy(X_test)
    predictedValues[:, columns['count']] = predict

    # Now we want to plot the heatmaps for the predictions/actual data
    # by time period
    months = np.unique(X_test[:, columns['t']])
    for month in months:
        # Create the heatmaps 
        selected = (X_test[:, columns['t']] == month)
        if selected.sum() > 0:
            plt.clf()
            m = createHeatMap(trueValues[selected, :])
            if m.sum() > 0:
                sns.heatmap(m)
                plt.title('True Density Distribution in Month {}'.format(month))
                plt.savefig('../figures/{}_results/{}_heatmap_true_n={}_t={}.png'.format(
                    city, process, n, month))
                plt.close()

            plt.clf()
            m = createHeatMap(predictedValues[selected, :])
            if m.sum() > 0:
                sns.heatmap(m)
                plt.title('Predicted Density Distribution in Month {}'.format(month))
                plt.savefig('../figures/{}_results/{}_heatmap_pred_n={}_t={}.png'.format(
                    city, process, n, month))
                plt.close()



In [17]:

    
log = False
%time data = createBuckets(n_buckets=5,logSpace=log)  # default is 15, logSpace=True
# %time train, train_t, test, test_t = split(data, 0)
#l = np.ones(3)
#horz = 1.0
#sig_eps = 1.0

#%time predictions, rmse, likelihood = GaussianProcess(train, train_t, test, test_t, l, horz, sig_eps)









    



(3850, 4)
CPU times: user 1min 21s, sys: 0 ns, total: 1min 21s
Wall time: 1min 21s



In [12]:

    
# Only do the below if logspace !
if log:
    test_t = np.exp(test_t)
    predictions = np.exp(predictions)



In [17]:

    
plotDistribution(predictions, test_t, 'sf', 5, process='GPSE')



In [14]:

    
X_test = np.zeros((test.shape[0], test.shape[1] + 1)).astype(int)
X_test[:,:-1] = test
X_test[:,-1] = test_t



In [16]:

    
plotHeatMaps(X_test, predictions, 'sf', 5, process='GPSE')



In [ ]:

    
'''
Easier method for calling our GP model! Kernal defaults to SE.
'''
def optimizeGaussianProcess(n, l1, l2, l3, horz, sig_eps,
                            log=False):
    # Bucketize the data as specified! By default, does Boston data.
    data = createBuckets(n, logSpace=log)
    
    # Split for latest year.
    train, train_t, test, test_t = split(data, 0)
    
    # Calculate the likelihood
    l = np.array([l1,l2,l3])
    likelihood = GaussianProcess(train, train_t, test, test_t,
                                 l, horz, sig_eps,
                                 predict = False, rmse = False)
    return likelihood



In [ ]:

    
# Collect likelihoods for different n values
testN = range(2,10) + range(10,20,5)
likelihoods = []
for n in testN:
    likelihood = optimizeGaussianProcess(n, 1.0, 1.0, 1.0, 1.0, 1.0,
                                        log=False)
    likelihoods.append(likelihood)



In [ ]:

    
x = testN
y = likelihoods
line1 = plt.plot(x, y, label="Log Likelihood")
plt.title('GP Predictions for Boston')
plt.xlabel('Dimension of Grid')
plt.ylabel('Log Likelihood')
plt.legend()
plt.show()



In [18]:

    
'''
Smart GP
'''

import GPy as gp



In [ ]:

    
%time data = createBuckets(n_buckets=10, logSpace=log)  # default is 15, logSpace=True
%time train, train_t, test, test_t = split(data, 0)



In [20]:

    
train_t = train_t.reshape((train_t.shape[0],1))



In [21]:

    
kern = gp.kern.RBF(input_dim=3, variance=1., lengthscale=1.)



In [22]:

    
%time m = gp.models.GPRegression(train, train_t, kern)









    



---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-22-b9bdb0df20f1> in <module>()
----> 1 get_ipython().magic(u'time m = gp.models.GPRegression(train, train_t, kern)')

/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2334         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2335         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2336         return self.run_line_magic(magic_name, magic_arg_s)
   2337 
   2338     #-------------------------------------------------------------------------

/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2255                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2256             with self.builtin_trap:
-> 2257                 result = fn(*args,**kwargs)
   2258             return result
   2259 

/usr/local/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/usr/local/lib/python2.7/dist-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/usr/local/lib/python2.7/dist-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1165         else:
   1166             st = clock2()
-> 1167             exec(code, glob, local_ns)
   1168             end = clock2()
   1169             out = None

<timed exec> in <module>()

/usr/local/lib/python2.7/dist-packages/GPy/core/parameterization/parameterized.pyc in __call__(self, *args, **kw)
     25         self._highest_parent_._connect_fixes()
     26         logger.debug("calling parameters changed")
---> 27         self.parameters_changed()
     28         return self
     29 

/usr/local/lib/python2.7/dist-packages/GPy/core/gp.pyc in parameters_changed(self)
    186             this method yourself, there may be unexpected consequences.
    187         """
--> 188         self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
    189         self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
    190         self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)

/usr/local/lib/python2.7/dist-packages/GPy/inference/latent_function_inference/exact_gaussian_inference.pyc in inference(self, kern, X, likelihood, Y, mean_function, Y_metadata)
     54         Ky = K.copy()
     55         diag.add(Ky, likelihood.gaussian_variance(Y_metadata)+1e-8)
---> 56         Wi, LW, LWi, W_logdet = pdinv(Ky)
     57 
     58         alpha, _ = dpotrs(LW, YYT_factor, lower=1)

/usr/local/lib/python2.7/dist-packages/GPy/util/linalg.pyc in pdinv(A, *args)
    209     logdet = 2.*np.sum(np.log(np.diag(L)))
    210     Li = dtrtri(L)
--> 211     Ai, _ = dpotri(L, lower=1)
    212     # Ai = np.tril(Ai) + np.tril(Ai,-1).T
    213     symmetrify(Ai)

/usr/local/lib/python2.7/dist-packages/GPy/util/linalg.pyc in dpotri(A, lower)
    141 
    142     A = force_F_ordered(A)
--> 143     R, info = lapack.dpotri(A, lower=lower) #needs to be zero here, seems to be a scipy bug
    144 
    145     symmetrify(R)

MemoryError:



In [ ]:

    
from IPython.display import display
display(m)