Table of contents:
In [ ]:
#% matplotlib inline
import os, sys, time, pickle
import math, random, itertools
import pandas as pd
import numpy as np
import heapq as hq
from scipy.optimize import minimize
from scipy.misc import logsumexp
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from joblib import Parallel, delayed
import cython
In [ ]:
sys.path.append('src/')
In [ ]:
from shared import TrajData, evaluate, do_evaluation
from ssvm import calc_node_features
from ssvm import calc_node_features, calc_edge_features
In [ ]:
random.seed(1234567890)
np.random.seed(1234567890)
In [ ]:
dat_ix = 0
data_dir = 'data/data-new'
In [ ]:
dat_obj = TrajData(dat_ix, data_dir=data_dir)
Hyperparameters.
In [ ]:
N_JOBS = 6 # number of parallel jobs
ABS_SCALER = False # feature scaling, True: MaxAbsScaler, False: MinMaxScaler #False: StandardScaler
C_SET = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000] # regularisation parameter
MC_PORTION = 0.1 # the portion of data that sampled by Monte-Carlo cross-validation
MC_NITER = 5 # number of iterations for Monte-Carlo cross-validation
Cost function and its gradient for MEMM with first order MC.
In [ ]:
%load_ext Cython
In [ ]:
%%cython
import numpy as np
from scipy.misc import logsumexp
cimport numpy as np
cpdef MEMM_1MC_obj(w, X_node_all, X_edge, list Y, float C, long M):
"""
w - parameter vector
X_node_all - feature matrix of POIs for all training examples, (N x M) x n_node_features
X_edge - transition feature matrix of POIs, M x M x n_edge_features
Y - labels/trajectories for all training examples
C - regularisation constant
M - total number of POIs
return the cost and the gradient of cost
"""
#print('entering MEMM_obj')
assert(C > 0)
assert(M > 0)
cdef long N, D, i, j, pj, pk, pl
N = int(np.shape(X_node_all)[0]/M)
D = np.shape(X_node_all)[1] * 2 + np.shape(X_edge)[2]
assert(D == np.shape(w)[0])
assert(N == len(Y))
cdef double cost, costi, denorminator
cost = 0.0
grad = np.zeros(D, dtype=np.float)
for i in range(N):
costi = 0.0
gradi = np.zeros(D, dtype=np.float)
for j in range(1, np.shape(Y[i])[0]):
pj = Y[i][j-1] # index of feature vector for POI p_{j-1}
pk = Y[i][j]
phi_j = np.hstack([X_node_all[i*M + pj], X_edge[pj, pk], X_node_all[i*M + pk]])
costi -= np.dot(w, phi_j)
gradi = gradi - phi_j
cost_values = np.zeros(M, dtype=np.float)
denorminator = 0.0
numerator = np.zeros(D, dtype=np.float)
for pl in range(M):
phi_l = np.hstack([X_node_all[i*M + pj], X_edge[pj, pl], X_node_all[i*M + pl]])
term = np.dot(w, phi_l)
cost_values[pl] = term
expterm = np.exp(term)
numerator = numerator + phi_l * expterm
denorminator += expterm
costi += logsumexp(cost_values)
gradi = gradi + numerator / denorminator
cost += costi / N
grad = grad + gradi / N
cost = 0.5 * np.dot(w, w) + C * cost
grad = w + C * grad
#print('exit MEMM_obj')
return (cost, grad)
In [ ]:
M0, L0 = 5, 3
f_u = np.array([2, 1, 1, 3, 1], dtype=np.float).reshape((M0, 1))
f_p = np.array([1,2,1,1,1, 1,1,1,1,3, 2,1,1,1,1, 1,1,3,1,1, 1,1,1,2,1], dtype=np.float).reshape((M0, M0, 1))
w0 = np.array([1, 1, 2], dtype=np.float)
ps0 = 1
In [ ]:
M0, L0 = 6, 4
f_u = np.array([2, 1, 1, 2, 1, 1], dtype=np.float).reshape((M0, 1))
f_p = np.array([1,2,1,1,1,1, 1,1,1,1,3,2, 2,1,1,1,1,2, 1,1,3,1,1,1, 1,1,1,2,1,1, 2,1,1,2,1,1],
dtype=np.float).reshape((M0, M0, 1))
w0 = np.array([1, 2, 1], dtype=np.float)
ps0 = 1
Inference for MEMM with first order MC by brute force search (for sanity check).
In [ ]:
def MEMM_inference_bruteForce(ps, L, M, w, X_node, X_edge):
assert(L > 1)
assert(M >= L)
assert(ps >= 0)
assert(ps < M)
Q = []
for x in itertools.permutations([p for p in range(M) if p != ps], L-1):
#print([ps] + list(x))
y = [ps] + list(x)
score = 0
for j in range(1, L):
ss = y[j-1]
tt = y[j]
score += np.dot(w, np.hstack([X_node[ss, :], X_edge[ss, tt], X_node[tt, :]]))
score -= logsumexp([np.dot(w, np.hstack([X_node[ss,:], X_edge[ss,pp], X_node[pp,:]])) for pp in range(M)])
priority = -score
hq.heappush(Q, (priority, y))
k = 20
while k > 0 and len(Q) > 0:
priority, pathwalk = hq.heappop(Q)
print(pathwalk, -priority)
k -= 1
In [ ]:
MEMM_inference_bruteForce(ps0, L0, M0, w0, f_u, f_p)
Inference for MEMM with first order MC using the Viterbi algorithm.
In [ ]:
def MEMM_inference_viterbi(ps, L, M, w, X_node, X_edge, debug=False):
assert(L > 1)
assert(M >= L)
assert(ps >= 0)
assert(ps < M)
assert(w.shape[0] == X_node.shape[1]*2 + X_edge.shape[2])
A = np.zeros((L-1, M), dtype=np.float) # scores matrix
B = np.ones((L-1, M), dtype=np.int) * (-1) # backtracking pointers
for p in range(M): # ps--p
A[0, p] = np.dot(w, np.hstack([X_node[ps, :], X_edge[ps, p], X_node[p, :]])) - \
logsumexp([np.dot(w, np.hstack([X_node[ps,:], X_edge[ps,pp], X_node[pp,:]])) for pp in range(M)]) \
if ps != p else -np.inf
B[0, p] = ps
for t in range(0, L-2): # ps~~p1--p
for p in range(M):
scores = [A[t, p1] + \
np.dot(w, np.hstack([X_node[p1,:], X_edge[p1,p], X_node[p,:]])) - \
logsumexp([np.dot(w, np.hstack([X_node[p1,:], X_edge[p1,pp], X_node[pp,:]])) for pp in range(M)])
if p1 not in {p,ps} else -np.inf for p1 in range(M)]
maxix = np.argmax(scores)
A[t+1, p] = scores[maxix]
B[t+1, p] = maxix
if debug == True: print(A)
y_hat = [np.argmax(A[L-2, :])]
p, t = y_hat[-1], L-2
while t >= 0:
y_hat.append(B[t, p])
p, t = y_hat[-1], t-1
y_hat.reverse()
if debug == True: print(y_hat, np.max(A[L-2]))
return np.asarray(y_hat)
In [ ]:
MEMM_inference_viterbi(ps0, L0, M0, w0, f_u, f_p)#, debug=True)
Inference using the List Viterbi algorithm, which sequentially find the (k+1)-th best path/walk given the 1st, 2nd, ..., k-th best paths/walks.
Reference papers:
Implementation is adapted from the above references.
In [ ]:
class HeapItem0: # an item in heapq (min-heap)
def __init__(self, priority, task):
self.priority = priority
self.task = task
self.string = str(priority) + ': ' + str(task)
def __lt__(self, other):
return self.priority < other.priority
def __repr__(self):
return self.string
def __str__(self):
return self.string
In [ ]:
def MEMM_inference_listViterbi0(ps, L, M, w, X_node, X_edge, debug=False):
assert(L > 1)
assert(M >= L)
assert(ps >= 0)
assert(ps < M)
# forward-backward procedure: adapted from the Rabiner paper
Alpha = np.zeros((L, M), dtype=np.float) # alpha_t(p_i)
Beta = np.zeros((L, M), dtype=np.float) # beta_t(p_i)
lognorm0 = logsumexp([np.dot(w, np.hstack([X_node[ps,:], X_edge[ps,pp], X_node[pp,:]])) for pp in range(M)])
for pj in range(M):
Alpha[1,pj] = np.dot(w, np.hstack([X_node[ps, :], X_edge[ps, pj], X_node[pj, :]])) - lognorm0 \
if pj != ps else -np.inf
for t in range(2, L):
for pj in range(M): # pi-->(pj fixed)
Alpha[t, pj] = np.max([Alpha[t-1, pi] + \
np.dot(w, np.hstack([X_node[pi, :], X_edge[pi, pj], X_node[pj, :]])) - \
logsumexp([np.dot(w, np.hstack([X_node[pi,:], X_edge[pi,pp], X_node[pp,:]])) \
for pp in range(M)]) \
if pi not in {pj, ps} else -np.inf for pi in range(M)])
for t in range(L-1, 1, -1):
for pi in range(M): # (fixed pi)-->pj
lognorm = logsumexp([np.dot(w, np.hstack([X_node[pi,:], X_edge[pi,pp], X_node[pp,:]])) for pp in range(M)])
Beta[t-1, pi] = np.max([Beta[t, pj] + \
np.dot(w, np.hstack([X_node[pi, :], X_edge[pi, pj], X_node[pj, :]])) - lognorm \
if pj not in {pi, ps} else -np.inf for pj in range(M)])
Beta[0, ps] = np.max([Beta[1, pj] + np.dot(w, np.hstack([X_node[ps,:], X_edge[ps,pj], X_node[pj,:]])) - lognorm0 \
if pj != ps else -np.inf for pj in range(M)])
Fp = np.zeros((L-1, M, M), dtype=np.float) # f_{t, t+1}(p, p')
for t in range(L-1):
for pi in range(M):
lognormi = logsumexp([np.dot(w, np.hstack([X_node[pi,:], X_edge[pi,pp], X_node[pp,:]])) for pp in range(M)])
for pj in range(M):
Fp[t, pi, pj] = Alpha[t, pi] + Beta[t+1, pj] + \
np.dot(w, np.hstack([X_node[pi, :], X_edge[pi, pj], X_node[pj, :]])) - lognormi \
if pj not in {pi, ps} else -np.inf
# identify the best path/walk: adapted from the IJCAI01 paper
y_best = np.ones(L, dtype=np.int) * (-1)
y_best[0] = ps
maxix = np.argmax(Fp[0, ps, :]) # the start POI is specified
y_best[1] = maxix
for t in range(2, L):
y_best[t] = np.argmax(Fp[t-1, y_best[t-1], :])
Q = [] # priority queue (min-heap)
maxIter = 1e5
with np.errstate(invalid='raise'): # deal with overflow
try: maxIter = np.power(M, L-1) - np.prod([M-kx for kx in range(1,L)]) + 1
except: maxIter = 1e5
if debug == True: maxIter = np.min([maxIter, 200]); print('#iterations:', maxIter)
else: maxIter = np.min([maxIter, 1e5])
# heap item for the best path/walk
#priority, partition_index, exclude_set = -np.max(Fu[L-1, :]), None, set() # -1 * score as priority
priority, partition_index, exclude_set = -np.max(Alpha[L-1, :]), None, set() # -1 * score as priority
hq.heappush(Q, HeapItem0(priority, (y_best, partition_index, exclude_set)))
if debug == True: histories = set()
k = 0; y_last = None
while len(Q) > 0 and k < maxIter:
hitem = hq.heappop(Q)
k_priority, (k_best, k_partition_index, k_exclude_set) = hitem.priority, hitem.task
k += 1; y_last = k_best
if debug == True:
#histories.add(''.join([str(x) + ',' for x in k_best]))
#print(k, len(histories))
#print('pop:', k_priority, k_best, k_partition_index, k_exclude_set)
print(k_best, -k_priority)
else:
if len(set(k_best)) == L: return k_best
# identify the (k+1)-th best path/walk given the 1st, 2nd, ..., k-th best: adapted from the IJCAI01 paper
partition_index_start = 1
if k_partition_index is not None:
assert(k_partition_index > 0)
assert(k_partition_index < L)
partition_index_start = k_partition_index
for parix in range(partition_index_start, L):
new_exclude_set = set({k_best[parix]})
if parix == partition_index_start:
new_exclude_set = new_exclude_set | k_exclude_set
new_best = np.ones(L, dtype=np.int) * (-1)
for pk in range(parix):
new_best[pk] = k_best[pk]
candidate_points = [p for p in range(M) if p not in new_exclude_set]
if len(candidate_points) == 0: continue
candidate_maxix = np.argmax([Fp[parix-1, k_best[parix-1], p] for p in candidate_points])
new_best[parix] = candidate_points[candidate_maxix]
for pk in range(parix+1, L):
new_best[pk] = np.argmax([Fp[pk-1, new_best[pk-1], p] for p in range(M)])
new_priority = Fp[parix-1, k_best[parix-1], new_best[parix]]
if k_partition_index is not None:
new_priority += (-k_priority) - Fp[parix-1, k_best[parix-1], k_best[parix]]
new_priority *= -1.0 # NOTE: -np.inf - np.inf + np.inf = nan
#print(' '*3, 'push:', new_priority, new_best, parix, new_exclude_set)
hq.heappush(Q, HeapItem0(new_priority, (new_best, parix, new_exclude_set)))
if debug == True: print('#iterations: %d, #distinct_trajectories: %d' % (k, len(histories)))
sys.stderr.write('WARN: reaching max number of iterations, NO optimal solution found, return the last one.\n')
return y_last
In [ ]:
MEMM_inference_listViterbi0(ps0, L0, M0, w0, f_u, f_p, debug=True)
In [ ]:
%%cython
#%%cython -a
import numpy as np
import heapq as hq
import sys
from scipy.misc import logsumexp
cimport numpy as np
"""
Inference using the List Viterbi algorithm, which sequentially find the (k+1)-th best path/walk given
the 1st, 2nd, ..., k-th best paths/walks.
Implementation is adapted from references:
- Sequentially finding the N-Best List in Hidden Markov Models, Dennis Nilsson and Jacob Goldberger, IJCAI 2001.
- A tutorial on hidden Markov models and selected applications in speech recognition, L.R. Rabiner,
Proceedings of the IEEE, 1989.
"""
cdef class HeapItem: # an item in heapq (min-heap)
cdef readonly float priority
cdef readonly object task, string
def __init__(self, float priority, task):
self.priority = priority
self.task = task
self.string = str(priority) + ': ' + str(task)
#def __lt__(self, other):
# return self.priority < other.priority
def __richcmp__(self, other, int op):
if op == 2: # ==
return self.priority == other.priority
elif op == 3: # !=
return self.priority != other.priority
elif op == 0: # <
return self.priority < other.priority
elif op == 1: # <=
return self.priority <= other.priority
elif op == 4: # >
return self.priority > other.priority
elif op == 5: # >=
return self.priority >= other.priority
else:
assert False
def __repr__(self):
return self.string
def __str__(self):
return self.string
cpdef MEMM_inference_listViterbi(int ps, int L, int M,
np.ndarray[dtype=np.float64_t, ndim=1] w,
np.ndarray[dtype=np.float64_t, ndim=2] X_node,
np.ndarray[dtype=np.float64_t, ndim=3] X_edge):
""" Inference using the list Viterbi algorithm """
assert(L > 1)
assert(M >= L)
assert(ps >= 0)
assert(ps < M)
cdef int pi, pj, t, k, pk, parix, partition_index, partition_index_start, k_partition_index
cdef long nIter, maxIter = long(1e6)
cdef float loss, priority
# forward-backward procedure: adapted from the Rabiner paper
Alpha = np.zeros((L, M), dtype=np.float) # alpha_t(p_i)
Beta = np.zeros((L, M), dtype=np.float) # beta_t(p_i)
lognorm0 = logsumexp([np.dot(w, np.hstack([X_node[ps, :], X_edge[ps, pp, :], X_node[pp, :]])) for pp in range(M)])
for pj in range(M):
Alpha[1, pj] = np.dot(w, np.hstack([X_node[ps, :], X_edge[ps, pj, :], X_node[pj, :]])) - lognorm0 \
if pj != ps else -np.inf
for t in range(2, L):
for pj in range(M): # pi-->(pj fixed)
Alpha[t, pj] = np.max([Alpha[t-1, pi] + \
np.dot(w, np.hstack([X_node[pi, :], X_edge[pi, pj, :], X_node[pj, :]])) - \
logsumexp([np.dot(w, np.hstack([X_node[pi, :], X_edge[pi, pp, :], X_node[pp, :]])) \
for pp in range(M)]) \
if pi not in [pj, ps] else -np.inf for pi in range(M)])
for t in range(L-1, 1, -1):
for pi in range(M): # (fixed pi)-->pj
lognorm = logsumexp([np.dot(w, np.hstack([X_node[pi,:], X_edge[pi,pp], X_node[pp,:]])) for pp in range(M)])
Beta[t-1, pi] = np.max([Beta[t, pj] + \
np.dot(w, np.hstack([X_node[pi, :], X_edge[pi, pj], X_node[pj, :]])) - lognorm \
if pj not in {pi, ps} else -np.inf for pj in range(M)])
Beta[0, ps] = np.max([Beta[1, pj] + np.dot(w, np.hstack([X_node[ps,:], X_edge[ps,pj], X_node[pj,:]])) - lognorm0 \
if pj != ps else -np.inf for pj in range(M)])
Fp = np.zeros((L-1, M, M), dtype=np.float) # f_{t, t+1}(p, p')
for t in range(L-1):
for pi in range(M):
lognormi = logsumexp([np.dot(w, np.hstack([X_node[pi,:], X_edge[pi,pp], X_node[pp,:]])) for pp in range(M)])
for pj in range(M):
Fp[t, pi, pj] = Alpha[t, pi] + Beta[t+1, pj] + \
np.dot(w, np.hstack([X_node[pi, :], X_edge[pi, pj], X_node[pj, :]])) - lognormi \
if pj not in [pi, ps] else -np.inf
# identify the best path/walk: adapted from the IJCAI01 paper
y_best = np.ones(L, dtype=np.int) * (-1)
y_best[0] = ps
y_best[1] = np.argmax(Fp[0, ps, :]) # the start POI is specified
for t in range(2, L):
y_best[t] = np.argmax(Fp[t-1, y_best[t-1], :])
Q = [] # priority queue (min-heap)
with np.errstate(invalid='raise'): # deal with overflow
try: nIter = np.power(M, L-1) - np.prod([M-kx for kx in range(1,L)]) + 1
except: nIter = maxIter
nIter = np.min([nIter, maxIter])
# heap item for the best path/walk
priority = -np.max(Alpha[L-1, :])
partition_index = -1
exclude_set = set() # -1 * score as priority
hq.heappush(Q, HeapItem(priority, (y_best, partition_index, exclude_set)))
k = 0; y_last = None
while len(Q) > 0 and k < nIter:
hitem = hq.heappop(Q)
k_priority, (k_best, k_partition_index, k_exclude_set) = hitem.priority, hitem.task
k += 1; y_last = k_best
if len(set(k_best)) == L:
#print(k_priority)
return k_best
# identify the (k+1)-th best path/walk given the 1st, 2nd, ..., k-th best: adapted from the IJCAI01 paper
partition_index_start = 1
if k_partition_index > 0:
assert(k_partition_index < L)
partition_index_start = k_partition_index
for parix in range(partition_index_start, L):
new_exclude_set = set({k_best[parix]})
if parix == partition_index_start:
new_exclude_set = new_exclude_set | k_exclude_set
# new_best[:parix]
new_best = np.zeros(L, dtype=np.int) * (-1)
new_best[:parix] = k_best[:parix]
if len(set(new_best[:parix])) < parix: # if there's sub-tour(s) in new_best[:parix]
break
# new_best[parix]
candidate_points = [p for p in range(M) if p not in new_exclude_set]
if len(candidate_points) == 0: continue
candidate_maxix = np.argmax([Fp[parix-1, k_best[parix-1], p] for p in candidate_points])
new_best[parix] = candidate_points[candidate_maxix]
# new_best[parix+1:]
for pk in range(parix+1, L):
new_best[pk] = np.argmax([Fp[pk-1, new_best[pk-1], p] for p in range(M)])
new_priority = Fp[parix-1, k_best[parix-1], new_best[parix]]
if k_partition_index > 0:
new_priority += (-k_priority) - Fp[parix-1, k_best[parix-1], k_best[parix]]
new_priority *= -1.0 # NOTE: -np.inf - np.inf + np.inf = nan
hq.heappush(Q, HeapItem(new_priority, (new_best, parix, new_exclude_set)))
if k >= nIter:
sys.stderr.write('WARN: reaching max number of iterations, NO optimal solution found, return the last one.\n')
if len(Q) == 0:
sys.stderr.write('WARN: empty queue, return the last one\n')
return y_last
In [ ]:
MEMM_inference_listViterbi(ps0, L0, M0, w0, f_u, f_p)
In [ ]:
class MEMM:
def __init__(self, dat_obj, inference_fun=MEMM_inference_listViterbi, C=1.0, poi_info=None, debug=False):
assert(C > 0)
self.C = C
self.inference_fun = inference_fun
self.dat_obj = dat_obj
self.debug = debug
self.trained = False
if poi_info is None:
self.poi_info = None
else:
self.poi_info = poi_info
if ABS_SCALER == True:
self.scaler = MaxAbsScaler(copy=False)
else:
self.scaler = MinMaxScaler(feature_range=(-1,1), copy=False)
#self.scaler = StandardScaler(copy=False)
def train(self, trajid_set_train):
if self.poi_info is None:
self.poi_info = calc_poi_info(list(trajid_set_train), traj_all, poi_all)
# build POI_ID <--> POI__INDEX mapping for POIs used to train CRF
# which means only POIs in traj such that len(traj) >= 2 are included
poi_set = {p for tid in trajid_set_train for p in self.dat_obj.traj_dict[tid] \
if len(self.dat_obj.traj_dict[tid]) >= 2}
self.poi_ix = sorted(poi_set)
self.poi_id_dict, self.poi_id_rdict = dict(), dict()
for idx, poi in enumerate(self.poi_ix):
self.poi_id_dict[poi] = idx
self.poi_id_rdict[idx] = poi
# generate training data
train_traj_list = [self.dat_obj.traj_dict[x] for x in trajid_set_train if len(self.dat_obj.traj_dict[x]) >= 2]
node_features_list = Parallel(n_jobs=N_JOBS)\
(delayed(calc_node_features)\
(tr[0], len(tr), self.poi_ix, self.poi_info, self.dat_obj) for tr in train_traj_list)
edge_features = calc_edge_features(list(trajid_set_train), self.poi_ix, self.poi_info, self.dat_obj)
assert(len(train_traj_list) == len(node_features_list))
self.fdim = node_features_list[0].shape
# feature scaling
# should each example be flattened to one vector before scaling?
# It seems the performance is better if we don't flatten before scaling
X_node_all = np.vstack(node_features_list)
#X_node_all = X_node_all.reshape(len(node_features_list), -1) # flatten every example to a vector
X_node_all = self.scaler.fit_transform(X_node_all)
#X_node_all = X_node_all.reshape(-1, self.fdim[1])
self.X_edge = edge_features
y_train = [np.array([self.poi_id_dict[x] for x in tr]) for tr in train_traj_list]
if self.debug == True: print('C:', self.C)
w = np.random.rand(X_node_all.shape[1] * 2 + self.X_edge.shape[2]) # initial guess
opt_method = 'BFGS' # 'Newton-CG'
options = {'disp': True} if self.debug == True else dict()
#opt = minimize(MEMM_1MC_cost, w, args=(X_node_all, self.X_edge, y_train, self.C, len(self.poi_ix)), \
# method=opt_method, jac=MEMM_1MC_grad, options=options)
opt = minimize(MEMM_1MC_obj, w, args=(X_node_all, self.X_edge, y_train, self.C, len(self.poi_ix)), \
method=opt_method, jac=True, options=options)
if opt.success == True:
self.w = opt.x
self.trained = True
else:
sys.stderr.write('Optimisation failed, C=%f\n' % self.C)
self.trained = False
return self.trained
def predict(self, startPOI, nPOI):
assert(self.trained == True)
if startPOI not in self.poi_ix: return None
X_node_test = calc_node_features(startPOI, nPOI, self.poi_ix, self.poi_info, self.dat_obj)
# feature scaling
# should each example be flattened to one vector before scaling?
assert(X_node_test.shape == self.fdim)
#X_node_test = X_node_test.reshape(1, -1) # flatten test example to a vector
X_node_test = self.scaler.transform(X_node_test)
#X_node_test = X_node_test.reshape(self.fdim)
y_hat = self.inference_fun(self.poi_id_dict[startPOI], nPOI, len(self.poi_ix), self.w, X_node_test, self.X_edge)
return np.array([self.poi_id_rdict[x] for x in y_hat])
Nested cross-validation with Monte-Carlo cross-validation as inner loop.
In [ ]:
recdict_memm1m = dict()
cnt = 1
keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
best_C = 1
#best_F1 = 0; best_pF1 = 0
best_Tau = 0
keys_cv = keys[:i] + keys[i+1:]
# use all training+validation set to compute POI features,
# make sure features do NOT change for training and validation
trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
if ps not in poi_set_i:
sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
continue
# tune regularisation constant C
for logit_C in C_SET:
print('\n--------------- try_C: %f ---------------\n' % logit_C); sys.stdout.flush()
F1_memm = []; pF1_memm = []; Tau_memm = []
# inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
for j in range(MC_NITER):
poi_list = []
while True: # make sure the start POI in test set are also in training set
rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
assert(len(test_ix) > 0)
trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
for j in test_ix:
trajid_set_train = trajid_set_train - dat_obj.TRAJID_GROUP_DICT[keys_cv[j]]
poi_set = {poi for tid in trajid_set_train for poi in dat_obj.traj_dict[tid]}
good_partition = True
for j in test_ix:
if keys_cv[j][0] not in poi_set: good_partition = False; break
if good_partition == True:
poi_list = sorted(poi_set)
break
# train
memm = MEMM(dat_obj, C=logit_C, poi_info=poi_info_i.loc[poi_list].copy(), debug=True)
if memm.train(trajid_set_train) == True:
for j in test_ix: # test
ps_cv, L_cv = keys_cv[j]
y_hat = memm.predict(ps_cv, L_cv)
if y_hat is not None:
F1, pF1, tau = evaluate(dat_obj, keys_cv[j], y_hat)
F1_memm.append(F1); pF1_memm.append(pF1); Tau_memm.append(tau)
else: # if training is failed
for j in test_ix:
F1_memm.append(0); pF1_memm.append(0); Tau_memm.append(0)
#mean_F1 = np.mean(F1_memm); mean_pF1 = np.mean(pF1_memm)
mean_Tau = np.mean(Tau_memm)
print('mean_Tau: %.3f' % mean_Tau)
if mean_Tau > best_Tau:
best_Tau = mean_Tau
best_C = logit_C
print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %f ---------------\n' % (cnt, len(keys), ps, L, best_C))
sys.stdout.flush()
# train model using all examples in training+validation set and measure performance on test set
memm = MEMM(dat_obj, C=best_C, poi_info=poi_info_i, debug=True)
if memm.train(trajid_set_i) == True:
y_hat = memm.predict(ps, L)
print(y_hat)
if y_hat is not None:
recdict_memm1m[(ps, L)] = {'PRED': y_hat, 'W': memm.w, 'C': memm.C}
cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()
In [ ]:
fmemm = os.path.join(dat_obj.data_dir, 'memm-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
In [ ]:
pickle.dump(recdict_memm1m, open(fmemm, 'bw'))
In [ ]:
len(recdict_memm1m)
In [ ]:
do_evaluation(dat_obj, recdict_memm1m)