In [1]:
#% matplotlib inline
import os, sys, time, pickle, tempfile
import math, random, itertools
import pandas as pd
import numpy as np
from scipy.linalg import kron
from scipy.optimize import minimize
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from joblib import Parallel, delayed
import cython
import pulp
In [2]:
sys.path.append('src')
In [3]:
from shared import TrajData, evaluate, do_evaluation, DF_COLUMNS, LOG_SMALL, LOG_ZERO
#from ssvm import calc_node_features
from inference_lv import do_inference_list_viterbi as inf_LVA
In [4]:
derived_features = ['trajLen', 'sameCatStart', 'distStart', 'diffPopStart', 'diffNVisitStart',
'diffDurationStart', 'sameNeighbourhoodStart']
In [5]:
random.seed(1234567890)
np.random.seed(1234567890)
ranksvm_dir = '$HOME/work/ranksvm' # directory that contains rankSVM binaries: train, predict, svm-scale
In [6]:
dat_ix = 3
data_dir = 'data/data-new'
In [7]:
dat_obj = TrajData(dat_ix, data_dir=data_dir)
Hyperparameters.
In [8]:
N_JOBS = 6 # number of parallel jobs
USE_GUROBI = False # whether to use GUROBI as ILP solver
ABS_SCALER = False # feature scaling, True: MaxAbsScaler, False: MinMaxScaler #False: StandardScaler
C_SET = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000, 3000] # regularisation parameter
ALPHA_SET = [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99] # trade-off parameter
MC_PORTION = 0.1 # the portion of data that sampled by Monte-Carlo cross-validation
MC_NITER = 5 # number of iterations for Monte-Carlo cross-validation
Method switches.
In [9]:
run_rand = False
run_rand_topk = False
run_rank = False
run_rank_topk = False
run_logreg = False
run_linreg = False
run_pwr = False
run_tran = False
run_tran_topk = True
run_comb = False
In [10]:
if run_rand == True:
recdict_rand = dict()
cnt = 1
keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
# train model using all examples in training set and measure performance on test set
trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
poi_set = {poi for tid in trajid_set_train for poi in dat_obj.traj_dict[tid]}
if ps not in poi_set: continue
poi_list = sorted(poi_set)
np.random.shuffle(poi_list)
recdict_rand[(ps, L)] = {'PRED': [[ps] + poi_list[:L-1]]}
#print_progress(cnt, len(keys)); cnt += 1; sys.stdout.flush()
print()
In [11]:
if run_rand == True:
do_evaluation(dat_obj, recdict_rand)
In [12]:
if run_rand == True:
frand = os.path.join(data_dir, 'rand-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_rand, open(frand, 'bw'))
In [13]:
if run_rand_topk == True:
recdict_rand = dict()
cnt = 1
keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
# train model using all examples in training set and measure performance on test set
trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
poi_set = {poi for tid in trajid_set_train for poi in dat_obj.traj_dict[tid]}
if ps not in poi_set: continue
poi_list = sorted(poi_set - set([ps]))
results = []
for k in range(10):
np.random.shuffle(poi_list)
results.append([ps] + poi_list[:L-1])
recdict_rand[(ps, L)] = {'PRED': results}
#print_progress(cnt, len(keys)); cnt += 1; sys.stdout.flush()
#print()
In [14]:
if run_rand_topk == True:
do_evaluation(dat_obj, recdict_rand)
frand = os.path.join(data_dir, 'rand10-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_rand, open(frand, 'bw'))
Query-based Leave-one-out Evaluation
Group trajectories according to queries, hold every group for test and use all other groups for training.
The performance of a prediction $\bf \bar{y}$ given a query $\bf x$ is the maximum (or average) similarity score between $\bf \bar{y}$ and every trajectory $\bf y$ in test set (which conforms to $\bf x$).
Training data are generated as follows:
query
(in IR terminology).query
, excluding the presence as $\text{startPOI}$. (the label of all absence POIs w.r.t. that query
got a label 0
)The dimension of training data matrix is #(qid, poi)
by #feature
.
In [15]:
def gen_train_subdf(poi_id, poi_info, query_id_set, query_id_rdict, dat_obj):
assert(type(dat_obj) == TrajData)
columns = DF_COLUMNS
poi_distmat = dat_obj.POI_DISTMAT
poi_clusters = dat_obj.POI_CLUSTERS
cats = dat_obj.POI_CAT_LIST
clusters = dat_obj.POI_CLUSTER_LIST
df_ = pd.DataFrame(index=np.arange(len(query_id_set)), columns=columns)
pop, nvisit = poi_info.loc[poi_id, 'popularity'], poi_info.loc[poi_id, 'nVisit']
cat, cluster = poi_info.loc[poi_id, 'poiCat'], poi_clusters.loc[poi_id, 'clusterID']
duration = poi_info.loc[poi_id, 'avgDuration']
for j in range(len(query_id_set)):
qid = query_id_set[j]
assert(qid in query_id_rdict) # qid --> (start, end, length)
(p0, trajLen) = query_id_rdict[qid]
idx = df_.index[j]
df_.loc[idx, 'poiID'] = poi_id
df_.loc[idx, 'queryID'] = qid
df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
df_.loc[idx, 'trajLen'] = trajLen
df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_info.loc[p0, 'poiCat'] else -1
df_.loc[idx, 'distStart'] = poi_distmat.loc[poi_id, p0]
df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
return df_
In [16]:
def gen_train_df(trajid_list, poi_info, dat_obj, n_jobs=-1):
assert(type(dat_obj) == TrajData)
columns = DF_COLUMNS
query_id_dict = dat_obj.QUERY_ID_DICT
train_trajs = [dat_obj.traj_dict[x] for x in trajid_list if len(dat_obj.traj_dict[x]) >= 2]
qid_set = sorted(set([query_id_dict[(t[0], len(t))] for t in train_trajs]))
poi_set = {poi for tr in train_trajs for poi in tr}
#M = len(poi_set)
query_id_rdict = dict()
for k, v in query_id_dict.items():
query_id_rdict[v] = k # qid --> (start, length)
train_df_list = Parallel(n_jobs=n_jobs)\
(delayed(gen_train_subdf)(poi, poi_info, qid_set, query_id_rdict, dat_obj) for poi in poi_set)
assert(len(train_df_list) > 0)
df_ = train_df_list[0]
for j in range(1, len(train_df_list)):
df_ = df_.append(train_df_list[j], ignore_index=True)
# set label
df_.set_index(['queryID', 'poiID'], inplace=True)
df_['label'] = 0
for t in train_trajs:
qid = query_id_dict[(t[0], len(t))]
for poi in t[1:]: # do NOT count if the POI is startPOI/endPOI
df_.loc[(qid, poi), 'label'] += 1
#for j in range(1, len(t)):
#poi = t[j]; df_.loc[(qid, poi), 'label'] += M - j # add the rank of this POI
df_.reset_index(inplace=True)
return df_
In [17]:
def gen_train_df2(trajid_list, poi_info, dat_obj, n_jobs=-1):
assert(type(dat_obj) == TrajData)
columns = DF_COLUMNS
poi_distmat = dat_obj.POI_DISTMAT
poi_clusters = dat_obj.POI_CLUSTERS
cats = dat_obj.POI_CAT_LIST
clusters = dat_obj.POI_CLUSTER_LIST
train_trajs = [dat_obj.traj_dict[x] for x in trajid_list if len(dat_obj.traj_dict[x]) >= 2]
poi_set = {poi for tr in train_trajs for poi in tr}
M = len(poi_set)
df_ = pd.DataFrame(columns=columns)
idx = 0
for q in sorted(dat_obj.TRAJID_GROUP_DICT.keys()):
for tid in sorted(dat_obj.TRAJID_GROUP_DICT[q]):
if tid not in trajid_list: continue
tr = dat_obj.traj_dict[tid]
trajLen = len(tr)
p0 = tr[0]
assert(len(tr) >= 2)
for j in range(len(tr)):
poi_id = tr[j]
pop, nvisit = poi_info.loc[poi_id, 'popularity'], poi_info.loc[poi_id, 'nVisit']
cat, cluster = poi_info.loc[poi_id, 'poiCat'], poi_clusters.loc[poi_id, 'clusterID']
duration = poi_info.loc[poi_id, 'avgDuration']
df_.loc[idx, 'poiID'] = poi_id
df_.loc[idx, 'queryID'] = dat_obj.QUERY_ID_DICT[q]
df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
df_.loc[idx, 'trajLen'] = trajLen
df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_info.loc[p0, 'poiCat'] else -1
df_.loc[idx, 'distStart'] = poi_distmat.loc[poi_id, p0]
df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
df_.loc[idx, 'label'] = M - j
idx += 1
return df_
Test data are generated the same way as training data, except that the labels of testing data (unknown) could be arbitrary values as suggested in libsvm FAQ.
The reported accuracy (by svm-predict
command) is meaningless as it is calculated based on these labels.
The dimension of training data matrix is #poi
by #feature
with one specific query
, i.e. tuple $(\text{startPOI}, \text{#POI})$.
In [18]:
def gen_test_df(startPOI, nPOI, poi_info, dat_obj):
assert(type(dat_obj) == TrajData)
columns = DF_COLUMNS
poi_distmat = dat_obj.POI_DISTMAT
poi_clusters = dat_obj.POI_CLUSTERS
cats = dat_obj.POI_CAT_LIST
clusters = dat_obj.POI_CLUSTER_LIST
query_id_dict = dat_obj.QUERY_ID_DICT
key = (p0, trajLen) = (startPOI, nPOI)
assert(key in query_id_dict)
assert(p0 in poi_info.index)
df_ = pd.DataFrame(index=np.arange(poi_info.shape[0]), columns=columns)
poi_list = sorted(poi_info.index)
qid = query_id_dict[key]
df_['queryID'] = qid
df_['label'] = np.random.rand(df_.shape[0]) # label for test data is arbitrary according to libsvm FAQ
for i in range(df_.index.shape[0]):
poi = poi_list[i]
lon, lat = poi_info.loc[poi, 'poiLon'], poi_info.loc[poi, 'poiLat']
pop, nvisit = poi_info.loc[poi, 'popularity'], poi_info.loc[poi, 'nVisit']
cat, cluster = poi_info.loc[poi, 'poiCat'], poi_clusters.loc[poi, 'clusterID']
duration = poi_info.loc[poi, 'avgDuration']
idx = df_.index[i]
df_.loc[idx, 'poiID'] = poi
df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
df_.loc[idx, 'trajLen'] = trajLen
df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_info.loc[p0, 'poiCat'] else -1
df_.loc[idx, 'distStart'] = poi_distmat.loc[poi, p0]
df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
return df_
Generate a string for a training/test data frame.
In [19]:
def gen_data_str(df_, df_columns=DF_COLUMNS):
for col in df_columns:
assert(col in df_.columns)
lines = []
for idx in df_.index:
slist = [str(df_.loc[idx, 'label'])]
slist.append(' qid:')
slist.append(str(int(df_.loc[idx, 'queryID'])))
fid = 1
for j in range(3, len(df_columns)):
values_ = df_.get_value(idx, df_columns[j])
values_ = values_ if isinstance(values_, tuple) else [values_]
for v in values_:
slist.append(' ')
slist.append(str(fid)); fid += 1
slist.append(':')
slist.append(str(v))
slist.append('\n')
lines.append(''.join(slist))
return ''.join(lines)
Here the rankSVM implementation could be liblinear-ranksvm or libsvm-ranksvm, please read README.ranksvm
in the zip file for installation instructions.
Use softmax function to convert ranking scores to a probability distribution.
In [20]:
def softmax(x):
x1 = x.copy()
x1 -= np.max(x1) # numerically more stable, REF: http://cs231n.github.io/linear-classify/#softmax
expx = np.exp(x1)
return expx / np.sum(expx, axis=0) # column-wise sum
Below is a python wrapper of the svm-train
or train
and svm-predict
or predict
commands of rankSVM with ranking probabilities $P(p_i \lvert (p_s, p_e, len))$ computed using softmax function.
In [21]:
# python wrapper of rankSVM
class RankSVM:
def __init__(self, bin_dir, useLinear=True, debug=False):
dir_ = !echo $bin_dir # deal with environmental variables in path
assert(os.path.exists(dir_[0]))
self.bin_dir = dir_[0]
self.bin_train = 'svm-train'
self.bin_predict = 'svm-predict'
if useLinear:
self.bin_train = 'train'
self.bin_predict = 'predict'
assert(isinstance(debug, bool))
self.debug = debug
# create named tmp files for model and feature scaling parameters
self.fmodel = None
self.fscale = None
with tempfile.NamedTemporaryFile(delete=False) as fd:
self.fmodel = fd.name
with tempfile.NamedTemporaryFile(delete=False) as fd:
self.fscale = fd.name
if self.debug:
print('model file:', self.fmodel)
print('feature scaling parameter file:', self.fscale)
def __del__(self):
# remove tmp files
if self.debug == False:
if self.fmodel is not None and os.path.exists(self.fmodel):
os.unlink(self.fmodel)
if self.fscale is not None and os.path.exists(self.fscale):
os.unlink(self.fscale)
def train(self, train_df, cost=1):
# cost is parameter C in SVM
# write train data to file
ftrain = None
with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as fd:
ftrain = fd.name
datastr = gen_data_str(train_df)
fd.write(datastr)
# feature scaling
ftrain_scaled = None
with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as fd:
ftrain_scaled = fd.name
result = !$self.bin_dir/svm-scale -s $self.fscale $ftrain > $ftrain_scaled
if self.debug:
print('cost:', cost)
print('train data file:', ftrain)
print('feature scaled train data file:', ftrain_scaled)
# train rank svm and generate model file, if the model file exists, rewrite it
result = !$self.bin_dir/$self.bin_train -c $cost $ftrain_scaled $self.fmodel
if self.debug:
print('Training finished.')
for i in range(len(result)): print(result[i])
# load model parameters
w = []
header = 5
with open(self.fmodel, 'r') as f:
for j in range(header): _ = f.readline()
for line in f: w.append(float(line.strip()))
self.w = np.array(w)
# remove train data file
if self.debug == False:
os.unlink(ftrain)
os.unlink(ftrain_scaled)
def predict(self, test_df, probability=False):
# predict ranking scores for the given feature matrix
if self.fmodel is None or not os.path.exists(self.fmodel):
print('Model should be trained before prediction')
return
# write test data to file
ftest = None
with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as fd:
ftest = fd.name
datastr = gen_data_str(test_df)
fd.write(datastr)
# feature scaling
ftest_scaled = None
with tempfile.NamedTemporaryFile(delete=False) as fd:
ftest_scaled = fd.name
result = !$self.bin_dir/svm-scale -r $self.fscale $ftest > $ftest_scaled
# generate prediction file
fpredict = None
with tempfile.NamedTemporaryFile(delete=False) as fd:
fpredict = fd.name
if self.debug:
print('test data file:', ftest)
print('feature scaled test data file:', ftest_scaled)
print('predict result file:', fpredict)
# predict using trained model and write prediction to file
result = !$self.bin_dir/$self.bin_predict $ftest_scaled $self.fmodel $fpredict
if self.debug:
print('Predict result: %-30s %s' % (result[0], result[1]))
# generate prediction DataFrame from prediction file
poi_rank_df = pd.read_csv(fpredict, header=None)
poi_rank_df.rename(columns={0:'rank'}, inplace=True)
poi_rank_df['poiID'] = test_df['poiID'].astype(np.int)
poi_rank_df.set_index('poiID', inplace=True)
if probability == True: poi_rank_df['probability'] = softmax(poi_rank_df['rank'])
# remove test file and prediction file
if self.debug == False:
os.unlink(ftest)
os.unlink(ftest_scaled)
os.unlink(fpredict)
return poi_rank_df
Nested cross-validation with Monte-Carlo cross-validation as inner loop.
In [22]:
if run_rank == True:
recdict_rank = dict()
recdict_rank_pop = dict()
cnt = 1
keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
best_C = 1
#best_F1 = 0; best_pF1 = 0
best_Tau = 0
keys_cv = keys[:i] + keys[i+1:]
# use all training+validation set to compute POI features,
# make sure features do NOT change for training and validation
trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
if ps not in poi_set_i:
sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
continue
# tune regularisation constant C
for rank_C in C_SET:
print('\n--------------- try_C: %.2f ---------------\n' % rank_C); sys.stdout.flush()
F1_rank = []; pF1_rank = []; Tau_rank = []
# inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
for j in range(MC_NITER):
poi_list = []
while True: # make sure the start POI in test set are also in training set
rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
assert(len(test_ix) > 0)
trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
for j in test_ix:
trajid_set_train = trajid_set_train - dat_obj.TRAJID_GROUP_DICT[keys_cv[j]]
poi_set = {poi for tid in trajid_set_train for poi in dat_obj.traj_dict[tid]}
good_partition = True
for j in test_ix:
if keys_cv[j][0] not in poi_set: good_partition = False; break
if good_partition == True:
poi_list = sorted(poi_set)
break
# train
train_df = gen_train_df(list(trajid_set_train), poi_info_i.loc[poi_list].copy(), dat_obj, n_jobs=N_JOBS)
ranksvm = RankSVM(ranksvm_dir, useLinear=True)#, debug=True)
ranksvm.train(train_df, cost=rank_C)
#sys.exit(0)
# test
for j in test_ix: # test
ps_cv, L_cv = keys_cv[j]
test_df = gen_test_df(ps_cv, L_cv, poi_info_i.loc[poi_list].copy(), dat_obj)
rank_df = ranksvm.predict(test_df)
rank_df.sort_values(by='rank', ascending=False, inplace=True)
y_hat = [ps_cv] + [p for p in rank_df.index.tolist() if p != ps_cv][:L_cv-1]
F1, pF1, tau = evaluate(dat_obj, keys_cv[j], [y_hat])
F1_rank.append(F1); pF1_rank.append(pF1); Tau_rank.append(tau)
#mean_F1 = np.mean(F1_rank); mean_pF1 = np.mean(pF1_rank)
mean_Tau = np.mean(Tau_rank)
print('mean_Tau: %.3f' % mean_Tau)
if mean_Tau > best_Tau:
best_Tau = mean_Tau
best_C = rank_C
print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %.2f ---------------\n' % (cnt,len(keys),ps,L,best_C))
sys.stdout.flush()
# train model using all examples in training+validation set and measure performance on test set
train_df = gen_train_df(list(trajid_set_i), poi_info_i.copy(), dat_obj, n_jobs=N_JOBS)
ranksvm = RankSVM(ranksvm_dir, useLinear=True)
ranksvm.train(train_df, cost=best_C)
test_df = gen_test_df(ps, L, poi_info_i, dat_obj)
rank_df = ranksvm.predict(test_df)
rank_df.sort_values(by='rank', ascending=False, inplace=True)
y_hat = [ps] + [p for p in rank_df.index.tolist() if p != ps][:L-1]
recdict_rank[(ps, L)] = {'PRED': [y_hat], 'C':best_C, 'W': ranksvm.w}
# POI popularity based ranking
poi_info_i.sort_values(by='popularity', ascending=False, inplace=True)
y_hat_pop = [ps] + [p for p in poi_info_i.index.tolist() if p != ps][:L-1]
recdict_rank_pop[(ps, L)] = {'PRED': [y_hat_pop]}
cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()
In [23]:
if run_rank == True:
print(len(recdict_rank))
print('RankSVM:')
do_evaluation(dat_obj, recdict_rank)
print('\nPopularity:')
do_evaluation(dat_obj, recdict_rank_pop)
In [24]:
if run_rank == True:
frank = os.path.join(data_dir, 'rank-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
fpop = os.path.join(data_dir, 'pop-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_rank, open(frank, 'bw'))
pickle.dump(recdict_rank_pop, open(fpop, 'bw'))
In [25]:
%load_ext Cython
In [26]:
%%cython
import numpy as np
import heapq as hq
import sys
cimport numpy as np
cdef class HeapItem: # an item in heapq (min-heap)
cdef readonly float priority
cdef readonly object task, string
def __init__(self, float priority, task):
self.priority = priority
self.task = task
self.string = str(priority) + ': ' + str(task)
def __richcmp__(self, other, int op):
if op == 2: # ==
return self.priority == other.priority
elif op == 3: # !=
return self.priority != other.priority
elif op == 0: # <
return self.priority < other.priority
elif op == 1: # <=
return self.priority <= other.priority
elif op == 4: # >
return self.priority > other.priority
elif op == 5: # >=
return self.priority >= other.priority
else:
assert False
def __repr__(self):
return self.string
def __str__(self):
return self.string
cpdef do_inference_list_viterbi(int ps, int L, int M, np.ndarray[dtype=np.float64_t, ndim=1] poi_scores, int top=10):
assert(L > 1)
assert(M >= L)
assert(ps >= 0)
assert(ps < M)
assert(top > 0)
cdef int pi, pj, t, pk, parix, partition_index, partition_index_start, k_partition_index
cdef long k, maxIter = long(1e6)
cdef float priority, new_priority
Cu = np.zeros(M, dtype=np.float)
Cp = np.zeros((M, M), dtype=np.float)
# a intermediate POI should NOT be the start POI, NO self-loops
for pi in range(M):
Cu[pi] = poi_scores[pi]
for pj in range(M):
Cp[pi, pj] = -np.inf if (pj == ps or pi == pj) else 0
# forward-backward procedure: adapted from the Rabiner paper
Alpha = np.zeros((L, M), dtype=np.float) # alpha_t(p_i)
Beta = np.zeros((L, M), dtype=np.float) # beta_t(p_i)
for pj in range(M): Alpha[1, pj] = Cp[ps, pj] + Cu[pj]
for t in range(2, L):
for pj in range(M): # ps~~pi--pj
Alpha[t, pj] = np.max([Alpha[t-1, pi] + Cp[pi, pj] + Cu[pj] for pi in range(M)])
for pi in range(M): Beta[L-1, pi] = 0
for t in range(L-1, 1, -1):
for pi in range(M): # ps~~pi--pj
Beta[t-1, pi] = np.max([Cp[pi, pj] + Cu[pj] + Beta[t, pj] for pj in range(M)])
Beta[0, ps] = np.max([Cp[ps, pj] + Cu[pj] + Beta[1, pj] for pj in range(M)])
Fp = np.zeros((L-1, M, M), dtype=np.float) # f_{t, t+1}(p, p')
for t in range(L-1):
for pi in range(M):
for pj in range(M):
Fp[t, pi, pj] = Alpha[t, pi] + Cp[pi, pj] + Cu[pj] + Beta[t+1, pj]
# identify the best path/walk: adapted from the IJCAI01 paper
y_best = np.ones(L, dtype=np.int) * (-1)
y_best[0] = ps
y_best[1] = np.argmax(Fp[0, ps, :]) # the start POI is specified
for t in range(2, L):
y_best[t] = np.argmax(Fp[t-1, y_best[t-1], :])
Q = [] # priority queue (min-heap)
# heap item for the best path/walk
priority = -np.max(Alpha[L-1, :]) # -1 * score as priority
partition_index = -1
exclude_set = set()
hq.heappush(Q, HeapItem(priority, (y_best, partition_index, exclude_set)))
results = []
k = 0; y_last = None
while len(Q) > 0 and k < maxIter:
hitem = hq.heappop(Q)
k_priority = hitem.priority
(k_best, k_partition_index, k_exclude_set) = hitem.task
k += 1; y_last = k_best
if len(set(k_best)) == L:
#print(-k_priority)
results.append(k_best); top -= 1
if top == 0: return results
# identify the (k+1)-th best path/walk given the 1st, 2nd, ..., k-th best: adapted from the IJCAI01 paper
partition_index_start = 1
if k_partition_index > 0:
assert(k_partition_index < L)
partition_index_start = k_partition_index
for parix in range(partition_index_start, L):
# new_best[:parix]
new_best = np.zeros(L, dtype=np.int) * (-1)
new_best[:parix] = k_best[:parix]
if len(set(new_best[:parix])) < parix: break
# new_best[parix]
#new_exclude_set = set({k_best[parix]})
new_exclude_set = set(k_best[:parix+1]) # exclude all visited POIs
if parix == partition_index_start: new_exclude_set = new_exclude_set | k_exclude_set
candidate_points = [p for p in range(M) if p not in new_exclude_set]
if len(candidate_points) == 0: continue
candidate_maxix = np.argmax([Fp[parix-1, k_best[parix-1], p] for p in candidate_points])
new_best[parix] = candidate_points[candidate_maxix]
# new_best[parix+1:]
for pk in range(parix+1, L):
new_best[pk] = np.argmax([Fp[pk-1, new_best[pk-1], p] for p in range(M)])
#new_best[pk] = np.argmax([Fp[pk-1, new_best[pk-1], p] for p in range(M) if p not in new_best[:pk]])
#new_best[pk] = np.argmax([Fp[pk-1, new_best[pk-1], p] for p in range(M) if p not in new_best[:parix+1]])
new_priority = Fp[parix-1, k_best[parix-1], new_best[parix]]
if k_partition_index > 0:
new_priority += (-k_priority) - Fp[parix-1, k_best[parix-1], k_best[parix]]
new_priority *= -1.0 # NOTE: -np.inf - np.inf + np.inf = nan
#print('push: %s, %d' % (str(new_best), parix))
hq.heappush(Q, HeapItem(new_priority, (new_best, parix, new_exclude_set)))
if k >= maxIter:
sys.stderr.write('WARN: reaching max number of iterations, NO optimal solution found, \
return suboptimal solution.\n')
if len(Q) == 0:
sys.stderr.write('WARN: empty queue, return the last one\n')
results.append(y_last); top -= 1
while len(Q) > 0 and top > 0:
hitem = hq.heappop(Q)
results.append(hitem.task[0])
top -= 1
return results
In [27]:
if run_rank_topk == True:
recdict_c = pickle.load(open(os.path.join(data_dir, 'rank-' + dat_obj.dat_suffix[dat_ix] + '.pkl'), 'rb'))
recdict_rank = dict()
recdict_rank_pop = dict()
cnt = 1
keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
best_C = 1
#best_F1 = 0; best_pF1 = 0
best_Tau = 0
keys_cv = keys[:i] + keys[i+1:]
# use all training+validation set to compute POI features,
# make sure features do NOT change for training and validation
trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
if ps not in poi_set_i:
sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
continue
best_C = recdict_c[(ps, L)]['C']
print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %.2f ---------------\n' % (cnt,len(keys),ps,L,best_C))
sys.stdout.flush()
# train model using all examples in training+validation set and measure performance on test set
train_df = gen_train_df(list(trajid_set_i), poi_info_i.copy(), dat_obj, n_jobs=N_JOBS)
ranksvm = RankSVM(ranksvm_dir, useLinear=True)
ranksvm.train(train_df, cost=best_C)
test_df = gen_test_df(ps, L, poi_info_i, dat_obj)
rank_df = ranksvm.predict(test_df, probability=True)
poi_list_rank = rank_df.index.tolist()
poiid_dict_rank = {poi: ix for ix, poi in enumerate(poi_list_rank)}
rank_scores = np.log10(rank_df['probability'].values)
top10 = do_inference_list_viterbi(poiid_dict_rank[ps], L, len(poi_list_rank), rank_scores)
recdict_rank[(ps, L)] = {'PRED': [[poi_list_rank[pi] for pi in t] for t in top10], 'C':best_C}
# POI popularity based ranking
poi_info_i.sort_values(by='popularity', ascending=False, inplace=True)
poi_list_pop = poi_info_i.index.tolist()
poiid_dict_pop = {poi: ix for ix, poi in enumerate(poi_list_pop)}
pop_scores = poi_info_i['popularity'].astype(np.float).values
top10 = do_inference_list_viterbi(poiid_dict_pop[ps], L, len(poi_list_pop), pop_scores)
recdict_rank_pop[(ps, L)] = {'PRED': [[poi_list_pop[pi] for pi in t] for t in top10]}
cnt += 1
In [28]:
if run_rank_topk == True:
print(len(recdict_rank))
print('RankSVM:')
topk = [1, 3, 5, 10]
recdicts = [{q: {'PRED': recdict_rank[q]['PRED'][:k]} for q in recdict_rank.keys()} for k in topk]
for j in range(len(topk)): do_evaluation(dat_obj, recdicts[j])
print('\nPopularity:')
recdicts = [{q: {'PRED': recdict_rank_pop[q]['PRED'][:k]} for q in recdict_rank_pop} for k in topk]
for j in range(len(topk)): do_evaluation(dat_obj, recdicts[j])
In [29]:
if run_rank_topk == True:
frank = os.path.join(data_dir, 'rank10-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
fpop = os.path.join(data_dir, 'pop10-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_rank, open(frank, 'bw'))
pickle.dump(recdict_rank_pop, open(fpop, 'bw'))
Generate features.
In [30]:
def gen_features(startPOI, nPOI, poi_info, poi_clusters, cats, clusters):
"""
Generate feature vectors for all POIs given query (startPOI, nPOI)
"""
assert(isinstance(cats, list))
assert(isinstance(clusters, list))
columns = DF_COLUMNS[3:]
poi_distmat = POI_DISTMAT
query_id_dict = QUERY_ID_DICT
key = (p0, trajLen) = (startPOI, nPOI)
assert(key in query_id_dict)
assert(p0 in poi_info.index)
df_ = pd.DataFrame(index=np.arange(poi_info.shape[0]), columns=columns)
poi_list = sorted(poi_info.index)
for i in range(df_.index.shape[0]):
poi = poi_list[i]
lon, lat = poi_info.loc[poi, 'poiLon'], poi_info.loc[poi, 'poiLat']
pop, nvisit = poi_info.loc[poi, 'popularity'], poi_info.loc[poi, 'nVisit']
cat, cluster = poi_info.loc[poi, 'poiCat'], poi_clusters.loc[poi, 'clusterID']
duration = poi_info.loc[poi, 'avgDuration']
idx = df_.index[i]
df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
df_.loc[idx, 'trajLen'] = trajLen
df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_all.loc[p0, 'poiCat'] else -1
df_.loc[idx, 'distStart'] = poi_distmat.loc[poi, p0]
df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
# convert to matrix
nrows = df_.shape[0]
ncols = df_.shape[1] + len(cats) + len(clusters) - 2
# features other than category and neighbourhood
X = df_[sorted(set(df_.columns) - {'category', 'neighbourhood'})].values
# boolean features: category (+1, -1)
cat_features = np.vstack([list(df_.loc[x, 'category']) for x in df_.index])
# boolean features: neighbourhood (+1, -1)
neigh_features = np.vstack([list(df_.loc[x, 'neighbourhood']) for x in df_.index])
return np.hstack([X, cat_features.astype(np.float), neigh_features.astype(np.float)])
Generate labels.
In [31]:
def gen_labels(traj_truth, poi_info, binarise=False):
"""
Generate labels for all POIs given a ground truth trajectory
"""
poi_list = sorted(poi_info.index)
ranks = np.zeros(len(poi_list), dtype=np.float)
for j in range(len(poi_list)):
try:
poi = poi_list[j]
ranks[j] = (len(poi_list) - traj_truth.index(poi)) / len(poi_list) # normalise
except ValueError:
pass # default rank is 0
if binarise == True:
return (ranks > 0).astype(np.float)*2 - 1 # binary labels (+1, -1)
return ranks.astype(np.int)
POI occurrence prediction: logistic regression
In [32]:
class LOGIT_REG:
def __init__(self, C=1.0, poi_info=None, debug=False):
self.C = C
self.debug = debug
self.trained = False
if poi_info is None:
self.poi_info = None
else:
self.poi_info = poi_info
def train(self, trajid_set_train):
# generate training data
if self.poi_info is None:
self.poi_info = calc_poi_info(list(trajid_set_train), traj_all, poi_all)
self.poi_list = sorted(self.poi_info.index)
train_traj_list = [traj_dict[x] for x in trajid_set_train if len(traj_dict[x]) >= 2]
feature_list = Parallel(n_jobs=N_JOBS)\
(delayed(gen_features)(tr[0], len(tr), self.poi_info, poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
for tr in train_traj_list)
labels_logreg = Parallel(n_jobs=N_JOBS)\
(delayed(gen_labels)(tr, self.poi_info, binarise=True) for tr in train_traj_list)
X = np.vstack(feature_list)
Y_logreg = np.hstack(labels_logreg)
# feature scaling
if ABS_SCALER == True:
self.scaler = MaxAbsScaler(copy=False)
else:
self.scaler = MinMaxScaler(feature_range=(-1,1), copy=False)
#self.scaler = StandardScaler(copy=False)
X = self.scaler.fit_transform(X)
# train
self.logreg = LogisticRegression(C=self.C, n_jobs=N_JOBS)
self.logreg.fit(X, Y_logreg)
self.trained = True
def predict(self, startPOI, nPOI):
assert(self.trained == True)
if startPOI not in self.poi_list: return None
X_test = gen_features(startPOI, nPOI, self.poi_info, poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST)
X_test = self.scaler.transform(X_test)
scores_logreg = self.logreg.decision_function(X_test)
# form recommendation
assert(len(scores_logreg) == len(self.poi_list))
p0_ix = self.poi_list.index(startPOI)
scores_logreg[p0_ix] = -1e6 # mask the start POI
topk_logreg = list(np.argsort(-np.asarray(scores_logreg))[:nPOI-1])
y_hat = [startPOI] + list(np.array(self.poi_list)[topk_logreg])
return np.array(y_hat)
Nested cross-validation with Monte-Carlo cross-validation as inner loop.
In [33]:
if run_logreg == True:
recdict_logreg = dict()
cnt = 1
keys = sorted(TRAJ_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
best_C = 1
#best_F1 = 0; best_pF1 = 0
best_Tau = 0
keys_cv = keys[:i] + keys[i+1:]
# use all training+validation set to compute POI features,
# make sure features do NOT change for training and validation
trajid_set_i = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
poi_info_i = calc_poi_info(list(trajid_set_i), traj_all, poi_all)
# tune regularisation constant C
for logit_C in C_SET:
print('\n--------------- try_C: %f ---------------\n' % logit_C); sys.stdout.flush()
F1_logreg = []; pF1_logreg = []; Tau_logreg = []
# inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
for j in range(MC_NITER):
poi_list = []
while True: # make sure the start POI in test set are also in training set
rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
assert(len(test_ix) > 0)
trajid_set_train = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
for j in test_ix:
trajid_set_train = trajid_set_train - TRAJ_GROUP_DICT[keys_cv[j]]
poi_set = set()
for tid in trajid_set_train: poi_set = poi_set | set(traj_dict[tid])
good_partition = True
for j in test_ix:
if keys_cv[j][0] not in poi_set: good_partition = False; break
if good_partition == True:
poi_list = sorted(poi_set)
break
# train
logreg = LOGIT_REG(C=logit_C, poi_info=poi_info_i.loc[poi_list].copy(), debug=True)
logreg.train(list(trajid_set_train))
# test
for j in test_ix:
ps_cv, L_cv = keys_cv[j]
y_hat = logreg.predict(ps_cv, L_cv)
if y_hat is not None:
F1, pF1, tau = evaluate(y_hat, TRAJ_GROUP_DICT[keys_cv[j]])
F1_logreg.append(F1); pF1_logreg.append(pF1); Tau_logreg.append(tau)
#mean_F1 = np.mean(F1_logreg); mean_pF1 = np.mean(pF1_logreg)
mean_Tau = np.mean(Tau_logreg)
print('mean_Tau: %.3f' % mean_Tau)
if mean_Tau > best_Tau:
best_Tau = mean_Tau
best_C = logit_C
print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %f ---------------\n' % (cnt, len(keys), ps, L, best_C))
sys.stdout.flush()
# train model using all examples in training set and measure performance on test set
logreg = LOGIT_REG(C=best_C, poi_info=poi_info_i, debug=True)
logreg.train(trajid_set_i)
y_hat = logreg.predict(ps, L)
if y_hat is not None:
recdict_logreg[(ps, L)] = {'PRED': y_hat, 'C': logreg.C}
cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()
In [34]:
if run_logreg == True:
F1_logreg = []; pF1_logreg = []; Tau_logreg = []
for key in sorted(recdict_logreg.keys()):
F1, pF1, tau = evaluate(recdict_logreg[key]['PRED'], TRAJ_GROUP_DICT[key])
F1_logreg.append(F1); pF1_logreg.append(pF1); Tau_logreg.append(tau)
print('LogReg: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
(np.mean(F1_logreg), np.std(F1_logreg)/np.sqrt(len(F1_logreg)), \
np.mean(pF1_logreg), np.std(pF1_logreg)/np.sqrt(len(pF1_logreg)), \
np.mean(Tau_logreg), np.std(Tau_logreg)/np.sqrt(len(Tau_logreg))))
In [35]:
if run_logreg == True:
flogreg = os.path.join(data_dir, 'logreg-' + dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_logreg, open(flogreg, 'bw'))
Direct rank/location prediction: linear regression
In [36]:
class LINEAR_REG:
def __init__(self, poi_info=None, debug=False):
self.debug = debug
self.trained = False
if poi_info is None:
self.poi_info = None
else:
self.poi_info = poi_info
def train(self, trajid_set_train):
if self.poi_info is None:
self.poi_info = calc_poi_info(list(trajid_set_train), traj_all, poi_all)
self.poi_list = sorted(self.poi_info.index)
# generate training data
train_traj_list = [traj_dict[x] for x in trajid_set_train if len(traj_dict[x]) >= 2]
feature_list = Parallel(n_jobs=N_JOBS)\
(delayed(gen_features)(tr[0], len(tr), self.poi_info, poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
for tr in train_traj_list)
labels_linreg = Parallel(n_jobs=N_JOBS)\
(delayed(gen_labels)(tr, self.poi_info, binarise=False) for tr in train_traj_list)
X = np.vstack(feature_list)
Y_linreg = np.hstack(labels_linreg)
# remove training examples with label '0', i.e., features of POIs that do not exist in trajectory
#valid_ix = np.nonzero(Y_linreg)[0]
#X = X[valid_ix, :]
#Y_linreg = Y_linreg[valid_ix]
# feature scaling
if ABS_SCALER == True:
self.scaler = MaxAbsScaler(copy=False)
else:
self.scaler = MinMaxScaler(feature_range=(-1,1), copy=False)
#self.scaler = StandardScaler(copy=False)
X = self.scaler.fit_transform(X)
# train
self.linreg = LinearRegression(normalize=True, n_jobs=N_JOBS)
self.linreg.fit(X, Y_linreg)
self.trained = True
def predict(self, startPOI, nPOI):
assert(self.trained == True)
if startPOI not in self.poi_list: return None
X_test = gen_features(startPOI, nPOI, self.poi_info, poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST)
X_test = self.scaler.transform(X_test)
scores_linreg = self.linreg.predict(X_test)
# form recommendation
assert(len(scores_linreg) == len(self.poi_list))
p0_ix = self.poi_list.index(startPOI)
scores_linreg[p0_ix] = -1e6 # mask the start POI
topk_linreg = list(np.argsort(-np.asarray(scores_linreg))[:nPOI-1])
y_hat = [startPOI] + list(np.array(self.poi_list)[topk_linreg])
return np.array(y_hat)
Leave-one-out cross-validation, NO hyper-parameters to tune.
In [37]:
if run_linreg == True:
recdict_linreg = dict()
cnt = 1
keys = sorted(TRAJ_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
trajid_set_i = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
poi_info_i = calc_poi_info(list(trajid_set_i), traj_all, poi_all)
print('\n--------------- %d/%d, Query: (%d, %d) ---------------\n' % (cnt, len(keys), ps, L))
# train model using all examples in training+evaluation set and measure performance on test set
linreg = LINEAR_REG(poi_info=poi_info_i)
linreg.train(list(trajid_set_i))
y_hat = linreg.predict(ps, L)
if y_hat is not None:
recdict_linreg[(ps, L)] = {'PRED': y_hat}
cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()
In [38]:
if run_linreg == True:
F1_linreg = []; pF1_linreg = []; Tau_linreg = []
for key in sorted(recdict_linreg.keys()):
F1, pF1, tau = evaluate(recdict_linreg[key]['PRED'], TRAJ_GROUP_DICT[key])
F1_linreg.append(F1); pF1_linreg.append(pF1); Tau_linreg.append(tau)
print('LinReg: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
(np.mean(F1_linreg), np.std(F1_linreg)/np.sqrt(len(F1_linreg)), \
np.mean(pF1_linreg), np.std(pF1_linreg)/np.sqrt(len(pF1_linreg)), \
np.mean(Tau_linreg), np.std(Tau_linreg)/np.sqrt(len(Tau_linreg))))
In [39]:
if run_linreg == True:
flinreg = os.path.join(data_dir, 'linreg-' + dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_linreg, open(flinreg, 'bw'))
Cost function and its gradient for pairwise POI ranking.
In [40]:
%load_ext Cython
In [41]:
%%cython
import numpy as np
cimport numpy as np
cpdef pwr_obj(w, X, Y, float C, long M, logit_loss=True):
"""
w - parameter vector
X - feature matrix for all training examples (features of all POIs for the 1st example, then 2nd, ...)
Y - labels/ranks for all training examples (labels of all POIs for the 1st example, then 2nd, ...)
C - regularisation constant
M - total number of POIs
logit_loss - True for logistic loss, False for squared hinge loss
return the cost and the gradient of cost
"""
assert(C > 0)
assert(M > 0)
cdef long N, N1, D, i, pj, pk, ix_j, ix_k
Nq = int(np.shape(X)[0]/M) # number of queries
D = np.shape(X)[1]
assert(D == np.shape(w)[0])
assert(np.shape(X)[0] == np.shape(Y)[0])
N1 = 10 * np.shape(X)[0] # 10 * M * N
cdef double cost, costi, lossi, l_jk
cost = 0.0
grad = np.zeros(D, dtype=np.float)
for i in range(Nq):
costi = 0.0
gradi = np.zeros(D, dtype=np.float)
for pj in range(M-1): # symmetric breaking of pair (pj, pk), #pairs should be n choose 2 not n^2
ix_j = i*M + pj # index of feature vector/label for POI pj
for pk in range(pj+1, M):
#if pj == pk:
# #result += np.log(2) # constant
# continue
ix_k = i*M + pk
l_jk = 0
if Y[ix_j] > Y[ix_k]: l_jk = 1.0
if Y[ix_j] < Y[ix_k]: l_jk = -1.0
phi_jk = X[ix_j] - X[ix_k]
term = l_jk * np.dot(w, phi_jk)
if logit_loss == True:
costi += np.log1p(np.exp(-1.0 * term))
gradi = gradi + phi_jk * (-1.0 * l_jk / (1.0 + np.exp(term)))
else:
if term < 1:
costi += (1.0 - term)**2
gradi = gradi - 2 * l_jk * (1.0 - term) * phi_jk
cost += costi / N1
grad = grad + gradi / N1
cost = 0.5 * np.dot(w, w) + C * cost
grad = w + C * grad
return (cost, grad)
Pairwise POI ranking.
Use the same features and labels as RankSVM
In [42]:
class PW_RANK:
def __init__(self, dat_obj, C=1.0, poi_info=None, logit_loss=True, debug=False):
self.dat_obj = dat_obj
self.C = C
self.logit_loss = logit_loss
self.debug = debug
self.trained = False
if poi_info is None:
self.poi_info = None
else:
self.poi_info = poi_info
def train(self, trajid_set_train):
if self.poi_info is None:
self.poi_info = self.dat_obj.calc_poi_info(list(trajid_set_train))
self.poi_list = sorted(self.poi_info.index)
#train_traj_list = [traj_dict[x] for x in trajid_set_train if len(traj_dict[x]) >= 2]
#feature_list = Parallel(n_jobs=N_JOBS)\
# (delayed(gen_features)(tr[0], len(tr), self.poi_info, poi_clusters=POI_CLUSTERS, \
# cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
# for tr in train_traj_list)
#labels_linreg = Parallel(n_jobs=N_JOBS)\
# (delayed(gen_labels)(tr, self.poi_info, binarise=False) for tr in train_traj_list)
traj_groups = dict()
for tid in trajid_set_train:
tr = self.dat_obj.traj_dict[tid]
if len(tr) < 2: continue
q = (tr[0], len(tr))
try: traj_groups[q].append(tid)
except: traj_groups[q] = [tid]
queries = sorted(traj_groups.keys())
feature_list = Parallel(n_jobs=N_JOBS)\
(delayed(calc_node_features)(q[0], q[1], self.poi_list, self.poi_info, self.dat_obj)\
for q in queries)
labels = []
for q in queries:
label = pd.Series(data=np.zeros(len(self.poi_list), dtype=np.float), index=self.poi_list)
for tid in traj_groups[q]:
tr = self.dat_obj.traj_dict[tid]
for poi in tr[1:]: label.loc[poi] += 1
labels.append(label)
X = np.vstack(feature_list)
Y = np.hstack(labels)
# remove training examples with label '0', i.e., features of POIs that do not exist in trajectory
#valid_ix = np.nonzero(Y_linreg)[0]
#X = X[valid_ix, :]
#Y_linreg = Y_linreg[valid_ix]
# feature scaling
if ABS_SCALER == True:
self.Xscaler = MaxAbsScaler(copy=False)
#self.Yscaler = MaxAbsScaler(copy=False)
else:
self.Xscaler = MinMaxScaler(feature_range=(-1,1), copy=False)
#self.Yscaler = MinMaxScaler(feature_range=(-1,1), copy=False)
#self.scaler = StandardScaler(copy=False)
X = self.Xscaler.fit_transform(X)
#Y = self.Yscaler.fit_transform(Y.reshape(-1, 1))
#Y = Y.reshape(-1) # reshape to one dimension
w = np.random.rand(np.shape(X)[1]) # initial guess
opt_method = 'BFGS' #'Newton-CG'
options = {'disp': True} if self.debug == True else dict()
opt = minimize(pwr_obj, w, args=(X, Y, self.C, len(self.poi_list), self.logit_loss), \
method=opt_method, jac=True, options=options)
if opt.success == True:
self.w = opt.x
self.trained = True
else:
sys.stderr.write('Optimisation failed, C=%f\n' % self.C)
self.trained = False
return self.trained
def predict(self, startPOI, nPOI):
assert(self.trained == True)
if startPOI not in self.poi_list: return None
X_test = calc_node_features(startPOI, nPOI, self.poi_list, self.poi_info, self.dat_obj)
#X_test = gen_features(startPOI, nPOI, self.poi_info, poi_clusters=POI_CLUSTERS, \
# cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST)
X_test = self.Xscaler.transform(X_test)
scores_pwr = 1.0 / (1.0 + np.exp(-1.0 * np.dot(X_test, self.w)))
# form recommendation
assert(len(scores_pwr) == len(self.poi_list))
p0_ix = self.poi_list.index(startPOI)
scores_pwr[p0_ix] = -1e6 # mask the start POI
topk_pwr = list(np.argsort(-np.asarray(scores_pwr))[:nPOI-1])
y_hat = [startPOI] + list(np.array(self.poi_list)[topk_pwr])
return np.array(y_hat)
Nested cross-validation with Monte-Carlo cross-validation as inner loop.
In [43]:
if run_pwr == True:
LOGIT_LOSS = False
recdict_pwr = dict()
cnt = 1
keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
best_C = 1
#best_F1 = 0; best_pF1 = 0
best_Tau = 0
keys_cv = keys[:i] + keys[i+1:]
# use all training+validation set to compute POI features,
# make sure features do NOT change for training and validation
trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
if ps not in poi_set_i:
sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
continue
# tune regularisation constant C
for logit_C in C_SET:
print('\n--------------- try_C: %f ---------------\n' % logit_C); sys.stdout.flush()
F1_pwr = []; pF1_pwr = []; Tau_pwr = []
# inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
for j in range(MC_NITER):
poi_list = []
while True: # make sure the start POI in test set are also in training set
rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
assert(len(test_ix) > 0)
trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
for j in test_ix:
trajid_set_train = trajid_set_train - dat_obj.TRAJID_GROUP_DICT[keys_cv[j]]
poi_set = {poi for tid in trajid_set_train for poi in dat_obj.traj_dict[tid]}
good_partition = True
for j in test_ix:
if keys_cv[j][0] not in poi_set: good_partition = False; break
if good_partition == True:
poi_list = sorted(poi_set)
break
# train
pwr = PW_RANK(dat_obj, C=logit_C, poi_info=poi_info_i.loc[poi_list].copy(),
logit_loss=LOGIT_LOSS, debug=True)
if pwr.train(list(trajid_set_train)) == True:
for j in test_ix: # test
ps_cv, L_cv = keys_cv[j]
y_hat = pwr.predict(ps_cv, L_cv)
if y_hat is not None:
F1, pF1, tau = evaluate(dat_obj, keys_cv[j], [y_hat])
F1_pwr.append(F1); pF1_pwr.append(pF1); Tau_pwr.append(tau)
else: # if training is failed
for j in test_ix:
F1_pwr.append(0); pF1_pwr.append(0); Tau_pwr.append(0)
#mean_F1 = np.mean(F1_pwr); mean_pF1 = np.mean(pF1_pwr)
mean_Tau = np.mean(Tau_pwr)
print('mean_Tau: %.3f' % mean_Tau)
if mean_Tau > best_Tau:
best_Tau = mean_Tau
best_C = logit_C
print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %f ---------------\n' % (cnt, len(keys), ps, L, best_C))
sys.stdout.flush()
# train model using all examples in training set and measure performance on test set
pwr = PW_RANK(dat_obj, C=best_C, poi_info=poi_info_i, logit_loss=LOGIT_LOSS, debug=True)
if pwr.train(trajid_set_i) == True:
y_hat = pwr.predict(ps, L)
if y_hat is not None:
recdict_pwr[(ps, L)] = {'PRED': [y_hat], 'W': pwr.w, 'C': pwr.C}
print(y_hat)
cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()
In [44]:
#len(recdict_pwr)
In [45]:
if run_pwr == True:
print('PW_RANK:')
do_evaluation(dat_obj, recdict_pwr)
In [46]:
if run_pwr == True:
fpwr = os.path.join(data_dir, 'pwr-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_pwr, open(fpwr, 'bw'))
Approximate transition probabilities (matrix) between different POI features (vector) using the Kronecker product of individual transition matrix corresponding to each feature, i.e., POI category, POI popularity (discritized), POI average visit duration (discritized) and POI neighborhoods (clusters).
Deal with features without corresponding POIs and feature with more than one corresponding POIs. (Before Normalisation)
NOTE: execute the above division before or after row normalisation will lead to the same result, as the division itself does NOT change the normalising constant of each row (i.e., the sum of each row before normalising).
In [47]:
def gen_poi_logtransmat(trajid_list, poi_set, poi_info, dat_obj, debug=False):
#transmat_cat = gen_transmat_cat(trajid_list, traj_dict, poi_info)
#transmat_pop, logbins_pop = gen_transmat_pop(trajid_list, traj_dict, poi_info)
#transmat_visit, logbins_visit = gen_transmat_visit(trajid_list, traj_dict, poi_info)
#transmat_duration, logbins_duration = gen_transmat_duration(trajid_list, traj_dict, poi_info)
#transmat_neighbor, poi_clusters = gen_transmat_neighbor(trajid_list, traj_dict, poi_info)
transmat_cat = dat_obj.gen_transmat_cat(trajid_list, poi_info)
transmat_pop = dat_obj.gen_transmat_pop(trajid_list, poi_info)
transmat_visit = dat_obj.gen_transmat_visit(trajid_list, poi_info)
transmat_duration = dat_obj.gen_transmat_duration(trajid_list, poi_info)
transmat_neighbor = dat_obj.gen_transmat_neighbor(trajid_list, poi_info)
logbins_pop = dat_obj.LOGBINS_POP
logbins_visit = dat_obj.LOGBINS_VISIT
logbins_duration = dat_obj.LOGBINS_DURATION
poi_clusters = dat_obj.POI_CLUSTERS
traj_dict = dat_obj.traj_dict
# Kronecker product
transmat_ix = list(itertools.product(transmat_cat.index, transmat_pop.index, transmat_visit.index, \
transmat_duration.index, transmat_neighbor.index))
transmat_value = transmat_cat.values
for transmat in [transmat_pop, transmat_visit, transmat_duration, transmat_neighbor]:
transmat_value = kron(transmat_value, transmat.values)
transmat_feature = pd.DataFrame(data=transmat_value, index=transmat_ix, columns=transmat_ix)
poi_train = sorted(poi_set)
feature_names = ['poiCat', 'popularity', 'nVisit', 'avgDuration', 'clusterID']
poi_features = pd.DataFrame(data=np.zeros((len(poi_train), len(feature_names))), \
columns=feature_names, index=poi_train)
poi_features.index.name = 'poiID'
poi_features['poiCat'] = poi_info.loc[poi_train, 'poiCat']
poi_features['popularity'] = np.digitize(poi_info.loc[poi_train, 'popularity'], logbins_pop)
poi_features['nVisit'] = np.digitize(poi_info.loc[poi_train, 'nVisit'], logbins_visit)
poi_features['avgDuration'] = np.digitize(poi_info.loc[poi_train, 'avgDuration'], logbins_duration)
poi_features['clusterID'] = poi_clusters.loc[poi_train, 'clusterID']
# shrink the result of Kronecker product and deal with POIs with the same features
poi_logtransmat = pd.DataFrame(data=np.zeros((len(poi_train), len(poi_train)), dtype=np.float), \
columns=poi_train, index=poi_train)
for p1 in poi_logtransmat.index:
rix = tuple(poi_features.loc[p1])
for p2 in poi_logtransmat.columns:
cix = tuple(poi_features.loc[p2])
value_ = transmat_feature.loc[(rix,), (cix,)]
poi_logtransmat.loc[p1, p2] = value_.values[0, 0]
# group POIs with the same features
features_dup = dict()
for poi in poi_features.index:
key = tuple(poi_features.loc[poi])
if key in features_dup:
features_dup[key].append(poi)
else:
features_dup[key] = [poi]
if debug == True:
for key in sorted(features_dup.keys()):
print(key, '->', features_dup[key])
# deal with POIs with the same features
for feature in sorted(features_dup.keys()):
n = len(features_dup[feature])
if n > 1:
group = features_dup[feature]
v1 = poi_logtransmat.loc[group[0], group[0]] # transition value of self-loop of POI group
# divide incoming transition value (i.e. unnormalised transition probability) uniformly among group members
for poi in group:
poi_logtransmat[poi] /= n
# outgoing transition value has already been duplicated (value copied above)
# duplicate & divide transition value of self-loop of POI group uniformly among all outgoing transitions,
# from a POI to all other POIs in the same group (excluding POI self-loop)
v2 = v1 / (n - 1)
for pair in itertools.permutations(group, 2):
poi_logtransmat.loc[pair[0], pair[1]] = v2
# normalise each row
for p1 in poi_logtransmat.index:
poi_logtransmat.loc[p1, p1] = 0
rowsum = poi_logtransmat.loc[p1].sum()
assert(rowsum > 0)
logrowsum = np.log10(rowsum)
for p2 in poi_logtransmat.columns:
if p1 == p2:
poi_logtransmat.loc[p1, p2] = LOG_ZERO # deal with log(0) explicitly
else:
poi_logtransmat.loc[p1, p2] = np.log10(poi_logtransmat.loc[p1, p2]) - logrowsum
return poi_logtransmat
In [48]:
#transmat_ = gen_poi_logtransmat(trajid_set_all, set(poi_info_all.index), traj_dict, poi_info_all, debug=True)
Use dynamic programming to find a possibly non-simple path, i.e., walk.
In [49]:
def find_viterbi(V, E, ps, L, withNodeWeight=False, alpha=0.5):
assert(isinstance(V, pd.DataFrame))
assert(isinstance(E, pd.DataFrame))
assert(ps in V.index)
assert(2 <= L <= V.index.shape[0])
if withNodeWeight == True:
assert(0 < alpha < 1)
beta = 1 - alpha
else:
alpha = 0
beta = 1
weightkey = 'weight'
if weightkey not in V.columns:
V['weight'] = 1 # dummy weights, will not be used as alpha=0
poi_id_dict, poi_id_rdict = dict(), dict()
for ix, poi in enumerate(V.index):
poi_id_dict[poi] = ix
poi_id_rdict[ix] = poi
M = V.shape[0]
assert(E.shape[0] >= M)
assert(E.shape[1] >= M)
A = np.zeros((L-1, M), dtype=np.float) # scores matrix
B = np.ones((L-1, M), dtype=np.int) * (-1) # backtracking pointers
for p in range(M): # ps--p1
p1 = poi_id_rdict[p]
A[0, p] = alpha * V.loc[p1, 'weight'] + beta * E.loc[ps, p1] if p1 != ps else -np.inf
B[0, p] = poi_id_dict[ps]
for t in range(0, L-2): # ps~~p2'--p3
for p in range(M):
p3 = poi_id_rdict[p]
scores = [A[t, p2] + alpha * V.loc[p3, 'weight'] + beta * E.loc[poi_id_rdict[p2], p3] \
if poi_id_rdict[p2] not in {ps, p3} else -np.inf for p2 in range(M)]
maxix = np.argmax(scores)
A[t+1, p] = scores[maxix]
B[t+1, p] = maxix
y_hat = [np.argmax(A[L-2, :])]
p, t = y_hat[-1], L-2
while t >= 0:
y_hat.append(B[t, p])
p, t = y_hat[-1], t-1
y_hat.reverse()
return np.asarray([poi_id_rdict[p] for p in y_hat])
Use integer linear programming (ILP) to find a simple path.
In [50]:
def find_ILP(V, E, ps, L, withNodeWeight=False, alpha=0.5):
assert(isinstance(V, pd.DataFrame))
assert(isinstance(E, pd.DataFrame))
assert(ps in V.index)
assert(2 <= L <= V.index.shape[0])
if withNodeWeight == True:
assert(0 < alpha < 1)
beta = 1 - alpha
p0 = str(ps); M = V.index.shape[0]
# REF: pythonhosted.org/PuLP/index.html
pois = [str(p) for p in V.index] # create a string list for each POI
pb = pulp.LpProblem('MostLikelyTraj', pulp.LpMaximize) # create problem
# visit_i_j = 1 means POI i and j are visited in sequence
visit_vars = pulp.LpVariable.dicts('visit', (pois, pois), 0, 1, pulp.LpInteger)
# isend_l = 1 means POI l is the END POI of trajectory
isend_vars = pulp.LpVariable.dicts('isend', pois, 0, 1, pulp.LpInteger)
# a dictionary contains all dummy variables
dummy_vars = pulp.LpVariable.dicts('u', [x for x in pois if x != p0], 2, M, pulp.LpInteger)
# add objective
objlist = []
if withNodeWeight == True:
objlist.append(alpha * V.loc[int(p0), 'weight'])
for pi in [x for x in pois]: # from
for pj in [y for y in pois if y != p0]: # to
if withNodeWeight == True:
objlist.append(visit_vars[pi][pj] * (alpha * V.loc[int(pj), 'weight'] + beta * E.loc[int(pi), int(pj)]))
else:
objlist.append(visit_vars[pi][pj] * E.loc[int(pi), int(pj)])
pb += pulp.lpSum(objlist), 'Objective'
# add constraints, each constraint should be in ONE line
pb += pulp.lpSum([visit_vars[pi][pi] for pi in pois]) == 0, 'NoSelfLoops'
pb += pulp.lpSum([visit_vars[p0][pj] for pj in pois]) == 1, 'StartAt_p0'
pb += pulp.lpSum([visit_vars[pi][p0] for pi in pois]) == 0, 'NoIncoming_p0'
pb += pulp.lpSum([visit_vars[pi][pj] for pi in pois for pj in pois]) == L-1, 'Length'
pb += pulp.lpSum([isend_vars[pi] for pi in pois]) == 1, 'OneEnd'
pb += isend_vars[p0] == 0, 'StartNotEnd'
for pk in [x for x in pois if x != p0]:
pb += pulp.lpSum([visit_vars[pi][pk] for pi in pois]) == isend_vars[pk] + \
pulp.lpSum([visit_vars[pk][pj] for pj in pois if pj != p0]), 'ConnectedAt_' + pk
pb += pulp.lpSum([visit_vars[pi][pk] for pi in pois]) <= 1, 'Enter_' + pk + '_AtMostOnce'
pb += pulp.lpSum([visit_vars[pk][pj] for pj in pois if pj != p0]) + isend_vars[pk] <= 1, \
'Leave_' + pk + '_AtMostOnce'
for pi in [x for x in pois if x != p0]:
for pj in [y for y in pois if y != p0]:
pb += dummy_vars[pi] - dummy_vars[pj] + 1 <= (M - 1) * (1 - visit_vars[pi][pj]), \
'SubTourElimination_' + pi + '_' + pj
#pb.writeLP("traj_tmp.lp")
# solve problem: solver should be available in PATH
if USE_GUROBI == True:
gurobi_options = [('TimeLimit', '7200'), ('Threads', str(N_JOBS)), ('NodefileStart', '0.2'), ('Cuts', '2')]
pb.solve(pulp.GUROBI_CMD(path='gurobi_cl', options=gurobi_options)) # GUROBI
else:
pb.solve(pulp.COIN_CMD(path='cbc', options=['-threads', str(N_JOBS), '-strategy', '1', '-maxIt', '2000000']))#CBC
visit_mat = pd.DataFrame(data=np.zeros((len(pois), len(pois)), dtype=np.float), index=pois, columns=pois)
isend_vec = pd.Series(data=np.zeros(len(pois), dtype=np.float), index=pois)
for pi in pois:
isend_vec.loc[pi] = isend_vars[pi].varValue
for pj in pois: visit_mat.loc[pi, pj] = visit_vars[pi][pj].varValue
#visit_mat.to_csv('visit.csv')
# build the recommended trajectory
recseq = [p0]
while True:
pi = recseq[-1]
pj = visit_mat.loc[pi].idxmax()
value = visit_mat.loc[pi, pj]
#print(value, int(round(value)))
#print(recseq)
assert(int(round(value)) == 1)
recseq.append(pj)
if len(recseq) == L:
assert(int(round(isend_vec[pj])) == 1)
#print('===:', recseq, ':====')
return np.asarray([int(x) for x in recseq])
Recommend trajectories by leveraging POI-POI transition probabilities (maximise the transition likelihood).
Leave-one-out cross-validation, NO hyper-parameters to tune.
In [51]:
if run_tran == True:
recdict_tran_DP = dict()
recdict_tran_ILP = dict()
cnt = 1
keys = sorted(TRAJ_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
print('\n--------------- %d/%d, Query: (%d, %d) ---------------\n' % (cnt, len(keys), ps, L))
# train model using all examples in training+validation set and measure performance on test set
trajid_set_train = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
poi_info = calc_poi_info(list(trajid_set_train), traj_all, poi_all)
if ps not in poi_info.index: continue
poi_logtransmat = gen_poi_logtransmat(trajid_set_train, set(poi_info.index), traj_dict, poi_info)
edges = poi_logtransmat
recdict_tran_DP[(ps, L)] = {'PRED': find_viterbi(poi_info.copy(), edges.copy(), ps, L)}
recdict_tran_ILP[(ps, L)] = {'PRED': find_ILP(poi_info, edges, ps, L)}
assert(len(recdict_tran_ILP[(ps, L)]) == len(set(recdict_tran_ILP[(ps, L)])))
cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()
In [52]:
if run_tran == True:
F1_DP = []; pF1_DP = []; Tau_DP = []
F1_ILP = []; pF1_ILP = []; Tau_ILP = []
for key in sorted(recdict_tran_DP.keys()):
F1, pF1, tau = evaluate(recdict_tran_DP[key]['PRED'], TRAJ_GROUP_DICT[key])
F1_DP.append(F1); pF1_DP.append(pF1); Tau_DP.append(tau)
assert(key in recdict_tran_ILP)
F1, pF1, tau = evaluate(recdict_tran_ILP[key]['PRED'], TRAJ_GROUP_DICT[key])
F1_ILP.append(F1); pF1_ILP.append(pF1); Tau_ILP.append(tau)
print('Transition Viterbi: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
(np.mean(F1_DP), np.std(F1_DP)/np.sqrt(len(F1_DP)), \
np.mean(pF1_DP), np.std(pF1_DP)/np.sqrt(len(pF1_DP)), \
np.mean(Tau_DP), np.std(Tau_DP)/np.sqrt(len(Tau_DP))))
print('Transition ILP: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
(np.mean(F1_ILP), np.std(F1_ILP)/np.sqrt(len(F1_ILP)), \
np.mean(pF1_ILP), np.std(pF1_ILP)/np.sqrt(len(pF1_ILP)), \
np.mean(Tau_ILP), np.std(Tau_ILP)/np.sqrt(len(Tau_ILP))))
In [53]:
if run_tran == True:
fDP = os.path.join(data_dir, 'tranDP-' + dat_suffix[dat_ix] + '.pkl')
fILP = os.path.join(data_dir, 'tranILP-' + dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_tran_DP, open(fDP, 'bw'))
pickle.dump(recdict_tran_ILP, open(fILP, 'bw'))
In [54]:
if run_tran_topk == True:
recdict_tran = dict()
cnt = 1
keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
keys_cv = keys[:i] + keys[i+1:]
# use all training+validation set to compute POI features,
# make sure features do NOT change for training and validation
trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
if ps not in poi_set_i:
sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
continue
print('\n--------------- %d/%d, Query: (%d, %d) ---------------\n' % (cnt,len(keys),ps,L))
sys.stdout.flush()
poi_logtransmat = gen_poi_logtransmat(trajid_set_i, set(poi_info_i.index), poi_info_i, dat_obj)
poi_list_tran = poi_info_i.index.tolist()
poiid_dict_tran = {poi: ix for ix, poi in enumerate(poi_list_tran)}
# do_inference_list_viterbi(int ps, int L, int M,
# np.ndarray[dtype=np.float64_t, ndim=2] unary_params,
# np.ndarray[dtype=np.float64_t, ndim=3] pw_params,
# np.ndarray[dtype=np.float64_t, ndim=2] unary_features,
# np.ndarray[dtype=np.float64_t, ndim=3] pw_features,
# y_true=None, y_true_list=None, int top=10, path4train=False, DIVERSITY=True)
unary_params = np.zeros((len(poi_list_tran), 1), dtype=np.float)
pw_params = np.ones((len(poi_list_tran), len(poi_list_tran), 1), dtype=np.float)
unary_features = np.zeros((len(poi_list_tran), 1), dtype=np.float)
pw_features = poi_logtransmat.values.reshape(poi_logtransmat.shape[0], poi_logtransmat.shape[1], 1)
top10 = inf_LVA(poiid_dict_tran[ps], L, len(poi_list_tran), unary_params, pw_params, \
unary_features, pw_features, DIVERSITY=False)
print(top10)
recdict_tran[(ps, L)] = {'PRED': [[poi_list_tran[pi] for pi in t[0]] for t in top10]}
cnt += 1
In [ ]:
if run_tran_topk == True:
print(len(recdict_tran))
print('Markov:')
topk = [1, 3, 5, 10]
recdicts = [{q: {'PRED': recdict_tran[q]['PRED'][:k]} for q in recdict_tran.keys()} for k in topk]
for j in range(len(topk)): do_evaluation(dat_obj, recdicts[j])
In [ ]:
if run_tran_topk == True:
ftran = os.path.join(data_dir, 'markov10-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_tran, open(ftran, 'bw'))
In [ ]:
comb_methods = [find_viterbi, find_ILP]
methods_str = ['DP', 'ILP']
In [ ]:
method_ix = 0
Nested cross-validation with Monte-Carlo cross-validation as inner loop.
In [ ]:
if run_comb == True:
recdict_comb = dict()
cnt = 1
keys = sorted(TRAJ_GROUP_DICT.keys())
inference_fun = comb_methods[method_ix]
# outer loop to evaluate the test performance by cross validation
for i in range(len(keys)):
ps, L = keys[i]
best_C = 1
best_alpha = 0.5
#best_F1 = 0; best_pF1 = 0
best_Tau = 0
keys_cv = keys[:i] + keys[i+1:]
# use all training+validation set to compute POI features,
# make sure features do NOT change for training and validation
trajid_set_i = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
poi_info_i = calc_poi_info(list(trajid_set_i), traj_all, poi_all)
# tune regularisation constant C
for rank_C in C_SET:
for alpha in ALPHA_SET:
print('\n--------------- try_C: %.3f, try_alpha: %.3f ---------------\n' % (rank_C, alpha))
sys.stdout.flush()
F1_comb = []; pF1_comb = []; Tau_comb = []
# inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
for j in range(MC_NITER):
poi_list = []
while True: # make sure the start POI in test set are also in training set
rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
assert(len(test_ix) > 0)
trajid_set_train = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
for j in test_ix:
trajid_set_train = trajid_set_train - TRAJ_GROUP_DICT[keys_cv[j]]
poi_set = set()
for tid in trajid_set_train: poi_set = poi_set | set(traj_dict[tid])
good_partition = True
for j in test_ix:
if keys_cv[j][0] not in poi_set: good_partition = False; break
if good_partition == True:
poi_list = sorted(poi_set)
break
# train
train_df = gen_train_df(list(trajid_set_train), traj_dict, poi_info_i.loc[poi_list].copy(),
poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST,
n_jobs=N_JOBS)
ranksvm = RankSVM(ranksvm_dir, useLinear=True)
ranksvm.train(train_df, cost=rank_C)
poi_logtransmat = gen_poi_logtransmat(trajid_set_train, set(poi_list), traj_dict,
poi_info_i.loc[poi_list].copy())
edges = poi_logtransmat
# test
for j in test_ix: # test
ps_cv, L_cv = keys_cv[j]
test_df = gen_test_df(ps_cv, L_cv, poi_info_i.loc[poi_list].copy(), poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
rank_df = ranksvm.predict(test_df, probability=True)
nodes = rank_df.copy()
nodes['weight'] = np.log10(nodes['probability'])
y_hat = inference_fun(nodes, edges.copy(), ps_cv, L_cv, withNodeWeight=True, alpha=alpha)
F1, pF1, tau = evaluate(y_hat, TRAJ_GROUP_DICT[keys_cv[j]])
F1_comb.append(F1); pF1_comb.append(pF1); Tau_comb.append(tau)
#mean_F1 = np.mean(F1_comb); mean_pF1 = np.mean(pF1_comb)
mean_Tau = np.mean(Tau_comb)
print('mean_Tau: %.3f' % mean_Tau)
if mean_Tau > best_Tau:
best_Tau = mean_Tau
best_C = rank_C
best_alpha = alpha
print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %.3f, Best_alpha: %.3f ---------------\n' % \
(cnt, len(keys), ps, L, best_C, best_alpha))
sys.stdout.flush()
# train model using all examples in training set and measure performance on test set
train_df = gen_train_df(list(trajid_set_i), traj_dict, poi_info_i.copy(), poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST, n_jobs=N_JOBS)
ranksvm = RankSVM(ranksvm_dir, useLinear=True)
ranksvm.train(train_df, cost=best_C)
test_df = gen_test_df(ps, L, poi_info_i.copy(), poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
rank_df = ranksvm.predict(test_df, probability=True)
nodes = rank_df.copy()
nodes['weight'] = np.log10(nodes['probability'])
poi_logtransmat = gen_poi_logtransmat(trajid_set_i, set(poi_info_i.index), traj_dict, poi_info_i)
edges = poi_logtransmat
y_hat = inference_fun(nodes, edges, ps, L, withNodeWeight=True, alpha=best_alpha)
recdict_comb[(ps, L)] = {'PRED': y_hat, 'C': best_C, 'alpha': best_alpha}
cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()
In [ ]:
if run_comb == True:
F1_comb = []; pF1_comb = []; Tau_comb = []
for key in sorted(recdict_comb.keys()):
F1, pF1, tau = evaluate(recdict_comb[key]['PRED'], TRAJ_GROUP_DICT[key])
F1_comb.append(F1); pF1_comb.append(pF1); Tau_comb.append(tau)
print('Comb ' + methods_str[method_ix] + ': F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
(np.mean(F1_comb), np.std(F1_comb)/np.sqrt(len(F1_comb)), \
np.mean(pF1_comb), np.std(pF1_comb)/np.sqrt(len(pF1_comb)), \
np.mean(Tau_comb), np.std(Tau_comb)/np.sqrt(len(Tau_comb))))
In [ ]:
if run_comb == True:
fcomb = os.path.join(data_dir, 'comb' + methods_str[method_ix] + '-' + dat_suffix[dat_ix] + '.pkl')
pickle.dump(recdict_comb, open(fcomb, 'bw'))