Trajectory Recommendation - Baselines


In [1]:
#% matplotlib inline

import os, sys, time, pickle, tempfile
import math, random, itertools
import pandas as pd
import numpy as np
from scipy.linalg import kron
from scipy.optimize import minimize

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler

from joblib import Parallel, delayed
import cython
import pulp

In [2]:
sys.path.append('src')

In [3]:
from shared import TrajData, evaluate, do_evaluation, DF_COLUMNS, LOG_SMALL, LOG_ZERO
#from ssvm import calc_node_features
from inference_lv import do_inference_list_viterbi as inf_LVA

In [4]:
derived_features = ['trajLen', 'sameCatStart', 'distStart', 'diffPopStart', 'diffNVisitStart', 
                    'diffDurationStart', 'sameNeighbourhoodStart']

In [5]:
random.seed(1234567890)
np.random.seed(1234567890)
ranksvm_dir = '$HOME/work/ranksvm'  # directory that contains rankSVM binaries: train, predict, svm-scale

In [6]:
dat_ix = 3
data_dir = 'data/data-new'

In [7]:
dat_obj = TrajData(dat_ix, data_dir=data_dir)

Hyperparameters.


In [8]:
N_JOBS = 6         # number of parallel jobs
USE_GUROBI = False # whether to use GUROBI as ILP solver
ABS_SCALER = False  # feature scaling, True: MaxAbsScaler, False: MinMaxScaler #False: StandardScaler
C_SET = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000, 3000]  # regularisation parameter
ALPHA_SET = [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99]  # trade-off parameter
MC_PORTION = 0.1   # the portion of data that sampled by Monte-Carlo cross-validation
MC_NITER = 5       # number of iterations for Monte-Carlo cross-validation

Method switches.


In [9]:
run_rand = False
run_rand_topk = False
run_rank = False
run_rank_topk = False
run_logreg = False
run_linreg = False
run_pwr = False
run_tran = False
run_tran_topk = True
run_comb = False

1. Random Guessing


In [10]:
if run_rand == True:
    recdict_rand = dict()
    cnt = 1
    keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]

        # train model using all examples in training set and measure performance on test set
        trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
        poi_set = {poi for tid in trajid_set_train for poi in dat_obj.traj_dict[tid]}
        if ps not in poi_set: continue
            
        poi_list = sorted(poi_set)
        np.random.shuffle(poi_list)
        recdict_rand[(ps, L)] = {'PRED': [[ps] + poi_list[:L-1]]}
       
        #print_progress(cnt, len(keys)); cnt += 1; sys.stdout.flush()
    print()

In [11]:
if run_rand == True:
    do_evaluation(dat_obj, recdict_rand)

In [12]:
if run_rand == True:
    frand = os.path.join(data_dir, 'rand-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_rand, open(frand, 'bw'))

In [13]:
if run_rand_topk == True:
    recdict_rand = dict()
    cnt = 1
    keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]

        # train model using all examples in training set and measure performance on test set
        trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
        poi_set = {poi for tid in trajid_set_train for poi in dat_obj.traj_dict[tid]}
        if ps not in poi_set: continue
            
        poi_list = sorted(poi_set - set([ps]))
        results = []
        for k in range(10):
            np.random.shuffle(poi_list)
            results.append([ps] + poi_list[:L-1])
        recdict_rand[(ps, L)] = {'PRED': results}
       
        #print_progress(cnt, len(keys)); cnt += 1; sys.stdout.flush()
    #print()

In [14]:
if run_rand_topk == True:
    do_evaluation(dat_obj, recdict_rand)
    frand = os.path.join(data_dir, 'rand10-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_rand, open(frand, 'bw'))

2. POI Scoring

Query-based Leave-one-out Evaluation

Group trajectories according to queries, hold every group for test and use all other groups for training.

The performance of a prediction $\bf \bar{y}$ given a query $\bf x$ is the maximum (or average) similarity score between $\bf \bar{y}$ and every trajectory $\bf y$ in test set (which conforms to $\bf x$).

2.1 Pairwise POI ranking using RankSVM

2.1.1 Training DataFrame

Training data are generated as follows:

  1. each input tuple $(\text{startPOI}, \text{#POI})$ form a query (in IR terminology).
  2. the label of a specific POI is the number of presence of that POI in the set of trajectories grouped by a specific query, excluding the presence as $\text{startPOI}$. (the label of all absence POIs w.r.t. that query got a label 0)

The dimension of training data matrix is #(qid, poi) by #feature.


In [15]:
def gen_train_subdf(poi_id, poi_info, query_id_set, query_id_rdict, dat_obj):
    assert(type(dat_obj) == TrajData)
    
    columns = DF_COLUMNS
    poi_distmat = dat_obj.POI_DISTMAT
    poi_clusters = dat_obj.POI_CLUSTERS
    cats = dat_obj.POI_CAT_LIST
    clusters = dat_obj.POI_CLUSTER_LIST
    
    df_ = pd.DataFrame(index=np.arange(len(query_id_set)), columns=columns)
    
    pop, nvisit = poi_info.loc[poi_id, 'popularity'], poi_info.loc[poi_id, 'nVisit']
    cat, cluster = poi_info.loc[poi_id, 'poiCat'], poi_clusters.loc[poi_id, 'clusterID'] 
    duration = poi_info.loc[poi_id, 'avgDuration']
    
    for j in range(len(query_id_set)):
        qid = query_id_set[j]
        assert(qid in query_id_rdict) # qid --> (start, end, length)
        (p0, trajLen) = query_id_rdict[qid]
        idx = df_.index[j]
        df_.loc[idx, 'poiID'] = poi_id
        df_.loc[idx, 'queryID'] = qid
        df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
        df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
        df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
        df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
        df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
        df_.loc[idx, 'trajLen'] = trajLen
        df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_info.loc[p0, 'poiCat'] else -1
        df_.loc[idx, 'distStart'] = poi_distmat.loc[poi_id, p0]
        df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
        df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
        df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
        df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
        
    return df_

In [16]:
def gen_train_df(trajid_list, poi_info, dat_obj, n_jobs=-1):    
    assert(type(dat_obj) == TrajData)
    
    columns = DF_COLUMNS
    query_id_dict = dat_obj.QUERY_ID_DICT
    train_trajs = [dat_obj.traj_dict[x] for x in trajid_list if len(dat_obj.traj_dict[x]) >= 2]
    qid_set = sorted(set([query_id_dict[(t[0], len(t))] for t in train_trajs]))
    poi_set = {poi for tr in train_trajs for poi in tr}
    #M = len(poi_set)
    
    query_id_rdict = dict()
    for k, v in query_id_dict.items(): 
        query_id_rdict[v] = k  # qid --> (start, length)
    
    train_df_list = Parallel(n_jobs=n_jobs)\
                    (delayed(gen_train_subdf)(poi, poi_info, qid_set, query_id_rdict, dat_obj) for poi in poi_set)
                        
    assert(len(train_df_list) > 0)
    df_ = train_df_list[0]
    for j in range(1, len(train_df_list)):
        df_ = df_.append(train_df_list[j], ignore_index=True)            
        
    # set label
    df_.set_index(['queryID', 'poiID'], inplace=True)
    df_['label'] = 0
    for t in train_trajs:
        qid = query_id_dict[(t[0], len(t))]
        for poi in t[1:]:  # do NOT count if the POI is startPOI/endPOI
            df_.loc[(qid, poi), 'label'] += 1
        #for j in range(1, len(t)):
            #poi = t[j]; df_.loc[(qid, poi), 'label'] += M - j  # add the rank of this POI

    df_.reset_index(inplace=True)
    return df_

In [17]:
def gen_train_df2(trajid_list, poi_info, dat_obj, n_jobs=-1):    
    assert(type(dat_obj) == TrajData)
    
    columns = DF_COLUMNS
    poi_distmat = dat_obj.POI_DISTMAT
    poi_clusters = dat_obj.POI_CLUSTERS
    cats = dat_obj.POI_CAT_LIST
    clusters = dat_obj.POI_CLUSTER_LIST
    
    train_trajs = [dat_obj.traj_dict[x] for x in trajid_list if len(dat_obj.traj_dict[x]) >= 2]
    poi_set = {poi for tr in train_trajs for poi in tr}
    M = len(poi_set)
     
    df_ = pd.DataFrame(columns=columns)
    idx = 0
    for q in sorted(dat_obj.TRAJID_GROUP_DICT.keys()):
        for tid in sorted(dat_obj.TRAJID_GROUP_DICT[q]):
            if tid not in trajid_list: continue
            tr = dat_obj.traj_dict[tid]
            trajLen = len(tr)
            p0 = tr[0]
            assert(len(tr) >= 2)
            for j in range(len(tr)):
                poi_id = tr[j]
                pop, nvisit = poi_info.loc[poi_id, 'popularity'], poi_info.loc[poi_id, 'nVisit']
                cat, cluster = poi_info.loc[poi_id, 'poiCat'], poi_clusters.loc[poi_id, 'clusterID'] 
                duration = poi_info.loc[poi_id, 'avgDuration']
                
                df_.loc[idx, 'poiID'] = poi_id
                df_.loc[idx, 'queryID'] = dat_obj.QUERY_ID_DICT[q]
                df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
                df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
                df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
                df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
                df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
                df_.loc[idx, 'trajLen'] = trajLen
                df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_info.loc[p0, 'poiCat'] else -1
                df_.loc[idx, 'distStart'] = poi_distmat.loc[poi_id, p0]
                df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
                df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
                df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
                df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
                df_.loc[idx, 'label'] = M - j
                
                idx += 1
    return df_

2.1.2 Test DataFrame

Test data are generated the same way as training data, except that the labels of testing data (unknown) could be arbitrary values as suggested in libsvm FAQ. The reported accuracy (by svm-predict command) is meaningless as it is calculated based on these labels.

The dimension of training data matrix is #poi by #feature with one specific query, i.e. tuple $(\text{startPOI}, \text{#POI})$.


In [18]:
def gen_test_df(startPOI, nPOI, poi_info, dat_obj):
    assert(type(dat_obj) == TrajData)
    
    columns = DF_COLUMNS
    poi_distmat = dat_obj.POI_DISTMAT
    poi_clusters = dat_obj.POI_CLUSTERS
    cats = dat_obj.POI_CAT_LIST
    clusters = dat_obj.POI_CLUSTER_LIST
    query_id_dict = dat_obj.QUERY_ID_DICT
    key = (p0, trajLen) = (startPOI, nPOI)
    assert(key in query_id_dict)
    assert(p0 in poi_info.index)
    
    df_ = pd.DataFrame(index=np.arange(poi_info.shape[0]), columns=columns)
    poi_list = sorted(poi_info.index)
    
    qid = query_id_dict[key]
    df_['queryID'] = qid
    df_['label'] = np.random.rand(df_.shape[0]) # label for test data is arbitrary according to libsvm FAQ

    for i in range(df_.index.shape[0]):
        poi = poi_list[i]
        lon, lat = poi_info.loc[poi, 'poiLon'], poi_info.loc[poi, 'poiLat']
        pop, nvisit = poi_info.loc[poi, 'popularity'], poi_info.loc[poi, 'nVisit']
        cat, cluster = poi_info.loc[poi, 'poiCat'], poi_clusters.loc[poi, 'clusterID']
        duration = poi_info.loc[poi, 'avgDuration']
        idx = df_.index[i]
        df_.loc[idx, 'poiID'] = poi
        df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
        df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
        df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
        df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
        df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
        df_.loc[idx, 'trajLen'] = trajLen
        df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_info.loc[p0, 'poiCat'] else -1
        df_.loc[idx, 'distStart'] = poi_distmat.loc[poi, p0]
        df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
        df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
        df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
        df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1

    return df_

Generate a string for a training/test data frame.


In [19]:
def gen_data_str(df_, df_columns=DF_COLUMNS):
    for col in df_columns:
        assert(col in df_.columns)
        
    lines = []
    for idx in df_.index:
        slist = [str(df_.loc[idx, 'label'])]
        slist.append(' qid:')
        slist.append(str(int(df_.loc[idx, 'queryID'])))
        fid = 1
        for j in range(3, len(df_columns)):
            values_ = df_.get_value(idx, df_columns[j])
            values_ = values_ if isinstance(values_, tuple) else [values_]
            for v in values_:
                slist.append(' ')
                slist.append(str(fid)); fid += 1
                slist.append(':')
                slist.append(str(v))
        slist.append('\n')
        lines.append(''.join(slist))
    return ''.join(lines)

2.1.3 Ranking POIs using rankSVM

Here the rankSVM implementation could be liblinear-ranksvm or libsvm-ranksvm, please read README.ranksvm in the zip file for installation instructions.

Use softmax function to convert ranking scores to a probability distribution.


In [20]:
def softmax(x):
    x1 = x.copy()
    x1 -= np.max(x1)  # numerically more stable, REF: http://cs231n.github.io/linear-classify/#softmax
    expx = np.exp(x1)
    return expx / np.sum(expx, axis=0) # column-wise sum

Below is a python wrapper of the svm-train or train and svm-predict or predict commands of rankSVM with ranking probabilities $P(p_i \lvert (p_s, p_e, len))$ computed using softmax function.


In [21]:
# python wrapper of rankSVM
class RankSVM:
    def __init__(self, bin_dir, useLinear=True, debug=False):
        dir_ = !echo $bin_dir  # deal with environmental variables in path
        assert(os.path.exists(dir_[0]))
        self.bin_dir = dir_[0]
        
        self.bin_train = 'svm-train'
        self.bin_predict = 'svm-predict'
        if useLinear:
            self.bin_train = 'train'
            self.bin_predict = 'predict'
        
        assert(isinstance(debug, bool))
        self.debug = debug
        
        # create named tmp files for model and feature scaling parameters
        self.fmodel = None
        self.fscale = None
        with tempfile.NamedTemporaryFile(delete=False) as fd: 
            self.fmodel = fd.name
        with tempfile.NamedTemporaryFile(delete=False) as fd: 
            self.fscale = fd.name
        
        if self.debug:
            print('model file:', self.fmodel)
            print('feature scaling parameter file:', self.fscale)
    
    
    def __del__(self):
        # remove tmp files
        if self.debug == False:
            if self.fmodel is not None and os.path.exists(self.fmodel):
                os.unlink(self.fmodel)
            if self.fscale is not None and os.path.exists(self.fscale):
                os.unlink(self.fscale)

    
    def train(self, train_df, cost=1):
        # cost is parameter C in SVM
        # write train data to file
        ftrain = None
        with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as fd: 
            ftrain = fd.name
            datastr = gen_data_str(train_df)
            fd.write(datastr)
        
        # feature scaling
        ftrain_scaled = None
        with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as fd: 
            ftrain_scaled = fd.name
        result = !$self.bin_dir/svm-scale -s $self.fscale $ftrain > $ftrain_scaled
        
        if self.debug:
            print('cost:', cost)
            print('train data file:', ftrain)
            print('feature scaled train data file:', ftrain_scaled)
        
        # train rank svm and generate model file, if the model file exists, rewrite it
        result = !$self.bin_dir/$self.bin_train -c $cost $ftrain_scaled $self.fmodel
        if self.debug:
            print('Training finished.')
            for i in range(len(result)): print(result[i])
                
        # load model parameters
        w = []
        header = 5
        with open(self.fmodel, 'r') as f:
            for j in range(header): _ = f.readline()
            for line in f: w.append(float(line.strip()))
        self.w = np.array(w)

        # remove train data file
        if self.debug == False:
            os.unlink(ftrain)
            os.unlink(ftrain_scaled)        

    
    def predict(self, test_df, probability=False):
        # predict ranking scores for the given feature matrix
        if self.fmodel is None or not os.path.exists(self.fmodel):
            print('Model should be trained before prediction')
            return
        
        # write test data to file
        ftest = None
        with tempfile.NamedTemporaryFile(mode='w+t', delete=False) as fd: 
            ftest = fd.name
            datastr = gen_data_str(test_df)
            fd.write(datastr)
                
        # feature scaling
        ftest_scaled = None
        with tempfile.NamedTemporaryFile(delete=False) as fd: 
            ftest_scaled = fd.name
        result = !$self.bin_dir/svm-scale -r $self.fscale $ftest > $ftest_scaled
            
        # generate prediction file
        fpredict = None
        with tempfile.NamedTemporaryFile(delete=False) as fd: 
            fpredict = fd.name
            
        if self.debug:
            print('test data file:', ftest)
            print('feature scaled test data file:', ftest_scaled)
            print('predict result file:', fpredict)
            
        # predict using trained model and write prediction to file
        result = !$self.bin_dir/$self.bin_predict $ftest_scaled $self.fmodel $fpredict
        if self.debug:
            print('Predict result: %-30s  %s' % (result[0], result[1]))
        
        # generate prediction DataFrame from prediction file
        poi_rank_df = pd.read_csv(fpredict, header=None)
        poi_rank_df.rename(columns={0:'rank'}, inplace=True)
        poi_rank_df['poiID'] = test_df['poiID'].astype(np.int)
        poi_rank_df.set_index('poiID', inplace=True)
        if probability == True: poi_rank_df['probability'] = softmax(poi_rank_df['rank'])
        
        # remove test file and prediction file
        if self.debug == False:
            os.unlink(ftest)
            os.unlink(ftest_scaled)
            os.unlink(fpredict)

        return poi_rank_df

Nested cross-validation with Monte-Carlo cross-validation as inner loop.


In [22]:
if run_rank == True:
    recdict_rank = dict()
    recdict_rank_pop = dict()
    cnt = 1
    keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]

        best_C = 1
        #best_F1 = 0; best_pF1 = 0
        best_Tau = 0
        keys_cv = keys[:i] + keys[i+1:]
        
        # use all training+validation set to compute POI features, 
        # make sure features do NOT change for training and validation
        trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
        poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
        
        poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
        if ps not in poi_set_i: 
            sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
            continue

        # tune regularisation constant C
        for rank_C in C_SET:
            print('\n--------------- try_C: %.2f ---------------\n' % rank_C); sys.stdout.flush() 
            F1_rank = []; pF1_rank = []; Tau_rank = []        

            # inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
            for j in range(MC_NITER):
                poi_list = []
                while True: # make sure the start POI in test set are also in training set
                    rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
                    test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
                    assert(len(test_ix) > 0)
                    trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
                    for j in test_ix: 
                        trajid_set_train = trajid_set_train - dat_obj.TRAJID_GROUP_DICT[keys_cv[j]]
                    poi_set = {poi for tid in trajid_set_train for poi in dat_obj.traj_dict[tid]}
                    good_partition = True
                    for j in test_ix: 
                        if keys_cv[j][0] not in poi_set: good_partition = False; break
                    if good_partition == True: 
                        poi_list = sorted(poi_set)
                        break

                # train                
                train_df = gen_train_df(list(trajid_set_train), poi_info_i.loc[poi_list].copy(), dat_obj, n_jobs=N_JOBS)
                ranksvm = RankSVM(ranksvm_dir, useLinear=True)#, debug=True)
                ranksvm.train(train_df, cost=rank_C)
                #sys.exit(0)
                
                # test
                for j in test_ix:  # test
                    ps_cv, L_cv = keys_cv[j]
                    test_df = gen_test_df(ps_cv, L_cv, poi_info_i.loc[poi_list].copy(), dat_obj)
                    rank_df = ranksvm.predict(test_df)
                    rank_df.sort_values(by='rank', ascending=False, inplace=True)
                    y_hat = [ps_cv] + [p for p in rank_df.index.tolist() if p != ps_cv][:L_cv-1]
                    F1, pF1, tau = evaluate(dat_obj, keys_cv[j], [y_hat])
                    F1_rank.append(F1); pF1_rank.append(pF1); Tau_rank.append(tau)
                    
            #mean_F1 = np.mean(F1_rank); mean_pF1 = np.mean(pF1_rank)
            mean_Tau = np.mean(Tau_rank)
            print('mean_Tau: %.3f' % mean_Tau)
            if mean_Tau > best_Tau:
                best_Tau = mean_Tau
                best_C = rank_C
        print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %.2f ---------------\n' % (cnt,len(keys),ps,L,best_C))
        sys.stdout.flush()

        # train model using all examples in training+validation set and measure performance on test set
        train_df = gen_train_df(list(trajid_set_i), poi_info_i.copy(), dat_obj, n_jobs=N_JOBS)
        ranksvm = RankSVM(ranksvm_dir, useLinear=True)
        ranksvm.train(train_df, cost=best_C)
        test_df = gen_test_df(ps, L, poi_info_i, dat_obj)
        rank_df = ranksvm.predict(test_df)
        rank_df.sort_values(by='rank', ascending=False, inplace=True)
        y_hat = [ps] + [p for p in rank_df.index.tolist() if p != ps][:L-1]
        recdict_rank[(ps, L)] = {'PRED': [y_hat], 'C':best_C, 'W': ranksvm.w}
        
        # POI popularity based ranking
        poi_info_i.sort_values(by='popularity', ascending=False, inplace=True)
        y_hat_pop = [ps] + [p for p in poi_info_i.index.tolist() if p != ps][:L-1]        
        recdict_rank_pop[(ps, L)] = {'PRED': [y_hat_pop]}

        cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()

In [23]:
if run_rank == True:
    print(len(recdict_rank))
    print('RankSVM:')
    do_evaluation(dat_obj, recdict_rank)
    print('\nPopularity:')
    do_evaluation(dat_obj, recdict_rank_pop)

In [24]:
if run_rank == True:
    frank = os.path.join(data_dir, 'rank-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
    fpop  = os.path.join(data_dir, 'pop-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_rank, open(frank, 'bw'))
    pickle.dump(recdict_rank_pop, open(fpop, 'bw'))

In [25]:
%load_ext Cython

In [26]:
%%cython
import numpy as np
import heapq as hq
import sys
cimport numpy as np

cdef class HeapItem:  # an item in heapq (min-heap)
    cdef readonly float priority
    cdef readonly object task, string
    
    def __init__(self, float priority, task):
        self.priority = priority
        self.task = task
        self.string = str(priority) + ': ' + str(task)
            
    def __richcmp__(self, other, int op):
        if op == 2: # ==
            return self.priority == other.priority
        elif op == 3: # !=
            return self.priority != other.priority
        elif op == 0: # <
            return self.priority < other.priority
        elif op == 1: # <=
            return self.priority <= other.priority
        elif op == 4: # >
            return self.priority > other.priority
        elif op == 5: # >=
            return self.priority >= other.priority
        else:
            assert False
            
    def __repr__(self):
        return self.string
    
    def __str__(self):
        return self.string


cpdef do_inference_list_viterbi(int ps, int L, int M, np.ndarray[dtype=np.float64_t, ndim=1] poi_scores, int top=10):
    assert(L > 1)
    assert(M >= L)
    assert(ps >= 0)
    assert(ps < M)
    assert(top > 0)
    
    cdef int pi, pj, t, pk, parix, partition_index, partition_index_start, k_partition_index
    cdef long k, maxIter = long(1e6)
    cdef float priority, new_priority
    
    Cu = np.zeros(M, dtype=np.float)
    Cp = np.zeros((M, M), dtype=np.float)
    
    # a intermediate POI should NOT be the start POI, NO self-loops
    for pi in range(M):
        Cu[pi] = poi_scores[pi]
        for pj in range(M):
            Cp[pi, pj] = -np.inf if (pj == ps or pi == pj) else 0
            
    # forward-backward procedure: adapted from the Rabiner paper
    Alpha = np.zeros((L, M), dtype=np.float)  # alpha_t(p_i)
    Beta  = np.zeros((L, M), dtype=np.float)  # beta_t(p_i)
    
    for pj in range(M): Alpha[1, pj] = Cp[ps, pj] + Cu[pj]
    for t in range(2, L):
        for pj in range(M): # ps~~pi--pj
            Alpha[t, pj] = np.max([Alpha[t-1, pi] + Cp[pi, pj] + Cu[pj] for pi in range(M)])
    
    for pi in range(M): Beta[L-1, pi] = 0
    for t in range(L-1, 1, -1):
        for pi in range(M): # ps~~pi--pj
            Beta[t-1, pi] = np.max([Cp[pi, pj] + Cu[pj] + Beta[t, pj] for pj in range(M)])
    Beta[0, ps] = np.max([Cp[ps, pj] + Cu[pj] + Beta[1, pj] for pj in range(M)])
    
    Fp = np.zeros((L-1, M, M), dtype=np.float)  # f_{t, t+1}(p, p')
    
    for t in range(L-1):
        for pi in range(M):
            for pj in range(M):
                Fp[t, pi, pj] = Alpha[t, pi] + Cp[pi, pj] + Cu[pj] + Beta[t+1, pj]
                
    # identify the best path/walk: adapted from the IJCAI01 paper
    y_best = np.ones(L, dtype=np.int) * (-1)
    y_best[0] = ps
    y_best[1] = np.argmax(Fp[0, ps, :])  # the start POI is specified
    for t in range(2, L): 
        y_best[t] = np.argmax(Fp[t-1, y_best[t-1], :])
    
    Q = []  # priority queue (min-heap)
        
    # heap item for the best path/walk
    priority = -np.max(Alpha[L-1, :])  # -1 * score as priority
    partition_index = -1
    exclude_set = set()  
    hq.heappush(Q, HeapItem(priority, (y_best, partition_index, exclude_set)))
    
    results = []
    k = 0; y_last = None
    while len(Q) > 0 and k < maxIter:
        hitem = hq.heappop(Q)
        k_priority = hitem.priority
        (k_best, k_partition_index, k_exclude_set) = hitem.task
        k += 1; y_last = k_best
        
        if len(set(k_best)) == L:
            #print(-k_priority)
            results.append(k_best); top -= 1
            if top == 0: return results
    
        # identify the (k+1)-th best path/walk given the 1st, 2nd, ..., k-th best: adapted from the IJCAI01 paper
        partition_index_start = 1
        if k_partition_index > 0:
            assert(k_partition_index < L)
            partition_index_start = k_partition_index

        for parix in range(partition_index_start, L):
            # new_best[:parix]
            new_best = np.zeros(L, dtype=np.int) * (-1)
            new_best[:parix] = k_best[:parix]
            if len(set(new_best[:parix])) < parix: break
            
            # new_best[parix]
            #new_exclude_set = set({k_best[parix]})
            new_exclude_set = set(k_best[:parix+1]) # exclude all visited POIs
            if parix == partition_index_start: new_exclude_set = new_exclude_set | k_exclude_set
            candidate_points = [p for p in range(M) if p not in new_exclude_set]
            if len(candidate_points) == 0: continue
            candidate_maxix = np.argmax([Fp[parix-1, k_best[parix-1], p] for p in candidate_points])
            new_best[parix] = candidate_points[candidate_maxix]
            
            # new_best[parix+1:]
            for pk in range(parix+1, L):
                new_best[pk] = np.argmax([Fp[pk-1, new_best[pk-1], p] for p in range(M)])
                #new_best[pk] = np.argmax([Fp[pk-1, new_best[pk-1], p] for p in range(M) if p not in new_best[:pk]])
                #new_best[pk] = np.argmax([Fp[pk-1, new_best[pk-1], p] for p in range(M) if p not in new_best[:parix+1]])
            
            new_priority = Fp[parix-1, k_best[parix-1], new_best[parix]]
            if k_partition_index > 0:
                new_priority += (-k_priority) - Fp[parix-1, k_best[parix-1], k_best[parix]]
            new_priority *= -1.0   # NOTE: -np.inf - np.inf + np.inf = nan

            #print('push: %s, %d' % (str(new_best), parix))
            
            hq.heappush(Q, HeapItem(new_priority, (new_best, parix, new_exclude_set)))
            
    if k >= maxIter: 
        sys.stderr.write('WARN: reaching max number of iterations, NO optimal solution found, \
                          return suboptimal solution.\n')
    if len(Q) == 0:
        sys.stderr.write('WARN: empty queue, return the last one\n')
    results.append(y_last); top -= 1
    while len(Q) > 0 and top > 0:
        hitem = hq.heappop(Q)
        results.append(hitem.task[0])
        top -= 1
    return results

In [27]:
if run_rank_topk == True:
    recdict_c = pickle.load(open(os.path.join(data_dir, 'rank-' + dat_obj.dat_suffix[dat_ix] + '.pkl'), 'rb'))
    recdict_rank = dict()
    recdict_rank_pop = dict()
    cnt = 1
    keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]

        best_C = 1
        #best_F1 = 0; best_pF1 = 0
        best_Tau = 0
        keys_cv = keys[:i] + keys[i+1:]
        
        # use all training+validation set to compute POI features, 
        # make sure features do NOT change for training and validation
        trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
        poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
        
        poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
        if ps not in poi_set_i: 
            sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
            continue
        
        best_C = recdict_c[(ps, L)]['C']

        print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %.2f ---------------\n' % (cnt,len(keys),ps,L,best_C))
        sys.stdout.flush()

        # train model using all examples in training+validation set and measure performance on test set
        train_df = gen_train_df(list(trajid_set_i), poi_info_i.copy(), dat_obj, n_jobs=N_JOBS)
        ranksvm = RankSVM(ranksvm_dir, useLinear=True)
        ranksvm.train(train_df, cost=best_C)
        test_df = gen_test_df(ps, L, poi_info_i, dat_obj)
        rank_df = ranksvm.predict(test_df, probability=True)
        poi_list_rank = rank_df.index.tolist()
        poiid_dict_rank = {poi: ix for ix, poi in enumerate(poi_list_rank)}
        rank_scores = np.log10(rank_df['probability'].values)
        top10 = do_inference_list_viterbi(poiid_dict_rank[ps], L, len(poi_list_rank), rank_scores)
        recdict_rank[(ps, L)] = {'PRED': [[poi_list_rank[pi] for pi in t] for t in top10], 'C':best_C}
        
        # POI popularity based ranking
        poi_info_i.sort_values(by='popularity', ascending=False, inplace=True)
        poi_list_pop = poi_info_i.index.tolist()
        poiid_dict_pop = {poi: ix for ix, poi in enumerate(poi_list_pop)}
        pop_scores = poi_info_i['popularity'].astype(np.float).values
        top10 = do_inference_list_viterbi(poiid_dict_pop[ps], L, len(poi_list_pop), pop_scores)
        recdict_rank_pop[(ps, L)] = {'PRED': [[poi_list_pop[pi] for pi in t] for t in top10]}
        
        cnt += 1

In [28]:
if run_rank_topk == True:
    print(len(recdict_rank))
    print('RankSVM:')
    topk = [1, 3, 5, 10]
    recdicts = [{q: {'PRED': recdict_rank[q]['PRED'][:k]} for q in recdict_rank.keys()} for k in topk]
    for j in range(len(topk)): do_evaluation(dat_obj, recdicts[j])
    print('\nPopularity:')
    recdicts = [{q: {'PRED': recdict_rank_pop[q]['PRED'][:k]} for q in recdict_rank_pop} for k in topk]
    for j in range(len(topk)): do_evaluation(dat_obj, recdicts[j])

In [29]:
if run_rank_topk == True:
    frank = os.path.join(data_dir, 'rank10-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
    fpop  = os.path.join(data_dir, 'pop10-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_rank, open(frank, 'bw'))
    pickle.dump(recdict_rank_pop, open(fpop, 'bw'))

2.2 POI occurrence prediction

Generate features.


In [30]:
def gen_features(startPOI, nPOI, poi_info, poi_clusters, cats, clusters):
    """
    Generate feature vectors for all POIs given query (startPOI, nPOI)
    """
    assert(isinstance(cats, list))
    assert(isinstance(clusters, list))
    
    columns = DF_COLUMNS[3:]
    poi_distmat = POI_DISTMAT
    query_id_dict = QUERY_ID_DICT
    key = (p0, trajLen) = (startPOI, nPOI)
    assert(key in query_id_dict)
    assert(p0 in poi_info.index)
    
    df_ = pd.DataFrame(index=np.arange(poi_info.shape[0]), columns=columns)
    poi_list = sorted(poi_info.index)
        
    for i in range(df_.index.shape[0]):
        poi = poi_list[i]
        lon, lat = poi_info.loc[poi, 'poiLon'], poi_info.loc[poi, 'poiLat']
        pop, nvisit = poi_info.loc[poi, 'popularity'], poi_info.loc[poi, 'nVisit']
        cat, cluster = poi_info.loc[poi, 'poiCat'], poi_clusters.loc[poi, 'clusterID']
        duration = poi_info.loc[poi, 'avgDuration']
        idx = df_.index[i]
        df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
        df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
        df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
        df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
        df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
        df_.loc[idx, 'trajLen'] = trajLen
        df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_all.loc[p0, 'poiCat'] else -1
        df_.loc[idx, 'distStart'] = poi_distmat.loc[poi, p0]
        df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
        df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
        df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
        df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
    
    # convert to matrix
    nrows = df_.shape[0]
    ncols = df_.shape[1] + len(cats) + len(clusters) - 2
    
    # features other than category and neighbourhood
    X = df_[sorted(set(df_.columns) - {'category', 'neighbourhood'})].values  
    
    # boolean features: category (+1, -1)
    cat_features = np.vstack([list(df_.loc[x, 'category']) for x in df_.index])
    
    # boolean features: neighbourhood (+1, -1)
    neigh_features = np.vstack([list(df_.loc[x, 'neighbourhood']) for x in df_.index])
    
    return np.hstack([X, cat_features.astype(np.float), neigh_features.astype(np.float)])

Generate labels.


In [31]:
def gen_labels(traj_truth, poi_info, binarise=False):
    """
    Generate labels for all POIs given a ground truth trajectory
    """
    poi_list = sorted(poi_info.index)
    ranks = np.zeros(len(poi_list), dtype=np.float)
    for j in range(len(poi_list)):
        try:
            poi = poi_list[j]
            ranks[j] = (len(poi_list) - traj_truth.index(poi)) / len(poi_list) # normalise
        except ValueError:
            pass # default rank is 0
        
    if binarise == True:
        return (ranks > 0).astype(np.float)*2 - 1 # binary labels (+1, -1)
    
    return ranks.astype(np.int)

POI occurrence prediction: logistic regression


In [32]:
class LOGIT_REG:
    def __init__(self, C=1.0, poi_info=None, debug=False):
        self.C = C
        self.debug = debug
        self.trained = False
        
        if poi_info is None:
            self.poi_info = None
        else:
            self.poi_info = poi_info
    
    
    def train(self, trajid_set_train):
        # generate training data
        if self.poi_info is None:
            self.poi_info = calc_poi_info(list(trajid_set_train), traj_all, poi_all)
        self.poi_list = sorted(self.poi_info.index)
            
        train_traj_list = [traj_dict[x] for x in trajid_set_train if len(traj_dict[x]) >= 2]
        feature_list = Parallel(n_jobs=N_JOBS)\
                               (delayed(gen_features)(tr[0], len(tr), self.poi_info, poi_clusters=POI_CLUSTERS, \
                                                      cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
                                for tr in train_traj_list)
        labels_logreg = Parallel(n_jobs=N_JOBS)\
                                (delayed(gen_labels)(tr, self.poi_info, binarise=True) for tr in train_traj_list)
        
        X = np.vstack(feature_list)
        Y_logreg = np.hstack(labels_logreg)

        # feature scaling
        if ABS_SCALER == True:
            self.scaler = MaxAbsScaler(copy=False)
        else:
            self.scaler = MinMaxScaler(feature_range=(-1,1), copy=False)
            #self.scaler = StandardScaler(copy=False)
        X = self.scaler.fit_transform(X)

        # train
        self.logreg = LogisticRegression(C=self.C, n_jobs=N_JOBS)
        self.logreg.fit(X, Y_logreg)
        self.trained = True
        
    
    
    def predict(self, startPOI, nPOI):
        assert(self.trained == True)
        if startPOI not in self.poi_list: return None
        X_test = gen_features(startPOI, nPOI, self.poi_info, poi_clusters=POI_CLUSTERS, \
                              cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST)
        X_test = self.scaler.transform(X_test)
        scores_logreg = self.logreg.decision_function(X_test)
        
        # form recommendation
        assert(len(scores_logreg) == len(self.poi_list))
        p0_ix = self.poi_list.index(startPOI)
        scores_logreg[p0_ix] = -1e6  # mask the start POI
        topk_logreg = list(np.argsort(-np.asarray(scores_logreg))[:nPOI-1])
        y_hat = [startPOI] + list(np.array(self.poi_list)[topk_logreg])
        return np.array(y_hat)

Nested cross-validation with Monte-Carlo cross-validation as inner loop.


In [33]:
if run_logreg == True:
    recdict_logreg = dict()
    cnt = 1
    keys = sorted(TRAJ_GROUP_DICT.keys())

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]

        best_C = 1
        #best_F1 = 0; best_pF1 = 0
        best_Tau = 0
        keys_cv = keys[:i] + keys[i+1:]
        
        # use all training+validation set to compute POI features, 
        # make sure features do NOT change for training and validation
        trajid_set_i = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
        poi_info_i = calc_poi_info(list(trajid_set_i), traj_all, poi_all)

        # tune regularisation constant C
        for logit_C in C_SET:
            print('\n--------------- try_C: %f ---------------\n' % logit_C); sys.stdout.flush() 
            F1_logreg = []; pF1_logreg = []; Tau_logreg = []        

            # inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
            for j in range(MC_NITER):
                poi_list = []
                while True: # make sure the start POI in test set are also in training set
                    rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
                    test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
                    assert(len(test_ix) > 0)
                    trajid_set_train = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
                    for j in test_ix: 
                        trajid_set_train = trajid_set_train - TRAJ_GROUP_DICT[keys_cv[j]]
                    poi_set = set()
                    for tid in trajid_set_train: poi_set = poi_set | set(traj_dict[tid])
                    good_partition = True
                    for j in test_ix: 
                        if keys_cv[j][0] not in poi_set: good_partition = False; break
                    if good_partition == True: 
                        poi_list = sorted(poi_set)
                        break
                        
                # train
                logreg = LOGIT_REG(C=logit_C, poi_info=poi_info_i.loc[poi_list].copy(), debug=True)
                logreg.train(list(trajid_set_train))
                
                # test
                for j in test_ix:
                    ps_cv, L_cv = keys_cv[j]
                    y_hat = logreg.predict(ps_cv, L_cv)
                    if y_hat is not None:
                        F1, pF1, tau = evaluate(y_hat, TRAJ_GROUP_DICT[keys_cv[j]])
                        F1_logreg.append(F1); pF1_logreg.append(pF1); Tau_logreg.append(tau)
                        
            #mean_F1 = np.mean(F1_logreg); mean_pF1 = np.mean(pF1_logreg)
            mean_Tau = np.mean(Tau_logreg)
            print('mean_Tau: %.3f' % mean_Tau)
            if mean_Tau > best_Tau:
                best_Tau = mean_Tau
                best_C = logit_C
        print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %f ---------------\n' % (cnt, len(keys), ps, L, best_C))
        sys.stdout.flush()

        # train model using all examples in training set and measure performance on test set
        logreg = LOGIT_REG(C=best_C, poi_info=poi_info_i, debug=True)
        logreg.train(trajid_set_i)
        y_hat = logreg.predict(ps, L)
        if y_hat is not None:
            recdict_logreg[(ps, L)] = {'PRED': y_hat, 'C': logreg.C}

        cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()

In [34]:
if run_logreg == True:
    F1_logreg = []; pF1_logreg = []; Tau_logreg = []
    for key in sorted(recdict_logreg.keys()):
        F1, pF1, tau = evaluate(recdict_logreg[key]['PRED'], TRAJ_GROUP_DICT[key])
        F1_logreg.append(F1); pF1_logreg.append(pF1); Tau_logreg.append(tau)
    print('LogReg: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
          (np.mean(F1_logreg), np.std(F1_logreg)/np.sqrt(len(F1_logreg)), \
           np.mean(pF1_logreg), np.std(pF1_logreg)/np.sqrt(len(pF1_logreg)), \
           np.mean(Tau_logreg), np.std(Tau_logreg)/np.sqrt(len(Tau_logreg))))

In [35]:
if run_logreg == True:
    flogreg = os.path.join(data_dir, 'logreg-' + dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_logreg, open(flogreg, 'bw'))

2.3 Direct rank/location prediction

Direct rank/location prediction: linear regression


In [36]:
class LINEAR_REG:
    def __init__(self, poi_info=None, debug=False):
        self.debug = debug
        self.trained = False
        
        if poi_info is None:
            self.poi_info = None
        else:
            self.poi_info = poi_info
        
    
    def train(self, trajid_set_train):
        if self.poi_info is None:
            self.poi_info = calc_poi_info(list(trajid_set_train), traj_all, poi_all)
        self.poi_list = sorted(self.poi_info.index)
        
        # generate training data
        train_traj_list = [traj_dict[x] for x in trajid_set_train if len(traj_dict[x]) >= 2]    
        feature_list = Parallel(n_jobs=N_JOBS)\
                               (delayed(gen_features)(tr[0], len(tr), self.poi_info, poi_clusters=POI_CLUSTERS, \
                                        cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
                                for tr in train_traj_list)
        labels_linreg = Parallel(n_jobs=N_JOBS)\
                                (delayed(gen_labels)(tr, self.poi_info, binarise=False) for tr in train_traj_list)
        X = np.vstack(feature_list)
        Y_linreg = np.hstack(labels_linreg)
        
        # remove training examples with label '0', i.e., features of POIs that do not exist in trajectory
        #valid_ix = np.nonzero(Y_linreg)[0]
        #X = X[valid_ix, :]
        #Y_linreg = Y_linreg[valid_ix]
        
        # feature scaling
        if ABS_SCALER == True:
            self.scaler = MaxAbsScaler(copy=False)
        else:
            self.scaler = MinMaxScaler(feature_range=(-1,1), copy=False)
            #self.scaler = StandardScaler(copy=False)
        X = self.scaler.fit_transform(X)
        
        # train
        self.linreg = LinearRegression(normalize=True, n_jobs=N_JOBS)
        self.linreg.fit(X, Y_linreg)
        self.trained = True        
    
    
    def predict(self, startPOI, nPOI):
        assert(self.trained == True)
        if startPOI not in self.poi_list: return None
        X_test = gen_features(startPOI, nPOI, self.poi_info, poi_clusters=POI_CLUSTERS, \
                              cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST)
        X_test = self.scaler.transform(X_test)
        scores_linreg = self.linreg.predict(X_test)
        
        # form recommendation
        assert(len(scores_linreg) == len(self.poi_list))
        p0_ix = self.poi_list.index(startPOI)
        scores_linreg[p0_ix] = -1e6  # mask the start POI
        topk_linreg = list(np.argsort(-np.asarray(scores_linreg))[:nPOI-1])
        y_hat = [startPOI] + list(np.array(self.poi_list)[topk_linreg])
        return np.array(y_hat)

Leave-one-out cross-validation, NO hyper-parameters to tune.


In [37]:
if run_linreg == True:
    recdict_linreg = dict()
    cnt = 1
    keys = sorted(TRAJ_GROUP_DICT.keys())

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]
        trajid_set_i = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
        poi_info_i = calc_poi_info(list(trajid_set_i), traj_all, poi_all)

        print('\n--------------- %d/%d, Query: (%d, %d) ---------------\n' % (cnt, len(keys), ps, L))

        # train model using all examples in training+evaluation set and measure performance on test set
        linreg = LINEAR_REG(poi_info=poi_info_i)
        linreg.train(list(trajid_set_i))
        y_hat = linreg.predict(ps, L)
        if y_hat is not None:
            recdict_linreg[(ps, L)] = {'PRED': y_hat}

        cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()

In [38]:
if run_linreg == True:
    F1_linreg = []; pF1_linreg = []; Tau_linreg = []
    for key in sorted(recdict_linreg.keys()):
        F1, pF1, tau = evaluate(recdict_linreg[key]['PRED'], TRAJ_GROUP_DICT[key])
        F1_linreg.append(F1); pF1_linreg.append(pF1); Tau_linreg.append(tau)
    print('LinReg: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
          (np.mean(F1_linreg), np.std(F1_linreg)/np.sqrt(len(F1_linreg)), \
           np.mean(pF1_linreg), np.std(pF1_linreg)/np.sqrt(len(pF1_linreg)), \
           np.mean(Tau_linreg), np.std(Tau_linreg)/np.sqrt(len(Tau_linreg))))

In [39]:
if run_linreg == True:
    flinreg = os.path.join(data_dir, 'linreg-' + dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_linreg, open(flinreg, 'bw'))

2.4 Pairwise POI ranking

Cost function and its gradient for pairwise POI ranking.


In [40]:
%load_ext Cython


The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython

In [41]:
%%cython
import numpy as np
cimport numpy as np

cpdef pwr_obj(w, X, Y, float C, long M, logit_loss=True):
    """
    w - parameter vector
    X - feature matrix for all training examples (features of all POIs for the 1st example, then 2nd, ...)
    Y - labels/ranks for all training examples (labels of all POIs for the 1st example, then 2nd, ...)
    C - regularisation constant
    M - total number of POIs
    logit_loss - True for logistic loss, False for squared hinge loss
    return the cost and the gradient of cost
    """
    assert(C > 0)
    assert(M > 0)
    cdef long N, N1, D, i, pj, pk, ix_j, ix_k
    Nq = int(np.shape(X)[0]/M) # number of queries
    D = np.shape(X)[1]
    assert(D == np.shape(w)[0])
    assert(np.shape(X)[0] == np.shape(Y)[0])
    N1 = 10 * np.shape(X)[0]  # 10 * M * N
    
    cdef double cost, costi, lossi, l_jk
    cost = 0.0
    grad = np.zeros(D, dtype=np.float)
    
    for i in range(Nq):
        costi = 0.0
        gradi = np.zeros(D, dtype=np.float)
        for pj in range(M-1): # symmetric breaking of pair (pj, pk), #pairs should be n choose 2 not n^2
            ix_j = i*M + pj  # index of feature vector/label for POI pj
            for pk in range(pj+1, M):
                #if pj == pk:
                #    #result += np.log(2)  # constant
                #    continue
                ix_k = i*M + pk
                l_jk = 0
                if Y[ix_j] > Y[ix_k]: l_jk = 1.0
                if Y[ix_j] < Y[ix_k]: l_jk = -1.0
                phi_jk = X[ix_j] - X[ix_k]
                term = l_jk * np.dot(w, phi_jk)
                if logit_loss == True:
                    costi += np.log1p(np.exp(-1.0 * term))
                    gradi = gradi + phi_jk * (-1.0 * l_jk / (1.0 + np.exp(term)))
                else:
                    if term < 1:
                        costi += (1.0 - term)**2
                        gradi = gradi - 2 * l_jk * (1.0 - term) * phi_jk
                
        cost += costi / N1
        grad = grad + gradi / N1
        
    cost = 0.5 * np.dot(w, w) + C * cost
    grad = w + C * grad
    
    return (cost, grad)

Pairwise POI ranking.

Use the same features and labels as RankSVM


In [42]:
class PW_RANK:
    def __init__(self, dat_obj, C=1.0, poi_info=None, logit_loss=True, debug=False):
        self.dat_obj = dat_obj
        self.C = C
        self.logit_loss = logit_loss
        self.debug = debug
        self.trained = False
        
        if poi_info is None:
            self.poi_info = None
        else:
            self.poi_info = poi_info
            
       
    def train(self, trajid_set_train):
        if self.poi_info is None:
            self.poi_info = self.dat_obj.calc_poi_info(list(trajid_set_train))
        self.poi_list = sorted(self.poi_info.index)
        
        #train_traj_list = [traj_dict[x] for x in trajid_set_train if len(traj_dict[x]) >= 2]            
        #feature_list = Parallel(n_jobs=N_JOBS)\
        #                       (delayed(gen_features)(tr[0], len(tr), self.poi_info, poi_clusters=POI_CLUSTERS, \
        #                                cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
        #                        for tr in train_traj_list)
        #labels_linreg = Parallel(n_jobs=N_JOBS)\
        #                        (delayed(gen_labels)(tr, self.poi_info, binarise=False) for tr in train_traj_list)
        
        traj_groups = dict()
        for tid in trajid_set_train:
            tr = self.dat_obj.traj_dict[tid]
            if len(tr) < 2: continue
            q = (tr[0], len(tr))
            try: traj_groups[q].append(tid)
            except: traj_groups[q] = [tid]
        queries = sorted(traj_groups.keys())
        feature_list = Parallel(n_jobs=N_JOBS)\
                               (delayed(calc_node_features)(q[0], q[1], self.poi_list, self.poi_info, self.dat_obj)\
                                for q in queries)
        labels = []
        for q in queries:
            label = pd.Series(data=np.zeros(len(self.poi_list), dtype=np.float), index=self.poi_list)
            for tid in traj_groups[q]:
                tr = self.dat_obj.traj_dict[tid]
                for poi in tr[1:]: label.loc[poi] += 1
            labels.append(label)   
        
        X = np.vstack(feature_list)
        Y = np.hstack(labels)
        
        # remove training examples with label '0', i.e., features of POIs that do not exist in trajectory
        #valid_ix = np.nonzero(Y_linreg)[0]
        #X = X[valid_ix, :]
        #Y_linreg = Y_linreg[valid_ix]
        
        # feature scaling
        if ABS_SCALER == True:
            self.Xscaler = MaxAbsScaler(copy=False)
            #self.Yscaler = MaxAbsScaler(copy=False)
        else:
            self.Xscaler = MinMaxScaler(feature_range=(-1,1), copy=False)
            #self.Yscaler = MinMaxScaler(feature_range=(-1,1), copy=False)
            #self.scaler = StandardScaler(copy=False)
        X = self.Xscaler.fit_transform(X)
        #Y = self.Yscaler.fit_transform(Y.reshape(-1, 1))
        #Y = Y.reshape(-1)  # reshape to one dimension
        
        w = np.random.rand(np.shape(X)[1])  # initial guess
        opt_method = 'BFGS' #'Newton-CG' 
        options = {'disp': True} if self.debug == True else dict()
        opt = minimize(pwr_obj, w, args=(X, Y, self.C, len(self.poi_list), self.logit_loss), \
                       method=opt_method, jac=True, options=options)
        if opt.success == True:
            self.w = opt.x
            self.trained = True
        else:
            sys.stderr.write('Optimisation failed, C=%f\n' % self.C)
            self.trained = False
        return self.trained
        
    
    def predict(self, startPOI, nPOI):
        assert(self.trained == True)
        if startPOI not in self.poi_list: return None
        X_test = calc_node_features(startPOI, nPOI, self.poi_list, self.poi_info, self.dat_obj)
        #X_test = gen_features(startPOI, nPOI, self.poi_info, poi_clusters=POI_CLUSTERS, \
        #                      cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST)
        X_test = self.Xscaler.transform(X_test)
        scores_pwr = 1.0 / (1.0 + np.exp(-1.0 * np.dot(X_test, self.w)))
        
        # form recommendation
        assert(len(scores_pwr) == len(self.poi_list))
        p0_ix = self.poi_list.index(startPOI)
        scores_pwr[p0_ix] = -1e6  # mask the start POI
        topk_pwr = list(np.argsort(-np.asarray(scores_pwr))[:nPOI-1])
        y_hat = [startPOI] + list(np.array(self.poi_list)[topk_pwr])
        return np.array(y_hat)

Nested cross-validation with Monte-Carlo cross-validation as inner loop.


In [43]:
if run_pwr == True:
    LOGIT_LOSS = False
    recdict_pwr = dict()
    cnt = 1
    keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]

        best_C = 1
        #best_F1 = 0; best_pF1 = 0
        best_Tau = 0
        keys_cv = keys[:i] + keys[i+1:]
        
        # use all training+validation set to compute POI features, 
        # make sure features do NOT change for training and validation
        trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
        poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))

        poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
        if ps not in poi_set_i: 
            sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
            continue

        # tune regularisation constant C
        for logit_C in C_SET:
            print('\n--------------- try_C: %f ---------------\n' % logit_C); sys.stdout.flush() 
            F1_pwr = []; pF1_pwr = []; Tau_pwr = []        

            # inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
            for j in range(MC_NITER):
                poi_list = []
                while True: # make sure the start POI in test set are also in training set
                    rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
                    test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
                    assert(len(test_ix) > 0)
                    trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
                    for j in test_ix: 
                        trajid_set_train = trajid_set_train - dat_obj.TRAJID_GROUP_DICT[keys_cv[j]]
                    poi_set = {poi for tid in trajid_set_train for poi in dat_obj.traj_dict[tid]}
                    good_partition = True
                    for j in test_ix: 
                        if keys_cv[j][0] not in poi_set: good_partition = False; break
                    if good_partition == True:
                        poi_list = sorted(poi_set)
                        break

                # train
                pwr = PW_RANK(dat_obj, C=logit_C, poi_info=poi_info_i.loc[poi_list].copy(), 
                              logit_loss=LOGIT_LOSS, debug=True)
                if pwr.train(list(trajid_set_train)) == True:
                    for j in test_ix:  # test
                        ps_cv, L_cv = keys_cv[j]
                        y_hat = pwr.predict(ps_cv, L_cv)
                        if y_hat is not None:
                            F1, pF1, tau = evaluate(dat_obj, keys_cv[j], [y_hat])
                            F1_pwr.append(F1); pF1_pwr.append(pF1); Tau_pwr.append(tau)
                else:  # if training is failed
                    for j in test_ix:
                        F1_pwr.append(0); pF1_pwr.append(0); Tau_pwr.append(0)

            #mean_F1 = np.mean(F1_pwr); mean_pF1 = np.mean(pF1_pwr)
            mean_Tau = np.mean(Tau_pwr)
            print('mean_Tau: %.3f' % mean_Tau)
            if mean_Tau > best_Tau:
                best_Tau = mean_Tau
                best_C = logit_C
        print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %f ---------------\n' % (cnt, len(keys), ps, L, best_C))
        sys.stdout.flush()

        # train model using all examples in training set and measure performance on test set
        pwr = PW_RANK(dat_obj, C=best_C, poi_info=poi_info_i, logit_loss=LOGIT_LOSS, debug=True)
        if pwr.train(trajid_set_i) == True:
            y_hat = pwr.predict(ps, L)
            if y_hat is not None:
                recdict_pwr[(ps, L)] = {'PRED': [y_hat], 'W': pwr.w, 'C': pwr.C}
                print(y_hat)

        cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()

In [44]:
#len(recdict_pwr)

In [45]:
if run_pwr == True:
    print('PW_RANK:')
    do_evaluation(dat_obj, recdict_pwr)

In [46]:
if run_pwr == True:
    fpwr = os.path.join(data_dir, 'pwr-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_pwr, open(fpwr, 'bw'))

3. Trajectory Scoring

3.1 Transition Matrix between POIs

Approximate transition probabilities (matrix) between different POI features (vector) using the Kronecker product of individual transition matrix corresponding to each feature, i.e., POI category, POI popularity (discritized), POI average visit duration (discritized) and POI neighborhoods (clusters).

Deal with features without corresponding POIs and feature with more than one corresponding POIs. (Before Normalisation)

  • For features without corresponding POIs, just remove the rows and columns from the matrix obtained by Kronecker product.
  • For different POIs with the exact same feature,
    • Let POIs with the same feature as a POI group,
    • The incoming transition value (i.e., unnormalised transition probability) of this POI group should be divided uniformly among the group members, which corresponds to choose a group member uniformly at random in the incoming case.
    • The outgoing transition value should be duplicated (i.e., the same) among all group members, as we were already in that group in the outgoing case.
    • For each POI in the group, the allocation transition value of the self-loop of the POI group is similar to that in the outgoing case, as we were already in that group, so just duplicate and then divide uniformly among the transitions from this POI to other POIs in the same group, which corresponds to choose a outgoing transition uniformly at random from all outgoing transitions excluding the self-loop of this POI.
  • Concretely, for a POI group with $n$ POIs,
    1. If the incoming transition value of POI group is $m_1$, then the corresponding incoming transition value for each group member is $\frac{m_1}{n}$.
    2. If the outgoing transition value of POI group is $m_2$, then the corresponding outgoing transition value for each group member is also $m_2$.
    3. If the transition value of self-loop of the POI group is $m_3$, then transition value of self-loop of individual POIs should be $0$,
      and other in-group transitions with value $\frac{m_3}{n-1}$ as the total number of outgoing transitions to other POIs in the same group is $n-1$ (excluding the self-loop), i.e. $n-1$ choose $1$.

NOTE: execute the above division before or after row normalisation will lead to the same result, as the division itself does NOT change the normalising constant of each row (i.e., the sum of each row before normalising).


In [47]:
def gen_poi_logtransmat(trajid_list, poi_set, poi_info, dat_obj, debug=False):
    #transmat_cat                        = gen_transmat_cat(trajid_list, traj_dict, poi_info)
    #transmat_pop,      logbins_pop      = gen_transmat_pop(trajid_list, traj_dict, poi_info)
    #transmat_visit,    logbins_visit    = gen_transmat_visit(trajid_list, traj_dict, poi_info)
    #transmat_duration, logbins_duration = gen_transmat_duration(trajid_list, traj_dict, poi_info)
    #transmat_neighbor, poi_clusters     = gen_transmat_neighbor(trajid_list, traj_dict, poi_info)

    transmat_cat      = dat_obj.gen_transmat_cat(trajid_list, poi_info)
    transmat_pop      = dat_obj.gen_transmat_pop(trajid_list, poi_info)
    transmat_visit    = dat_obj.gen_transmat_visit(trajid_list, poi_info)
    transmat_duration = dat_obj.gen_transmat_duration(trajid_list, poi_info)
    transmat_neighbor = dat_obj.gen_transmat_neighbor(trajid_list, poi_info)
    logbins_pop      = dat_obj.LOGBINS_POP
    logbins_visit    = dat_obj.LOGBINS_VISIT
    logbins_duration = dat_obj.LOGBINS_DURATION
    poi_clusters     = dat_obj.POI_CLUSTERS
    traj_dict        = dat_obj.traj_dict
    
    # Kronecker product
    transmat_ix = list(itertools.product(transmat_cat.index, transmat_pop.index, transmat_visit.index, \
                                         transmat_duration.index, transmat_neighbor.index))
    transmat_value = transmat_cat.values
    for transmat in [transmat_pop, transmat_visit, transmat_duration, transmat_neighbor]:
        transmat_value = kron(transmat_value, transmat.values)
    transmat_feature = pd.DataFrame(data=transmat_value, index=transmat_ix, columns=transmat_ix)
    
    poi_train = sorted(poi_set)
    feature_names = ['poiCat', 'popularity', 'nVisit', 'avgDuration', 'clusterID']
    poi_features = pd.DataFrame(data=np.zeros((len(poi_train), len(feature_names))), \
                                columns=feature_names, index=poi_train)
    poi_features.index.name = 'poiID'
    poi_features['poiCat'] = poi_info.loc[poi_train, 'poiCat']
    poi_features['popularity'] = np.digitize(poi_info.loc[poi_train, 'popularity'], logbins_pop)
    poi_features['nVisit'] = np.digitize(poi_info.loc[poi_train, 'nVisit'], logbins_visit)
    poi_features['avgDuration'] = np.digitize(poi_info.loc[poi_train, 'avgDuration'], logbins_duration)
    poi_features['clusterID'] = poi_clusters.loc[poi_train, 'clusterID']
    
    # shrink the result of Kronecker product and deal with POIs with the same features
    poi_logtransmat = pd.DataFrame(data=np.zeros((len(poi_train), len(poi_train)), dtype=np.float), \
                                   columns=poi_train, index=poi_train)
    for p1 in poi_logtransmat.index:
        rix = tuple(poi_features.loc[p1])
        for p2 in poi_logtransmat.columns:
            cix = tuple(poi_features.loc[p2])
            value_ = transmat_feature.loc[(rix,), (cix,)]
            poi_logtransmat.loc[p1, p2] = value_.values[0, 0]
    
    # group POIs with the same features
    features_dup = dict()
    for poi in poi_features.index:
        key = tuple(poi_features.loc[poi])
        if key in features_dup:
            features_dup[key].append(poi)
        else:
            features_dup[key] = [poi]
    if debug == True:
        for key in sorted(features_dup.keys()):
            print(key, '->', features_dup[key])
            
    # deal with POIs with the same features
    for feature in sorted(features_dup.keys()):
        n = len(features_dup[feature])
        if n > 1:
            group = features_dup[feature]
            v1 = poi_logtransmat.loc[group[0], group[0]]  # transition value of self-loop of POI group
            
            # divide incoming transition value (i.e. unnormalised transition probability) uniformly among group members
            for poi in group:
                poi_logtransmat[poi] /= n
                
            # outgoing transition value has already been duplicated (value copied above)
            
            # duplicate & divide transition value of self-loop of POI group uniformly among all outgoing transitions,
            # from a POI to all other POIs in the same group (excluding POI self-loop)
            v2 = v1 / (n - 1)
            for pair in itertools.permutations(group, 2):
                poi_logtransmat.loc[pair[0], pair[1]] = v2
                            
    # normalise each row
    for p1 in poi_logtransmat.index:
        poi_logtransmat.loc[p1, p1] = 0
        rowsum = poi_logtransmat.loc[p1].sum()
        assert(rowsum > 0)
        logrowsum = np.log10(rowsum)
        for p2 in poi_logtransmat.columns:
            if p1 == p2:
                poi_logtransmat.loc[p1, p2] = LOG_ZERO  # deal with log(0) explicitly
            else:
                poi_logtransmat.loc[p1, p2] = np.log10(poi_logtransmat.loc[p1, p2]) - logrowsum
    
    return poi_logtransmat

In [48]:
#transmat_ = gen_poi_logtransmat(trajid_set_all, set(poi_info_all.index), traj_dict, poi_info_all, debug=True)

3.2 Inference: Viterbi Decoding vs ILP

Use dynamic programming to find a possibly non-simple path, i.e., walk.


In [49]:
def find_viterbi(V, E, ps, L, withNodeWeight=False, alpha=0.5):
    assert(isinstance(V, pd.DataFrame))
    assert(isinstance(E, pd.DataFrame))
    assert(ps in V.index)
    assert(2 <= L <= V.index.shape[0])
    if withNodeWeight == True:
        assert(0 < alpha < 1)
        beta = 1 - alpha
    else:
        alpha = 0
        beta = 1
        weightkey = 'weight'
        if weightkey not in V.columns:
            V['weight'] = 1  # dummy weights, will not be used as alpha=0
    poi_id_dict, poi_id_rdict = dict(), dict()
    for ix, poi in enumerate(V.index):
        poi_id_dict[poi] = ix
        poi_id_rdict[ix] = poi
    M = V.shape[0]
    assert(E.shape[0] >= M)
    assert(E.shape[1] >= M)
    
    A = np.zeros((L-1, M), dtype=np.float)     # scores matrix
    B = np.ones((L-1, M), dtype=np.int) * (-1) # backtracking pointers
    
    for p in range(M): # ps--p1
        p1 = poi_id_rdict[p]
        A[0, p] = alpha * V.loc[p1, 'weight'] + beta * E.loc[ps, p1] if p1 != ps else -np.inf
        B[0, p] = poi_id_dict[ps]

    for t in range(0, L-2): # ps~~p2'--p3
        for p in range(M):
            p3 = poi_id_rdict[p]
            scores = [A[t, p2] + alpha * V.loc[p3, 'weight'] + beta * E.loc[poi_id_rdict[p2], p3] \
                      if poi_id_rdict[p2] not in {ps, p3} else -np.inf for p2 in range(M)] 
            maxix = np.argmax(scores)
            A[t+1, p] = scores[maxix]
            B[t+1, p] = maxix
            
    y_hat = [np.argmax(A[L-2, :])]
    p, t = y_hat[-1], L-2
    while t >= 0:
        y_hat.append(B[t, p])
        p, t = y_hat[-1], t-1
    y_hat.reverse()

    return np.asarray([poi_id_rdict[p] for p in y_hat])

Use integer linear programming (ILP) to find a simple path.


In [50]:
def find_ILP(V, E, ps, L, withNodeWeight=False, alpha=0.5):
    assert(isinstance(V, pd.DataFrame))
    assert(isinstance(E, pd.DataFrame))
    assert(ps in V.index)
    assert(2 <= L <= V.index.shape[0])
    if withNodeWeight == True:
        assert(0 < alpha < 1)
    beta = 1 - alpha
    
    p0 = str(ps); M = V.index.shape[0]
    
    # REF: pythonhosted.org/PuLP/index.html
    pois = [str(p) for p in V.index] # create a string list for each POI
    pb = pulp.LpProblem('MostLikelyTraj', pulp.LpMaximize) # create problem
    # visit_i_j = 1 means POI i and j are visited in sequence
    visit_vars = pulp.LpVariable.dicts('visit', (pois, pois), 0, 1, pulp.LpInteger) 
    # isend_l = 1 means POI l is the END POI of trajectory
    isend_vars = pulp.LpVariable.dicts('isend', pois, 0, 1, pulp.LpInteger) 
    # a dictionary contains all dummy variables
    dummy_vars = pulp.LpVariable.dicts('u', [x for x in pois if x != p0], 2, M, pulp.LpInteger)
    
    # add objective
    objlist = []
    if withNodeWeight == True:
        objlist.append(alpha * V.loc[int(p0), 'weight'])
    for pi in [x for x in pois]:     # from
        for pj in [y for y in pois if y != p0]: # to
            if withNodeWeight == True:
                objlist.append(visit_vars[pi][pj] * (alpha * V.loc[int(pj), 'weight'] + beta * E.loc[int(pi), int(pj)]))
            else:
                objlist.append(visit_vars[pi][pj] * E.loc[int(pi), int(pj)])
    pb += pulp.lpSum(objlist), 'Objective'
    
    # add constraints, each constraint should be in ONE line
    pb += pulp.lpSum([visit_vars[pi][pi] for pi in pois]) == 0, 'NoSelfLoops'
    pb += pulp.lpSum([visit_vars[p0][pj] for pj in pois]) == 1, 'StartAt_p0'
    pb += pulp.lpSum([visit_vars[pi][p0] for pi in pois]) == 0, 'NoIncoming_p0'
    pb += pulp.lpSum([visit_vars[pi][pj] for pi in pois for pj in pois]) == L-1, 'Length'
    pb += pulp.lpSum([isend_vars[pi] for pi in pois]) == 1, 'OneEnd'
    pb += isend_vars[p0] == 0, 'StartNotEnd'
    
    for pk in [x for x in pois if x != p0]:
        pb += pulp.lpSum([visit_vars[pi][pk] for pi in pois]) == isend_vars[pk] + \
              pulp.lpSum([visit_vars[pk][pj] for pj in pois if pj != p0]), 'ConnectedAt_' + pk
        pb += pulp.lpSum([visit_vars[pi][pk] for pi in pois]) <= 1, 'Enter_' + pk + '_AtMostOnce'
        pb += pulp.lpSum([visit_vars[pk][pj] for pj in pois if pj != p0]) + isend_vars[pk] <= 1, \
              'Leave_' + pk + '_AtMostOnce'
    for pi in [x for x in pois if x != p0]:
        for pj in [y for y in pois if y != p0]:
            pb += dummy_vars[pi] - dummy_vars[pj] + 1 <= (M - 1) * (1 - visit_vars[pi][pj]), \
                    'SubTourElimination_' + pi + '_' + pj
    #pb.writeLP("traj_tmp.lp")
    
    # solve problem: solver should be available in PATH
    if USE_GUROBI == True:
        gurobi_options = [('TimeLimit', '7200'), ('Threads', str(N_JOBS)), ('NodefileStart', '0.2'), ('Cuts', '2')]
        pb.solve(pulp.GUROBI_CMD(path='gurobi_cl', options=gurobi_options)) # GUROBI
    else:
        pb.solve(pulp.COIN_CMD(path='cbc', options=['-threads', str(N_JOBS), '-strategy', '1', '-maxIt', '2000000']))#CBC
    visit_mat = pd.DataFrame(data=np.zeros((len(pois), len(pois)), dtype=np.float), index=pois, columns=pois)
    isend_vec = pd.Series(data=np.zeros(len(pois), dtype=np.float), index=pois)
    for pi in pois:
        isend_vec.loc[pi] = isend_vars[pi].varValue
        for pj in pois: visit_mat.loc[pi, pj] = visit_vars[pi][pj].varValue
    #visit_mat.to_csv('visit.csv')

    # build the recommended trajectory
    recseq = [p0]
    while True:
        pi = recseq[-1]
        pj = visit_mat.loc[pi].idxmax()
        value = visit_mat.loc[pi, pj]
        #print(value, int(round(value)))
        #print(recseq)
        assert(int(round(value)) == 1)
        recseq.append(pj)
        if len(recseq) == L: 
            assert(int(round(isend_vec[pj])) == 1)
            #print('===:', recseq, ':====')
            return np.asarray([int(x) for x in recseq])

Recommend trajectories by leveraging POI-POI transition probabilities (maximise the transition likelihood).

Leave-one-out cross-validation, NO hyper-parameters to tune.


In [51]:
if run_tran == True:
    recdict_tran_DP = dict()
    recdict_tran_ILP = dict()
    cnt = 1
    keys = sorted(TRAJ_GROUP_DICT.keys())

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]
        
        print('\n--------------- %d/%d, Query: (%d, %d) ---------------\n' % (cnt, len(keys), ps, L))

        # train model using all examples in training+validation set and measure performance on test set
        trajid_set_train = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
        poi_info = calc_poi_info(list(trajid_set_train), traj_all, poi_all)
           
        if ps not in poi_info.index: continue
        
        poi_logtransmat = gen_poi_logtransmat(trajid_set_train, set(poi_info.index), traj_dict, poi_info)
        edges = poi_logtransmat

        recdict_tran_DP[(ps, L)] = {'PRED': find_viterbi(poi_info.copy(), edges.copy(), ps, L)}
        recdict_tran_ILP[(ps, L)] = {'PRED': find_ILP(poi_info, edges, ps, L)}
        assert(len(recdict_tran_ILP[(ps, L)]) == len(set(recdict_tran_ILP[(ps, L)])))
       
        cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()

In [52]:
if run_tran == True:
    F1_DP = []; pF1_DP = []; Tau_DP = []
    F1_ILP = []; pF1_ILP = []; Tau_ILP = []
    for key in sorted(recdict_tran_DP.keys()):
        F1, pF1, tau = evaluate(recdict_tran_DP[key]['PRED'], TRAJ_GROUP_DICT[key])
        F1_DP.append(F1); pF1_DP.append(pF1); Tau_DP.append(tau)
        assert(key in recdict_tran_ILP)
        F1, pF1, tau = evaluate(recdict_tran_ILP[key]['PRED'], TRAJ_GROUP_DICT[key])
        F1_ILP.append(F1); pF1_ILP.append(pF1); Tau_ILP.append(tau)
    print('Transition Viterbi: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
          (np.mean(F1_DP), np.std(F1_DP)/np.sqrt(len(F1_DP)), \
           np.mean(pF1_DP), np.std(pF1_DP)/np.sqrt(len(pF1_DP)), \
           np.mean(Tau_DP), np.std(Tau_DP)/np.sqrt(len(Tau_DP))))
    print('Transition ILP: F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
          (np.mean(F1_ILP), np.std(F1_ILP)/np.sqrt(len(F1_ILP)), \
           np.mean(pF1_ILP), np.std(pF1_ILP)/np.sqrt(len(pF1_ILP)), \
           np.mean(Tau_ILP), np.std(Tau_ILP)/np.sqrt(len(Tau_ILP))))

In [53]:
if run_tran == True:
    fDP = os.path.join(data_dir, 'tranDP-' + dat_suffix[dat_ix] + '.pkl')
    fILP = os.path.join(data_dir, 'tranILP-' + dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_tran_DP, open(fDP, 'bw'))
    pickle.dump(recdict_tran_ILP, open(fILP, 'bw'))

In [54]:
if run_tran_topk == True:
    recdict_tran = dict()
    cnt = 1
    keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]

        keys_cv = keys[:i] + keys[i+1:]
        
        # use all training+validation set to compute POI features, 
        # make sure features do NOT change for training and validation
        trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
        poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
        
        poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
        if ps not in poi_set_i: 
            sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
            continue
        
        print('\n--------------- %d/%d, Query: (%d, %d) ---------------\n' % (cnt,len(keys),ps,L))
        sys.stdout.flush()
        
        poi_logtransmat = gen_poi_logtransmat(trajid_set_i, set(poi_info_i.index), poi_info_i, dat_obj)
        poi_list_tran = poi_info_i.index.tolist()
        poiid_dict_tran = {poi: ix for ix, poi in enumerate(poi_list_tran)}
        
        # do_inference_list_viterbi(int ps, int L, int M,
        #                           np.ndarray[dtype=np.float64_t, ndim=2] unary_params,
        #                           np.ndarray[dtype=np.float64_t, ndim=3] pw_params,
        #                           np.ndarray[dtype=np.float64_t, ndim=2] unary_features,
        #                           np.ndarray[dtype=np.float64_t, ndim=3] pw_features,
        #                           y_true=None, y_true_list=None, int top=10, path4train=False, DIVERSITY=True)
        unary_params = np.zeros((len(poi_list_tran), 1), dtype=np.float)
        pw_params = np.ones((len(poi_list_tran), len(poi_list_tran), 1), dtype=np.float)
        unary_features = np.zeros((len(poi_list_tran), 1), dtype=np.float)
        pw_features = poi_logtransmat.values.reshape(poi_logtransmat.shape[0], poi_logtransmat.shape[1], 1)
        
        top10 = inf_LVA(poiid_dict_tran[ps], L, len(poi_list_tran), unary_params, pw_params, \
                        unary_features, pw_features, DIVERSITY=False)
        print(top10)
        recdict_tran[(ps, L)] = {'PRED': [[poi_list_tran[pi] for pi in t[0]] for t in top10]}
        
        cnt += 1


--------------- 1/99, Query: (1, 2) ---------------

[(array([ 0, 14]), 1), (array([0, 5]), 2), (array([ 0, 28]), 3), (array([0, 9]), 4), (array([ 0, 19]), 5), (array([ 0, 20]), 6), (array([ 0, 21]), 7), (array([ 0, 23]), 8), (array([0, 4]), 9), (array([ 0, 26]), 10)]

--------------- 2/99, Query: (1, 3) ---------------

[(array([ 0, 14,  5]), 1), (array([ 0, 19, 28]), 2), (array([ 0,  5, 28]), 3), (array([ 0, 28, 19]), 4), (array([ 0, 14,  4]), 5), (array([ 0, 19, 26]), 6), (array([ 0, 28,  5]), 7), (array([ 0,  5, 19]), 8), (array([ 0, 19,  5]), 9), (array([ 0, 14,  9]), 10)]
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-54-df21a120f05e> in <module>()
     13         # make sure features do NOT change for training and validation
     14         trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
---> 15         poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
     16 
     17         poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}

~/work/repos/digbeta/dchen/tour/src/shared.py in calc_poi_info(self, trajid_list)
     81         poi_info = self.traj_all[self.traj_all['trajID'] == trajid_list[0]][['poiID', 'poiDuration']].copy()
     82         for i in range(1, len(trajid_list)):
---> 83             traj = self.traj_all[self.traj_all['trajID'] == trajid_list[i]][['poiID', 'poiDuration']]
     84             poi_info = poi_info.append(traj, ignore_index=True)
     85 

~/apps/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   1956         if isinstance(key, (Series, np.ndarray, Index, list)):
   1957             # either boolean or fancy integer index
-> 1958             return self._getitem_array(key)
   1959         elif isinstance(key, DataFrame):
   1960             return self._getitem_frame(key)

~/apps/miniconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_array(self, key)
   1998             key = check_bool_indexer(self.index, key)
   1999             indexer = key.nonzero()[0]
-> 2000             return self.take(indexer, axis=0, convert=False)
   2001         else:
   2002             indexer = self.loc._convert_to_indexer(key, axis=1)

~/apps/miniconda3/lib/python3.6/site-packages/pandas/core/generic.py in take(self, indices, axis, convert, is_copy, **kwargs)
   1922         taken : type of caller
   1923         """
-> 1924         nv.validate_take(tuple(), kwargs)
   1925         self._consolidate_inplace()
   1926         new_data = self._data.take(indices,

~/apps/miniconda3/lib/python3.6/site-packages/pandas/compat/numpy/function.py in __call__(self, args, kwargs, fname, max_fname_arg_count, method)
     38     def __call__(self, args, kwargs, fname=None,
     39                  max_fname_arg_count=None, method=None):
---> 40         if args or kwargs:
     41             fname = self.fname if fname is None else fname
     42             max_fname_arg_count = (self.max_fname_arg_count if

KeyboardInterrupt: 

In [ ]:
if run_tran_topk == True:
    print(len(recdict_tran))
    print('Markov:')
    topk = [1, 3, 5, 10]
    recdicts = [{q: {'PRED': recdict_tran[q]['PRED'][:k]} for q in recdict_tran.keys()} for k in topk]
    for j in range(len(topk)): do_evaluation(dat_obj, recdicts[j])

In [ ]:
if run_tran_topk == True:
    ftran = os.path.join(data_dir, 'markov10-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_tran, open(ftran, 'bw'))

3.3 Naively combine POI ranking and transition probabilities


In [ ]:
comb_methods = [find_viterbi, find_ILP]
methods_str = ['DP', 'ILP']

In [ ]:
method_ix = 0

Nested cross-validation with Monte-Carlo cross-validation as inner loop.


In [ ]:
if run_comb == True:
    recdict_comb = dict()
    cnt = 1
    keys = sorted(TRAJ_GROUP_DICT.keys())
    inference_fun = comb_methods[method_ix]

    # outer loop to evaluate the test performance by cross validation
    for i in range(len(keys)):
        ps, L = keys[i]

        best_C = 1
        best_alpha = 0.5
        #best_F1 = 0; best_pF1 = 0
        best_Tau = 0
        keys_cv = keys[:i] + keys[i+1:]
        
        # use all training+validation set to compute POI features,
        # make sure features do NOT change for training and validation
        trajid_set_i = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
        poi_info_i = calc_poi_info(list(trajid_set_i), traj_all, poi_all)

        # tune regularisation constant C
        for rank_C in C_SET:
            for alpha in ALPHA_SET:
                print('\n--------------- try_C: %.3f, try_alpha: %.3f ---------------\n' % (rank_C, alpha))
                sys.stdout.flush()
                F1_comb = []; pF1_comb = []; Tau_comb = []        

                # inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
                for j in range(MC_NITER):
                    poi_list = []
                    while True: # make sure the start POI in test set are also in training set
                        rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
                        test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
                        assert(len(test_ix) > 0)
                        trajid_set_train = set(trajid_set_all) - TRAJ_GROUP_DICT[keys[i]]
                        for j in test_ix: 
                            trajid_set_train = trajid_set_train - TRAJ_GROUP_DICT[keys_cv[j]]
                        poi_set = set()
                        for tid in trajid_set_train: poi_set = poi_set | set(traj_dict[tid])
                        good_partition = True
                        for j in test_ix: 
                            if keys_cv[j][0] not in poi_set: good_partition = False; break
                        if good_partition == True: 
                            poi_list = sorted(poi_set)
                            break

                    # train
                    train_df = gen_train_df(list(trajid_set_train), traj_dict, poi_info_i.loc[poi_list].copy(), 
                                            poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST, 
                                            n_jobs=N_JOBS)
                    ranksvm = RankSVM(ranksvm_dir, useLinear=True)
                    ranksvm.train(train_df, cost=rank_C)
                    poi_logtransmat = gen_poi_logtransmat(trajid_set_train, set(poi_list), traj_dict, 
                                                          poi_info_i.loc[poi_list].copy())
                    edges = poi_logtransmat                

                    # test
                    for j in test_ix:  # test
                        ps_cv, L_cv = keys_cv[j]
                        test_df = gen_test_df(ps_cv, L_cv, poi_info_i.loc[poi_list].copy(), poi_clusters=POI_CLUSTERS, \
                                              cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
                        rank_df = ranksvm.predict(test_df, probability=True)
                        nodes = rank_df.copy()
                        nodes['weight'] = np.log10(nodes['probability'])

                        y_hat = inference_fun(nodes, edges.copy(), ps_cv, L_cv, withNodeWeight=True, alpha=alpha)
                        F1, pF1, tau = evaluate(y_hat, TRAJ_GROUP_DICT[keys_cv[j]])
                        F1_comb.append(F1); pF1_comb.append(pF1); Tau_comb.append(tau)

                #mean_F1 = np.mean(F1_comb); mean_pF1 = np.mean(pF1_comb)
                mean_Tau = np.mean(Tau_comb)
                print('mean_Tau: %.3f' % mean_Tau)
                if mean_Tau > best_Tau:
                    best_Tau = mean_Tau
                    best_C = rank_C
                    best_alpha = alpha
        print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %.3f, Best_alpha: %.3f ---------------\n' % \
              (cnt, len(keys), ps, L, best_C, best_alpha))
        sys.stdout.flush()

        # train model using all examples in training set and measure performance on test set
        train_df = gen_train_df(list(trajid_set_i), traj_dict, poi_info_i.copy(), poi_clusters=POI_CLUSTERS, \
                                cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST, n_jobs=N_JOBS)
        ranksvm = RankSVM(ranksvm_dir, useLinear=True)
        ranksvm.train(train_df, cost=best_C)
        test_df = gen_test_df(ps, L, poi_info_i.copy(), poi_clusters=POI_CLUSTERS, \
                              cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
        rank_df = ranksvm.predict(test_df, probability=True)
        nodes = rank_df.copy()
        nodes['weight'] = np.log10(nodes['probability'])
        poi_logtransmat = gen_poi_logtransmat(trajid_set_i, set(poi_info_i.index), traj_dict, poi_info_i)
        edges = poi_logtransmat 

        y_hat = inference_fun(nodes, edges, ps, L, withNodeWeight=True, alpha=best_alpha)
        recdict_comb[(ps, L)] = {'PRED': y_hat, 'C': best_C, 'alpha': best_alpha}

        cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()

In [ ]:
if run_comb == True:
    F1_comb = []; pF1_comb = []; Tau_comb = []
    for key in sorted(recdict_comb.keys()):
        F1, pF1, tau = evaluate(recdict_comb[key]['PRED'], TRAJ_GROUP_DICT[key])
        F1_comb.append(F1); pF1_comb.append(pF1); Tau_comb.append(tau)
    print('Comb ' + methods_str[method_ix] + ': F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
          (np.mean(F1_comb), np.std(F1_comb)/np.sqrt(len(F1_comb)), \
           np.mean(pF1_comb), np.std(pF1_comb)/np.sqrt(len(pF1_comb)), \
           np.mean(Tau_comb), np.std(Tau_comb)/np.sqrt(len(Tau_comb))))

In [ ]:
if run_comb == True:
    fcomb = os.path.join(data_dir, 'comb' + methods_str[method_ix] + '-' + dat_suffix[dat_ix] + '.pkl')
    pickle.dump(recdict_comb, open(fcomb, 'bw'))