Experiment on Generated Data

The goal of this notebook is to design an experiment to check if multi-user and multi-label (which is what our dataset looks like) is a problem for SSVM.
To chieve this goal,

  1. a trained SSVM $\mathcal{M}_0$ (on Glasgow dataset $\mathcal{D}_0$) is used to generate a single user, single label dataset $\mathcal{D}_1$. Concretely, we predict a trajectory for every query $(p, l), p \in \mathcal{P}, l \in \{3,4,5,6,7\}$ use $\mathcal{M}_0$, where $\mathcal{P}$ is from $\mathcal{D}_0$.
  2. train a new SSVM $\mathcal{M}_1$ using features (POI and transition features) computed from $\mathcal{D}_0$ and labels from $\mathcal{D}_1$, and check the performance on training set (i.e., $\mathcal{D}_1$).
  3. perform leave-one-out cross validation on $\mathcal{D}_1$. Hyperparameter (i.e., $C$) is determined by trying some numbers when holding one label in $\mathcal{D}_1$ as test example and using all other labels in $\mathcal{D}_1$ as training set (POI and transition features are computed from $\mathcal{D}_0$), then fix the $C$ for all leave-one-out cross validations.
  4. we noted that POI and transition features are computed from $\mathcal{D}_0$ and labels are from $\mathcal{D}_1$, as we can't compute the duration related features (i.e., avgDuration for POI, and log transition probability between discretized duration buckets) on $\mathcal{D}_1$ as no duration information is generated. So we try to disable duration related features one-by-one, and perform step $3$ to check whether duration related features help.
  5. if duration related features don't help, we can turn off them and then compute POI and transition features from $\mathcal{D}_1$ and use labels in $\mathcal{D}_1$, then we want to compare the performance of RankSVM and SSVM on $\mathcal{D}_1$ (using leave-one-out cross validation), if SSVM performans better than RankSVM, it means multi-user and multi-label in our dataset is a problem for SSVM.

In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, pickle, random
import pandas as pd
import numpy as np
import cvxopt
import seaborn as sns

In [ ]:
random.seed(1234554321)
np.random.seed(123456789)
cvxopt.base.setseed(123456789)

Run notebook ssvm.ipynb.


In [ ]:
%run 'ssvm_ml.ipynb'

In [ ]:
#dump_vars = True

Step 1 - Generate new dataset

Load trained parameters and prediction results


In [ ]:
#fname = os.path.join(data_dir, 'ssvm-listViterbi-Glas.pkl')
fname = os.path.join(data_dir, 'ssvm-listViterbi-Osak.3.pkl')

In [ ]:
ssvm_lv = pickle.load(open(fname, 'rb'))  # a dict: query -> {'PRED': trajectory, 'C': ssvm-c, 'W': model_params}

In [ ]:
query = (2, 5)
#query = (5, 4)
W = ssvm_lv[query]['W']

In [ ]:
ssvm_lv[query]

In [ ]:
#W = np.random.randn(W.shape[0])  # Use a random weight vector to generate trajectories

In [ ]:
#W

In [ ]:
#%%script false
trajid_set = set(trajid_set_all) - TRAJ_GROUP_DICT[query]
poi_set = {p for tid in trajid_set for p in traj_dict[tid] if len(traj_dict[tid]) >= 2}
poi_list = sorted(poi_set)
n_states = len(poi_set)
n_edge_features = 5
n_node_features = (len(W) - n_states * n_states * n_edge_features) // n_states
#print(len(W), n_states, n_node_features)
#unary_params = W[:n_states * n_node_features].reshape(n_states, n_node_features)
#pw_params = W[n_states * n_node_features:].reshape((n_states, n_states, n_edge_features)) 
unary_params = W[:-n_edge_features]
pw_params = W[-n_edge_features:].reshape(n_edge_features)
# duplicate params so that inference procedures work the same way no matter params shared or not
unary_params = np.tile(unary_params, (n_states, 1))
pw_params = np.tile(pw_params, (n_states, n_states, 1))

poi_id_dict, poi_id_rdict = dict(), dict()
for idx, poi in enumerate(poi_list):
    poi_id_dict[poi] = idx
    poi_id_rdict[idx] = poi
    
print('Finished.')

In [ ]:
len(poi_list)

Compute feature scaling parameters


In [ ]:
#%%script false
poi_info = calc_poi_info(sorted(trajid_set), traj_all, poi_all)

traj_list = [traj_dict[k] for k in sorted(trajid_set) if len(traj_dict[k]) >= 2]
node_features_list = Parallel(n_jobs=N_JOBS)\
                     (delayed(calc_node_features)\
                      (tr[0], len(tr), poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, \
                       cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST) for tr in traj_list)
edge_features = calc_edge_features(list(trajid_set), poi_list, traj_dict, poi_info.copy())
fdim = node_features_list[0].shape
X_node_all = np.vstack(node_features_list)
#scaler = MaxAbsScaler(copy=False)
scaler = MinMaxScaler(feature_range=(-1,1), copy=False)
scaler.fit(X_node_all)

# edge feature scaling
scaler_edge = MinMaxScaler(feature_range=(-1,1), copy=False)
fdim_edge = edge_features.shape
edge_features = scaler_edge.fit_transform(edge_features.reshape(fdim_edge[0]*fdim_edge[1], -1))
edge_features = edge_features.reshape(fdim_edge)
    

print('Finished.')

In [ ]:
print(poi_info.shape)
print(edge_features.shape)

Generating trajectories


In [ ]:
poi, L = query
X_node_test = calc_node_features(poi, L, poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, 
                                 cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
X_node_test = scaler.transform(X_node_test)  # feature scaling
unary_features = X_node_test
pw_features = edge_features.copy()
y_pred = do_inference_listViterbi(poi_id_dict[poi], L, len(poi_set), 
                                  unary_params, pw_params, unary_features, pw_features)
print([poi_id_rdict[p] for p in y_pred])

In [ ]:
y_hat = [2, 1, 6, 21, 20]
y_hat = [2, 1, 6, 20, 21]
#y_hat = [5, 7, 8, 6]
#y_hat = [5, 8, 7, 6]

In [ ]:
score = 0
y = [poi_id_dict[x] for x in y_hat]
for j in range(len(y)-1):
    ss = y[j]
    tt = y[j+1]
    score += np.dot(pw_params[ss, tt], pw_features[ss, tt])
    score += np.dot(unary_params[tt], unary_features[tt])
print(score)

In [ ]:
[traj_dict[x] for x in TRAJ_GROUP_DICT[query]]

In [ ]:
#%%script false
lengthes = [3, 4, 5, 6]#, 7]
fake_labels = []
for poi in sorted(poi_list):
    for L in lengthes:
        X_node_test = calc_node_features(poi, L, poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, \
                                         cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
        X_node_test = scaler.transform(X_node_test)  # feature scaling
        unary_features = X_node_test
        pw_features = edge_features.copy()
        y_pred = do_inference_listViterbi(poi_id_dict[poi], L, len(poi_set), 
                                          unary_params, pw_params, unary_features, pw_features)
        fake_labels.append([poi_id_rdict[p] for p in y_pred])

print('Finished.')

In [ ]:
len(fake_labels)

In [ ]:
#fname = 'fake_labels.pkl'
#if dump_vars == True: pickle.dump(fake_labels, open(fname, 'wb'))

In [ ]:
#vars_equal(pickle.load(open(fname, 'rb')), fake_labels)

Step 2 - Train SSVM on generated dataset

Computing scaling parameters and training features/labels


In [ ]:
def calc_train_data(train_labels, poi_list, poi_info, edge_features, poi_id_dict):
    node_features_all = Parallel(n_jobs=N_JOBS)\
                        (delayed(calc_node_features)\
                         (tr[0], len(tr), poi_list, poi_info, poi_clusters=POI_CLUSTERS, \
                          cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST) for tr in train_labels)
    fdim_train = node_features_all[0].shape
    X_node_train = np.vstack(node_features_all)
    scaler_node = MinMaxScaler(feature_range=(-1,1), copy=False)
    X_node_train = scaler_node.fit_transform(X_node_train)
    X_node_train = X_node_train.reshape(-1, fdim_train[0], fdim_train[1])
    
    assert(len(train_labels) == X_node_train.shape[0])
    X_train = [(X_node_train[k, :, :], edge_features.copy(), 
                (poi_id_dict[train_labels[k][0]], len(train_labels[k]))) for k in range(len(train_labels))]
    y_train = [np.array([poi_id_dict[k] for k in tr]) for tr in train_labels]
    assert(len(X_train) == len(y_train))
    
    return X_train, y_train, scaler_node

Training on generated data


In [ ]:
def train_ssvm(X_train, y_train, C):
    sm = MyModel(inference_fun=do_inference_listViterbi)
    osssvm = OneSlackSSVM(model=sm, C=C, n_jobs=N_JOBS, verbose=0)
    try:
        osssvm.fit(X_train, y_train, initialize=True)
        print('SSVM training finished.')
    except:
        sys.stderr.write('SSVM training FAILED.\n')
    return osssvm

Plot the primal dual objective value curve


In [ ]:
def plot_obj_curve(ssvm):
    plt.plot(ssvm.objective_curve_, label='dual')
    plt.plot(ssvm.primal_objective_curve_, label='primal')
    plt.legend()

Make prediction


In [ ]:
def predict(ssvm, ps, L, poi_list, poi_info, edge_features, scaler_node, poi_id_dict, poi_id_rdict):
    X_node_test = calc_node_features(ps, L, poi_list, poi_info, poi_clusters=POI_CLUSTERS, 
                                     cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
    X_node_test = scaler_node.transform(X_node_test)
    X_test = [(X_node_test, edge_features, (poi_id_dict[ps], L))]
    y_hat = ssvm.predict(X_test)
    return np.array([poi_id_rdict[p] for p in y_hat[0]])

Compute evaluation metrics


In [ ]:
def evaluation(predictions):
    F1_ssvm = []; pF1_ssvm = []; tau_ssvm = []
    for key in sorted(predictions.keys()):
        F1 = calc_F1(predictions[key]['REAL'], predictions[key]['PRED'])
        pF1 = calc_pairsF1(predictions[key]['REAL'], predictions[key]['PRED'])
        tau = calc_kendalltau(predictions[key]['REAL'], predictions[key]['PRED'])
        F1_ssvm.append(F1); pF1_ssvm.append(pF1); tau_ssvm.append(tau)
    F1_mean = np.mean(F1_ssvm); pF1_mean = np.mean(pF1_ssvm); tau_mean = np.mean(tau_ssvm)
    print('F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
          (F1_mean, np.std(F1_ssvm)/np.sqrt(len(F1_ssvm)), \
           pF1_mean, np.std(pF1_ssvm)/np.sqrt(len(pF1_ssvm)), \
           tau_mean, np.std(tau_ssvm)/np.sqrt(len(tau_ssvm))))
    return F1_mean, pF1_mean, tau_mean

Train on generated dataset


In [ ]:
#C = 0.3

In [ ]:
%%script false
train_labels = fake_labels.copy()
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(), 
                                                edge_features.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, C)

In [ ]:
#plot_obj_curve(ssvm)

Evaluate on training set


In [ ]:
%%script false
predictions = dict()
for label in train_labels:
    y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(),
                     scaler_node, poi_id_dict, poi_id_rdict)
    predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}

In [ ]:
%%script false
ret = evaluation(predictions)

Step 3 - Leave-one-out evaluation on generated dataset

FOR STEP 4: Turn off duration related features


In [ ]:
#poi_info['avgDuration'] = 0.0

In [ ]:
# transition features: [poiCat, popularity, nVisit, avgDuration, clusterID]
#edge_features = edge_features[:, :, [0,1,2,4]]

Choose hyper-parameter C

Choose hyper-parameter C using Monte-Carlo cross validation


In [ ]:
%%script false
num_test = int(len(fake_labels) * MC_PORTION)
best_tau = 0; best_C = 0

In [ ]:
#edge_features = np.zeros_like(edge_features)  # Turn off transition features

In [ ]:
%%script false
np.random.seed(0)
for C in C_SET:
    print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush() 
    F1_test = []; pF1_test = []; tau_test = []
    for t in range(MC_NITER):
        while True:
            indices = np.arange(len(fake_labels))
            np.random.shuffle(indices)
            test_ix = indices[:num_test]
            train_ix = indices[num_test:]
            train_labels = [fake_labels[ix] for ix in train_ix]
            test_labels  = [fake_labels[ix] for ix in test_ix]
            poi_set_ = {p for x in train_labels for p in x}
            if len(poi_set_) == len(poi_list): break
        X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(), 
                                                        edge_features.copy(), poi_id_dict.copy())
        ssvm = train_ssvm(X_train, y_train, C)
        predictions = dict()
        for label in test_labels:
            y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(), 
                             scaler_node, poi_id_dict, poi_id_rdict)
            predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
        F1, pF1, tau = evaluation(predictions)
        F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
    mean_tau = np.mean(tau_test)
    print('mean_tau: %.3f' % mean_tau)
    if mean_tau > best_tau:
        best_tau = mean_tau
        best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))

Leave-one-out cross validation


In [ ]:
%%script false
predictions = dict()

In [ ]:
%%script false
for i in range(len(fake_labels)):
    sys.stdout.write('%s ' % str(i+1))
    train_labels = fake_labels[:i] + fake_labels[i+1:]
    X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(), 
                                                    edge_features.copy(), poi_id_dict.copy())
    ssvm = train_ssvm(X_train, y_train, best_C)
    test_label = fake_labels[i]
    y_pred = predict(ssvm, test_label[0], len(test_label), poi_list, poi_info.copy(), edge_features.copy(), 
                     scaler_node, poi_id_dict, poi_id_rdict)
    predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}

In [ ]:
%%script false
ret = evaluation(predictions)

In [ ]:
%%script false
fname = 'ssvm-orig-feature.pkl'
if dump_vars == True: pickle.dump(predictions, open(fname, 'wb'))

In [ ]:
%%script false
vars_equal(pickle.load(open(fname, 'rb')), predictions)

Turn off duration related features one-by-one, and perform step 3 to check whether duration related features help.

Concretely, disable duration related POI and transition features in step 1 one-by-one, and run step 1 to step 3.

Step 5 - Compute POI and transition features on the generated data: SSVM vs. RankSVM

Compute features

  • turn off duration related features.
  • recall that POI popularity is the number of distinct users that visited the POI, and we have only one user in $\mathcal{D}_1$, which means all POIs have a popularity $1$ and transition between different popularity buckets (only one bucket here) is meaningless.
  • recompute the transition probabilities between nVisit buckets.

In [ ]:
#transmat_visit0, logbins_visit0 = gen_transmat_visit(trajid_set, traj_dict, poi_info)

In [ ]:
%%script false
poi_info_new = calc_poi_info(sorted(trajid_set), traj_all, poi_all)
edge_features_new = calc_edge_features(list(trajid_set), poi_list, traj_dict, poi_info_new.copy())

# set POI popularity and nvisit
poi_info_new['avgDuration'] = 0.0
poi_info_new['popularity'] = 1  # only a single user
poi_info_new['nVisit'] = 0
for label in fake_labels:
    for p in label: poi_info_new.loc[p, 'nVisit'] += 1
        
# set popularity (drop it) and nvisit based transition features

# compute binning boundaries
poi_visits = poi_info_new.loc[poi_list, 'nVisit']
expo_visit1 = np.log10(max(1, min(poi_visits)))
expo_visit2 = np.log10(max(poi_visits))
nbins_visit = BIN_CLUSTER
logbins_visit = np.logspace(np.floor(expo_visit1), np.ceil(expo_visit2), nbins_visit+1)
logbins_visit[0] = 0  # deal with underflow
if not (logbins_visit[-1] > poi_info_new['nVisit'].max()):
    logbins_visit[-1] = poi_info_new['nVisit'].max() + 1

# compute transition matrix between different nVist buckets
nbins = len(logbins_visit) - 1
transmat_visit_cnt = pd.DataFrame(data=np.zeros((nbins, nbins), dtype=np.float),
                                  columns=np.arange(1, nbins+1), index=np.arange(1, nbins+1))
for t in fake_labels:
    for pi in range(len(t)-1):
        p1, p2 = t[pi], t[pi+1]
        assert(p1 in poi_info_new.index and p2 in poi_info_new.index)
        visit1 = poi_info_new.loc[p1, 'nVisit']
        visit2 = poi_info_new.loc[p2, 'nVisit']
        vc1, vc2 = np.digitize([visit1, visit2], logbins_visit)
        if vc1 > 5 or vc2 > 5: print(p1, visit1, p2, visit2)
        transmat_visit_cnt.loc[vc1, vc2] += 1
transmat_visit = normalise_transmat(transmat_visit_cnt)

# compute nvisit based transition features
poi_features = pd.DataFrame(data=np.zeros((len(poi_list), 1)), columns=['nVisit'], index=poi_list)
poi_features.index.name = 'poiID'
poi_features['nVisit'] = np.digitize(poi_info_new.loc[poi_list, 'nVisit'], logbins_visit)
for j in range(len(poi_list)): # NOTE: POI order
    pj = poi_list[j]
    visit = poi_features.loc[pj, 'nVisit']    
    for k in range(len(poi_list)): # NOTE: POI order
        pk = poi_list[k]
        #edge_features_new[j, k, 2] = np.log10(transmat_visit.loc[visit, poi_features.loc[pk, 'nVisit']])
        edge_features_new[j, k, 2] = transmat_visit.loc[visit, poi_features.loc[pk, 'nVisit']]

# transition features: [poiCat, popularity, nVisit, avgDuration, clusterID]
edge_features_new = edge_features_new[:, :, [0, 1, 2, 4]]

# edge feature scaling
scaler_edge = MinMaxScaler(feature_range=(-1,1), copy=False)
fdim_edge = edge_features_new.shape
edge_features_new = scaler_edge.fit_transform(edge_features_new.reshape(fdim_edge[0]*fdim_edge[1], -1))
edge_features_new = edge_features_new.reshape(fdim_edge)
    

print('Finished.')

Plot the transition matrix


In [ ]:
#sns.heatmap(np.log10(transmat_visit0), cmap='BuGn')#, vmin=0, vmax=1)

In [ ]:
#sns.heatmap(np.log10(transmat_visit), cmap='BuGn')#, vmin=0, vmax=1)

Plot the histograms of the number of visit


In [ ]:
#print(logbins_visit0)
#poi_info['nVisit'].hist()

In [ ]:
#print(logbins_visit)
#poi_info_new['nVisit'].hist()

Choose hyper-parameter C


In [ ]:
%%script false
edge_features_new = np.zeros_like(edge_features_new) # Turn off transition features

In [ ]:
%%script false
num_test = int(len(fake_labels) * MC_PORTION)
best_tau = 0; best_C = 0

In [ ]:
%%script false
np.random.seed(0)
for C in C_SET:
    print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush() 
    F1_test = []; pF1_test = []; tau_test = []
    for t in range(MC_NITER):
        while True:
            indices = np.arange(len(fake_labels))
            np.random.shuffle(indices)
            test_ix = indices[:num_test]
            train_ix = indices[num_test:]
            train_labels = [fake_labels[ix] for ix in train_ix]
            test_labels  = [fake_labels[ix] for ix in test_ix]
            poi_set_ = {p for x in train_labels for p in x}
            if len(poi_set_) == len(poi_list): break
        X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info_new.copy(), 
                                                        edge_features_new.copy(), poi_id_dict.copy())
        ssvm = train_ssvm(X_train, y_train, C)
        predictions = dict()
        for label in test_labels:
            y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info_new.copy(), edge_features_new.copy(), 
                             scaler_node, poi_id_dict, poi_id_rdict)
            predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
        F1, pF1, tau = evaluation(predictions)
        F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
    tau_mean = np.mean(tau_test)
    print('mean_tau: %.3f' % tau_mean)
    if tau_mean > best_tau:
        best_tau = tau_mean
        best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))

Leave-one-out cross validation


In [ ]:
%%script false
predictions = dict()

In [ ]:
%%script false
for i in range(len(fake_labels)):
    sys.stdout.write('%s ' % str(i+1))
    train_labels = fake_labels[:i] + fake_labels[i+1:]
    X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info_new.copy(), 
                                                    edge_features_new.copy(), poi_id_dict.copy())
    ssvm = train_ssvm(X_train, y_train, best_C)
    test_label = fake_labels[i]
    y_pred = predict(ssvm, test_label[0], len(test_label), poi_list, poi_info_new.copy(), edge_features_new.copy(), 
                     scaler_node, poi_id_dict, poi_id_rdict)
    predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}

In [ ]:
%%script false
ret = evaluation(predictions)

Performance of RankSVM


In [ ]:
%run 'baseline.ipynb'

In [ ]:
def gen_train_df_new(train_labels, poi_list, poi_info, query_id_dict, poi_clusters, cats, clusters, n_jobs=-1):    
    columns = DF_COLUMNS
    poi_distmat = POI_DISTMAT
    train_trajs = train_labels    
    
    qid_set = sorted(set(query_id_dict.values()))    
    query_id_rdict = dict()
    for k, v in query_id_dict.items(): 
        query_id_rdict[v] = k  # qid --> (start, length)
    
    train_df_list = Parallel(n_jobs=n_jobs)\
                            (delayed(gen_train_subdf_new)(poi, qid_set, poi_info, poi_clusters,
                                                          cats,clusters,query_id_rdict) for poi in poi_list)
                        
    assert(len(train_df_list) > 0)
    df_ = train_df_list[0]
    for j in range(1, len(train_df_list)):
        df_ = df_.append(train_df_list[j], ignore_index=True)            
        
    # set label
    df_.set_index(['queryID', 'poiID'], inplace=True)
    df_['label'] = 0
    for t in train_trajs:
        qid = query_id_dict[(t[0], len(t))]
        for poi in t[1:]:  # do NOT count if the POI is startPOI/endPOI
            df_.loc[(qid, poi), 'label'] += 1

    df_.reset_index(inplace=True)
    return df_

In [ ]:
def gen_train_subdf_new(poi_id, query_id_set, poi_info, poi_clusters, cats, clusters, query_id_rdict):
    assert(isinstance(cats, list))
    assert(isinstance(clusters, list))
    
    columns = DF_COLUMNS
    poi_distmat = POI_DISTMAT
    df_ = pd.DataFrame(index=np.arange(len(query_id_set)), columns=columns)
    
    pop, nvisit = poi_info.loc[poi_id, 'popularity'], poi_info.loc[poi_id, 'nVisit']
    cat, cluster = poi_info.loc[poi_id, 'poiCat'], poi_clusters.loc[poi_id, 'clusterID'] 
    duration = poi_info.loc[poi_id, 'avgDuration']
    
    for j in range(len(query_id_set)):
        qid = query_id_set[j]
        assert(qid in query_id_rdict) # qid --> (start, end, length)
        (p0, trajLen) = query_id_rdict[qid]
        idx = df_.index[j]
        df_.loc[idx, 'poiID'] = poi_id
        df_.loc[idx, 'queryID'] = qid
        df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
        df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
        df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
        df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
        df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
        df_.loc[idx, 'trajLen'] = trajLen
        df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_info.loc[p0, 'poiCat'] else -1
        df_.loc[idx, 'distStart'] = poi_distmat.loc[poi_id, p0]
        df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
        df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
        df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
        df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
        
    return df_

In [ ]:
def gen_test_df_new(startPOI, nPOI, poi_info, query_id_dict, poi_clusters, cats, clusters):
    assert(isinstance(cats, list))
    assert(isinstance(clusters, list))
    
    columns = DF_COLUMNS
    poi_distmat = POI_DISTMAT
    
    key = (p0, trajLen) = (startPOI, nPOI)
    assert(key in query_id_dict)
    assert(p0 in poi_info.index)
    
    df_ = pd.DataFrame(index=np.arange(poi_info.shape[0]), columns=columns)
    poi_list = sorted(poi_info.index)
    
    qid = query_id_dict[key]
    df_['queryID'] = qid
    df_['label'] = np.random.rand(df_.shape[0]) # label for test data is arbitrary according to libsvm FAQ

    for i in range(df_.index.shape[0]):
        poi = poi_list[i]
        lon, lat = poi_info.loc[poi, 'poiLon'], poi_info.loc[poi, 'poiLat']
        pop, nvisit = poi_info.loc[poi, 'popularity'], poi_info.loc[poi, 'nVisit']
        cat, cluster = poi_info.loc[poi, 'poiCat'], poi_clusters.loc[poi, 'clusterID']
        duration = poi_info.loc[poi, 'avgDuration']
        idx = df_.index[i]
        df_.loc[idx, 'poiID'] = poi
        df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
        df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
        df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
        df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
        df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
        df_.loc[idx, 'trajLen'] = trajLen
        df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_all.loc[p0, 'poiCat'] else -1
        df_.loc[idx, 'distStart'] = poi_distmat.loc[poi, p0]
        df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
        df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
        df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
        df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
        
    return df_

Tune hyper-parameter


In [ ]:
%%script false
num_test = int(len(fake_labels) * 0.2)
best_tau = 0; best_C = 0
query_id_dict = {(tr[0], len(tr)): ix for ix, tr in enumerate(fake_labels)}

In [ ]:
#poi_info_new = calc_poi_info(sorted(trajid_set), traj_all, poi_all)  # Compute features on the original dataset

In [ ]:
%%script false
np.random.seed(0)
for C in C_SET:
    print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush() 
    F1_test = []; pF1_test = []; tau_test = []
    for t in range(MC_NITER):
        while True:
            indices = np.arange(len(fake_labels))
            np.random.shuffle(indices)
            test_ix = indices[:num_test]
            train_ix = indices[num_test:]
            train_labels = [fake_labels[ix] for ix in train_ix]
            test_labels  = [fake_labels[ix] for ix in test_ix]
            poi_set_ = {p for x in train_labels for p in x}
            if len(poi_set_) == len(poi_list): break
        train_df = gen_train_df_new(train_labels, poi_list, poi_info_new.copy(), query_id_dict.copy(), 
                                    poi_clusters=POI_CLUSTERS,cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST,n_jobs=N_JOBS)
        ranksvm = RankSVM(ranksvm_dir, useLinear=True)
        ranksvm.train(train_df, cost=C)
        
        predictions = dict()
        for label in test_labels:
            test_df = gen_test_df_new(label[0], len(label), poi_info_new.copy(), query_id_dict.copy(), 
                                      poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
            rank_df = ranksvm.predict(test_df)
            rank_df.sort_values(by='rank', ascending=False, inplace=True)
            y_pred = [label[0]] + [p for p in rank_df.index.tolist() if p != label[0]][:len(label)-1]
            predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
            
        F1, pF1, tau = evaluation(predictions)
        F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
    tau_mean = np.mean(tau_test)
    print('mean_tau: %.3f' % tau_mean)
    if tau_mean > best_tau:
        best_tau = tau_mean
        best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))

In [ ]:
#predictions = dict()

In [ ]:
%%script false
for i in range(len(fake_labels)):
    sys.stdout.write('%s ' % str(i+1))
    train_labels = fake_labels[:i] + fake_labels[i+1:]
    train_df = gen_train_df_new(train_labels, poi_list, poi_info_new.copy(), query_id_dict.copy(),
                                poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST, n_jobs=N_JOBS)
    ranksvm = RankSVM(ranksvm_dir, useLinear=True)
    ranksvm.train(train_df, cost=best_C)
    test_label = fake_labels[i]
    test_df = gen_test_df_new(test_label[0], len(test_label), poi_info_new.copy(), query_id_dict.copy(),
                              poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
    rank_df = ranksvm.predict(test_df)
    rank_df.sort_values(by='rank', ascending=False, inplace=True)
    y_pred = [test_label[0]] + [p for p in rank_df.index.tolist() if p != test_label[0]][:len(test_label)-1]
    predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}

In [ ]:
#ret = evaluation(predictions)