Experiment on Pruned Data

Pruning a real dataset such as there is only one trajectory for each query, then evaluate SSVM and RankSVM on the pruned dataset via leave-one-out cross validation.



In [ ]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import os, pickle, random
import pandas as pd
import numpy as np
import cvxopt



In [ ]:

    
random.seed(1234554321)
np.random.seed(123456789)
cvxopt.base.setseed(123456789)

Run notebook generated_data.ipynb.



In [ ]:

    
%run 'generated_data.ipynb'



In [ ]:

    
compute_new_features = True

Step 1 - Pruning data



In [ ]:

    
pruned_labels = []
pruned_trajid_set = []

Pruning dataset such that only the trajectory with maximum total POI popularity for each query is keeped



In [ ]:

    
for query in sorted(TRAJ_GROUP_DICT.keys()):
    max_pop = 0
    max_tid = -1
    for tid in TRAJ_GROUP_DICT[query]:
        pop = np.sum([poi_info_all.loc[p, 'popularity'] for p in traj_dict[tid]])
        if pop > max_pop:
            max_pop = pop
            max_tid = tid
    assert(max_tid != -1)
    pruned_labels.append(traj_dict[max_tid]) 
    pruned_trajid_set.append(max_tid)



In [ ]:

    
#pruned_labels

POI and transition features are computed from the original dataset



In [ ]:

    
if compute_new_features == True:
    trajid_set = pruned_trajid_set.copy()  # Use only the pruned dataset to compute POI/transition features
else:
    trajid_set = trajid_set_all.copy()



In [ ]:

    
poi_info = calc_poi_info(sorted(trajid_set), traj_all, poi_all)
poi_list = sorted({p for t in pruned_labels for p in t})
poi_id_dict = {poi: ix for ix, poi in enumerate(poi_list)}
poi_id_rdict = {ix: poi for ix, poi in enumerate(poi_list)}
query_id_dict = {(t[0], len(t)): ix for ix, t in enumerate(pruned_labels)}
edge_features = calc_edge_features(list(trajid_set), poi_list, traj_dict, poi_info.copy())



In [ ]:

    
print(poi_info.shape[0], len(poi_list))

Step 2 - Train SSVM on pruned dataset

Train on pruned dataset



In [ ]:

    
C = 0.3



In [ ]:

    
train_labels = pruned_labels.copy()
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(), 
                                                edge_features.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, C)



In [ ]:

    
plot_obj_curve(ssvm)

Evaluate on training set



In [ ]:

    
predictions = dict()
for label in train_labels:
    y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(),
                     scaler_node, poi_id_dict, poi_id_rdict)
    predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}



In [ ]:

    
ret = evaluation(predictions)

Step 3 - Leave-one-out evaluation on pruned dataset

Evaluate SSVM



In [ ]:

    
edge_features = np.zeros_like(edge_features)  # turn off transition features

Choose hyper-parameter C

Choose hyper-parameter C using Monte-Carlo cross validation



In [ ]:

    
num_test = int(len(pruned_labels) * MC_PORTION)
best_tau = 0; best_C = 0



In [ ]:

    
np.random.seed(0)
for C in C_SET:
    print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush() 
    F1_test = []; pF1_test = []; tau_test = []
    for t in range(MC_NITER):
        while True:
            indices = np.arange(len(pruned_labels))
            np.random.shuffle(indices)
            test_ix = indices[:num_test]
            train_ix = indices[num_test:]
            train_labels = [pruned_labels[ix] for ix in train_ix]
            test_labels  = [pruned_labels[ix] for ix in test_ix]
            poi_set_ = {p for x in train_labels for p in x}
            if len(poi_set_) == len(poi_list): break
        X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(), 
                                                        edge_features.copy(), poi_id_dict.copy())
        ssvm = train_ssvm(X_train, y_train, C)
        predictions = dict()
        for label in test_labels:
            y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(), 
                             scaler_node, poi_id_dict, poi_id_rdict)
            predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
        F1, pF1, tau = evaluation(predictions)
        F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
    mean_tau = np.mean(tau_test)
    print('mean_tau: %.3f' % mean_tau)
    if mean_tau > best_tau:
        best_tau = mean_tau
        best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))

Leave-one-out cross validation



In [ ]:

    
# best_C: 30  when using the original dataset to compute features
# best_C: 0.1 when using the original dataset to compute features and turn off transition features
# best_C: 0.3 when using the pruned dataset to compute features 
# best_C: 0.1 when using the pruned dataset to compute features and turn off transition features



In [ ]:

    
predictions = dict()



In [ ]:

    
# make sure the POI features are the same for training and test
poi_info_ = poi_info
edge_features_ = edge_features

for i in range(len(pruned_labels)):
    sys.stdout.write('%s ' % str(i+1))
    train_labels = pruned_labels[:i] + pruned_labels[i+1:]
    test_label = pruned_labels[i]
    #trajid_set_ = trajid_set[:i] + trajid_set[i+1:]
    poi_list_ = sorted({p for x in train_labels for p in x})
    if test_label[0] not in poi_list_: continue
    poi_id_dict_ = {poi: ix for ix, poi in enumerate(poi_list_)}
    poi_id_rdict_ = {ix: poi for ix, poi in enumerate(poi_list_)}
    #poi_info_ = calc_poi_info(trajid_set_, traj_all, poi_all)
    #edge_features_ = calc_edge_features(trajid_set_, poi_list, traj_dict, poi_info_.copy())
    X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list_, poi_info_.copy(), 
                                                    edge_features_.copy(), poi_id_dict_.copy())
    ssvm = train_ssvm(X_train, y_train, best_C)
    y_pred = predict(ssvm, test_label[0], len(test_label), poi_list_, poi_info_.copy(), edge_features_.copy(), 
                     scaler_node, poi_id_dict_, poi_id_rdict_)
    predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}



In [ ]:

    
len(predictions)



In [ ]:

    
ret = evaluation(predictions)

Evaluate RankSVM

Choose hyper-parameter C

Choose hyper-parameter C using Monte-Carlo cross validation



In [ ]:

    
num_test = int(len(pruned_labels) * MC_PORTION)
best_tau = 0; best_C = 0



In [ ]:

    
np.random.seed(0)
for C in C_SET:
    print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush() 
    F1_test = []; pF1_test = []; tau_test = []
    for t in range(MC_NITER):
        while True:
            indices = np.arange(len(pruned_labels))
            np.random.shuffle(indices)
            test_ix = indices[:num_test]
            train_ix = indices[num_test:]
            train_labels = [pruned_labels[ix] for ix in train_ix]
            test_labels  = [pruned_labels[ix] for ix in test_ix]
            poi_set_ = {p for x in train_labels for p in x}
            if len(poi_set_) == len(poi_list): break
        train_df = gen_train_df_new(train_labels, poi_list, poi_info.copy(), query_id_dict.copy(), 
                                    poi_clusters=POI_CLUSTERS,cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST,n_jobs=N_JOBS)
        ranksvm = RankSVM(ranksvm_dir, useLinear=True)
        ranksvm.train(train_df, cost=C)
        
        predictions = dict()
        for label in test_labels:
            test_df = gen_test_df_new(label[0], len(label), poi_info.copy(), query_id_dict.copy(), 
                                      poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
            rank_df = ranksvm.predict(test_df)
            rank_df.sort_values(by='rank', ascending=False, inplace=True)
            y_pred = [label[0]] + [p for p in rank_df.index.tolist() if p != label[0]][:len(label)-1]
            predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
            
        F1, pF1, tau = evaluation(predictions)
        F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
    tau_mean = np.mean(tau_test)
    print('mean_tau: %.3f' % tau_mean)
    if tau_mean > best_tau:
        best_tau = tau_mean
        best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))

Leave-one-out cross validation



In [ ]:

    
# best_C: 0.030 when compute the features on the original dataset
# best_C: 0.010 when compute the features on the pruned dataset



In [ ]:

    
predictions = dict()



In [ ]:

    
# make sure the POI features are the same for training and test
poi_info_ = poi_info
query_id_dict_ = query_id_dict

for i in range(len(pruned_labels)):
    sys.stdout.write('%s ' % str(i+1))
    train_labels = pruned_labels[:i] + pruned_labels[i+1:]
    #trajid_set_ = trajid_set[:i] + trajid_set[i+1:]
    test_label = pruned_labels[i]
    poi_list_ = sorted({p for x in train_labels for p in x})
    if test_label[0] not in poi_list_: continue
    poi_id_dict_ = {poi: ix for ix, poi in enumerate(poi_list_)}
    poi_id_rdict_ = {ix: poi for ix, poi in enumerate(poi_list_)}
    #poi_info_ = calc_poi_info(trajid_set_, traj_all, poi_all)
    train_df = gen_train_df_new(train_labels, poi_list_, poi_info_.copy(), query_id_dict_.copy(),
                                poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST, n_jobs=N_JOBS)
    ranksvm = RankSVM(ranksvm_dir, useLinear=True)
    ranksvm.train(train_df, cost=best_C)
    test_df = gen_test_df_new(test_label[0], len(test_label), poi_info_.copy(), query_id_dict_.copy(),
                              poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
    rank_df = ranksvm.predict(test_df)
    rank_df.sort_values(by='rank', ascending=False, inplace=True)
    y_pred = [test_label[0]] + [p for p in rank_df.index.tolist() if p != test_label[0]][:len(test_label)-1]
    predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}



In [ ]:

    
len(predictions)



In [ ]:

    
ret = evaluation(predictions)