Pruning a real dataset such as there is only one trajectory for each query, then evaluate SSVM and RankSVM on the pruned dataset via leave-one-out cross validation.
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import os, pickle, random
import pandas as pd
import numpy as np
import cvxopt
In [ ]:
random.seed(1234554321)
np.random.seed(123456789)
cvxopt.base.setseed(123456789)
Run notebook generated_data.ipynb
.
In [ ]:
%run 'generated_data.ipynb'
In [ ]:
compute_new_features = True
In [ ]:
pruned_labels = []
pruned_trajid_set = []
Pruning dataset such that only the trajectory with maximum total POI popularity for each query is keeped
In [ ]:
for query in sorted(TRAJ_GROUP_DICT.keys()):
max_pop = 0
max_tid = -1
for tid in TRAJ_GROUP_DICT[query]:
pop = np.sum([poi_info_all.loc[p, 'popularity'] for p in traj_dict[tid]])
if pop > max_pop:
max_pop = pop
max_tid = tid
assert(max_tid != -1)
pruned_labels.append(traj_dict[max_tid])
pruned_trajid_set.append(max_tid)
In [ ]:
#pruned_labels
POI and transition features are computed from the original dataset
In [ ]:
if compute_new_features == True:
trajid_set = pruned_trajid_set.copy() # Use only the pruned dataset to compute POI/transition features
else:
trajid_set = trajid_set_all.copy()
In [ ]:
poi_info = calc_poi_info(sorted(trajid_set), traj_all, poi_all)
poi_list = sorted({p for t in pruned_labels for p in t})
poi_id_dict = {poi: ix for ix, poi in enumerate(poi_list)}
poi_id_rdict = {ix: poi for ix, poi in enumerate(poi_list)}
query_id_dict = {(t[0], len(t)): ix for ix, t in enumerate(pruned_labels)}
edge_features = calc_edge_features(list(trajid_set), poi_list, traj_dict, poi_info.copy())
In [ ]:
print(poi_info.shape[0], len(poi_list))
Train on pruned dataset
In [ ]:
C = 0.3
In [ ]:
train_labels = pruned_labels.copy()
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(),
edge_features.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, C)
In [ ]:
plot_obj_curve(ssvm)
Evaluate on training set
In [ ]:
predictions = dict()
for label in train_labels:
y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(),
scaler_node, poi_id_dict, poi_id_rdict)
predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
In [ ]:
ret = evaluation(predictions)
In [ ]:
edge_features = np.zeros_like(edge_features) # turn off transition features
Choose hyper-parameter C
Choose hyper-parameter C using Monte-Carlo cross validation
In [ ]:
num_test = int(len(pruned_labels) * MC_PORTION)
best_tau = 0; best_C = 0
In [ ]:
np.random.seed(0)
for C in C_SET:
print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush()
F1_test = []; pF1_test = []; tau_test = []
for t in range(MC_NITER):
while True:
indices = np.arange(len(pruned_labels))
np.random.shuffle(indices)
test_ix = indices[:num_test]
train_ix = indices[num_test:]
train_labels = [pruned_labels[ix] for ix in train_ix]
test_labels = [pruned_labels[ix] for ix in test_ix]
poi_set_ = {p for x in train_labels for p in x}
if len(poi_set_) == len(poi_list): break
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(),
edge_features.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, C)
predictions = dict()
for label in test_labels:
y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(),
scaler_node, poi_id_dict, poi_id_rdict)
predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
F1, pF1, tau = evaluation(predictions)
F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
mean_tau = np.mean(tau_test)
print('mean_tau: %.3f' % mean_tau)
if mean_tau > best_tau:
best_tau = mean_tau
best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))
Leave-one-out cross validation
In [ ]:
# best_C: 30 when using the original dataset to compute features
# best_C: 0.1 when using the original dataset to compute features and turn off transition features
# best_C: 0.3 when using the pruned dataset to compute features
# best_C: 0.1 when using the pruned dataset to compute features and turn off transition features
In [ ]:
predictions = dict()
In [ ]:
# make sure the POI features are the same for training and test
poi_info_ = poi_info
edge_features_ = edge_features
for i in range(len(pruned_labels)):
sys.stdout.write('%s ' % str(i+1))
train_labels = pruned_labels[:i] + pruned_labels[i+1:]
test_label = pruned_labels[i]
#trajid_set_ = trajid_set[:i] + trajid_set[i+1:]
poi_list_ = sorted({p for x in train_labels for p in x})
if test_label[0] not in poi_list_: continue
poi_id_dict_ = {poi: ix for ix, poi in enumerate(poi_list_)}
poi_id_rdict_ = {ix: poi for ix, poi in enumerate(poi_list_)}
#poi_info_ = calc_poi_info(trajid_set_, traj_all, poi_all)
#edge_features_ = calc_edge_features(trajid_set_, poi_list, traj_dict, poi_info_.copy())
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list_, poi_info_.copy(),
edge_features_.copy(), poi_id_dict_.copy())
ssvm = train_ssvm(X_train, y_train, best_C)
y_pred = predict(ssvm, test_label[0], len(test_label), poi_list_, poi_info_.copy(), edge_features_.copy(),
scaler_node, poi_id_dict_, poi_id_rdict_)
predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}
In [ ]:
len(predictions)
In [ ]:
ret = evaluation(predictions)
Choose hyper-parameter C
Choose hyper-parameter C using Monte-Carlo cross validation
In [ ]:
num_test = int(len(pruned_labels) * MC_PORTION)
best_tau = 0; best_C = 0
In [ ]:
np.random.seed(0)
for C in C_SET:
print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush()
F1_test = []; pF1_test = []; tau_test = []
for t in range(MC_NITER):
while True:
indices = np.arange(len(pruned_labels))
np.random.shuffle(indices)
test_ix = indices[:num_test]
train_ix = indices[num_test:]
train_labels = [pruned_labels[ix] for ix in train_ix]
test_labels = [pruned_labels[ix] for ix in test_ix]
poi_set_ = {p for x in train_labels for p in x}
if len(poi_set_) == len(poi_list): break
train_df = gen_train_df_new(train_labels, poi_list, poi_info.copy(), query_id_dict.copy(),
poi_clusters=POI_CLUSTERS,cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST,n_jobs=N_JOBS)
ranksvm = RankSVM(ranksvm_dir, useLinear=True)
ranksvm.train(train_df, cost=C)
predictions = dict()
for label in test_labels:
test_df = gen_test_df_new(label[0], len(label), poi_info.copy(), query_id_dict.copy(),
poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
rank_df = ranksvm.predict(test_df)
rank_df.sort_values(by='rank', ascending=False, inplace=True)
y_pred = [label[0]] + [p for p in rank_df.index.tolist() if p != label[0]][:len(label)-1]
predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
F1, pF1, tau = evaluation(predictions)
F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
tau_mean = np.mean(tau_test)
print('mean_tau: %.3f' % tau_mean)
if tau_mean > best_tau:
best_tau = tau_mean
best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))
Leave-one-out cross validation
In [ ]:
# best_C: 0.030 when compute the features on the original dataset
# best_C: 0.010 when compute the features on the pruned dataset
In [ ]:
predictions = dict()
In [ ]:
# make sure the POI features are the same for training and test
poi_info_ = poi_info
query_id_dict_ = query_id_dict
for i in range(len(pruned_labels)):
sys.stdout.write('%s ' % str(i+1))
train_labels = pruned_labels[:i] + pruned_labels[i+1:]
#trajid_set_ = trajid_set[:i] + trajid_set[i+1:]
test_label = pruned_labels[i]
poi_list_ = sorted({p for x in train_labels for p in x})
if test_label[0] not in poi_list_: continue
poi_id_dict_ = {poi: ix for ix, poi in enumerate(poi_list_)}
poi_id_rdict_ = {ix: poi for ix, poi in enumerate(poi_list_)}
#poi_info_ = calc_poi_info(trajid_set_, traj_all, poi_all)
train_df = gen_train_df_new(train_labels, poi_list_, poi_info_.copy(), query_id_dict_.copy(),
poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST, n_jobs=N_JOBS)
ranksvm = RankSVM(ranksvm_dir, useLinear=True)
ranksvm.train(train_df, cost=best_C)
test_df = gen_test_df_new(test_label[0], len(test_label), poi_info_.copy(), query_id_dict_.copy(),
poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
rank_df = ranksvm.predict(test_df)
rank_df.sort_values(by='rank', ascending=False, inplace=True)
y_pred = [test_label[0]] + [p for p in rank_df.index.tolist() if p != test_label[0]][:len(test_label)-1]
predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}
In [ ]:
len(predictions)
In [ ]:
ret = evaluation(predictions)