The goal of this notebook is to design an experiment to check if multi-user and multi-label (which is what our dataset looks like) is a problem for SSVM.
To chieve this goal,
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import os, pickle, random
import pandas as pd
import numpy as np
import cvxopt
import seaborn as sns
In [ ]:
random.seed(1234554321)
np.random.seed(123456789)
cvxopt.base.setseed(123456789)
Run notebook ssvm.ipynb
.
In [ ]:
%run 'ssvm_ml.ipynb'
In [ ]:
#dump_vars = True
Load trained parameters and prediction results
In [ ]:
#fname = os.path.join(data_dir, 'ssvm-listViterbi-Glas.pkl')
fname = os.path.join(data_dir, 'ssvm-listViterbi-Osak.3.pkl')
In [ ]:
ssvm_lv = pickle.load(open(fname, 'rb')) # a dict: query -> {'PRED': trajectory, 'C': ssvm-c, 'W': model_params}
In [ ]:
query = (2, 5)
#query = (5, 4)
W = ssvm_lv[query]['W']
In [ ]:
ssvm_lv[query]
In [ ]:
#W = np.random.randn(W.shape[0]) # Use a random weight vector to generate trajectories
In [ ]:
#W
In [ ]:
#%%script false
trajid_set = set(trajid_set_all) - TRAJ_GROUP_DICT[query]
poi_set = {p for tid in trajid_set for p in traj_dict[tid] if len(traj_dict[tid]) >= 2}
poi_list = sorted(poi_set)
n_states = len(poi_set)
n_edge_features = 5
n_node_features = (len(W) - n_states * n_states * n_edge_features) // n_states
#print(len(W), n_states, n_node_features)
#unary_params = W[:n_states * n_node_features].reshape(n_states, n_node_features)
#pw_params = W[n_states * n_node_features:].reshape((n_states, n_states, n_edge_features))
unary_params = W[:-n_edge_features]
pw_params = W[-n_edge_features:].reshape(n_edge_features)
# duplicate params so that inference procedures work the same way no matter params shared or not
unary_params = np.tile(unary_params, (n_states, 1))
pw_params = np.tile(pw_params, (n_states, n_states, 1))
poi_id_dict, poi_id_rdict = dict(), dict()
for idx, poi in enumerate(poi_list):
poi_id_dict[poi] = idx
poi_id_rdict[idx] = poi
print('Finished.')
In [ ]:
len(poi_list)
Compute feature scaling parameters
In [ ]:
#%%script false
poi_info = calc_poi_info(sorted(trajid_set), traj_all, poi_all)
traj_list = [traj_dict[k] for k in sorted(trajid_set) if len(traj_dict[k]) >= 2]
node_features_list = Parallel(n_jobs=N_JOBS)\
(delayed(calc_node_features)\
(tr[0], len(tr), poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST) for tr in traj_list)
edge_features = calc_edge_features(list(trajid_set), poi_list, traj_dict, poi_info.copy())
fdim = node_features_list[0].shape
X_node_all = np.vstack(node_features_list)
#scaler = MaxAbsScaler(copy=False)
scaler = MinMaxScaler(feature_range=(-1,1), copy=False)
scaler.fit(X_node_all)
# edge feature scaling
scaler_edge = MinMaxScaler(feature_range=(-1,1), copy=False)
fdim_edge = edge_features.shape
edge_features = scaler_edge.fit_transform(edge_features.reshape(fdim_edge[0]*fdim_edge[1], -1))
edge_features = edge_features.reshape(fdim_edge)
print('Finished.')
In [ ]:
print(poi_info.shape)
print(edge_features.shape)
Generating trajectories
In [ ]:
poi, L = query
X_node_test = calc_node_features(poi, L, poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS,
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
X_node_test = scaler.transform(X_node_test) # feature scaling
unary_features = X_node_test
pw_features = edge_features.copy()
y_pred = do_inference_listViterbi(poi_id_dict[poi], L, len(poi_set),
unary_params, pw_params, unary_features, pw_features)
print([poi_id_rdict[p] for p in y_pred])
In [ ]:
y_hat = [2, 1, 6, 21, 20]
y_hat = [2, 1, 6, 20, 21]
#y_hat = [5, 7, 8, 6]
#y_hat = [5, 8, 7, 6]
In [ ]:
score = 0
y = [poi_id_dict[x] for x in y_hat]
for j in range(len(y)-1):
ss = y[j]
tt = y[j+1]
score += np.dot(pw_params[ss, tt], pw_features[ss, tt])
score += np.dot(unary_params[tt], unary_features[tt])
print(score)
In [ ]:
[traj_dict[x] for x in TRAJ_GROUP_DICT[query]]
In [ ]:
#%%script false
lengthes = [3, 4, 5, 6]#, 7]
fake_labels = []
for poi in sorted(poi_list):
for L in lengthes:
X_node_test = calc_node_features(poi, L, poi_list, poi_info.copy(), poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
X_node_test = scaler.transform(X_node_test) # feature scaling
unary_features = X_node_test
pw_features = edge_features.copy()
y_pred = do_inference_listViterbi(poi_id_dict[poi], L, len(poi_set),
unary_params, pw_params, unary_features, pw_features)
fake_labels.append([poi_id_rdict[p] for p in y_pred])
print('Finished.')
In [ ]:
len(fake_labels)
In [ ]:
#fname = 'fake_labels.pkl'
#if dump_vars == True: pickle.dump(fake_labels, open(fname, 'wb'))
In [ ]:
#vars_equal(pickle.load(open(fname, 'rb')), fake_labels)
Computing scaling parameters and training features/labels
In [ ]:
def calc_train_data(train_labels, poi_list, poi_info, edge_features, poi_id_dict):
node_features_all = Parallel(n_jobs=N_JOBS)\
(delayed(calc_node_features)\
(tr[0], len(tr), poi_list, poi_info, poi_clusters=POI_CLUSTERS, \
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST) for tr in train_labels)
fdim_train = node_features_all[0].shape
X_node_train = np.vstack(node_features_all)
scaler_node = MinMaxScaler(feature_range=(-1,1), copy=False)
X_node_train = scaler_node.fit_transform(X_node_train)
X_node_train = X_node_train.reshape(-1, fdim_train[0], fdim_train[1])
assert(len(train_labels) == X_node_train.shape[0])
X_train = [(X_node_train[k, :, :], edge_features.copy(),
(poi_id_dict[train_labels[k][0]], len(train_labels[k]))) for k in range(len(train_labels))]
y_train = [np.array([poi_id_dict[k] for k in tr]) for tr in train_labels]
assert(len(X_train) == len(y_train))
return X_train, y_train, scaler_node
Training on generated data
In [ ]:
def train_ssvm(X_train, y_train, C):
sm = MyModel(inference_fun=do_inference_listViterbi)
osssvm = OneSlackSSVM(model=sm, C=C, n_jobs=N_JOBS, verbose=0)
try:
osssvm.fit(X_train, y_train, initialize=True)
print('SSVM training finished.')
except:
sys.stderr.write('SSVM training FAILED.\n')
return osssvm
Plot the primal dual objective value curve
In [ ]:
def plot_obj_curve(ssvm):
plt.plot(ssvm.objective_curve_, label='dual')
plt.plot(ssvm.primal_objective_curve_, label='primal')
plt.legend()
Make prediction
In [ ]:
def predict(ssvm, ps, L, poi_list, poi_info, edge_features, scaler_node, poi_id_dict, poi_id_rdict):
X_node_test = calc_node_features(ps, L, poi_list, poi_info, poi_clusters=POI_CLUSTERS,
cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
X_node_test = scaler_node.transform(X_node_test)
X_test = [(X_node_test, edge_features, (poi_id_dict[ps], L))]
y_hat = ssvm.predict(X_test)
return np.array([poi_id_rdict[p] for p in y_hat[0]])
Compute evaluation metrics
In [ ]:
def evaluation(predictions):
F1_ssvm = []; pF1_ssvm = []; tau_ssvm = []
for key in sorted(predictions.keys()):
F1 = calc_F1(predictions[key]['REAL'], predictions[key]['PRED'])
pF1 = calc_pairsF1(predictions[key]['REAL'], predictions[key]['PRED'])
tau = calc_kendalltau(predictions[key]['REAL'], predictions[key]['PRED'])
F1_ssvm.append(F1); pF1_ssvm.append(pF1); tau_ssvm.append(tau)
F1_mean = np.mean(F1_ssvm); pF1_mean = np.mean(pF1_ssvm); tau_mean = np.mean(tau_ssvm)
print('F1 (%.3f, %.3f), pairsF1 (%.3f, %.3f), Tau (%.3f, %.3f)' % \
(F1_mean, np.std(F1_ssvm)/np.sqrt(len(F1_ssvm)), \
pF1_mean, np.std(pF1_ssvm)/np.sqrt(len(pF1_ssvm)), \
tau_mean, np.std(tau_ssvm)/np.sqrt(len(tau_ssvm))))
return F1_mean, pF1_mean, tau_mean
Train on generated dataset
In [ ]:
#C = 0.3
In [ ]:
%%script false
train_labels = fake_labels.copy()
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(),
edge_features.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, C)
In [ ]:
#plot_obj_curve(ssvm)
Evaluate on training set
In [ ]:
%%script false
predictions = dict()
for label in train_labels:
y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(),
scaler_node, poi_id_dict, poi_id_rdict)
predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
In [ ]:
%%script false
ret = evaluation(predictions)
FOR STEP 4: Turn off duration related features
In [ ]:
#poi_info['avgDuration'] = 0.0
In [ ]:
# transition features: [poiCat, popularity, nVisit, avgDuration, clusterID]
#edge_features = edge_features[:, :, [0,1,2,4]]
Choose hyper-parameter C using Monte-Carlo cross validation
In [ ]:
%%script false
num_test = int(len(fake_labels) * MC_PORTION)
best_tau = 0; best_C = 0
In [ ]:
#edge_features = np.zeros_like(edge_features) # Turn off transition features
In [ ]:
%%script false
np.random.seed(0)
for C in C_SET:
print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush()
F1_test = []; pF1_test = []; tau_test = []
for t in range(MC_NITER):
while True:
indices = np.arange(len(fake_labels))
np.random.shuffle(indices)
test_ix = indices[:num_test]
train_ix = indices[num_test:]
train_labels = [fake_labels[ix] for ix in train_ix]
test_labels = [fake_labels[ix] for ix in test_ix]
poi_set_ = {p for x in train_labels for p in x}
if len(poi_set_) == len(poi_list): break
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(),
edge_features.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, C)
predictions = dict()
for label in test_labels:
y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info.copy(), edge_features.copy(),
scaler_node, poi_id_dict, poi_id_rdict)
predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
F1, pF1, tau = evaluation(predictions)
F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
mean_tau = np.mean(tau_test)
print('mean_tau: %.3f' % mean_tau)
if mean_tau > best_tau:
best_tau = mean_tau
best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))
In [ ]:
%%script false
predictions = dict()
In [ ]:
%%script false
for i in range(len(fake_labels)):
sys.stdout.write('%s ' % str(i+1))
train_labels = fake_labels[:i] + fake_labels[i+1:]
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info.copy(),
edge_features.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, best_C)
test_label = fake_labels[i]
y_pred = predict(ssvm, test_label[0], len(test_label), poi_list, poi_info.copy(), edge_features.copy(),
scaler_node, poi_id_dict, poi_id_rdict)
predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}
In [ ]:
%%script false
ret = evaluation(predictions)
In [ ]:
%%script false
fname = 'ssvm-orig-feature.pkl'
if dump_vars == True: pickle.dump(predictions, open(fname, 'wb'))
In [ ]:
%%script false
vars_equal(pickle.load(open(fname, 'rb')), predictions)
Turn off duration related features one-by-one, and perform step 3 to check whether duration related features help.
Concretely, disable duration related POI and transition features in step 1 one-by-one, and run step 1 to step 3.
In [ ]:
#transmat_visit0, logbins_visit0 = gen_transmat_visit(trajid_set, traj_dict, poi_info)
In [ ]:
%%script false
poi_info_new = calc_poi_info(sorted(trajid_set), traj_all, poi_all)
edge_features_new = calc_edge_features(list(trajid_set), poi_list, traj_dict, poi_info_new.copy())
# set POI popularity and nvisit
poi_info_new['avgDuration'] = 0.0
poi_info_new['popularity'] = 1 # only a single user
poi_info_new['nVisit'] = 0
for label in fake_labels:
for p in label: poi_info_new.loc[p, 'nVisit'] += 1
# set popularity (drop it) and nvisit based transition features
# compute binning boundaries
poi_visits = poi_info_new.loc[poi_list, 'nVisit']
expo_visit1 = np.log10(max(1, min(poi_visits)))
expo_visit2 = np.log10(max(poi_visits))
nbins_visit = BIN_CLUSTER
logbins_visit = np.logspace(np.floor(expo_visit1), np.ceil(expo_visit2), nbins_visit+1)
logbins_visit[0] = 0 # deal with underflow
if not (logbins_visit[-1] > poi_info_new['nVisit'].max()):
logbins_visit[-1] = poi_info_new['nVisit'].max() + 1
# compute transition matrix between different nVist buckets
nbins = len(logbins_visit) - 1
transmat_visit_cnt = pd.DataFrame(data=np.zeros((nbins, nbins), dtype=np.float),
columns=np.arange(1, nbins+1), index=np.arange(1, nbins+1))
for t in fake_labels:
for pi in range(len(t)-1):
p1, p2 = t[pi], t[pi+1]
assert(p1 in poi_info_new.index and p2 in poi_info_new.index)
visit1 = poi_info_new.loc[p1, 'nVisit']
visit2 = poi_info_new.loc[p2, 'nVisit']
vc1, vc2 = np.digitize([visit1, visit2], logbins_visit)
if vc1 > 5 or vc2 > 5: print(p1, visit1, p2, visit2)
transmat_visit_cnt.loc[vc1, vc2] += 1
transmat_visit = normalise_transmat(transmat_visit_cnt)
# compute nvisit based transition features
poi_features = pd.DataFrame(data=np.zeros((len(poi_list), 1)), columns=['nVisit'], index=poi_list)
poi_features.index.name = 'poiID'
poi_features['nVisit'] = np.digitize(poi_info_new.loc[poi_list, 'nVisit'], logbins_visit)
for j in range(len(poi_list)): # NOTE: POI order
pj = poi_list[j]
visit = poi_features.loc[pj, 'nVisit']
for k in range(len(poi_list)): # NOTE: POI order
pk = poi_list[k]
#edge_features_new[j, k, 2] = np.log10(transmat_visit.loc[visit, poi_features.loc[pk, 'nVisit']])
edge_features_new[j, k, 2] = transmat_visit.loc[visit, poi_features.loc[pk, 'nVisit']]
# transition features: [poiCat, popularity, nVisit, avgDuration, clusterID]
edge_features_new = edge_features_new[:, :, [0, 1, 2, 4]]
# edge feature scaling
scaler_edge = MinMaxScaler(feature_range=(-1,1), copy=False)
fdim_edge = edge_features_new.shape
edge_features_new = scaler_edge.fit_transform(edge_features_new.reshape(fdim_edge[0]*fdim_edge[1], -1))
edge_features_new = edge_features_new.reshape(fdim_edge)
print('Finished.')
Plot the transition matrix
In [ ]:
#sns.heatmap(np.log10(transmat_visit0), cmap='BuGn')#, vmin=0, vmax=1)
In [ ]:
#sns.heatmap(np.log10(transmat_visit), cmap='BuGn')#, vmin=0, vmax=1)
Plot the histograms of the number of visit
In [ ]:
#print(logbins_visit0)
#poi_info['nVisit'].hist()
In [ ]:
#print(logbins_visit)
#poi_info_new['nVisit'].hist()
In [ ]:
%%script false
edge_features_new = np.zeros_like(edge_features_new) # Turn off transition features
In [ ]:
%%script false
num_test = int(len(fake_labels) * MC_PORTION)
best_tau = 0; best_C = 0
In [ ]:
%%script false
np.random.seed(0)
for C in C_SET:
print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush()
F1_test = []; pF1_test = []; tau_test = []
for t in range(MC_NITER):
while True:
indices = np.arange(len(fake_labels))
np.random.shuffle(indices)
test_ix = indices[:num_test]
train_ix = indices[num_test:]
train_labels = [fake_labels[ix] for ix in train_ix]
test_labels = [fake_labels[ix] for ix in test_ix]
poi_set_ = {p for x in train_labels for p in x}
if len(poi_set_) == len(poi_list): break
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info_new.copy(),
edge_features_new.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, C)
predictions = dict()
for label in test_labels:
y_pred = predict(ssvm, label[0], len(label), poi_list, poi_info_new.copy(), edge_features_new.copy(),
scaler_node, poi_id_dict, poi_id_rdict)
predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
F1, pF1, tau = evaluation(predictions)
F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
tau_mean = np.mean(tau_test)
print('mean_tau: %.3f' % tau_mean)
if tau_mean > best_tau:
best_tau = tau_mean
best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))
In [ ]:
%%script false
predictions = dict()
In [ ]:
%%script false
for i in range(len(fake_labels)):
sys.stdout.write('%s ' % str(i+1))
train_labels = fake_labels[:i] + fake_labels[i+1:]
X_train, y_train, scaler_node = calc_train_data(train_labels, poi_list, poi_info_new.copy(),
edge_features_new.copy(), poi_id_dict.copy())
ssvm = train_ssvm(X_train, y_train, best_C)
test_label = fake_labels[i]
y_pred = predict(ssvm, test_label[0], len(test_label), poi_list, poi_info_new.copy(), edge_features_new.copy(),
scaler_node, poi_id_dict, poi_id_rdict)
predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}
In [ ]:
%%script false
ret = evaluation(predictions)
In [ ]:
%run 'baseline.ipynb'
In [ ]:
def gen_train_df_new(train_labels, poi_list, poi_info, query_id_dict, poi_clusters, cats, clusters, n_jobs=-1):
columns = DF_COLUMNS
poi_distmat = POI_DISTMAT
train_trajs = train_labels
qid_set = sorted(set(query_id_dict.values()))
query_id_rdict = dict()
for k, v in query_id_dict.items():
query_id_rdict[v] = k # qid --> (start, length)
train_df_list = Parallel(n_jobs=n_jobs)\
(delayed(gen_train_subdf_new)(poi, qid_set, poi_info, poi_clusters,
cats,clusters,query_id_rdict) for poi in poi_list)
assert(len(train_df_list) > 0)
df_ = train_df_list[0]
for j in range(1, len(train_df_list)):
df_ = df_.append(train_df_list[j], ignore_index=True)
# set label
df_.set_index(['queryID', 'poiID'], inplace=True)
df_['label'] = 0
for t in train_trajs:
qid = query_id_dict[(t[0], len(t))]
for poi in t[1:]: # do NOT count if the POI is startPOI/endPOI
df_.loc[(qid, poi), 'label'] += 1
df_.reset_index(inplace=True)
return df_
In [ ]:
def gen_train_subdf_new(poi_id, query_id_set, poi_info, poi_clusters, cats, clusters, query_id_rdict):
assert(isinstance(cats, list))
assert(isinstance(clusters, list))
columns = DF_COLUMNS
poi_distmat = POI_DISTMAT
df_ = pd.DataFrame(index=np.arange(len(query_id_set)), columns=columns)
pop, nvisit = poi_info.loc[poi_id, 'popularity'], poi_info.loc[poi_id, 'nVisit']
cat, cluster = poi_info.loc[poi_id, 'poiCat'], poi_clusters.loc[poi_id, 'clusterID']
duration = poi_info.loc[poi_id, 'avgDuration']
for j in range(len(query_id_set)):
qid = query_id_set[j]
assert(qid in query_id_rdict) # qid --> (start, end, length)
(p0, trajLen) = query_id_rdict[qid]
idx = df_.index[j]
df_.loc[idx, 'poiID'] = poi_id
df_.loc[idx, 'queryID'] = qid
df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
df_.loc[idx, 'trajLen'] = trajLen
df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_info.loc[p0, 'poiCat'] else -1
df_.loc[idx, 'distStart'] = poi_distmat.loc[poi_id, p0]
df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
return df_
In [ ]:
def gen_test_df_new(startPOI, nPOI, poi_info, query_id_dict, poi_clusters, cats, clusters):
assert(isinstance(cats, list))
assert(isinstance(clusters, list))
columns = DF_COLUMNS
poi_distmat = POI_DISTMAT
key = (p0, trajLen) = (startPOI, nPOI)
assert(key in query_id_dict)
assert(p0 in poi_info.index)
df_ = pd.DataFrame(index=np.arange(poi_info.shape[0]), columns=columns)
poi_list = sorted(poi_info.index)
qid = query_id_dict[key]
df_['queryID'] = qid
df_['label'] = np.random.rand(df_.shape[0]) # label for test data is arbitrary according to libsvm FAQ
for i in range(df_.index.shape[0]):
poi = poi_list[i]
lon, lat = poi_info.loc[poi, 'poiLon'], poi_info.loc[poi, 'poiLat']
pop, nvisit = poi_info.loc[poi, 'popularity'], poi_info.loc[poi, 'nVisit']
cat, cluster = poi_info.loc[poi, 'poiCat'], poi_clusters.loc[poi, 'clusterID']
duration = poi_info.loc[poi, 'avgDuration']
idx = df_.index[i]
df_.loc[idx, 'poiID'] = poi
df_.set_value(idx, 'category', tuple((cat == np.array(cats)).astype(np.int) * 2 - 1))
df_.set_value(idx, 'neighbourhood', tuple((cluster == np.array(clusters)).astype(np.int) * 2 - 1))
df_.loc[idx, 'popularity'] = LOG_SMALL if pop < 1 else np.log10(pop)
df_.loc[idx, 'nVisit'] = LOG_SMALL if nvisit < 1 else np.log10(nvisit)
df_.loc[idx, 'avgDuration'] = LOG_SMALL if duration < 1 else np.log10(duration)
df_.loc[idx, 'trajLen'] = trajLen
df_.loc[idx, 'sameCatStart'] = 1 if cat == poi_all.loc[p0, 'poiCat'] else -1
df_.loc[idx, 'distStart'] = poi_distmat.loc[poi, p0]
df_.loc[idx, 'diffPopStart'] = pop - poi_info.loc[p0, 'popularity']
df_.loc[idx, 'diffNVisitStart'] = nvisit - poi_info.loc[p0, 'nVisit']
df_.loc[idx, 'diffDurationStart'] = duration - poi_info.loc[p0, 'avgDuration']
df_.loc[idx, 'sameNeighbourhoodStart'] = 1 if cluster == poi_clusters.loc[p0, 'clusterID'] else -1
return df_
Tune hyper-parameter
In [ ]:
%%script false
num_test = int(len(fake_labels) * 0.2)
best_tau = 0; best_C = 0
query_id_dict = {(tr[0], len(tr)): ix for ix, tr in enumerate(fake_labels)}
In [ ]:
#poi_info_new = calc_poi_info(sorted(trajid_set), traj_all, poi_all) # Compute features on the original dataset
In [ ]:
%%script false
np.random.seed(0)
for C in C_SET:
print('\n--------------- try_C: %f ---------------\n' % C); sys.stdout.flush()
F1_test = []; pF1_test = []; tau_test = []
for t in range(MC_NITER):
while True:
indices = np.arange(len(fake_labels))
np.random.shuffle(indices)
test_ix = indices[:num_test]
train_ix = indices[num_test:]
train_labels = [fake_labels[ix] for ix in train_ix]
test_labels = [fake_labels[ix] for ix in test_ix]
poi_set_ = {p for x in train_labels for p in x}
if len(poi_set_) == len(poi_list): break
train_df = gen_train_df_new(train_labels, poi_list, poi_info_new.copy(), query_id_dict.copy(),
poi_clusters=POI_CLUSTERS,cats=POI_CAT_LIST,clusters=POI_CLUSTER_LIST,n_jobs=N_JOBS)
ranksvm = RankSVM(ranksvm_dir, useLinear=True)
ranksvm.train(train_df, cost=C)
predictions = dict()
for label in test_labels:
test_df = gen_test_df_new(label[0], len(label), poi_info_new.copy(), query_id_dict.copy(),
poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
rank_df = ranksvm.predict(test_df)
rank_df.sort_values(by='rank', ascending=False, inplace=True)
y_pred = [label[0]] + [p for p in rank_df.index.tolist() if p != label[0]][:len(label)-1]
predictions[(label[0], len(label))] = {'PRED': y_pred, 'REAL': label}
F1, pF1, tau = evaluation(predictions)
F1_test.append(F1); pF1_test.append(pF1); tau_test.append(tau)
tau_mean = np.mean(tau_test)
print('mean_tau: %.3f' % tau_mean)
if tau_mean > best_tau:
best_tau = tau_mean
best_C = C
print('\nbest_tau: %.3f, best_C: %.3f' % (best_tau, best_C))
In [ ]:
#predictions = dict()
In [ ]:
%%script false
for i in range(len(fake_labels)):
sys.stdout.write('%s ' % str(i+1))
train_labels = fake_labels[:i] + fake_labels[i+1:]
train_df = gen_train_df_new(train_labels, poi_list, poi_info_new.copy(), query_id_dict.copy(),
poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST, n_jobs=N_JOBS)
ranksvm = RankSVM(ranksvm_dir, useLinear=True)
ranksvm.train(train_df, cost=best_C)
test_label = fake_labels[i]
test_df = gen_test_df_new(test_label[0], len(test_label), poi_info_new.copy(), query_id_dict.copy(),
poi_clusters=POI_CLUSTERS, cats=POI_CAT_LIST, clusters=POI_CLUSTER_LIST)
rank_df = ranksvm.predict(test_df)
rank_df.sort_values(by='rank', ascending=False, inplace=True)
y_pred = [test_label[0]] + [p for p in rank_df.index.tolist() if p != test_label[0]][:len(test_label)-1]
predictions[(test_label[0], len(test_label))] = {'PRED': y_pred, 'REAL': test_label}
In [ ]:
#ret = evaluation(predictions)