Leave-one-out Evaluation of SSVM

Leave-one-query-out evaluation of (single-/multi-label) SSVM with regularisation parameter $C$ tuned using Monte-Carlo cross validation.


In [ ]:
%matplotlib inline

import os, sys
import numpy as np
import pandas as pd
import pickle as pkl
import random
import cvxopt
import matplotlib.pyplot as plt

In [ ]:
sys.path.append('src/')

In [ ]:
#import pyximport; pyximport.install()
from inference_lv import do_inference_list_viterbi

In [ ]:
from inference import do_inference_brute_force, do_inference_viterbi, do_inference_greedy
from shared import TrajData, evaluate, do_evaluation
from ssvm import SSVM

In [ ]:
random.seed(1234554321)
np.random.seed(123456789)
cvxopt.base.setseed(123456789)

In [ ]:
N_JOBS = 6         # number of parallel jobs
C_SET = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100, 300, 1000, 3000]  # regularisation parameter
MC_PORTION = 0.1   # the portion of data that sampled by Monte-Carlo cross-validation
MC_NITER = 5       # number of iterations for Monte-Carlo cross-validation
SSVM_SHARE_PARAMS = True  # share params among POIs/transitions in SSVM
SSVM_MULTI_LABEL = True  # use multi-label SSVM

In [ ]:
dat_ix = 2
data_dir = 'data/data-new'

In [ ]:
def normalised_hamming(seq1, seq2):
    assert(len(seq1) > 1)
    assert(len(seq2) > 1)
    assert(len(seq1) == len(seq2))
    length = len(seq1)
    distance = 0
    indicators = [seq1[i] != seq2[i] for i in range(length)]
    return np.sum(indicators) / length

In [ ]:
s1 = [1, 3, 5]
s2 = [1, 2, 5]

In [ ]:
normalised_hamming(s1, s2)

In [ ]:
def evaluate_hamming(dat_obj, recdict):
    assert(type(dat_obj) == TrajData)
    #hamming_list = []
    hamming_dict = dict()
    for key in sorted(recdict.keys()):
        assert(key in dat_obj.TRAJID_GROUP_DICT)
        y_true_list = [dat_obj.traj_dict[tid] for tid in dat_obj.TRAJID_GROUP_DICT[key]]
        y_hat_list = recdict[key]['PRED']
        distance_list = [normalised_hamming(y_hat_list[0], y_true) for y_true in y_true_list]
        #hamming_list.append(np.min(distance_list))
        length = key[1]
        distance = np.min(distance_list)
        try:
            hamming_dict[length].append(distance)
        except KeyError:
            hamming_dict[length] = [distance]
    return hamming_dict
    #avg_hamming = np.sum(hamming_list) / len(hamming_list)
    #print('%d, %.3f' % (len(hamming_list), avg_hamming))
    #return avg_hamming

In [ ]:
dat_obj = TrajData(dat_ix, data_dir=data_dir)

In [ ]:
dat_obj.dat_suffix

In [ ]:
queries = sorted(dat_obj.TRAJID_GROUP_DICT.keys())
len(queries)

In [ ]:
rank = 'rank-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#sppath = 'ssvm-A10-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
sp = 'ssvm-B-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
srpath = 'ssvm-C-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
sr = 'ssvm-D-' + dat_obj.dat_suffix[dat_ix] + '.pkl'

rank_dict = pkl.load(open(os.path.join(data_dir, rank), 'rb'))
sp_dict = pkl.load(open(os.path.join(data_dir, sp), 'rb'))
sr_dict = pkl.load(open(os.path.join(data_dir, sr), 'rb'))
srpath_dict = pkl.load(open(os.path.join(data_dir, srpath), 'rb'))

In [ ]:
DEBUG = False
F1_rank, pF1_rank, Tau_rank = do_evaluation(dat_obj, rank_dict, debug=DEBUG)
F1_sp, pF1_sp, Tau_sp = do_evaluation(dat_obj, sp_dict, debug=DEBUG)
F1_sr, pF1_sr, Tau_sr = do_evaluation(dat_obj, sr_dict, debug=DEBUG)
F1_srpath, pF1_srpath, Tau_srpath = do_evaluation(dat_obj, srpath_dict, debug=DEBUG)

In [ ]:
DEBUG = True
do_evaluation(dat_obj, rank_dict, debug=DEBUG)
do_evaluation(dat_obj, sp_dict, debug=DEBUG)
do_evaluation(dat_obj, sr_dict, debug=DEBUG)
do_evaluation(dat_obj, srpath_dict, debug=DEBUG)

In [ ]:
rank_dist = evaluate_hamming(dat_obj, rank_dict)
sp_dist = evaluate_hamming(dat_obj, sp_dict)
sr_dist = evaluate_hamming(dat_obj, sr_dict)
srpath_dist = evaluate_hamming(dat_obj, srpath_dict)

In [ ]:
dist_dicts = [rank_dist, sp_dist, sr_dist, srpath_dist]
fig, axes = plt.subplots(nrows=1, ncols=len(dist_dicts), figsize=[15,3])
for j in range(len(dist_dicts)):
    X = sorted(dist_dicts[j].keys())
    #Y = [np.mean(dist_dict[x]) for x in X]
    #plt.plot(X, Y)
    Y = [dist_dicts[j][x] for x in X]
    axes[j].boxplot(Y, labels=X, showmeans=True)
    #ax.xticks(np.arange(1,len(Y)+1), X)

In [ ]:
[len(rank_dist[x]) for x in sorted(rank_dist.keys())]

In [ ]:
[len(sp_dist[x]) for x in sorted(sp_dist.keys())]

In [ ]:
[len(sr_dist[x]) for x in sorted(sr_dist.keys())]

In [ ]:
[len(srpath_dist[x]) for x in sorted(srpath_dist.keys())]

In [ ]:
for j in range(len(queries)):
    q = queries[j]
    if q[1] > 2 and len(dat_obj.TRAJID_GROUP_DICT[q]) > 1:
        if max(F1_sr[j], F1_srpath[j]) > max(F1_sp[j], F1_rank[j]) and \
        max(pF1_sr[j], pF1_srpath[j]) > max(pF1_sp[j], pF1_rank[j]) and \
        max(Tau_sr[j], Tau_srpath[j]) > max(Tau_sp[j], Tau_rank[j]):
            print(j, q)

In [ ]:
ix = 16
print([F1_rank[ix], pF1_rank[ix], Tau_rank[ix]])
print([F1_sp[ix], pF1_sp[ix], Tau_sp[ix]])
print([F1_sr[ix], pF1_sr[ix], Tau_sr[ix]])
print([F1_srpath[ix], pF1_srpath[ix], Tau_srpath[ix]])

In [ ]:
query = (10, 3)
dat_obj.TRAJID_GROUP_DICT[query]

In [ ]:
[dat_obj.traj_dict[tid] for tid in dat_obj.TRAJID_GROUP_DICT[query]]

In [ ]:
rank_dict[query]['PRED']

In [ ]:
sp_dict[query]['PRED']

In [ ]:
sr_dict[query]['PRED']

In [ ]:
srpath_dict[query]['PRED']

In [ ]:
%%script false
startdict = dict()
for q in queries:
    if q[0] in startdict: startdict[q[0]].append(q)
    else: startdict[q[0]] = [q]

In [ ]:
%%script false
%matplotlib inline
import matplotlib.pyplot as plt

X = sorted(startdict.keys())
Y = [len(startdict[q]) for q in X]

plt.plot(X, sorted(Y), '^')

In [ ]:
#basename = 'rand-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#basename = 'pop-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#basename = 'rank-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#basename = 'ssvm-A-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#basename = 'ssvm-B-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#basename = 'ssvm-A10-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#basename = 'ssvm-B10-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#basename = 'ssvm-C10-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
basename = 'ssvm-D10-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#basename = 'ssvm-D15-' + dat_obj.dat_suffix[dat_ix] + '.pkl'
#basename = 'ssvm-D-' + dat_obj.dat_suffix[dat_ix] + '-new2.pkl'

recdict = pkl.load(open(os.path.join(dat_obj.data_dir, basename), 'rb'))
#recdict = {q:{k: (v if k != 'PRED' else [recdict[q]['PRED']]) for k,v in recdict[q].items()} for q in recdict}
#pkl.dump(recdict, open(os.path.join(dat_obj.data_dir, basename), 'wb'))
do_evaluation(dat_obj, recdict)

In [ ]:
recdict[query]

In [ ]:
recrank = pkl.load(open(os.path.join(dat_obj.data_dir, 'rank-Glas.pkl'), 'rb'))

In [ ]:
recrank[query]

In [ ]:
%%script false
cnt = 0
keys = sorted(recdict.keys())
for j in range(len(keys)):
    q = keys[j]
    for y in recdict[q]['PRED']:
        if len(y) > len(set(y)): cnt += 1; print(j); break
#print('total:', cnt)

In [ ]:
#len(set(recdict[keys[138]]['PRED'][0]))

In [ ]:
cfname = 'bestC-D-' + dat_obj.dat_suffix[dat_ix] + '.pkl'

In [ ]:
fssvm = os.path.join(data_dir, 'ssvm-D10-' + dat_obj.dat_suffix[dat_ix] + '.pkl')
fssvm

In [ ]:
cdict = pkl.load(open(os.path.join(data_dir, cfname), 'rb'))

In [ ]:
len(cdict)

In [ ]:
#cdict

In [ ]:
queries = sorted(dat_obj.TRAJID_GROUP_DICT.keys())
cnt = 1
recdict_ssvm = dict()
#for query in queries:
qix = np.random.choice(np.arange(len(queries)), 5)
for query in [queries[ix] for ix in qix]:
    if query not in cdict:
        sys.stderr.write('query %s does not exist in hyper-parameter dictionary.\n' % str(query))
        cnt += 1
        continue

    ps, L = query
    
    trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[query]
    poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))  
    poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
    if ps not in poi_set_i: 
        sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
        continue
        
    
    best_C = cdict[query]
    print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %f ---------------\n' % (cnt, len(queries), ps, L, best_C))
    sys.stdout.flush()
    
    ssvm = SSVM(inference_train=do_inference_list_viterbi, inference_pred=do_inference_list_viterbi, 
                dat_obj=dat_obj, share_params=SSVM_SHARE_PARAMS, multi_label=SSVM_MULTI_LABEL, 
                C=best_C, poi_info=poi_info_i, debug=True)
    if ssvm.train(sorted(trajid_set_i), n_jobs=N_JOBS) == True:
        y_hat_list = ssvm.predict(ps, L)
        print(cnt, y_hat_list)
        if y_hat_list is not None:
            recdict_ssvm[(ps, L)] = {'PRED': y_hat_list, 'MODEL': ssvm, 'C': ssvm.C}
    cnt += 1
#    break

In [ ]:
#do_evaluation(dat_obj, recdict_ssvm)

In [ ]:
pkl.dump(recdict_ssvm, open(fssvm, 'bw'))

In [ ]:
rec = pkl.load(open(fssvm, 'rb'))

In [ ]:
query = queries[0]
assert(query in rec)

In [ ]:
ssvm1_1 = rec[query]['MODEL']

In [ ]:
ssvm1_1.predict(1, 8)

Nested cross-validation with Monte-Carlo cross-validation as inner loop.


In [ ]:
inference_methods = [do_inference_brute_force, do_inference_list_viterbi, do_inference_viterbi, do_inference_greedy]
methods_suffix = ['bruteForce', 'listViterbi', 'viterbi', 'greedy']

In [ ]:
method_ix_train = 1
method_ix_pred = 1

In [ ]:
%%script false
recdict_ssvm = dict()
cnt = 1
keys = sorted(dat_obj.TRAJID_GROUP_DICT.keys())

# outer loop to evaluate the test performance by cross validation
#for i in range(len(keys)):
for i in range(2):
    ps, L = keys[i]
    best_C = 1
    #best_F1 = 0; best_pF1 = 0
    best_Tau = 0
    keys_cv = keys[:i] + keys[i+1:]
    
    # use all training+validation set to compute POI features, 
    # make sure features do NOT change for training and validation
    trajid_set_i = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
    poi_info_i = dat_obj.calc_poi_info(list(trajid_set_i))
    
    poi_set_i = {p for tid in trajid_set_i for p in dat_obj.traj_dict[tid] if len(dat_obj.traj_dict[tid]) >= 2}
    if ps not in poi_set_i: 
        sys.stderr.write('start POI of query %s does not exist in training set.\n' % str(keys[i]))
        continue
    
    # tune regularisation constant C
    for ssvm_C in C_SET:
        print('\n--------------- try_C: %f ---------------\n' % ssvm_C); sys.stdout.flush() 
        F1_ssvm = []; pF1_ssvm = []; Tau_ssvm = []        
        
        # inner loop to evaluate the performance of a model with a specified C by Monte-Carlo cross validation
        for j in range(MC_NITER):
            poi_list = []
            while True: # make sure the start POI in test set are also in training set
                rand_ix = np.arange(len(keys_cv)); np.random.shuffle(rand_ix)
                test_ix = rand_ix[:int(MC_PORTION*len(rand_ix))]
                assert(len(test_ix) > 0)
                trajid_set_train = set(dat_obj.trajid_set_all) - dat_obj.TRAJID_GROUP_DICT[keys[i]]
                for j in test_ix: 
                    trajid_set_train = trajid_set_train - dat_obj.TRAJID_GROUP_DICT[keys_cv[j]]
                poi_set = {p for tid in sorted(trajid_set_train) for p in dat_obj.traj_dict[tid] \
                           if len(dat_obj.traj_dict[tid]) >= 2}
                good_partition = True
                for j in test_ix: 
                    if keys_cv[j][0] not in poi_set: good_partition = False; break
                if good_partition == True: 
                    poi_list = sorted(poi_set)
                    break

            # train
            ssvm = SSVM(inference_train=inference_methods[method_ix_train], 
                        inference_pred=inference_methods[method_ix_pred], 
                        dat_obj=dat_obj, share_params=SSVM_SHARE_PARAMS, multi_label=SSVM_MULTI_LABEL, 
                        C=ssvm_C, poi_info=poi_info_i.loc[poi_list].copy())
            if ssvm.train(sorted(trajid_set_train), n_jobs=N_JOBS) == True:            
                for j in test_ix: # test
                    ps_cv, L_cv = keys_cv[j]
                    y_hat_list = ssvm.predict(ps_cv, L_cv)
                    if y_hat_list is not None:
                        F1, pF1, tau = evaluate(dat_obj, keys_cv[j], y_hat_list)
                        F1_ssvm.append(F1); pF1_ssvm.append(pF1); Tau_ssvm.append(tau)
            else: 
                for j in test_ix:
                    F1_ssvm.append(0); pF1_ssvm.append(0); Tau_ssvm.append(0)
        
        #mean_F1 = np.mean(F1_ssvm); mean_pF1 = np.mean(pF1_ssvm)
        mean_Tau = np.mean(Tau_ssvm)
        print('mean_Tau: %.3f' % mean_Tau)
        if mean_Tau > best_Tau:
            best_Tau = mean_Tau
            best_C = ssvm_C
            
    #best_C = recdict[(ps, L)]['C']
    print('\n--------------- %d/%d, Query: (%d, %d), Best_C: %f ---------------\n' % (cnt, len(keys), ps, L, best_C))
    sys.stdout.flush()
    
    
    # train model using all examples in training set and measure performance on test set
    ssvm = SSVM(inference_train=inference_methods[method_ix_train], inference_pred=inference_methods[method_ix_pred], 
                dat_obj=dat_obj, share_params=SSVM_SHARE_PARAMS, multi_label=SSVM_MULTI_LABEL, 
                C=best_C, poi_info=poi_info_i)#, debug=True)
    if ssvm.train(sorted(trajid_set_i), n_jobs=N_JOBS) == True:
        y_hat_list = ssvm.predict(ps, L)
        print(cnt, y_hat_list)
        if y_hat_list is not None:
            recdict_ssvm[(ps, L)] = {'PRED': y_hat_list, 'W': ssvm.osssvm.w, 'C': ssvm.C}
        
    cnt += 1; #print_progress(cnt, len(keys)); sys.stdout.flush()

In [ ]:
%%script false
do_evaluation(dat_obj, recdict_ssvm)

In [ ]:
%%script false
fssvm = os.path.join(dat_obj.data_dir, 'ssvm-' + methods_suffix[method_ix_train] + '-' + \
                     methods_suffix[method_ix_pred] + '-' + dat_obj.dat_suffix[dat_ix] + '-ml.pkl')
pkl.dump(recdict_ssvm, open(fssvm, 'bw'))