In [101]:
%matplotlib inline
import os
import re
import math
import random
import pickle
import pandas as pd
import numpy as np
import scipy.stats
#from numba import jit
from datetime import datetime
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
In [102]:
nfeatures = 8 # number of features
EPS = 1e-12 # smooth, deal with 0 probability
random.seed(123456789) # control random choice when splitting training/testing set
In [103]:
data_dir = 'data/data-ijcai15'
#fvisit = os.path.join(data_dir, 'userVisits-Osak.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Osak.csv')
#fvisit = os.path.join(data_dir, 'userVisits-Glas.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Glas.csv')
#fvisit = os.path.join(data_dir, 'userVisits-Edin.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Edin.csv')
fvisit = os.path.join(data_dir, 'userVisits-Toro.csv')
fcoord = os.path.join(data_dir, 'photoCoords-Toro.csv')
In [104]:
suffix = fvisit.split('-')[-1].split('.')[0]
In [105]:
visits = pd.read_csv(fvisit, sep=';')
coords = pd.read_csv(fcoord, sep=';')
# merge data frames according to column 'photoID'
assert(visits.shape[0] == coords.shape[0])
traj = pd.merge(visits, coords, on='photoID')
traj.head()
Out[105]:
In [106]:
num_photo = traj['photoID'].unique().shape[0]
num_user = traj['userID'].unique().shape[0]
num_poi = traj['poiID'].unique().shape[0]
num_seq = traj['seqID'].unique().shape[0]
pd.DataFrame({'#photo': num_photo, '#user': num_user, '#poi': num_poi, '#seq': num_seq, \
'#photo/user': num_photo/num_user, '#seq/user': num_seq/num_user}, index=[str(suffix)])
Out[106]:
In [107]:
#plt.figure(figsize=[15, 5])
#plt.xlabel('Longitude')
#plt.ylabel('Latitude')
#plt.scatter(traj['photoLon'], traj['photoLat'], marker='+')
Compute POI (Longitude, Latitude) as the average coordinates of the assigned photos.
In [108]:
poi_coords = traj[['poiID', 'photoLon', 'photoLat']].groupby('poiID').mean()
poi_coords.reset_index(inplace=True)
poi_coords.rename(columns={'photoLon':'poiLon', 'photoLat':'poiLat'}, inplace=True)
Extract POI category and visiting frequency.
In [109]:
poi_catfreq = traj[['poiID', 'poiTheme', 'poiFreq']].groupby('poiID').first()
poi_catfreq.reset_index(inplace=True)
In [110]:
poi_all = pd.merge(poi_catfreq, poi_coords, on='poiID')
poi_all.set_index('poiID', inplace=True)
#poi_all.to_csv(fpoi, index=True)
In [111]:
seq_all = traj[['userID', 'seqID', 'poiID', 'dateTaken']].copy().groupby(['userID', 'seqID', 'poiID'])\
.agg([np.min, np.max, np.size])
seq_all.columns = seq_all.columns.droplevel()
seq_all.reset_index(inplace=True)
seq_all.rename(columns={'amin':'arrivalTime', 'amax':'departureTime', 'size':'#photo'}, inplace=True)
seq_all['poiDuration(sec)'] = seq_all['departureTime'] - seq_all['arrivalTime']
#seq_all.head()
In [112]:
seq_user = seq_all[['userID', 'seqID', 'poiID']].copy().groupby(['userID', 'seqID']).agg(np.size)
seq_user.reset_index(inplace=True)
seq_user.rename(columns={'size':'seqLen'}, inplace=True)
seq_user.set_index('seqID', inplace=True)
#seq_user.head()
In [113]:
seq_len = seq_all[['seqID', 'poiID']].copy().groupby('seqID').agg(np.size)
seq_len.reset_index(inplace=True)
seq_len.rename(columns={'poiID':'seqLen'}, inplace=True)
#seq_len.head()
In [114]:
seq_stats = seq_all[['seqID', '#photo', 'poiDuration(sec)']].copy().groupby('seqID').agg(np.sum)
seq_stats.reset_index(inplace=True)
#seq_stats.rename(columns={'poiDuration(sec)':'totalPoiDuration(sec)'}, inplace=True)
seq_stats = pd.merge(seq_len, seq_stats, on='seqID')
seq_stats['poiDuration(sec)'] /= 60
seq_stats.rename(columns={'poiDuration(sec)':'totalPoiDuration(min)'}, inplace=True)
seq_stats.set_index('seqID', inplace=True)
#seq_stats.head()
In [115]:
#ax = seq_stats['seqLen'].hist(bins=50)
#ax.set_xlabel('sequence length')
#ax.set_ylim([0.1, 1e4])
#ax.set_yscale('log')
In [116]:
#ax = seq_stats['#photo'].hist(bins=50)
#ax.set_xlabel('#photo for sequence')
#ax.set_ylim([0.1, 1e4])
#ax.set_yscale('log')
In [117]:
#ax = seq_stats['totalPoiDuration(min)'].hist(bins=100)
#ax.set_xlabel('totalPoiDuration(min)')
#ax.set_ylim([0.1, 1e4])
#ax.set_yscale('log')
#ax.set_xscale('log')
Sequences with length {3, 4, 5}
In [118]:
#seq_stats = seq_stats[seq_stats['seqLen'].isin({3, 4, 5})]
#ax = seq_stats['totalPoiDuration(min)'].hist(bins=50)
#ax.set_xlabel('totalPoiDuration(min)')
#ax.set_ylim([0.1, 1e4])
#ax.set_yscale('log')
In [119]:
def extract_seq(seqid, seq_all):
seqi = seq_all[seq_all['seqID'] == seqid].copy()
seqi.sort(columns=['arrivalTime'], ascending=True, inplace=True)
return seqi['poiID'].tolist()
By design, $\text{Pr}(\text{POI}_i \to \text{POI}_j)$ should be bigger if any of $\text{Pr}(\text{Cat}_i \to \text{Cat}_j)$, $\text{Pr}(\text{Pop}_i \to \text{Pop}_j)$ and $\text{Pr}(\text{Dist}_i \to \text{Dist}_j)$ becomes bigger (if other factors stay the same).
So how to combine these probabilities?
Both addition and multiplication seems to be able to serve this purpose, what is the difference?
For the addtion case,
\begin{equation} \text{Pr}(\text{POI}_i \to \text{POI}_j) = \frac{ \text{Pr}(\text{Cat}_{\text{POI}_i} \to \text{Cat}_{\text{POI}_j}) + \text{Pr}(\text{Pop}_{\text{POI}_i} \to \text{Pop}_{\text{POI}_j}) + \text{Pr}(\text{Dist}_{\text{POI}_{i-1} \to \text{POI}_i} \to \text{Dist}_{\text{POI}_{i} \to \text{POI}_j}) } {Z_i} \end{equation}where $\text{POI}_{i-1}$ is the direct predecessor of $\text{POI}_i$ in a trajectory and $Z_i$ is a normalizing constant.
For the multiplication case,
\begin{equation} \text{Pr}(\text{POI}_i \to \text{POI}_j) = \frac{ \text{Pr}(\text{Cat}_{\text{POI}_i} \to \text{Cat}_{\text{POI}_j}) \times \text{Pr}(\text{Pop}_{\text{POI}_i} \to \text{Pop}_{\text{POI}_j}) \times \text{Pr}(\text{Dist}_{\text{POI}_{i-1} \to \text{POI}_i} \to \text{Dist}_{\text{POI}_{i} \to \text{POI}_j}) } {Z_i} \end{equation}similarly, $\text{POI}_{i-1}$ is the direct predecessor of $\text{POI}_i$ in a trajectory and $Z_i$ is again a normalizing constant.
It is important to note the fact that, by design, $\text{Pr}(\text{POI}_i \to \text{POI}_j)$ should be very small if any of $\text{Pr}(\text{Cat}_i \to \text{Cat}_j)$, $\text{Pr}(\text{Pop}_i \to \text{Pop}_j)$ and $\text{Pr}(\text{Dist}_i \to \text{Dist}_j)$ is very small, in the extreme case, if any of the three probabilities is $0$, then $\text{Pr}(\text{POI}_i \to \text{POI}_j)$ should be $0$ because the event "Transition from POI$_i$ to POI$_j$" is impossible.
From the equation of the addition case, it is clear that the addition rule contradicts the above fact while the multiplication rule is consistent with it.
Intuitively, the addition rule could make an unlikely event become much more likely, specifically, make an impossible event become possible.
We model transition probabilities between POI categories, i.e. $\text{Pr}(\text{Cat}_{\text{POI}_i} \to \text{Cat}_{\text{POI}_j})$.
We count the number of transition first, then normalise each row while taking care of zero by adding each cell a small number (i.e. $0.1$ times the minimum value of that row) if there exists a zero cell.
In [120]:
def calc_poi_cat_transmat(seqid_set, poi_all, seq_all):
poi_cats = poi_all['poiTheme'].unique().tolist()
poi_cats.sort()
poi_cat_transmat = pd.DataFrame(data=np.zeros((len(poi_cats), len(poi_cats)), dtype=np.float), \
index=poi_cats, columns=poi_cats)
for seqid in seqid_set:
seq = extract_seq(seqid, seq_all)
for j in range(len(seq)-1):
poi1 = seq[j]
poi2 = seq[j+1]
cat1 = poi_all.loc[poi1, 'poiTheme']
cat2 = poi_all.loc[poi2, 'poiTheme']
poi_cat_transmat.loc[cat1, cat2] += 1
return poi_cat_transmat
In [121]:
def normalise_transmat(transmat):
assert(isinstance(transmat, pd.DataFrame))
for row in range(transmat.index.shape[0]):
nonzeroidx = np.nonzero(transmat.iloc[row])[0].tolist()
if len(nonzeroidx) < transmat.columns.shape[0]:
minv = np.min(transmat.iloc[row, nonzeroidx])
EPS = 0.1 * minv # row-specific smooth factor
#zeroidx = list(set(range(len(transmat.columns))) - set(nonzeroidx))
#transmat.iloc[row, zeroidx] = EPS
transmat.iloc[row] += EPS
rowsum = np.sum(transmat.iloc[row])
assert(rowsum > 0)
transmat.iloc[row] /= rowsum
return transmat
In [122]:
poi_cat_transmat = calc_poi_cat_transmat(seq_all['seqID'].unique(), poi_all, seq_all)
poi_cat_transmat
Out[122]:
In [123]:
poi_cat_transmat = normalise_transmat(poi_cat_transmat)
poi_cat_transmat
Out[123]:
In [124]:
poi_cat_transmat_log = np.log10(poi_cat_transmat)
poi_cat_transmat_log
Out[124]:
We model transition probabilities between POI popularities, i.e. $\text{Pr}(\text{Pop}_{\text{POI}_i} \to \text{Pop}_{\text{POI}_j})$ after discretizing POI popularities.
What the general criteria of data discretization? Discretization of continuous features from Wikipedia.
TODO: Improve the discritization using Fayyad's minimum description length principle (MDLP) described by this paper.
NOTE: MDLP is a supervised discretization method which requires some sort of class labels that are not available here. We need an unsupervised discretization method, well-known ones including equal width, equal frequency, clustering based, etc.
Try different discretization strategies and choose the best one using the following metrics:
actSeqRank(Top%)
= $100 \times \frac{\text{rank of actual sequence}~S_a}{\text{number of enumerated sequence w.r.p}~S_a}$
rankedTop5(%)
= $100 \times \frac{\sum{\delta(\text{rank of actual sequence} \le 5)}}
{\text{number of actual sequence}}$, where $\delta(\text{True}) = 1$ and $\delta(\text{False}) = 0$
i.e. the percentage of actual sequence $S_a$ ranked top 5 among all enumerated sequences with respect to $S_a$.
descretization strategy | actSeqRank(Top%) Toronto (smaller is better) | rankedTop5(%) Toronto (larger is better) | actSeqRank(Top%) Glasgow | rankedTop5(%) Glasgow |
quantile, nbins=2 | mean: 18.496, std: 24.746 | 131/266 = 49.25% | mean: 16.328, std: 19.763 | 55/100 = 55.00% |
quantile, nbins=3 | mean: 16.049, std: 19.873 | 131/266 = 49.25% | mean: 16.188, std: 18.117 | 52/100 = 52.00% |
quantile, nbins=4 | mean: 16.478, std: 21.002 | 132/266 = 49.62% | mean: 15.130, std: 18.643 | 58/100 = 58.00% |
quantile, nbins=5 | mean: 16.811, std: 22.159 | 127/266 = 47.74% | mean: 14.316, std: 17.294 | 61/100 = 61.00% |
quantile, nbins=6 | mean: 15.049, std: 18.722 | 128/266 = 48.12% | mean: 14.581, std: 17.522 | 58/100 = 58.00% |
quantile, nbins=7 | mean: 15.831, std: 20.140 | 130/266 = 48.87% | mean: 14.777, std: 17.175 | 59/100 = 59.00% |
quantile, nbins=8 | mean: 15.567, std: 19.811 | 135/266 = 50.75% | mean: 15.019, std: 17.101 | 56/100 = 56.00% |
quantile, nbins=9 | mean: 14.556, std: 17.937 | 138/266 = 51.88% | mean: 14.255, std: 16.184 | 64/100 = 64.00% |
quantile, nbins=10 | mean: 14.398, std: 17.694 | 135/266 = 50.75% | mean: 13.771, std: 16.043 | 62/100 = 62.00% |
quantile, nbins=11 | mean: 14.874, std: 18.915 | 138/266 = 51.88% | mean: 13.568, std: 15.522 | 63/100 = 63.00% |
quantile, nbins=12 | mean: 13.491, std: 16.760 | 145/266 = 54.51% | mean: 11.464, std: 13.896 | 66/100 = 66.00% |
quantile, nbins=13 | mean: 13.528, std: 16.964 | 144/266 = 54.14% | mean: 12.416, std: 13.081 | 63/100 = 63.00% |
quantile, nbins=14 | mean: 13.279, std: 15.960 | 142/266 = 53.38% | mean: 12.596, std: 14.259 | 65/100 = 65.00% |
quantile, nbins=15 | mean: 12.784, std: 15.796 | 151/266 = 56.77% | mean: 12.209, std: 13.527 | 66/100 = 66.00% |
equalWidth, nbins=2 | mean: 24.761, std: 29.356 | 115/266 = 43.23% | mean: 23.705, std: 24.304 | 47/100 = 47.00% |
equalWidth, nbins=3 | mean: 21.841, std: 28.366 | 127/266 = 47.74% | mean: 23.225, std: 25.065 | 47/100 = 47.00% |
equalWidth, nbins=4 | mean: 23.636, std: 29.252 | 120/266 = 45.11% | mean: 19.416, std: 22.059 | 53/100 = 53.00% |
equalWidth, nbins=5 | mean: 20.154, std: 26.713 | 125/266 = 46.99% | mean: 19.201, std: 23.250 | 54/100 = 54.00% |
equalWidth, nbins=6 | mean: 19.129, std: 25.530 | 125/266 = 46.99% | mean: 19.907, std: 23.132 | 53/100 = 53.00% |
equalWidth, nbins=7 | mean: 16.922, std: 20.619 | 126/266 = 47.37% | mean: 17.030, std: 19.898 | 57/100 = 57.00% |
equalWidth, nbins=8 | mean: 18.240, std: 22.739 | 121/266 = 45.49% | mean: 19.977, std: 23.118 | 54/100 = 54.00% |
equalWidth, nbins=9 | mean: 17.507, std: 22.659 | 129/266 = 48.50% | mean: 18.183, std: 22.388 | 55/100 = 55.00% |
equalWidth, nbins=10 | mean: 18.196, std: 23.321 | 125/266 = 46.99% | mean: 18.158, std: 21.773 | 52/100 = 52.00% |
equalWidth, nbins=11 | mean: 17.711, std: 23.398 | 131/266 = 49.25% | mean: 15.555, std: 19.722 | 60/100 = 60.00% |
equalWidth, nbins=12 | mean: 17.389, std: 22.718 | 132/266 = 49.62% | mean: 15.174, std: 17.188 | 60/100 = 60.00% |
equalWidth, nbins=13 | mean: 15.820, std: 20.856 | 141/266 = 53.01% | mean: 15.184, std: 18.837 | 61/100 = 61.00% |
equalWidth, nbins=14 | mean: 15.681, std: 19.559 | 136/266 = 51.13% | mean: 12.922, std: 15.119 | 63/100 = 63.00% |
equalWidth, nbins=15 | mean: 15.712, std: 19.794 | 136/266 = 51.13% | mean: 14.274, std: 16.828 | 61/100 = 61.00% |
another, bins=[0, 500, 1500, 10000] | mean: 15.977, std: 20.373 | 132/266 = 49.62% | mean: 20.427, std: 21.596 | 50/100 = 50.00% |
In [125]:
rank_mean_Toro = [18.496, 16.049, 16.478, 16.811, 15.049, 15.831, 15.567, 14.556, 14.398, 14.874, 13.491, 13.528, \
13.279, 12.784, 24.761, 21.841, 23.636, 20.154, 19.129, 16.922, 18.240, 17.507, 18.196, 17.711, \
17.389, 15.820, 15.681, 15.712, 15.977]
rank_std_Toro = [24.746, 19.873, 21.002, 22.159, 18.722, 20.140, 19.811, 17.937, 17.694, 18.915, 16.760, 16.964, \
15.960, 15.796, 29.356, 28.366, 29.252, 26.713, 25.530, 20.619, 22.739, 22.659, 23.321, 23.398, \
22.718, 20.856, 19.559, 19.794, 20.373]
rank_mean_Glas = [16.328, 16.188, 15.130, 14.316, 14.581, 14.777, 15.019, 14.255, 13.771, 13.568, 11.464, 12.416, \
12.596, 12.209, 23.705, 23.225, 19.416, 19.201, 19.907, 17.030, 19.977, 18.183, 18.158, 15.555, \
15.174, 15.184, 12.922, 14.274, 20.427]
rank_std_Glas = [19.763, 18.117, 18.643, 17.294, 17.522, 17.175, 17.101, 16.184, 16.043, 15.522, 13.896, 13.081, \
14.259, 13.527, 24.304, 25.065, 22.059, 23.250, 23.132, 19.898, 23.118, 22.388, 21.773, 19.722, \
17.188, 18.837, 15.119, 16.828, 21.596]
rank_top5_Toro = [49.25, 49.25, 49.62, 47.74, 48.12, 48.87, 50.75, 51.88, 50.75, 51.88, 54.51, 54.14, \
53.38, 56.77, 43.23, 47.74, 45.11, 46.99, 46.99, 47.37, 45.49, 48.50, 46.99, 49.25, \
49.62, 53.01, 51.13, 51.13, 49.62]
rank_top5_Glas = [55, 52, 58, 61, 58, 59, 56, 64, 62, 63, 66, 63, 65, 66, 47, 47, 53, 54, 53, 57, 54, 55, 52, 60, 60, \
61, 63, 61, 50]
#xlabels = ['qbins='+str(x) for x in range(2, 16)]
#xlabels.extend(['ebins='+str(x) for x in range(2, 16)])
#xlabels.append('another')
xlabels = [x for x in range(2, len(rank_mean_Toro)+2)]
In [126]:
plt.figure(figsize=[15, 10])
plt.xlim([0, len(rank_mean_Toro)+2])
plt.ylim([-20, 100])
plt.errorbar(xlabels, rank_mean_Toro, rank_std_Toro, linestyle='--', marker='s', label='errorbar_Toronto')
plt.errorbar(xlabels, rank_mean_Glas, rank_std_Glas, linestyle='--', marker='s', label='errorbar_Glasgow')
plt.plot(xlabels, rank_top5_Toro, linestyle='--', marker='s', label='top5_Toronto')
plt.plot(xlabels, rank_top5_Glas, linestyle='--', marker='s', label='top5_Glasgow')
plt.legend()
#idx = 10
idx = 7
plt.annotate('choose', xy=(xlabels[idx], rank_top5_Glas[idx]), xytext=(xlabels[idx], rank_top5_Glas[idx]+15), \
arrowprops=dict(facecolor='green', shrink=0.1))
Out[126]:
It could be seen from the above plot that discretization based on equal frequency (quantiles) performs better than that based on equal width, to balance the complexity and accuracy, we choose "quantile, nbins=9"
.
In [127]:
poi_all['poiFreq'].get_values()
Out[127]:
In [128]:
poi_all['poiFreq'].describe()
Out[128]:
In [129]:
poi_all['poiFreq'].quantile([.25, .5, .75]).tolist()
Out[129]:
In [130]:
ax = poi_all['poiFreq'].hist(bins=10)
ax.set_xlabel('POI Popularity')
ax.set_ylabel('#POI')
Out[130]:
In [131]:
#plt.plot(np.ones(poi_all.index.shape[0]), np.sqrt(poi_all['poiFreq']), marker='+')
In [132]:
nbins = 9
quantiles = np.round(np.linspace(0, 1, nbins+1), 2)[1:-1]
quantiles
Out[132]:
In [133]:
bins_qt = [0]
bins_qt.extend(poi_all['poiFreq'].quantile(quantiles))
bins_qt.append(poi_all['poiFreq'].max() + 1)
bins_qt
Out[133]:
In [134]:
#nbins = 15
#inter = round((poi_all['poiFreq'].max() + 1) / nbins)
#bins_ew = [x*inter for x in range(nbins)]
#bins_ew.append(poi_all['poiFreq'].max() + 1)
#bins_ew
In [135]:
#bins = np.linspace(0, 10000, 11)
#bins = np.logspace(0, 4, 5)
#bins = [1, 100, 500, 1000, 2000, 5000]
#bins_ef = [0, 500, 1500, 10000]
In [136]:
bins_pop = bins_qt
#bins_pop = bins_ew
#bins_pop = bins_ef
In [137]:
ax = poi_all['poiFreq'].hist(bins=bins_pop)
ax.set_xlabel('POI Popularity')
ax.set_ylabel('#POI')
ax.set_xscale('log')
In [138]:
poi_all['popClass'] = np.digitize(poi_all['poiFreq'].get_values(), bins_pop)
#poi_all
In [139]:
def calc_poi_pop_transmat(seqid_set, poi_all, seq_all):
pop_class = poi_all['popClass'].unique().tolist()
pop_class.sort()
poi_pop_transmat = pd.DataFrame(data=np.zeros((len(pop_class), len(pop_class)), dtype=np.float), \
index=pop_class, columns=pop_class)
for seqid in seqid_set:
seq = extract_seq(seqid, seq_all)
for j in range(len(seq)-1):
poi1 = seq[j]
poi2 = seq[j+1]
pc1 = poi_all.loc[poi1, 'popClass']
pc2 = poi_all.loc[poi2, 'popClass']
poi_pop_transmat.loc[pc1, pc2] += 1
return poi_pop_transmat
In [140]:
poi_pop_transmat = calc_poi_pop_transmat(seq_all['seqID'].unique(), poi_all, seq_all)
poi_pop_transmat
Out[140]:
In [141]:
poi_pop_transmat = normalise_transmat(poi_pop_transmat)
poi_pop_transmat
Out[141]:
In [142]:
poi_pop_transmat_log = np.log10(poi_pop_transmat)
poi_pop_transmat_log
Out[142]:
We model transition probabilities between different POI pair distances, i.e. $\text{Pr}(\text{Dist}_{\text{POI}_{i-1} \to \text{POI}_i} \to \text{Dist}_{\text{POI}_{i} \to \text{POI}_j})$ after discretize POI pair distances.
Compute POI-pair distance if the pair is observed in dataset.
In [143]:
def calc_dist(longitude1, latitude1, longitude2, latitude2):
"""Calculate the distance (unit: km) between two places on earth"""
# convert degrees to radians
lon1 = math.radians(longitude1)
lat1 = math.radians(latitude1)
lon2 = math.radians(longitude2)
lat2 = math.radians(latitude2)
radius = 6371.009 # mean earth radius is 6371.009km, en.wikipedia.org/wiki/Earth_radius#Mean_radius
# The haversine formula, en.wikipedia.org/wiki/Great-circle_distance
dlon = math.fabs(lon1 - lon2)
dlat = math.fabs(lat1 - lat2)
return 2 * radius * math.asin( math.sqrt( \
(math.sin(0.5*dlat))**2 + math.cos(lat1) * math.cos(lat2) * (math.sin(0.5*dlon))**2 ))
In [144]:
def calc_obs_poipair_distmat(seqid_set, poi_all, seq_all):
poi_distmat = pd.DataFrame(data=np.full((poi_all.shape[0], poi_all.shape[0]), np.nan, dtype=np.float), \
index=poi_all.index, columns=poi_all.index)
for seqid in seqid_set:
seq = extract_seq(seqid, seq_all)
if len(seq) < 2: continue
for j in range(len(seq)-1):
poi1 = seq[j]
poi2 = seq[j+1]
if np.isnan(poi_distmat.loc[poi1, poi2]):
dist = calc_dist(poi_all.loc[poi1, 'poiLon'], poi_all.loc[poi1, 'poiLat'], \
poi_all.loc[poi2, 'poiLon'], poi_all.loc[poi2, 'poiLat'])
poi_distmat.loc[poi1, poi2] = dist
poi_distmat.loc[poi2, poi1] = dist
return poi_distmat
In [145]:
poi_distmat = calc_obs_poipair_distmat(seq_all['seqID'].unique(), poi_all, seq_all)
#poi_distmat
We use the same metrics as described above to choose a discretization strategy for POI pair distances.
descretization strategy | actSeqRank(Top%) Toronto (smaller is better) | rankedTop5(%) Toronto (larger is better) | actSeqRank(Top%) Glasgow | rankedTop5(%) Glasgow |
quantile, nbins=2 | mean: 15.156, std: 16.911 | 120/266 = 45.11% | mean: 14.354, std: 16.541 | 60/100 = 60.00% |
quantile, nbins=3 | mean: 14.917, std: 17.484 | 130/266 = 48.87% | mean: 14.450, std: 16.173 | 60/100 = 60.00% |
quantile, nbins=4 | mean: 14.389, std: 17.527 | 139/266 = 52.26% | mean: 14.255, std: 16.184 | 64/100 = 64.00% |
quantile, nbins=5 | mean: 13.645, std: 16.550 | 148/266 = 55.64% | mean: 14.085, std: 16.159 | 63/100 = 63.00% |
quantile, nbins=6 | mean: 14.299, std: 17.550 | 138/266 = 51.88% | mean: 13.156, std: 14.436 | 63/100 = 63.00% |
quantile, nbins=7 | mean: 12.689, std: 14.674 | 140/266 = 52.63% | mean: 12.755, std: 13.681 | 62/100 = 62.00% |
quantile, nbins=8 | mean: 12.996, std: 15.606 | 143/266 = 53.76% | mean: 11.716, std: 13.003 | 64/100 = 64.00% |
quantile, nbins=9 | mean: 12.510, std: 14.946 | 140/266 = 52.63% | mean: 11.355, std: 12.893 | 66/100 = 66.00% |
quantile, nbins=10 | mean: 12.467, std: 14.549 | 146/266 = 54.89% | mean: 11.181, std: 12.107 | 68/100 = 68.00% |
quantile, nbins=11 | mean: 12.548, std: 14.758 | 143/266 = 53.76% | mean: 10.214, std: 10.872 | 68/100 = 68.00% |
quantile, nbins=12 | mean: 11.980, std: 13.883 | 147/266 = 55.26% | mean: 10.041, std: 10.895 | 73/100 = 73.00% |
quantile, nbins=13 | mean: 12.170, std: 13.983 | 140/266 = 52.63% | mean: 9.345, std: 9.759 | 73/100 = 73.00% |
quantile, nbins=14 | mean: 11.384, std: 12.787 | 153/266 = 57.52% | mean: 9.008, std: 9.444 | 73/100 = 73.00% |
quantile, nbins=15 | mean: 11.444, std: 12.888 | 151/266 = 56.77% | mean: 8.613, std: 9.052 | 75/100 = 75.00% |
quantile, nbins=16 | mean: 10.932, std: 12.621 | 156/266 = 58.65% | mean: 8.553, std: 9.336 | 74/100 = 74.00% |
quantile, nbins=17 | mean: 10.991, std: 12.950 | 156/266 = 58.65% | mean: 8.025, std: 8.502 | 79/100 = 79.00% |
quantile, nbins=18 | mean: 10.728, std: 12.285 | 155/266 = 58.27% | EMPTY ROW 13 | NaN |
quantile, nbins=19 | mean: 10.754, std: 12.368 | 150/266 = 56.39% | EMPTY ROW 11 | NaN |
quantile, nbins=20 | mean: 10.836, std: 12.383 | 152/266 = 57.14% | mean: 7.922, std: 8.702 | 79/100 = 79.00% |
quantile, nbins=21 | mean: 10.579, std: 12.301 | 162/266 = 60.90% | EMPTY ROW 15 | NaN |
equalWidth, nbins=2 | EMPTY LAST BIN | NaN | EMPTY LAST BIN | NaN |
equalWidth, nbins=3 | EMPTY LAST BIN | NaN | EMPTY LAST TWO BIN | NaN |
equalWidth, nbins=4 | EMPTY LAST TWO BIN | NaN | EMPTY LAST TWO BIN | NaN |
another, bins=[0, 2, 5, 100] | mean: 15.110, std: 16.204 | 117/266 = 43.98% | mean: 14.937, std: 16.630 | 60/100 = 60.00% |
In [146]:
rank_mean_Toro = [15.156, 14.917, 14.389, 13.645, 14.299, 12.689, 12.996, 12.510, 12.467, 12.548, 11.980, \
12.170, 11.384, 11.444, 10.932, 10.991, 10.836, 15.110]
rank_std_Toro = [16.911, 17.484, 17.527, 16.550, 17.550, 14.674, 15.606, 14.946, 14.549, 14.758, 13.883, \
13.983, 12.787, 12.888, 12.621, 12.950, 12.383, 16.204]
rank_mean_Glas = [14.354, 14.450, 14.255, 14.085, 13.156, 12.755, 11.716, 11.355, 11.181, 10.214, 10.041, \
9.345, 9.008, 8.613, 8.553, 8.025, 7.922, 14.937]
rank_std_Glas = [16.541, 16.173, 16.184, 16.159, 14.436, 13.681, 13.003, 12.893, 12.107, 10.872, 10.895, \
9.759, 9.444, 9.052, 9.336, 8.502, 8.702, 16.630]
rank_top5_Toro = [45.11, 48.87, 52.26, 55.64, 51.88, 52.63, 53.76, 52.63, 54.89, 53.76, 55.26, 52.63, 57.52, \
56.77, 58.65, 58.65, 57.14, 43.98]
rank_top5_Glas = [60, 60, 64, 63, 63, 62, 64, 66, 68, 68, 73, 73, 73, 75, 74, 79, 79, 60]
xlabels = [x for x in range(2, len(rank_mean_Toro)+2)]
In [147]:
plt.figure(figsize=[15, 10])
plt.xlim([0, len(rank_mean_Toro)+2])
plt.ylim([-20, 100])
plt.errorbar(xlabels, rank_mean_Toro, rank_std_Toro, linestyle='--', marker='s', label='errorbar_Toronto')
plt.errorbar(xlabels, rank_mean_Glas, rank_std_Glas, linestyle='--', marker='s', label='errorbar_Glasgow')
plt.plot(xlabels, rank_top5_Toro, linestyle='--', marker='s', label='top5_Toronto')
plt.plot(xlabels, rank_top5_Glas, linestyle='--', marker='s', label='top5_Glasgow')
plt.legend()
#idx = 10
idx = 8
plt.annotate('choose', xy=(xlabels[idx], rank_top5_Glas[idx]), xytext=(xlabels[idx], rank_top5_Glas[idx]+15), \
arrowprops=dict(facecolor='green', shrink=0.1))
Out[147]:
Remove rows that contain NaN
and plot the curve, we choose quantile, nbins=10
to balance the complexity and accuracy.
In [148]:
#distdata = pd.Series([x for x in np.unique(poi_distmat.get_values().flatten()) if not np.isnan(x)])
distdata = pd.Series([poi_distmat.iloc[x, y] \
for x in range(poi_distmat.index.shape[0]) \
for y in range(x+1, poi_distmat.index.shape[0]) \
if not np.isnan(poi_distmat.iloc[x, y])])
In [149]:
distdata.describe()
Out[149]:
In [150]:
ax = distdata.hist(bins=20)
ax.set_xlabel('POI-Pair Distance (km)')
ax.set_ylabel('#POI-Pair')
Out[150]:
In [151]:
nbins = 10
quantiles = np.round(np.linspace(0, 1, nbins+1), 2)[1:-1]
quantiles
Out[151]:
In [152]:
bins_qt = [0]
bins_qt.extend(distdata.quantile(quantiles))
bins_qt.append(10*round(distdata.max()))
bins_qt
Out[152]:
In [153]:
#nbins = 4
#inter = round((round(distdata.max()) + 1) / nbins)
#maxdist = 30 # Toronto, maximum distance among all POI pairs
#maxdist = 46 # Glasgow
#inter = round(maxdist / nbins)
#bins_ew = [x*inter for x in range(nbins)]
#bins_ew.append(maxdist)
#bins_ew
In [154]:
#bins = np.linspace(0, 10, 7)
#bins = np.logspace(0, 2, 4)
#bins = [0, 1, 2, 3, 10]
#bins_a = [0, 2, 5, 100] # walk, ride, drive
In [155]:
#bins_ef = [0, 1.15, 2.25, 100]
In [156]:
bins_dist = bins_qt
#bins_dist = bins_ew
#bins_dist = bins_ef
#bins_dist = bins_a
In [157]:
ax = distdata.ix[np.nonzero(distdata)].hist(bins=bins_dist)
ax.set_xlabel('POI-Pair Distance (km)')
ax.set_ylabel('#POI-Pair')
ax.set_xscale('log')
In [158]:
poi_distclass_mat = pd.DataFrame(data=np.zeros((poi_all.shape[0], poi_all.shape[0]), dtype=np.int), \
index=poi_all.index, columns=poi_all.index)
In [159]:
for i in range(poi_all.index.shape[0]):
poi1 = poi_all.index[i]
for j in range(i+1, poi_all.index.shape[0]):
poi2 = poi_all.index[j]
dc = None
if np.isnan(poi_distmat.loc[poi1, poi2]):
dist = calc_dist(poi_all.loc[poi1, 'poiLon'], poi_all.loc[poi1, 'poiLat'], \
poi_all.loc[poi2, 'poiLon'], poi_all.loc[poi2, 'poiLat'])
dc = np.digitize([dist], bins_dist)[0]
else:
dc = np.digitize([poi_distmat.loc[poi1, poi2]], bins_dist)[0]
assert(dc is not None)
poi_distclass_mat.loc[poi1, poi2] = dc
poi_distclass_mat.loc[poi2, poi1] = dc
In [160]:
poi_distclass_mat
Out[160]:
Use POI pair that is observed in dataset to compute the transition matrix between different "class" of distances.
In [161]:
def calc_poipair_dist_transmat(seqid_set, poi_all, seq_all, poi_distclass_mat, bins_dist):
dist_class = list(range(1, len(bins_dist)))
poipair_dist_transmat = pd.DataFrame(data=np.zeros((len(dist_class), len(dist_class)), dtype=np.float), \
index=dist_class, columns=dist_class)
for seqid in seqid_set:
seq = extract_seq(seqid, seq_all)
if len(seq) < 3: continue
for j in range(1, len(seq)-1):
poi1 = seq[j-1]
poi2 = seq[j]
poi3 = seq[j+1]
dc1 = poi_distclass_mat.loc[poi1, poi2]
dc2 = poi_distclass_mat.loc[poi2, poi3]
poipair_dist_transmat.loc[dc1, dc2] += 1
return poipair_dist_transmat
In [162]:
poipair_dist_transmat = calc_poipair_dist_transmat(seq_all['seqID'].unique(), poi_all, seq_all, \
poi_distclass_mat, bins_dist)
poipair_dist_transmat
Out[162]:
In [163]:
poipair_dist_transmat = normalise_transmat(poipair_dist_transmat)
poipair_dist_transmat
Out[163]:
In [164]:
poipair_dist_transmat_log = np.log10(poipair_dist_transmat)
poipair_dist_transmat_log
Out[164]:
Log likelihood of trajectory $[\text{POI}_1, \text{POI}_2, \dots, \text{POI}_i, ..., \text{POI}_N]$ is defined as \begin{align} \text{logl} =& \sum_{i=1}^{N-1} \log(\text{Pr}(\text{Cat}_{\text{POI}_i} \to \text{Cat}_{\text{POI}_{i+1}})) + \sum_{i=1}^{N-1} \log(\text{Pr}(\text{Pop}_{\text{POI}_i} \to \text{Pop}_{\text{POI}_{i+1}})) + \sum_{i=2}^{N-1} \log(\text{Pr}(\text{Dist}_{\text{POI}_{i-1} \to \text{POI}_i} \to \text{Dist}_{\text{POI}_{i} \to \text{POI}_{i+1}})) \\ & + \log(\text{Pr}(\text{POI}_1)) \end{align}
where $\text{Pr}(\text{POI}_1)$ is the prior of $\text{POI}_1$ and we assume $\text{Pr}(\text{POI}_1)=1.0$, 10-based logarithm is used here.
In [165]:
def calc_seq_loglikelihood(seq, poi_all, poi_cat_transmat_log, poi_pop_transmat_log, \
poi_distclass_mat, poipair_dist_transmat_log):
assert(len(seq) > 1)
cat1 = poi_all.loc[seq[0], 'poiTheme']
cat2 = poi_all.loc[seq[1], 'poiTheme']
pc1 = poi_all.loc[seq[0], 'popClass']
pc2 = poi_all.loc[seq[1], 'popClass']
logL = poi_cat_transmat_log.loc[cat1, cat2] + poi_pop_transmat_log.loc[pc1, pc2]
for j in range(1, len(seq)-1):
poi1 = seq[j-1]
poi2 = seq[j]
poi3 = seq[j+1]
cat2 = poi_all.loc[poi2, 'poiTheme']
cat3 = poi_all.loc[poi3, 'poiTheme']
pc2 = poi_all.loc[poi2, 'popClass']
pc3 = poi_all.loc[poi3, 'popClass']
dc12 = poi_distclass_mat.loc[poi1, poi2]
dc23 = poi_distclass_mat.loc[poi2, poi3]
logL += poi_cat_transmat_log.loc[cat2, cat3] + poi_pop_transmat_log.loc[pc2, pc3]
#print(seq, dc12, dc23)
logL += poipair_dist_transmat_log.loc[dc12, dc23]
return logL
Simple check.
In [166]:
seq1 = [10, 21, 28, 22]
d12 = calc_dist(poi_all.loc[10,'poiLon'], poi_all.loc[10,'poiLat'], poi_all.loc[21,'poiLon'], poi_all.loc[21, 'poiLat'])
d23 = calc_dist(poi_all.loc[21,'poiLon'], poi_all.loc[21,'poiLat'], poi_all.loc[28,'poiLon'], poi_all.loc[28, 'poiLat'])
d34 = calc_dist(poi_all.loc[28,'poiLon'], poi_all.loc[28,'poiLat'], poi_all.loc[22,'poiLon'], poi_all.loc[22, 'poiLat'])
In [167]:
print(d12, d23, d34)
print(bins_dist)
In [168]:
s1 = poi_cat_transmat_log.loc[poi_all.loc[10, 'poiTheme'], poi_all.loc[21, 'poiTheme']] + \
poi_cat_transmat_log.loc[poi_all.loc[21, 'poiTheme'], poi_all.loc[28, 'poiTheme']] + \
poi_cat_transmat_log.loc[poi_all.loc[28, 'poiTheme'], poi_all.loc[22, 'poiTheme']] + \
poi_pop_transmat_log.loc[poi_all.loc[10, 'popClass'], poi_all.loc[21, 'popClass']] + \
poi_pop_transmat_log.loc[poi_all.loc[21, 'popClass'], poi_all.loc[28, 'popClass']] + \
poi_pop_transmat_log.loc[poi_all.loc[28, 'popClass'], poi_all.loc[22, 'popClass']]
s2 = poipair_dist_transmat_log.loc[np.digitize([d12], bins_dist)[0], np.digitize([d23], bins_dist)[0]] + \
poipair_dist_transmat_log.loc[np.digitize([d23], bins_dist)[0], np.digitize([d34], bins_dist)[0]]
print(s1+s2)
In [169]:
calc_seq_loglikelihood([10, 21, 28, 22], poi_all, poi_cat_transmat_log, poi_pop_transmat_log, \
poi_distclass_mat, poipair_dist_transmat_log)
Out[169]:
In [170]:
def parse_seqstr(seqstr):
term = re.sub('[ \[\]]', '', seqstr).split(',')
return [int(x) for x in term]
To save computation power, we consider unique travelling sequences only (i.e. treat sequences with the same list of POI and visiting order of different users as one sequence) as no user-specific features are used here.
In [171]:
unique_seq = dict() # seq -> [(seqid, userid)]
In [172]:
for seqid in sorted(seq_all['seqID'].unique().tolist()):
seq = extract_seq(seqid, seq_all)
if str(seq) not in unique_seq:
unique_seq[str(seq)] = [(seqid, seq_user.loc[seqid])]
else:
unique_seq[str(seq)].append((seqid, seq_user.loc[seqid]))
In [173]:
unique_seq345 = [parse_seqstr(x) for x in sorted(unique_seq.keys()) if len(x.split(',')) in {3,4,5}]
In [174]:
unique_seq345_logL = pd.DataFrame(data=np.zeros((len(unique_seq345), 2), dtype=np.float), \
index=[str(x) for x in unique_seq345], columns=['logLikelihood', 'seqLen'])
unique_seq345_logL.index.name = 'actSeq'
In [175]:
for seq in unique_seq345:
assert(len(seq) in {3,4,5})
logL = calc_seq_loglikelihood(seq, poi_all, poi_cat_transmat_log, poi_pop_transmat_log, \
poi_distclass_mat, poipair_dist_transmat_log)
unique_seq345_logL.loc[str(seq), 'logLikelihood'] = logL
unique_seq345_logL.loc[str(seq), 'seqLen'] = len(seq)
#print('Sequence %-20s Log likelihood: %.3f' % (str(seq), logL))
In [176]:
print(unique_seq345_logL.index.shape[0])
unique_seq345_logL.head()
Out[176]:
In [177]:
unique_seq345_logL['seqLen'].hist(bins=10)
Out[177]:
Compute log likelihood of enumerated trajectories for all unique actual sequences of length {3, 4, 5}.
In [178]:
poi_list = poi_all.index.tolist()
#poi_list
Enumerate trajectories of the same (start, end) and length (3, 4 or 5) with respect to an actual sequence.
In [179]:
def enum_seq345(seq, poi_list):
assert(len(seq) in {3, 4, 5})
p0 = seq[0]
pN = seq[-1]
# enumerate sequences with length 3
if len(seq) == 3:
return [[p0, p, pN] \
for p in poi_list if p not in {p0, pN}]
# enumerate sequences with length 4
if len(seq) == 4:
return [[p0, p1, p2, pN] \
for p1 in poi_list if p1 not in {p0, pN} \
for p2 in poi_list if p2 not in {p0, p1, pN}]
# enumerate sequences with length 5
if len(seq) == 5:
return [[p0, p1, p2, p3, pN] \
for p1 in poi_list if p1 not in {p0, pN} \
for p2 in poi_list if p2 not in {p0, p1, pN} \
for p3 in poi_list if p3 not in {p0, p1, p2, pN}]
Compute the log likelihood of enumerated trajectories.
In [180]:
enum_logL_df = pd.DataFrame()
In [181]:
for seq in unique_seq345:
enum_seqs = enum_seq345(seq, poi_list)
df = pd.DataFrame(data=sorted([str(x) for x in enum_seqs]), columns=['enumSeq'])
df.set_index('enumSeq', inplace=True)
df['actSeq'] = str(seq)
enum_logL_df = enum_logL_df.append(df)
In [182]:
print(enum_logL_df.shape)
enum_logL_df.head()
Out[182]:
In [183]:
t1 = datetime.now()
logL = Parallel(n_jobs=-2)(delayed(calc_seq_loglikelihood)\
(seq, poi_all, poi_cat_transmat_log, poi_pop_transmat_log, poi_distclass_mat, poipair_dist_transmat_log)\
for seq in [parse_seqstr(x) for x in enum_logL_df.index])
print('%d seconds used' % (datetime.now()-t1).total_seconds()) # 930 seconds
In [184]:
enum_logL_df['enumSeqLogLikelihood'] = logL
In [185]:
#enum_logL_df.head(23)
Compare the log likelihood between actual sequences $S_a$ and the one of highest log likelihood among enumerated sequences with respect to $S_a$ as well as the log likelihood rank of $S_a$.
In [186]:
df = pd.DataFrame(data=sorted([str(x) for x in unique_seq345]), columns=['actSeq'])
df.set_index('actSeq', inplace=True)
df['actSeqLogLikelihood'] = unique_seq345_logL.loc[df.index, 'logLikelihood']
df['enumSeq'] = ''
df['enumSeqLogLikelihood'] = 0
df['actSeqRank'] = 0
df['#enumSeq'] = 0
for seqstr in df.index:
sub_df = enum_logL_df[enum_logL_df['actSeq'] == seqstr].copy()
sub_df.reset_index(inplace=True)
sub_df.sort(columns=['enumSeqLogLikelihood'], ascending=False, inplace=True)
df.loc[seqstr, 'enumSeq'] = sub_df.iloc[0]['enumSeq']
df.loc[seqstr, 'enumSeqLogLikelihood'] = sub_df.iloc[0]['enumSeqLogLikelihood']
df.loc[seqstr, 'actSeqRank'] = 1 + np.nonzero(sub_df['enumSeq'] == seqstr)[0][0] # rank of actual sequence
df.loc[seqstr, '#enumSeq'] = sub_df.index.shape[0]
df['actSeqRank(Top%)'] = 100*df['actSeqRank']/df['#enumSeq']
In [187]:
#df
In [188]:
print('mean: %.3f, std: %.3f' % (round(df['actSeqRank(Top%)'].mean(),3), round(df['actSeqRank(Top%)'].std(),3)))
df['actSeqRank(Top%)'].describe()
Out[188]:
In [189]:
ntop = np.nonzero(df['actSeqRank'] <= 5)[0].shape[0]
print('%d/%d = %.2f%%' % (ntop, df.index.shape[0], 100*ntop/df.index.shape[0]))