Computation Steps:
In [104]:
%matplotlib inline
import os
import re
import math
import random
import pickle
import pandas as pd
import numpy as np
import scipy.stats
from datetime import datetime
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
In [105]:
nfeatures = 8 # number of features
EPS = 1e-12 # smooth, deal with 0 probability
random.seed(123456789) # control random choice when splitting training/testing set
In [106]:
data_dir = 'data/data-ijcai15'
#fvisit = os.path.join(data_dir, 'userVisits-Osak.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Osak.csv')
#fvisit = os.path.join(data_dir, 'userVisits-Glas.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Glas.csv')
#fvisit = os.path.join(data_dir, 'userVisits-Edin.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Edin.csv')
fvisit = os.path.join(data_dir, 'userVisits-Toro.csv')
fcoord = os.path.join(data_dir, 'photoCoords-Toro.csv')
In [107]:
suffix = fvisit.split('-')[-1].split('.')[0]
fenumseq = os.path.join(data_dir, 'enumSeqData-' + suffix + '.pkl')
In [108]:
visits = pd.read_csv(fvisit, sep=';')
coords = pd.read_csv(fcoord, sep=';')
# merge data frames according to column 'photoID'
assert(visits.shape[0] == coords.shape[0])
traj = pd.merge(visits, coords, on='photoID')
traj.head()
Out[108]:
In [109]:
num_photo = traj['photoID'].unique().shape[0]
num_user = traj['userID'].unique().shape[0]
num_poi = traj['poiID'].unique().shape[0]
num_seq = traj['seqID'].unique().shape[0]
pd.DataFrame({'#photo': num_photo, '#user': num_user, '#poi': num_poi, '#seq': num_seq, \
'#photo/user': num_photo/num_user, '#seq/user': num_seq/num_user}, index=[str(suffix)])
Out[109]:
In [110]:
plt.figure(figsize=[15, 5])
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.scatter(traj['photoLon'], traj['photoLat'], marker='+')
Out[110]:
Compute POI (Longitude, Latitude) as the average coordinates of the assigned photos.
In [111]:
poi_coords = traj[['poiID', 'photoLon', 'photoLat']].groupby('poiID').mean()
poi_coords.reset_index(inplace=True)
poi_coords.rename(columns={'photoLon':'poiLon', 'photoLat':'poiLat'}, inplace=True)
Extract POI category and visiting frequency.
In [112]:
poi_catfreq = traj[['poiID', 'poiTheme', 'poiFreq']].groupby('poiID').first()
poi_catfreq.reset_index(inplace=True)
In [113]:
poi_all = pd.merge(poi_catfreq, poi_coords, on='poiID')
poi_all.set_index('poiID', inplace=True)
#poi_all.to_csv(fpoi, index=True)
In [114]:
seq_all = traj[['userID', 'seqID', 'poiID', 'dateTaken']].copy().groupby(['userID', 'seqID', 'poiID'])\
.agg([np.min, np.max, np.size])
seq_all.columns = seq_all.columns.droplevel()
seq_all.reset_index(inplace=True)
seq_all.rename(columns={'amin':'arrivalTime', 'amax':'departureTime', 'size':'#photo'}, inplace=True)
seq_all['poiDuration(sec)'] = seq_all['departureTime'] - seq_all['arrivalTime']
seq_all.head()
Out[114]:
In [115]:
seq_user = seq_all[['userID', 'seqID', 'poiID']].copy().groupby(['userID', 'seqID']).agg(np.size)
seq_user.reset_index(inplace=True)
seq_user.rename(columns={'size':'seqLen'}, inplace=True)
seq_user.set_index('seqID', inplace=True)
seq_user.head()
Out[115]:
In [116]:
seq_len = seq_all[['seqID', 'poiID']].copy().groupby('seqID').agg(np.size)
seq_len.reset_index(inplace=True)
seq_len.rename(columns={'poiID':'seqLen'}, inplace=True)
seq_len.head()
Out[116]:
In [117]:
seq_stats = seq_all[['seqID', '#photo', 'poiDuration(sec)']].copy().groupby('seqID').agg(np.sum)
seq_stats.reset_index(inplace=True)
#seq_stats.rename(columns={'poiDuration(sec)':'totalPoiDuration(sec)'}, inplace=True)
seq_stats = pd.merge(seq_len, seq_stats, on='seqID')
seq_stats['poiDuration(sec)'] /= 60
seq_stats.rename(columns={'poiDuration(sec)':'totalPoiDuration(min)'}, inplace=True)
seq_stats.set_index('seqID', inplace=True)
seq_stats.head()
Out[117]:
In [118]:
ax = seq_stats['seqLen'].hist(bins=50)
ax.set_xlabel('sequence length')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')
In [119]:
ax = seq_stats['#photo'].hist(bins=50)
ax.set_xlabel('#photo for sequence')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')
In [120]:
ax = seq_stats['totalPoiDuration(min)'].hist(bins=100)
ax.set_xlabel('totalPoiDuration(min)')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')
#ax.set_xscale('log')
Sequences with length {3, 4, 5}
In [121]:
seq_stats = seq_stats[seq_stats['seqLen'].isin({3, 4, 5})]
ax = seq_stats['totalPoiDuration(min)'].hist(bins=50)
ax.set_xlabel('totalPoiDuration(min)')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')
Do some filtering? (e.g. totalPoiDuration = 0 or too large, seqLen = 1 etc), 8 hours split vs. 6 hours split?
NOTE: All sequences are used in the next section (i.e. compute sequence log likelihood, enumerate sequences) without any filtering.
In [122]:
min_duration = 10 # 10 minutes
max_duration = 960 # 16 hours
In [123]:
seq_stats = seq_stats[seq_stats['totalPoiDuration(min)'] > min_duration]
seq_stats = seq_stats[seq_stats['totalPoiDuration(min)'] < max_duration]
ax = seq_stats['totalPoiDuration(min)'].hist(bins=50)
ax.set_xlabel('totalPoiDuration(min)')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')
Sequences with the same (start, end), some (start, end) are more popular than others.
what do we need? (start POI category, end POI category) vs. (start POI, end POI)
In [124]:
def extract_seq(seqid, seq_all):
seqi = seq_all[seq_all['seqID'] == seqid].copy()
seqi.sort(columns=['arrivalTime'], ascending=True, inplace=True)
return seqi['poiID'].tolist()
In [125]:
seq_dict = dict()
for seqid in seq_stats.index:
assert(seq_stats.loc[seqid, 'seqLen'] >= 2)
seq = extract_seq(seqid, seq_all)
p0 = seq[0]
pN = seq[-1]
key = (p0, pN, len(seq))
if key not in seq_dict:
seq_dict[key] = []
seq_dict[key].append(seq)
In [126]:
tuples = sorted(seq_dict.keys())
group_size = np.array([len(seq_dict[x]) for x in tuples])
maxidx = group_size.argmax()
minidx = group_size.argmin()
print('max: %d sequences with (start, end, seqLen) %s' % (group_size[maxidx], str(tuples[maxidx])))
print('min: %d sequences with (start, end, seqLen) %s' % (group_size[minidx], str(tuples[minidx])))
ax = pd.Series(group_size).hist(bins=20)
ax.set_xlabel('#sequence with same (start, end, seqLen)')
ax.set_ylabel('#(start, end, seqLen) tuples')
Out[126]:
Split sequences into training set and testing set. which kind of split? hold one sequence in each type of (start, end)?
By design, $\text{Pr}(\text{POI}_i \to \text{POI}_j)$ should be bigger if any of $\text{Pr}(\text{Cat}_i \to \text{Cat}_j)$, $\text{Pr}(\text{Pop}_i \to \text{Pop}_j)$ and $\text{Pr}(\text{Dist}_i \to \text{Dist}_j)$ becomes bigger (if other factors stay the same).
So how to combine these probabilities?
Both addition and multiplication seems to be able to serve this purpose, what is the difference?
For the addtion case,
\begin{equation} \text{Pr}(\text{POI}_i \to \text{POI}_j) = \frac{ \text{Pr}(\text{Cat}_{\text{POI}_i} \to \text{Cat}_{\text{POI}_j}) + \text{Pr}(\text{Pop}_{\text{POI}_i} \to \text{Pop}_{\text{POI}_j}) + \text{Pr}(\text{Dist}_{\text{POI}_{i-1} \to \text{POI}_i} \to \text{Dist}_{\text{POI}_{i} \to \text{POI}_j}) } {Z_i} \end{equation}where $\text{POI}_{i-1}$ is the direct predecessor of $\text{POI}_i$ in a trajectory and $Z_i$ is a normalizing constant.
For the addtion case,
\begin{equation} \text{Pr}(\text{POI}_i \to \text{POI}_j) = \frac{ \text{Pr}(\text{Cat}_{\text{POI}_i} \to \text{Cat}_{\text{POI}_j}) \times \text{Pr}(\text{Pop}_{\text{POI}_i} \to \text{Pop}_{\text{POI}_j}) \times \text{Pr}(\text{Dist}_{\text{POI}_{i-1} \to \text{POI}_i} \to \text{Dist}_{\text{POI}_{i} \to \text{POI}_j}) } {Z_i} \end{equation}similarly, $\text{POI}_{i-1}$ is the direct predecessor of $\text{POI}_i$ in a trajectory and $Z_i$ is again a normalizing constant.
It is important to note the fact that, by design, $\text{Pr}(\text{POI}_i \to \text{POI}_j)$ should be very small if any of $\text{Pr}(\text{Cat}_i \to \text{Cat}_j)$, $\text{Pr}(\text{Pop}_i \to \text{Pop}_j)$ and $\text{Pr}(\text{Dist}_i \to \text{Dist}_j)$ is very small, in the extreme case, if any of the three probabilities is $0$, then $\text{Pr}(\text{POI}_i \to \text{POI}_j)$ should be $0$ because the event "Transition from POI$_i$ to POI$_j$" is impossible.
From the equation of the addition case, it is clear that the addition rule contradicts the above fact while the multiplication rule is consistent with it.
Intuitively, the addition rule could make an unlikely event become much more likely, specifically, make an impossible event become possible.
In [127]:
def normalise_transmat(transmat):
assert(isinstance(transmat, pd.DataFrame))
for row in range(transmat.index.shape[0]):
nonzeroidx = np.nonzero(transmat.iloc[row])[0].tolist()
if len(nonzeroidx) < transmat.columns.shape[0]:
minv = np.min(transmat.iloc[row, nonzeroidx])
EPS = 0.1 * minv # row-specific smooth factor
#zeroidx = list(set(range(len(transmat.columns))) - set(nonzeroidx))
#transmat.iloc[row, zeroidx] = EPS
transmat.iloc[row] += EPS
rowsum = np.sum(transmat.iloc[row])
assert(rowsum > 0)
transmat.iloc[row] /= rowsum
return transmat
In [128]:
def calc_poi_cat_transmat(seqid_set, poi_all, seq_all):
poi_cats = poi_all['poiTheme'].unique().tolist()
poi_cats.sort()
poi_cat_transmat = pd.DataFrame(data=np.zeros((len(poi_cats), len(poi_cats)), dtype=np.float), \
index=poi_cats, columns=poi_cats)
for seqid in seqid_set:
seq = extract_seq(seqid, seq_all)
for j in range(len(seq)-1):
poi1 = seq[j]
poi2 = seq[j+1]
cat1 = poi_all.loc[poi1, 'poiTheme']
cat2 = poi_all.loc[poi2, 'poiTheme']
poi_cat_transmat.loc[cat1, cat2] += 1
return poi_cat_transmat
In [129]:
poi_cat_transmat = calc_poi_cat_transmat(seq_all['seqID'].unique(), poi_all, seq_all)
poi_cat_transmat
Out[129]:
In [130]:
poi_cat_transmat = normalise_transmat(poi_cat_transmat)
poi_cat_transmat
Out[130]:
TODO: Improve the discritization using Fayyad's minimum description length principle (MDLP) described by this paper.
NOTE: MDLP is a supervised discretization method which requires some sort of class labels that are not available here. We need an unsupervised discretization method, well-known ones including equal width, equal frequency, clustering based, etc.
What the general criteria of data discretization? Discretization of continuous features from Wikipedia.
In [131]:
poi_all['poiFreq'].get_values()
Out[131]:
In [132]:
poi_all['poiFreq'].describe()
Out[132]:
In [133]:
ax = poi_all['poiFreq'].hist(bins=10)
ax.set_xlabel('POI Popularity')
ax.set_ylabel('#POI')
Out[133]:
In [134]:
plt.plot(np.ones(poi_all.index.shape[0]), np.sqrt(poi_all['poiFreq']), marker='+')
Out[134]:
In [135]:
#bins = [1, 346, 936, 1874, 5000]
#bins = np.linspace(0, 10000, 11)
#bins = np.logspace(0, 4, 5)
#bins = [1, 100, 500, 1000, 2000, 5000]
bins = [1, 500, 1500, 10000]
In [136]:
ax = poi_all['poiFreq'].hist(bins=bins)
ax.set_xlabel('POI Popularity')
ax.set_ylabel('#POI')
ax.set_xscale('log')
In [137]:
poi_all['popClass'] = np.digitize(poi_all['poiFreq'].get_values(), bins)
poi_all
Out[137]:
In [138]:
def calc_poi_pop_transmat(seqid_set, poi_all, seq_all):
pop_class = poi_all['popClass'].unique().tolist()
pop_class.sort()
poi_pop_transmat = pd.DataFrame(data=np.zeros((len(pop_class), len(pop_class)), dtype=np.float), \
index=pop_class, columns=pop_class)
for seqid in seqid_set:
seq = extract_seq(seqid, seq_all)
for j in range(len(seq)-1):
poi1 = seq[j]
poi2 = seq[j+1]
pc1 = poi_all.loc[poi1, 'popClass']
pc2 = poi_all.loc[poi2, 'popClass']
poi_pop_transmat.loc[pc1, pc2] += 1
return poi_pop_transmat
In [139]:
poi_pop_transmat = calc_poi_pop_transmat(seq_all['seqID'].unique(), poi_all, seq_all)
poi_pop_transmat
Out[139]:
In [140]:
poi_pop_transmat = normalise_transmat(poi_pop_transmat)
poi_pop_transmat
Out[140]:
In [141]:
def calc_poi_transmat(poi_all, seq_all, poi_cat_transmat, poi_pop_transmat):
poi_transmat = pd.DataFrame(data=np.zeros((poi_all.index.shape[0], poi_all.index.shape[0]), dtype=np.float), \
index=poi_all.index, columns=poi_all.index)
for poi1 in poi_all.index:
cat1 = poi_all.loc[poi1, 'poiTheme']
pc1 = poi_all.loc[poi1, 'popClass']
for poi2 in poi_all.index:
cat2 = poi_all.loc[poi2, 'poiTheme']
pc2 = poi_all.loc[poi2, 'popClass']
logP = math.log10(poi_cat_transmat.loc[cat1, cat2]) + math.log10(poi_pop_transmat.loc[pc1, pc2])
poi_transmat.loc[poi1, poi2] = 10 ** (logP)
return poi_transmat
In [142]:
poi_transmat = calc_poi_transmat(poi_all, seq_all, poi_cat_transmat, poi_pop_transmat)
poi_transmat
Out[142]:
In [143]:
poi_transmat.sum(axis=1)
Out[143]:
In [144]:
for poi1 in poi_transmat.index:
rowsum = poi_transmat.loc[poi1].sum()
#print(rowsum)
assert(rowsum > 0.)
#poi_transmat.loc[poi1] /= rowsum # OK
#print(poi_transmat.loc[poi1].sum())
logSum = math.log10(rowsum)
#print(logSum)
for poi2 in poi_transmat.columns:
logP = math.log10(poi_transmat.loc[poi1, poi2]) - logSum
poi_transmat.loc[poi1, poi2] = 10 ** (logP)
In [145]:
poi_transmat
Out[145]:
In [146]:
poi_transmat.sum(axis=1)
Out[146]:
In [147]:
def calc_dist(longitude1, latitude1, longitude2, latitude2):
"""Calculate the distance (unit: km) between two places on earth"""
# convert degrees to radians
lon1 = math.radians(longitude1)
lat1 = math.radians(latitude1)
lon2 = math.radians(longitude2)
lat2 = math.radians(latitude2)
radius = 6371.009 # mean earth radius is 6371.009km, en.wikipedia.org/wiki/Earth_radius#Mean_radius
# The haversine formula, en.wikipedia.org/wiki/Great-circle_distance
dlon = math.fabs(lon1 - lon2)
dlat = math.fabs(lat1 - lat2)
return 2 * radius * math.asin( math.sqrt( \
(math.sin(0.5*dlat))**2 + math.cos(lat1) * math.cos(lat2) * (math.sin(0.5*dlon))**2 ))
Compute POI-pair distance if the pair is observed in dataset.
In [148]:
def calc_obs_poipair_distmat(seqid_set, poi_all, seq_all):
poi_distmat = pd.DataFrame(data=np.full((poi_all.shape[0], poi_all.shape[0]), np.nan, dtype=np.float), \
index=poi_all.index, columns=poi_all.index)
for seqid in seqid_set:
seq = extract_seq(seqid, seq_all)
if len(seq) < 2: continue
for j in range(len(seq)-1):
poi1 = seq[j]
poi2 = seq[j+1]
if np.isnan(poi_distmat.loc[poi1, poi2]):
dist = calc_dist(poi_all.loc[poi1, 'poiLon'], poi_all.loc[poi1, 'poiLat'], \
poi_all.loc[poi2, 'poiLon'], poi_all.loc[poi2, 'poiLat'])
poi_distmat.loc[poi1, poi2] = dist
poi_distmat.loc[poi2, poi1] = dist
return poi_distmat
In [149]:
poi_distmat = calc_obs_poipair_distmat(seq_all['seqID'].unique(), poi_all, seq_all)
poi_distmat
Out[149]:
In [150]:
#distdata = pd.Series([x for x in np.unique(poi_distmat.get_values().flatten()) if not np.isnan(x)])
distdata = pd.Series([poi_distmat.iloc[x, y] \
for x in range(poi_distmat.index.shape[0]) \
for y in range(x+1, poi_distmat.index.shape[0]) \
if not np.isnan(poi_distmat.iloc[x, y])])
In [151]:
distdata.describe()
Out[151]:
In [152]:
ax = distdata.hist(bins=10)
ax.set_xlabel('POI-Pair Distance (km)')
ax.set_ylabel('#POI-Pair')
Out[152]:
In [153]:
#bins = np.linspace(0, 10, 7)
#bins = np.logspace(0, 2, 4)
#bins = [0, 1, 2, 3, 10]
dist_bins = [0, 2, 5, 100] # walk, ride, drive
In [154]:
ax = distdata.ix[np.nonzero(distdata)].hist(bins=dist_bins)
ax.set_xlabel('POI-Pair Distance (km)')
ax.set_ylabel('#POI-Pair')
ax.set_xscale('log')
In [155]:
poi_distclass_mat = pd.DataFrame(data=np.full((poi_all.shape[0], poi_all.shape[0]), np.nan, dtype=np.float), \
index=poi_all.index, columns=poi_all.index)
In [156]:
for i in range(poi_all.index.shape[0]):
poi1 = poi_all.index[i]
for j in range(i+1, poi_all.index.shape[0]):
poi2 = poi_all.index[j]
if np.isnan(poi_distmat.loc[poi1, poi2]): continue
dc = np.digitize([poi_distmat.loc[poi1, poi2]], dist_bins)[0]
poi_distclass_mat.loc[poi1, poi2] = dc
poi_distclass_mat.loc[poi2, poi1] = dc
In [157]:
poi_distclass_mat
Out[157]:
In [158]:
def calc_poipair_dist_transmat(seqid_set, poi_all, seq_all, poi_distclass_mat):
dist_class = [poi_distclass_mat.iloc[x, y] \
for x in range(poi_distclass_mat.index.shape[0]) \
for y in range(x+1, poi_distclass_mat.index.shape[0]) \
if not np.isnan(poi_distclass_mat.iloc[x, y])]
dist_class = np.unique(np.array(dist_class))
poipair_dist_transmat = pd.DataFrame(data=np.zeros((len(dist_class), len(dist_class)), dtype=np.float), \
index=dist_class, columns=dist_class)
for seqid in seqid_set:
seq = extract_seq(seqid, seq_all)
if len(seq) < 3: continue
for j in range(1, len(seq)-1):
poi1 = seq[j-1]
poi2 = seq[j]
poi3 = seq[j+1]
dc1 = int(poi_distclass_mat.loc[poi1, poi2])
dc2 = int(poi_distclass_mat.loc[poi2, poi3])
assert(not np.isnan(dc1))
assert(not np.isnan(dc2))
poipair_dist_transmat.loc[dc1, dc2] += 1
return poipair_dist_transmat
In [159]:
poipair_dist_transmat = calc_poipair_dist_transmat(seq_all['seqID'].unique(), poi_all, seq_all, poi_distclass_mat)
poipair_dist_transmat
Out[159]:
In [160]:
poipair_dist_transmat = normalise_transmat(poipair_dist_transmat)
poipair_dist_transmat
Out[160]:
In [161]:
def calc_seq_loglikelihood(seq, poi_all, poi_transmat, poi_distclass_mat, poipair_dist_transmat, dist_bins):
assert(len(seq) > 1)
logL = math.log10(poi_transmat.loc[seq[0], seq[1]])
for j in range(1, len(seq)-1):
poi1 = seq[j-1]
poi2 = seq[j]
poi3 = seq[j+1]
dc12 = poi_distclass_mat.loc[poi1, poi2]
dc23 = poi_distclass_mat.loc[poi2, poi3]
if np.isnan(dc12):
dist12 = calc_dist(poi_all.loc[poi1, 'poiLon'], poi_all.loc[poi1, 'poiLat'], \
poi_all.loc[poi2, 'poiLon'], poi_all.loc[poi2, 'poiLat'])
dc12 = np.digitize([dist12], dist_bins)[0]
if np.isnan(dc23):
dist23 = calc_dist(poi_all.loc[poi2, 'poiLon'], poi_all.loc[poi2, 'poiLat'], \
poi_all.loc[poi3, 'poiLon'], poi_all.loc[poi3, 'poiLat'])
dc23 = np.digitize([dist23], dist_bins)[0]
logL += math.log10(poi_transmat.loc[poi2, poi3])
logL += math.log10(poipair_dist_transmat.loc[dc12, dc23])
return logL
Simple check.
In [162]:
seq1 = [10, 21, 28, 22]
d12 = calc_dist(poi_all.loc[10,'poiLon'], poi_all.loc[10,'poiLat'], poi_all.loc[21,'poiLon'], poi_all.loc[21, 'poiLat'])
d23 = calc_dist(poi_all.loc[21,'poiLon'], poi_all.loc[21,'poiLat'], poi_all.loc[28,'poiLon'], poi_all.loc[28, 'poiLat'])
d34 = calc_dist(poi_all.loc[28,'poiLon'], poi_all.loc[28,'poiLat'], poi_all.loc[22,'poiLon'], poi_all.loc[22, 'poiLat'])
In [163]:
print(d12, d23, d34, dist_bins)
In [164]:
s1 = math.log10(poi_transmat.loc[10, 21]) + math.log10(poi_transmat.loc[21, 28]) + math.log10(poi_transmat.loc[28, 22])
s2 = math.log10(poipair_dist_transmat.loc[1, 1]) + math.log10(poipair_dist_transmat.loc[1, 1])
print(s1+s2)
In [165]:
calc_seq_loglikelihood([10, 21, 28, 22], poi_all, poi_transmat, poi_distclass_mat, poipair_dist_transmat, dist_bins)
Out[165]:
In [166]:
seqid_set = sorted(seq_all['seqID'].unique().tolist())
act_seq_logL = pd.DataFrame(data=np.zeros((len(seqid_set), 3), dtype=np.object), \
index=seqid_set, columns=['actSeq', 'logLikelihood', 'seqLen'])
act_seq_logL.index.name = 'seqID'
In [167]:
for seqid in seqid_set:
seq = extract_seq(seqid, seq_all)
logL = 0
if len(seq) < 2:
logL = math.log10(1)
else:
logL = calc_seq_loglikelihood(seq, poi_all, poi_transmat, poi_distclass_mat, poipair_dist_transmat, dist_bins)
act_seq_logL.loc[seqid, 'actSeq'] = str(seq)
act_seq_logL.loc[seqid, 'seqLen'] = len(seq)
act_seq_logL.loc[seqid, 'logLikelihood'] = logL
#print('Sequence %-20s Log likelihood: %.3f' % (str(seq), logL))
In [168]:
act_seq_logL.head()
Out[168]:
In [169]:
seq_345_logL = act_seq_logL[act_seq_logL['seqLen'].isin({3, 4, 5})].copy()
seq_345_logL.sort(columns=['actSeq'], ascending=True, inplace=True)
In [170]:
seq_345_logL
Out[170]:
Construct distinct trajectories of length {3, 4, 5}.
In [171]:
seq_345 = dict()
In [172]:
for seqid in sorted(seq_all['seqID'].unique().tolist()):
seq = extract_seq(seqid, seq_all)
if len(seq) not in {3, 4, 5}: continue
if str(seq) not in seq_345:
seq_345[str(seq)] = [(seqid, seq_user.loc[seqid])]
else:
seq_345[str(seq)].append((seqid, seq_user.loc[seqid]))
In [173]:
distinct_seq_345 = sorted([seq_345[x][0][0] for x in sorted(seq_345.keys())])
In [174]:
print(len(distinct_seq_345))
#distinct_seq_345
In [175]:
poi_list = poi_all.index.tolist()
#poi_list
Enumerate trajectories of the same (start, end) and length (3, 4 or 5) as an actual sequence.
In [176]:
def enum_345_seq(seqid, poi_list, seq_all):
seq = extract_seq(seqid, seq_all)
assert(len(seq) in {3, 4, 5})
p0 = seq[0]
pN = seq[-1]
# enumerate sequences with length 3
if len(seq) == 3:
return [[p0, p, pN] \
for p in poi_list if p not in {p0, pN}]
# enumerate sequences with length 4
if len(seq) == 4:
return [[p0, p1, p2, pN] \
for p1 in poi_list if p1 not in {p0, pN} \
for p2 in poi_list if p2 not in {p0, p1, pN}]
# enumerate sequences with length 5
if len(seq) == 5:
return [[p0, p1, p2, p3, pN] \
for p1 in poi_list if p1 not in {p0, pN} \
for p2 in poi_list if p2 not in {p0, p1, pN} \
for p3 in poi_list if p3 not in {p0, p1, p2, pN}]
Compute (or load from .pkl file if possible) the log likelihood of enumerated trajectories.
In [177]:
enum_logL_dict = dict() # key: actual sequence, value: tuple(loglikelihood of actual sequence, enum sequences dataframe)
In [178]:
doEnum = True
In [179]:
if os.path.exists(fenumseq):
dict1 = pickle.load(open(fenumseq, 'rb'))
if (np.array(sorted(dict1.keys())) == np.array(sorted(seq_345.keys()))).all():
doEnum = False
enum_logL_dict = dict1
In [180]:
doEnum
Out[180]:
In [181]:
if doEnum:
for seqid in distinct_seq_345:
enum_seqs = enum_345_seq(seqid, poi_list, seq_all)
df = pd.DataFrame(data=np.array([str(x) for x in enum_seqs]))
df.columns = ['enumSeq']
df.set_index('enumSeq', inplace=True)
df['logLikelihood'] = 0
for seq in enum_seqs:
#print(seq)
logL = calc_seq_loglikelihood(seq, poi_all, poi_transmat, poi_distclass_mat, poipair_dist_transmat, dist_bins)
df.loc[str(seq), 'logLikelihood'] = logL
key = seq_345_logL.loc[seqid, 'actSeq']
df.sort(columns=['logLikelihood'], ascending=False, inplace=True) # sort by the loglikelihood of enum sequences
enum_logL_dict[key] = (seq_345_logL.loc[seqid, 'logLikelihood'], df)
pickle.dump(enum_logL_dict, open(fenumseq, 'wb'))
In [182]:
seqidx = 0
seqid = distinct_seq_345[seqidx]
act_seq = seq_345_logL.loc[seqid, 'actSeq']
print('Actual Sequence:', act_seq)
print('Loglikelihood:', enum_logL_dict[act_seq][0])
enum_logL_dict[act_seq][1].head()
Out[182]:
In [183]:
rank = 1 + np.nonzero(enum_logL_dict[act_seq][1].index == act_seq)[0][0]
print('rank of actual sequence: %d/%d' % (rank, enum_logL_dict[act_seq][1].index.shape[0]))
In [184]:
actseqs = sorted(enum_logL_dict.keys())
actseq_logL = [enum_logL_dict[x][0] for x in actseqs]
In [185]:
df = pd.DataFrame(actseqs)
df.columns = ['actSeq']
df['actSeqLogLikelihood'] = [enum_logL_dict[x][0] for x in actseqs]
df['enumSeq'] = [enum_logL_dict[x][1].index[0] for x in actseqs] # the enum sequence with highest log likelihood
df['enumSeqLogLikelihood'] = [enum_logL_dict[x][1].iloc[0]['logLikelihood'] for x in actseqs]
# rank of actual sequence
df['actSeqRank'] = [1+np.nonzero(enum_logL_dict[x][1].index == x)[0][0] for x in actseqs]
df['#enumSeq'] = [enum_logL_dict[x][1].index.shape[0] for x in actseqs]
df['actSeqRank(Top%)'] = 100*df['actSeqRank']/df['#enumSeq']
df
Out[185]:
In [186]:
enumseq_logL = [enum_logL_dict[x][1]['logLikelihood'] for x in actseqs]
In [187]:
ncol = 10
nrow = int(len(actseqs) / 10)
if len(actseqs) % 10 > 0: nrow += 1
plt.figure(figsize=[20, nrow*5])
for k in range(nrow):
plt.subplot(nrow, 1, k+1)
plt.ylim([-16, 0])
if (k*ncol + ncol) > len(actseqs):
N = len(actseqs[k*ncol:])
xidx = range(1, N+1)
plt.boxplot(enumseq_logL[k*ncol:], labels=actseqs[k*ncol:])
plt.plot([], 'gD', xdata=xidx, ydata=actseq_logL[k*ncol:], label='actual trajectory')
else:
xidx = range(1, ncol+1)
plt.boxplot(enumseq_logL[k*ncol:(k+1)*ncol], labels=actseqs[k*ncol:(k+1)*ncol])
plt.plot([], 'gD', xdata=xidx, ydata=actseq_logL[k*ncol:(k+1)*ncol], label='actual trajectory')
#plt.xlabel('Actual Trajectory')
plt.ylabel('Log Likelihood')
plt.legend(numpoints=1)
In [188]:
def calc_F1score(seq_act, seq_rec):
assert(len(seq_act) > 0)
assert(len(seq_rec) > 0)
actset = set(seq_act)
recset = set(seq_rec)
intersect = actset & recset
recall = len(intersect) / len(seq_act)
precision = len(intersect) / len(seq_rec)
F1score = 2. * precision * recall / (precision + recall)
return F1score
In [189]:
def parse_seqstr(seqstr):
term = re.sub('[ \[\]]', '', seqstr).split(',')
return [int(x) for x in term]
In [190]:
parse_seqstr(str([3, 11, 27]))
Out[190]:
In [191]:
calc_F1score(parse_seqstr(str([3, 11, 27])), parse_seqstr(str([3, 5, 27])))
Out[191]:
In [192]:
calc_F1score(parse_seqstr(str([11, 29, 6, 23, 22])), parse_seqstr(str([11, 29, 6, 23, 22])))
Out[192]:
In [193]:
actseqs = sorted(enum_logL_dict.keys())
In [194]:
df1 = pd.DataFrame(actseqs)
df1.columns = ['actSeq']
df1['enumSeq#1'] = [enum_logL_dict[x][1].index[0] for x in actseqs] # the enum sequence with highest log likelihood
df1['F1-score#1'] = [calc_F1score(parse_seqstr(df1.loc[idx, 'actSeq']), parse_seqstr(df1.loc[idx, 'enumSeq#1'])) \
for idx in df1.index]
df1['enumSeq#2'] = [enum_logL_dict[x][1].index[1] for x in actseqs] # the enum sequence with second highest loglikelihood
df1['F1-score#2'] = [calc_F1score(parse_seqstr(df1.loc[idx, 'actSeq']), parse_seqstr(df1.loc[idx, 'enumSeq#2'])) \
for idx in df1.index]
df1['enumSeq#3'] = [enum_logL_dict[x][1].index[2] for x in actseqs] # the enum sequence with third highest loglikelihood
df1['F1-score#3'] = [calc_F1score(parse_seqstr(df1.loc[idx, 'actSeq']), parse_seqstr(df1.loc[idx, 'enumSeq#3'])) \
for idx in df1.index]
In [195]:
df1.head()
Out[195]:
Boxplot of F1-score for top-K (or percent) log likelihood of enumerated trajectories.
In [196]:
topk = 10
In [197]:
enumseq_F1 = [[calc_F1score(parse_seqstr(x), parse_seqstr(enum_logL_dict[x][1].index[y])) \
for y in range(enum_logL_dict[x][1].index.shape[0]) if y < topk] \
# for y in range(enum_logL_dict[x][1].index.shape[0]) if y < enum_logL_dict[x][1].index.shape[0]*topk/100] \
for x in actseqs]
In [198]:
seq0 = actseqs[0]
print(seq0)
print(enum_logL_dict[seq0][1].index.tolist())
print(enumseq_F1[0])
print(len(enumseq_F1))
In [199]:
ncol = 10
nrow = int(len(actseqs) / 10)
if len(actseqs) % 10 > 0: nrow += 1
plt.figure(figsize=[20, nrow*5])
for k in range(nrow):
plt.subplot(nrow, 1, k+1)
plt.ylim([0.1, 1.1])
if (k*ncol + ncol) > len(actseqs):
N = len(actseqs[k*ncol:])
xidx = range(1, N+1)
plt.boxplot(enumseq_F1[k*ncol:], labels=actseqs[k*ncol:])
else:
xidx = range(1, ncol+1)
plt.boxplot(enumseq_F1[k*ncol:(k+1)*ncol], labels=actseqs[k*ncol:(k+1)*ncol])
#plt.xlabel('Actual Trajectory')
plt.ylabel('F1 score')
Scatter plot of F1 score vs. log likelihood for enumerated trajectories.
In [200]:
print('topk =', topk)
enumseq_logL_list = [enum_logL_dict[x][1].iloc[i]['logLikelihood'] \
for x in actseqs \
for i in range(enum_logL_dict[x][1].shape[0]) if i < topk]
In [201]:
enumseq_F1_list = [calc_F1score(parse_seqstr(x), parse_seqstr(enum_logL_dict[x][1].index[y])) \
for x in actseqs \
for y in range(enum_logL_dict[x][1].index.shape[0]) if y < topk]
In [202]:
plt.figure(figsize=[10, 10])
plt.xlabel('Log Likelihood')
plt.ylabel('F1 score')
plt.scatter(enumseq_logL_list, enumseq_F1_list, marker='+')
Out[202]:
Log Likelihood Rank vs. F1 score Rank.
In [203]:
enumseq_logL_rank = [list(range(1, 1+enum_logL_dict[x][1].index.shape[0])) for x in actseqs]
# already sorted by loglikelihood
In [204]:
enumseq_F1_rank = []
for x in actseqs:
F1score = [calc_F1score(parse_seqstr(x), parse_seqstr(enum_logL_dict[x][1].index[y])) \
for y in range(enum_logL_dict[x][1].index.shape[0])]
idx = np.argsort(F1score)
rank = np.zeros(enum_logL_dict[x][1].index.shape[0], dtype=np.int)
for i in range(len(idx)):
rank[idx[i]] = i+1
enumseq_F1_rank.append(rank)
In [205]:
X = [y for x in enumseq_logL_rank for y in x]
Y = [y for x in enumseq_F1_rank for y in x]
plt.figure(figsize=[10, 10])
plt.xlabel('Log Likelihood Rank')
plt.ylabel('F1 score Rank')
plt.xlim([0, max(X)+1])
plt.ylim([0, max(Y)+1])
assert(max(X) == max(Y))
plt.scatter(X, Y, marker='+')
plt.plot([0, max(X)+1], [0, max(Y)+1], linestyle='--', color='g')
Out[205]:
Pearson Correlation Coefficient:
In [206]:
scipy.stats.pearsonr(X, Y)
Out[206]: