Trajectory Recommendation using a Markov Chain

1. Features

  1. POI category (a transition matrix between different categories)
  2. POI popularity (a transition matrix between different class of popularity)
  3. POI pair distance (a transition matrix between different class of distance)

Computation Steps:

  1. First compute the above features using a set of travelling sequences,
  2. Next compute a POI-to-POI transition matrix by combing the first two features,
  3. Then compute the log likelihood of (actual/enumerated) sequences using POI-to-POI transition matrix and POI pair distance transition matrix and make comparison.

2. Load Data


In [104]:
%matplotlib inline

import os
import re
import math
import random
import pickle
import pandas as pd
import numpy as np
import scipy.stats
from datetime import datetime
from joblib import Parallel, delayed
import matplotlib.pyplot as plt

In [105]:
nfeatures = 8 # number of features
EPS = 1e-12 # smooth, deal with 0 probability
random.seed(123456789) # control random choice when splitting training/testing set

In [106]:
data_dir = 'data/data-ijcai15'
#fvisit = os.path.join(data_dir, 'userVisits-Osak.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Osak.csv')
#fvisit = os.path.join(data_dir, 'userVisits-Glas.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Glas.csv')
#fvisit = os.path.join(data_dir, 'userVisits-Edin.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Edin.csv')
fvisit = os.path.join(data_dir, 'userVisits-Toro.csv')
fcoord = os.path.join(data_dir, 'photoCoords-Toro.csv')

In [107]:
suffix = fvisit.split('-')[-1].split('.')[0]
fenumseq = os.path.join(data_dir, 'enumSeqData-' + suffix + '.pkl')

In [108]:
visits = pd.read_csv(fvisit, sep=';')
coords = pd.read_csv(fcoord, sep=';')
# merge data frames according to column 'photoID'
assert(visits.shape[0] == coords.shape[0])
traj = pd.merge(visits, coords, on='photoID')
traj.head()


Out[108]:
photoID userID dateTaken poiID poiTheme poiFreq seqID photoLon photoLat
0 7941504100 10007579@N00 1346844688 30 Structure 1538 1 -79.380844 43.645641
1 4886005532 10012675@N05 1142731848 6 Cultural 986 2 -79.391525 43.654335
2 4886006468 10012675@N05 1142732248 6 Cultural 986 2 -79.391525 43.654335
3 4885404441 10012675@N05 1142732373 6 Cultural 986 2 -79.391525 43.654335
4 4886008334 10012675@N05 1142732445 6 Cultural 986 2 -79.391525 43.654335

In [109]:
num_photo = traj['photoID'].unique().shape[0]
num_user = traj['userID'].unique().shape[0]
num_poi = traj['poiID'].unique().shape[0]
num_seq = traj['seqID'].unique().shape[0]
pd.DataFrame({'#photo': num_photo, '#user': num_user, '#poi': num_poi, '#seq': num_seq, \
              '#photo/user': num_photo/num_user, '#seq/user': num_seq/num_user}, index=[str(suffix)])


Out[109]:
#photo #photo/user #poi #seq #seq/user #user
Toro 39419 28.257348 29 6057 4.341935 1395

In [110]:
plt.figure(figsize=[15, 5])
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.scatter(traj['photoLon'], traj['photoLat'], marker='+')


Out[110]:
<matplotlib.collections.PathCollection at 0x7f5e0097ebe0>

2.1 Compute POI Info

Compute POI (Longitude, Latitude) as the average coordinates of the assigned photos.


In [111]:
poi_coords = traj[['poiID', 'photoLon', 'photoLat']].groupby('poiID').mean()
poi_coords.reset_index(inplace=True)
poi_coords.rename(columns={'photoLon':'poiLon', 'photoLat':'poiLat'}, inplace=True)

Extract POI category and visiting frequency.


In [112]:
poi_catfreq = traj[['poiID', 'poiTheme', 'poiFreq']].groupby('poiID').first()
poi_catfreq.reset_index(inplace=True)

In [113]:
poi_all = pd.merge(poi_catfreq, poi_coords, on='poiID')
poi_all.set_index('poiID', inplace=True)
#poi_all.to_csv(fpoi, index=True)

2.2 Construct Travelling Sequences


In [114]:
seq_all = traj[['userID', 'seqID', 'poiID', 'dateTaken']].copy().groupby(['userID', 'seqID', 'poiID'])\
          .agg([np.min, np.max, np.size])
seq_all.columns = seq_all.columns.droplevel()
seq_all.reset_index(inplace=True)
seq_all.rename(columns={'amin':'arrivalTime', 'amax':'departureTime', 'size':'#photo'}, inplace=True)
seq_all['poiDuration(sec)'] = seq_all['departureTime'] - seq_all['arrivalTime']
seq_all.head()


Out[114]:
userID seqID poiID arrivalTime departureTime #photo poiDuration(sec)
0 10007579@N00 1 30 1346844688 1346844688 1 0
1 10012675@N05 2 6 1142731848 1142732445 4 597
2 10012675@N05 3 6 1142916492 1142916492 1 0
3 10012675@N05 4 13 1319327174 1319332848 9 5674
4 10014440@N06 5 24 1196128621 1196128878 3 257

In [115]:
seq_user = seq_all[['userID', 'seqID', 'poiID']].copy().groupby(['userID', 'seqID']).agg(np.size)
seq_user.reset_index(inplace=True)
seq_user.rename(columns={'size':'seqLen'}, inplace=True)
seq_user.set_index('seqID', inplace=True)
seq_user.head()


Out[115]:
userID poiID
seqID
1 10007579@N00 1
2 10012675@N05 1
3 10012675@N05 1
4 10012675@N05 1
5 10014440@N06 1

2.3 Compute Some Sequence Statistics


In [116]:
seq_len = seq_all[['seqID', 'poiID']].copy().groupby('seqID').agg(np.size)
seq_len.reset_index(inplace=True)
seq_len.rename(columns={'poiID':'seqLen'}, inplace=True)
seq_len.head()


Out[116]:
seqID seqLen
0 1 1
1 2 1
2 3 1
3 4 1
4 5 1

In [117]:
seq_stats = seq_all[['seqID', '#photo', 'poiDuration(sec)']].copy().groupby('seqID').agg(np.sum)
seq_stats.reset_index(inplace=True)
#seq_stats.rename(columns={'poiDuration(sec)':'totalPoiDuration(sec)'}, inplace=True)
seq_stats = pd.merge(seq_len, seq_stats, on='seqID')
seq_stats['poiDuration(sec)'] /= 60
seq_stats.rename(columns={'poiDuration(sec)':'totalPoiDuration(min)'}, inplace=True)
seq_stats.set_index('seqID', inplace=True)
seq_stats.head()


Out[117]:
seqLen #photo totalPoiDuration(min)
seqID
1 1 1 0.000000
2 1 4 9.950000
3 1 1 0.000000
4 1 9 94.566667
5 1 3 4.283333

In [118]:
ax = seq_stats['seqLen'].hist(bins=50)
ax.set_xlabel('sequence length')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')



In [119]:
ax = seq_stats['#photo'].hist(bins=50)
ax.set_xlabel('#photo for sequence')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')



In [120]:
ax = seq_stats['totalPoiDuration(min)'].hist(bins=100)
ax.set_xlabel('totalPoiDuration(min)')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')
#ax.set_xscale('log')


Sequences with length {3, 4, 5}


In [121]:
seq_stats = seq_stats[seq_stats['seqLen'].isin({3, 4, 5})]
ax = seq_stats['totalPoiDuration(min)'].hist(bins=50)
ax.set_xlabel('totalPoiDuration(min)')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')


Do some filtering? (e.g. totalPoiDuration = 0 or too large, seqLen = 1 etc), 8 hours split vs. 6 hours split?

NOTE: All sequences are used in the next section (i.e. compute sequence log likelihood, enumerate sequences) without any filtering.


In [122]:
min_duration = 10  # 10 minutes
max_duration = 960 # 16 hours

In [123]:
seq_stats = seq_stats[seq_stats['totalPoiDuration(min)'] > min_duration]
seq_stats = seq_stats[seq_stats['totalPoiDuration(min)'] < max_duration]
ax = seq_stats['totalPoiDuration(min)'].hist(bins=50)
ax.set_xlabel('totalPoiDuration(min)')
ax.set_ylim([0.1, 1e4])
ax.set_yscale('log')


Sequences with the same (start, end), some (start, end) are more popular than others.

what do we need? (start POI category, end POI category) vs. (start POI, end POI)


In [124]:
def extract_seq(seqid, seq_all):
    seqi = seq_all[seq_all['seqID'] == seqid].copy()
    seqi.sort(columns=['arrivalTime'], ascending=True, inplace=True)
    return seqi['poiID'].tolist()

In [125]:
seq_dict = dict()
for seqid in seq_stats.index:
    assert(seq_stats.loc[seqid, 'seqLen'] >= 2)
    seq = extract_seq(seqid, seq_all)
    p0 = seq[0]
    pN = seq[-1]
    key = (p0, pN, len(seq))
    if key not in seq_dict:
        seq_dict[key] = []
    seq_dict[key].append(seq)

In [126]:
tuples = sorted(seq_dict.keys())
group_size = np.array([len(seq_dict[x]) for x in tuples])
maxidx = group_size.argmax()
minidx = group_size.argmin()
print('max: %d sequences with (start, end, seqLen) %s' % (group_size[maxidx], str(tuples[maxidx])))
print('min: %d sequences with (start, end, seqLen) %s' % (group_size[minidx], str(tuples[minidx])))
ax = pd.Series(group_size).hist(bins=20)
ax.set_xlabel('#sequence with same (start, end, seqLen)')
ax.set_ylabel('#(start, end, seqLen) tuples')


max: 5 sequences with (start, end, seqLen) (23, 21, 3)
min: 1 sequences with (start, end, seqLen) (3, 22, 3)
Out[126]:
<matplotlib.text.Text at 0x7f5e117ba5c0>

Split sequences into training set and testing set. which kind of split? hold one sequence in each type of (start, end)?

3. Compute Transition Probabilities

3.1 Basic Definitions

  • $\text{Pr}(\text{POI}_i \to \text{POI}_j)$: the transition probability from $\text{POI}_i$ to $\text{POI }_j$
  • $\text{Pr}(\text{Cat}_i \to \text{Cat}_j)$: the transition probability from a POI of category $\text{Cat}_i$ to a POI of category $\text{Cat}_j$
  • $\text{Pr}(\text{Pop}_i \to \text{Pop}_j)$: the transition probability from a POI of Popularity class $\text{Pop}_i$ to a POI of Popularity class $\text{Pop}_j$
  • $\text{Pr}(\text{Dist}_i \to \text{Dist}_j)$: the transition probability from a POI-POI pair with distance (between the two) class $\text{Dist}_i$ to a POI-POI pair with distance (between the two) class $\text{Dist}_j$

By design, $\text{Pr}(\text{POI}_i \to \text{POI}_j)$ should be bigger if any of $\text{Pr}(\text{Cat}_i \to \text{Cat}_j)$, $\text{Pr}(\text{Pop}_i \to \text{Pop}_j)$ and $\text{Pr}(\text{Dist}_i \to \text{Dist}_j)$ becomes bigger (if other factors stay the same).

So how to combine these probabilities?
Both addition and multiplication seems to be able to serve this purpose, what is the difference?

The Addition Case

For the addtion case,

\begin{equation} \text{Pr}(\text{POI}_i \to \text{POI}_j) = \frac{ \text{Pr}(\text{Cat}_{\text{POI}_i} \to \text{Cat}_{\text{POI}_j}) + \text{Pr}(\text{Pop}_{\text{POI}_i} \to \text{Pop}_{\text{POI}_j}) + \text{Pr}(\text{Dist}_{\text{POI}_{i-1} \to \text{POI}_i} \to \text{Dist}_{\text{POI}_{i} \to \text{POI}_j}) } {Z_i} \end{equation}

where $\text{POI}_{i-1}$ is the direct predecessor of $\text{POI}_i$ in a trajectory and $Z_i$ is a normalizing constant.

The Multiplication Case

For the addtion case,

\begin{equation} \text{Pr}(\text{POI}_i \to \text{POI}_j) = \frac{ \text{Pr}(\text{Cat}_{\text{POI}_i} \to \text{Cat}_{\text{POI}_j}) \times \text{Pr}(\text{Pop}_{\text{POI}_i} \to \text{Pop}_{\text{POI}_j}) \times \text{Pr}(\text{Dist}_{\text{POI}_{i-1} \to \text{POI}_i} \to \text{Dist}_{\text{POI}_{i} \to \text{POI}_j}) } {Z_i} \end{equation}

similarly, $\text{POI}_{i-1}$ is the direct predecessor of $\text{POI}_i$ in a trajectory and $Z_i$ is again a normalizing constant.

The Difference between Addition and Multiplication

It is important to note the fact that, by design, $\text{Pr}(\text{POI}_i \to \text{POI}_j)$ should be very small if any of $\text{Pr}(\text{Cat}_i \to \text{Cat}_j)$, $\text{Pr}(\text{Pop}_i \to \text{Pop}_j)$ and $\text{Pr}(\text{Dist}_i \to \text{Dist}_j)$ is very small, in the extreme case, if any of the three probabilities is $0$, then $\text{Pr}(\text{POI}_i \to \text{POI}_j)$ should be $0$ because the event "Transition from POI$_i$ to POI$_j$" is impossible.

From the equation of the addition case, it is clear that the addition rule contradicts the above fact while the multiplication rule is consistent with it.

Intuitively, the addition rule could make an unlikely event become much more likely, specifically, make an impossible event become possible.

3.2 Transition Probabilities between POI Categories


In [127]:
def normalise_transmat(transmat):
    assert(isinstance(transmat, pd.DataFrame))
    for row in range(transmat.index.shape[0]):
        nonzeroidx = np.nonzero(transmat.iloc[row])[0].tolist()
        if len(nonzeroidx) < transmat.columns.shape[0]:
            minv = np.min(transmat.iloc[row, nonzeroidx])
            EPS = 0.1 * minv  # row-specific smooth factor
            #zeroidx = list(set(range(len(transmat.columns))) - set(nonzeroidx))
            #transmat.iloc[row, zeroidx] = EPS
            transmat.iloc[row] += EPS
        rowsum = np.sum(transmat.iloc[row])
        assert(rowsum > 0)
        transmat.iloc[row] /= rowsum
    return transmat

In [128]:
def calc_poi_cat_transmat(seqid_set, poi_all, seq_all):
    poi_cats = poi_all['poiTheme'].unique().tolist()
    poi_cats.sort()
    poi_cat_transmat = pd.DataFrame(data=np.zeros((len(poi_cats), len(poi_cats)), dtype=np.float), \
                                    index=poi_cats, columns=poi_cats)
    for seqid in seqid_set:
        seq = extract_seq(seqid, seq_all)
        for j in range(len(seq)-1):
            poi1 = seq[j]
            poi2 = seq[j+1]
            cat1 = poi_all.loc[poi1, 'poiTheme']
            cat2 = poi_all.loc[poi2, 'poiTheme']
            poi_cat_transmat.loc[cat1, cat2] += 1
    return poi_cat_transmat

In [129]:
poi_cat_transmat = calc_poi_cat_transmat(seq_all['seqID'].unique(), poi_all, seq_all)
poi_cat_transmat


Out[129]:
Amusement Beach Cultural Shopping Sport Structure
Amusement 14 20 51 17 35 16
Beach 18 43 61 92 19 106
Cultural 46 70 40 72 22 94
Shopping 16 101 58 16 19 71
Sport 42 23 21 13 8 21
Structure 26 91 81 59 24 24

In [130]:
poi_cat_transmat = normalise_transmat(poi_cat_transmat)
poi_cat_transmat


Out[130]:
Amusement Beach Cultural Shopping Sport Structure
Amusement 0.091503 0.130719 0.333333 0.111111 0.228758 0.104575
Beach 0.053097 0.126844 0.179941 0.271386 0.056047 0.312684
Cultural 0.133721 0.203488 0.116279 0.209302 0.063953 0.273256
Shopping 0.056940 0.359431 0.206406 0.056940 0.067616 0.252669
Sport 0.328125 0.179688 0.164062 0.101562 0.062500 0.164062
Structure 0.085246 0.298361 0.265574 0.193443 0.078689 0.078689

3.3 Transition Probabilities between POI Popularity Classes

TODO: Improve the discritization using Fayyad's minimum description length principle (MDLP) described by this paper.

NOTE: MDLP is a supervised discretization method which requires some sort of class labels that are not available here. We need an unsupervised discretization method, well-known ones including equal width, equal frequency, clustering based, etc.

3.3.1 Discretize POI Popularity

What the general criteria of data discretization? Discretization of continuous features from Wikipedia.


In [131]:
poi_all['poiFreq'].get_values()


Out[131]:
array([3506,  609,  688, 3056,  986, 2064, 1736,  278,  346, 4142,  481,
        964,  141,  113, 3553,  808,   26,  111,   89, 3594, 3619, 1874,
       1028, 1701,  104,  631,  936,  744, 1538])

In [132]:
poi_all['poiFreq'].describe()


Out[132]:
count      29.000000
mean     1360.896552
std      1289.191534
min        26.000000
25%       346.000000
50%       936.000000
75%      1874.000000
max      4142.000000
Name: poiFreq, dtype: float64

In [133]:
ax = poi_all['poiFreq'].hist(bins=10)
ax.set_xlabel('POI Popularity')
ax.set_ylabel('#POI')


Out[133]:
<matplotlib.text.Text at 0x7f5e117bd320>

In [134]:
plt.plot(np.ones(poi_all.index.shape[0]), np.sqrt(poi_all['poiFreq']), marker='+')


Out[134]:
[<matplotlib.lines.Line2D at 0x7f5e0cbb2518>]

In [135]:
#bins = [1, 346, 936, 1874, 5000]
#bins = np.linspace(0, 10000, 11)
#bins = np.logspace(0, 4, 5)
#bins = [1, 100, 500, 1000, 2000, 5000]
bins = [1, 500, 1500, 10000]

In [136]:
ax = poi_all['poiFreq'].hist(bins=bins)
ax.set_xlabel('POI Popularity')
ax.set_ylabel('#POI')
ax.set_xscale('log')



In [137]:
poi_all['popClass'] = np.digitize(poi_all['poiFreq'].get_values(), bins)
poi_all


Out[137]:
poiTheme poiFreq poiLon poiLat popClass
poiID
1 Sport 3506 -79.379243 43.643183 3
2 Sport 609 -79.418634 43.632772 2
3 Sport 688 -79.380045 43.662175 2
4 Sport 3056 -79.389290 43.641297 3
6 Cultural 986 -79.392396 43.653662 2
7 Cultural 2064 -79.377327 43.647151 3
8 Cultural 1736 -79.385349 43.642385 3
9 Cultural 278 -79.339170 43.716447 1
10 Cultural 346 -79.361107 43.667067 1
11 Cultural 4142 -79.394458 43.667183 3
12 Cultural 481 -79.182211 43.820080 1
13 Cultural 964 -79.409364 43.678157 2
14 Amusement 141 -79.417001 43.633924 1
15 Amusement 113 -79.373573 43.619836 1
16 Amusement 3553 -79.387065 43.642849 3
17 Amusement 808 -79.416011 43.632563 2
18 Amusement 26 -79.450808 43.637183 1
19 Beach 111 -79.378227 43.621697 1
20 Beach 89 -79.462382 43.646557 1
21 Beach 3594 -79.380453 43.656274 3
22 Beach 3619 -79.383702 43.652478 3
23 Shopping 1874 -79.379884 43.653868 3
24 Shopping 1028 -79.382320 43.638621 2
25 Shopping 1701 -79.401168 43.654748 3
26 Shopping 104 -79.452683 43.725761 1
27 Shopping 631 -79.390935 43.670104 2
28 Structure 936 -79.381184 43.652181 2
29 Structure 744 -79.391265 43.662138 2
30 Structure 1538 -79.380584 43.645651 3

3.3.2 Compute Transition Probabilities


In [138]:
def calc_poi_pop_transmat(seqid_set, poi_all, seq_all):
    pop_class = poi_all['popClass'].unique().tolist()
    pop_class.sort()
    poi_pop_transmat = pd.DataFrame(data=np.zeros((len(pop_class), len(pop_class)), dtype=np.float), \
                                    index=pop_class, columns=pop_class)
    for seqid in seqid_set:
        seq = extract_seq(seqid, seq_all)
        for j in range(len(seq)-1):
            poi1 = seq[j]
            poi2 = seq[j+1]
            pc1 = poi_all.loc[poi1, 'popClass']
            pc2 = poi_all.loc[poi2, 'popClass']
            poi_pop_transmat.loc[pc1, pc2] += 1
    return poi_pop_transmat

In [139]:
poi_pop_transmat = calc_poi_pop_transmat(seq_all['seqID'].unique(), poi_all, seq_all)
poi_pop_transmat


Out[139]:
1 2 3
1 8 11 18
2 18 52 301
3 12 312 818

In [140]:
poi_pop_transmat = normalise_transmat(poi_pop_transmat)
poi_pop_transmat


Out[140]:
1 2 3
1 0.216216 0.297297 0.486486
2 0.048518 0.140162 0.811321
3 0.010508 0.273205 0.716287

3.4 Transition Probabilities between POIs


In [141]:
def calc_poi_transmat(poi_all, seq_all, poi_cat_transmat, poi_pop_transmat):
    poi_transmat = pd.DataFrame(data=np.zeros((poi_all.index.shape[0], poi_all.index.shape[0]), dtype=np.float), \
                                index=poi_all.index, columns=poi_all.index)
    for poi1 in poi_all.index:
        cat1 = poi_all.loc[poi1, 'poiTheme']
        pc1  = poi_all.loc[poi1, 'popClass']
        for poi2 in poi_all.index:
            cat2 = poi_all.loc[poi2, 'poiTheme']
            pc2  = poi_all.loc[poi2, 'popClass']
            logP = math.log10(poi_cat_transmat.loc[cat1, cat2]) + math.log10(poi_pop_transmat.loc[pc1, pc2])
            poi_transmat.loc[poi1, poi2] = 10 ** (logP)
    return poi_transmat

In [142]:
poi_transmat = calc_poi_transmat(poi_all, seq_all, poi_cat_transmat, poi_pop_transmat)
poi_transmat


Out[142]:
poiID 1 2 3 4 6 7 8 9 10 11 ... 21 22 23 24 25 26 27 28 29 30
poiID
1 0.044768 0.017075 0.017075 0.044768 0.044823 0.117516 0.117516 0.001724 0.001724 0.117516 ... 0.128708 0.128708 0.072748 0.027747 0.072748 0.001067 0.027747 0.044823 0.044823 0.117516
2 0.050708 0.008760 0.008760 0.050708 0.022995 0.133107 0.133107 0.007960 0.007960 0.133107 ... 0.145784 0.145784 0.082400 0.014235 0.082400 0.004928 0.014235 0.022995 0.022995 0.133107
3 0.050708 0.008760 0.008760 0.050708 0.022995 0.133107 0.133107 0.007960 0.007960 0.133107 ... 0.145784 0.145784 0.082400 0.014235 0.082400 0.004928 0.014235 0.022995 0.022995 0.133107
4 0.044768 0.017075 0.017075 0.044768 0.044823 0.117516 0.117516 0.001724 0.001724 0.117516 ... 0.128708 0.128708 0.072748 0.027747 0.072748 0.001067 0.027747 0.044823 0.044823 0.117516
6 0.051887 0.008964 0.008964 0.051887 0.016298 0.094340 0.094340 0.005642 0.005642 0.094340 ... 0.165094 0.165094 0.169811 0.029336 0.169811 0.010155 0.029336 0.038300 0.038300 0.221698
7 0.045809 0.017472 0.017472 0.045809 0.031768 0.083289 0.083289 0.001222 0.001222 0.083289 ... 0.145756 0.145756 0.149921 0.057182 0.149921 0.002199 0.057182 0.074655 0.074655 0.195730
8 0.045809 0.017472 0.017472 0.045809 0.031768 0.083289 0.083289 0.001222 0.001222 0.083289 ... 0.145756 0.145756 0.149921 0.057182 0.149921 0.002199 0.057182 0.074655 0.074655 0.195730
9 0.031113 0.019013 0.019013 0.031113 0.034569 0.056568 0.056568 0.025141 0.025141 0.056568 ... 0.098994 0.098994 0.101823 0.062225 0.101823 0.045255 0.062225 0.081238 0.081238 0.132935
10 0.031113 0.019013 0.019013 0.031113 0.034569 0.056568 0.056568 0.025141 0.025141 0.056568 ... 0.098994 0.098994 0.101823 0.062225 0.101823 0.045255 0.062225 0.081238 0.081238 0.132935
11 0.045809 0.017472 0.017472 0.045809 0.031768 0.083289 0.083289 0.001222 0.001222 0.083289 ... 0.145756 0.145756 0.149921 0.057182 0.149921 0.002199 0.057182 0.074655 0.074655 0.195730
12 0.031113 0.019013 0.019013 0.031113 0.034569 0.056568 0.056568 0.025141 0.025141 0.056568 ... 0.098994 0.098994 0.101823 0.062225 0.101823 0.045255 0.062225 0.081238 0.081238 0.132935
13 0.051887 0.008964 0.008964 0.051887 0.016298 0.094340 0.094340 0.005642 0.005642 0.094340 ... 0.165094 0.165094 0.169811 0.029336 0.169811 0.010155 0.029336 0.038300 0.038300 0.221698
14 0.111288 0.068009 0.068009 0.111288 0.099099 0.162162 0.162162 0.072072 0.072072 0.162162 ... 0.063593 0.063593 0.054054 0.033033 0.054054 0.024024 0.033033 0.031090 0.031090 0.050874
15 0.111288 0.068009 0.068009 0.111288 0.099099 0.162162 0.162162 0.072072 0.072072 0.162162 ... 0.063593 0.063593 0.054054 0.033033 0.054054 0.024024 0.033033 0.031090 0.031090 0.050874
16 0.163857 0.062498 0.062498 0.163857 0.091068 0.238762 0.238762 0.003503 0.003503 0.238762 ... 0.093632 0.093632 0.079587 0.030356 0.079587 0.001168 0.030356 0.028570 0.028570 0.074906
17 0.185596 0.032063 0.032063 0.185596 0.046721 0.270440 0.270440 0.016173 0.016173 0.270440 ... 0.106055 0.106055 0.090147 0.015574 0.090147 0.005391 0.015574 0.014657 0.014657 0.084844
18 0.111288 0.068009 0.068009 0.111288 0.099099 0.162162 0.162162 0.072072 0.072072 0.162162 ... 0.063593 0.063593 0.054054 0.033033 0.054054 0.024024 0.033033 0.031090 0.031090 0.050874
19 0.027266 0.016663 0.016663 0.027266 0.053496 0.087539 0.087539 0.038906 0.038906 0.087539 ... 0.061708 0.061708 0.132026 0.080682 0.132026 0.058678 0.080682 0.092960 0.092960 0.152117
20 0.027266 0.016663 0.016663 0.027266 0.053496 0.087539 0.087539 0.038906 0.038906 0.087539 ... 0.061708 0.061708 0.132026 0.080682 0.132026 0.058678 0.080682 0.092960 0.092960 0.152117
21 0.040146 0.015312 0.015312 0.040146 0.049161 0.128889 0.128889 0.001891 0.001891 0.128889 ... 0.090856 0.090856 0.194391 0.074144 0.194391 0.002852 0.074144 0.085427 0.085427 0.223972
22 0.040146 0.015312 0.015312 0.040146 0.049161 0.128889 0.128889 0.001891 0.001891 0.128889 ... 0.090856 0.090856 0.194391 0.074144 0.194391 0.002852 0.074144 0.085427 0.085427 0.223972
23 0.048432 0.018473 0.018473 0.048432 0.056391 0.147846 0.147846 0.002169 0.002169 0.147846 ... 0.257456 0.257456 0.040785 0.015556 0.040785 0.000598 0.015556 0.069030 0.069030 0.180984
24 0.054858 0.009477 0.009477 0.054858 0.028930 0.167461 0.167461 0.010014 0.010014 0.167461 ... 0.291614 0.291614 0.046196 0.007981 0.046196 0.002763 0.007981 0.035415 0.035415 0.204996
25 0.048432 0.018473 0.018473 0.048432 0.056391 0.147846 0.147846 0.002169 0.002169 0.147846 ... 0.257456 0.257456 0.040785 0.015556 0.040785 0.000598 0.015556 0.069030 0.069030 0.180984
26 0.032894 0.020102 0.020102 0.032894 0.061364 0.100414 0.100414 0.044628 0.044628 0.100414 ... 0.174858 0.174858 0.027700 0.016928 0.027700 0.012311 0.016928 0.075118 0.075118 0.122920
27 0.054858 0.009477 0.009477 0.054858 0.028930 0.167461 0.167461 0.010014 0.010014 0.167461 ... 0.291614 0.291614 0.046196 0.007981 0.046196 0.002763 0.007981 0.035415 0.035415 0.204996
28 0.063842 0.011029 0.011029 0.063842 0.037223 0.215466 0.215466 0.012885 0.012885 0.215466 ... 0.242066 0.242066 0.156944 0.027113 0.156944 0.009385 0.027113 0.011029 0.011029 0.063842
29 0.063842 0.011029 0.011029 0.063842 0.037223 0.215466 0.215466 0.012885 0.012885 0.215466 ... 0.242066 0.242066 0.156944 0.027113 0.156944 0.009385 0.027113 0.011029 0.011029 0.063842
30 0.056364 0.021498 0.021498 0.056364 0.072556 0.190227 0.190227 0.002791 0.002791 0.190227 ... 0.213712 0.213712 0.138560 0.052849 0.138560 0.002033 0.052849 0.021498 0.021498 0.056364

29 rows × 29 columns


In [143]:
poi_transmat.sum(axis=1)


Out[143]:
poiID
1     1.576483
2     1.634392
3     1.634392
4     1.576483
6     1.657619
7     1.637397
8     1.637397
9     1.560811
10    1.560811
11    1.637397
12    1.560811
13    1.657619
14    1.885533
15    1.885533
16    1.998180
17    2.044765
18    1.885533
19    1.650642
20    1.650642
21    1.774917
22    1.774917
23    1.709562
24    1.786467
25    1.709562
26    1.625277
27    1.786467
28    1.979241
29    1.979241
30    1.884832
dtype: float64

In [144]:
for poi1 in poi_transmat.index:
    rowsum = poi_transmat.loc[poi1].sum()
    #print(rowsum)
    assert(rowsum > 0.)
    #poi_transmat.loc[poi1] /= rowsum # OK
    #print(poi_transmat.loc[poi1].sum())
    logSum = math.log10(rowsum)
    #print(logSum)
    for poi2 in poi_transmat.columns:
        logP = math.log10(poi_transmat.loc[poi1, poi2]) - logSum
        poi_transmat.loc[poi1, poi2] = 10 ** (logP)

In [145]:
poi_transmat


Out[145]:
poiID 1 2 3 4 6 7 8 9 10 11 ... 21 22 23 24 25 26 27 28 29 30
poiID
1 0.028397 0.010831 0.010831 0.028397 0.028432 0.074543 0.074543 0.001094 0.001094 0.074543 ... 0.081642 0.081642 0.046146 0.017601 0.046146 0.000677 0.017601 0.028432 0.028432 0.074543
2 0.031025 0.005360 0.005360 0.031025 0.014070 0.081441 0.081441 0.004870 0.004870 0.081441 ... 0.089198 0.089198 0.050416 0.008710 0.050416 0.003015 0.008710 0.014070 0.014070 0.081441
3 0.031025 0.005360 0.005360 0.031025 0.014070 0.081441 0.081441 0.004870 0.004870 0.081441 ... 0.089198 0.089198 0.050416 0.008710 0.050416 0.003015 0.008710 0.014070 0.014070 0.081441
4 0.028397 0.010831 0.010831 0.028397 0.028432 0.074543 0.074543 0.001094 0.001094 0.074543 ... 0.081642 0.081642 0.046146 0.017601 0.046146 0.000677 0.017601 0.028432 0.028432 0.074543
6 0.031302 0.005408 0.005408 0.031302 0.009832 0.056913 0.056913 0.003403 0.003403 0.056913 ... 0.099597 0.099597 0.102443 0.017698 0.102443 0.006126 0.017698 0.023105 0.023105 0.133745
7 0.027977 0.010671 0.010671 0.027977 0.019402 0.050867 0.050867 0.000746 0.000746 0.050867 ... 0.089017 0.089017 0.091560 0.034923 0.091560 0.001343 0.034923 0.045594 0.045594 0.119537
8 0.027977 0.010671 0.010671 0.027977 0.019402 0.050867 0.050867 0.000746 0.000746 0.050867 ... 0.089017 0.089017 0.091560 0.034923 0.091560 0.001343 0.034923 0.045594 0.045594 0.119537
9 0.019934 0.012182 0.012182 0.019934 0.022148 0.036243 0.036243 0.016108 0.016108 0.036243 ... 0.063425 0.063425 0.065237 0.039867 0.065237 0.028994 0.039867 0.052049 0.052049 0.085171
10 0.019934 0.012182 0.012182 0.019934 0.022148 0.036243 0.036243 0.016108 0.016108 0.036243 ... 0.063425 0.063425 0.065237 0.039867 0.065237 0.028994 0.039867 0.052049 0.052049 0.085171
11 0.027977 0.010671 0.010671 0.027977 0.019402 0.050867 0.050867 0.000746 0.000746 0.050867 ... 0.089017 0.089017 0.091560 0.034923 0.091560 0.001343 0.034923 0.045594 0.045594 0.119537
12 0.019934 0.012182 0.012182 0.019934 0.022148 0.036243 0.036243 0.016108 0.016108 0.036243 ... 0.063425 0.063425 0.065237 0.039867 0.065237 0.028994 0.039867 0.052049 0.052049 0.085171
13 0.031302 0.005408 0.005408 0.031302 0.009832 0.056913 0.056913 0.003403 0.003403 0.056913 ... 0.099597 0.099597 0.102443 0.017698 0.102443 0.006126 0.017698 0.023105 0.023105 0.133745
14 0.059022 0.036069 0.036069 0.059022 0.052558 0.086003 0.086003 0.038224 0.038224 0.086003 ... 0.033727 0.033727 0.028668 0.017519 0.028668 0.012741 0.017519 0.016489 0.016489 0.026981
15 0.059022 0.036069 0.036069 0.059022 0.052558 0.086003 0.086003 0.038224 0.038224 0.086003 ... 0.033727 0.033727 0.028668 0.017519 0.028668 0.012741 0.017519 0.016489 0.016489 0.026981
16 0.082003 0.031277 0.031277 0.082003 0.045576 0.119490 0.119490 0.001753 0.001753 0.119490 ... 0.046859 0.046859 0.039830 0.015192 0.039830 0.000584 0.015192 0.014298 0.014298 0.037487
17 0.090767 0.015681 0.015681 0.090767 0.022849 0.132260 0.132260 0.007909 0.007909 0.132260 ... 0.051867 0.051867 0.044087 0.007616 0.044087 0.002636 0.007616 0.007168 0.007168 0.041493
18 0.059022 0.036069 0.036069 0.059022 0.052558 0.086003 0.086003 0.038224 0.038224 0.086003 ... 0.033727 0.033727 0.028668 0.017519 0.028668 0.012741 0.017519 0.016489 0.016489 0.026981
19 0.016519 0.010095 0.010095 0.016519 0.032409 0.053033 0.053033 0.023570 0.023570 0.053033 ... 0.037384 0.037384 0.079985 0.048879 0.079985 0.035549 0.048879 0.056318 0.056318 0.092156
20 0.016519 0.010095 0.010095 0.016519 0.032409 0.053033 0.053033 0.023570 0.023570 0.053033 ... 0.037384 0.037384 0.079985 0.048879 0.079985 0.035549 0.048879 0.056318 0.056318 0.092156
21 0.022618 0.008627 0.008627 0.022618 0.027697 0.072617 0.072617 0.001065 0.001065 0.072617 ... 0.051189 0.051189 0.109521 0.041773 0.109521 0.001607 0.041773 0.048130 0.048130 0.126187
22 0.022618 0.008627 0.008627 0.022618 0.027697 0.072617 0.072617 0.001065 0.001065 0.072617 ... 0.051189 0.051189 0.109521 0.041773 0.109521 0.001607 0.041773 0.048130 0.048130 0.126187
23 0.028330 0.010806 0.010806 0.028330 0.032986 0.086482 0.086482 0.001269 0.001269 0.086482 ... 0.150597 0.150597 0.023857 0.009099 0.023857 0.000350 0.009099 0.040379 0.040379 0.105865
24 0.030708 0.005305 0.005305 0.030708 0.016194 0.093739 0.093739 0.005606 0.005606 0.093739 ... 0.163235 0.163235 0.025859 0.004467 0.025859 0.001546 0.004467 0.019824 0.019824 0.114749
25 0.028330 0.010806 0.010806 0.028330 0.032986 0.086482 0.086482 0.001269 0.001269 0.086482 ... 0.150597 0.150597 0.023857 0.009099 0.023857 0.000350 0.009099 0.040379 0.040379 0.105865
26 0.020239 0.012368 0.012368 0.020239 0.037756 0.061782 0.061782 0.027459 0.027459 0.061782 ... 0.107587 0.107587 0.017043 0.010415 0.017043 0.007575 0.010415 0.046218 0.046218 0.075630
27 0.030708 0.005305 0.005305 0.030708 0.016194 0.093739 0.093739 0.005606 0.005606 0.093739 ... 0.163235 0.163235 0.025859 0.004467 0.025859 0.001546 0.004467 0.019824 0.019824 0.114749
28 0.032256 0.005572 0.005572 0.032256 0.018807 0.108863 0.108863 0.006510 0.006510 0.108863 ... 0.122303 0.122303 0.079295 0.013699 0.079295 0.004742 0.013699 0.005572 0.005572 0.032256
29 0.032256 0.005572 0.005572 0.032256 0.018807 0.108863 0.108863 0.006510 0.006510 0.108863 ... 0.122303 0.122303 0.079295 0.013699 0.079295 0.004742 0.013699 0.005572 0.005572 0.032256
30 0.029904 0.011406 0.011406 0.029904 0.038495 0.100925 0.100925 0.001481 0.001481 0.100925 ... 0.113385 0.113385 0.073513 0.028039 0.073513 0.001078 0.028039 0.011406 0.011406 0.029904

29 rows × 29 columns


In [146]:
poi_transmat.sum(axis=1)


Out[146]:
poiID
1     1
2     1
3     1
4     1
6     1
7     1
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
dtype: float64

3.5 Transition Probabilities between POI Pair Distance Classes

TODO: Improve the distance calculation using Google maps distance API with different travel modes demonstrated here.

3.5.1 Compute POI Pair Distance


In [147]:
def calc_dist(longitude1, latitude1, longitude2, latitude2):
    """Calculate the distance (unit: km) between two places on earth"""
    # convert degrees to radians
    lon1 = math.radians(longitude1)
    lat1 = math.radians(latitude1)
    lon2 = math.radians(longitude2)
    lat2 = math.radians(latitude2)
    radius = 6371.009 # mean earth radius is 6371.009km, en.wikipedia.org/wiki/Earth_radius#Mean_radius
    # The haversine formula, en.wikipedia.org/wiki/Great-circle_distance
    dlon = math.fabs(lon1 - lon2)
    dlat = math.fabs(lat1 - lat2)
    return 2 * radius * math.asin( math.sqrt( \
               (math.sin(0.5*dlat))**2 + math.cos(lat1) * math.cos(lat2) * (math.sin(0.5*dlon))**2 ))

Compute POI-pair distance if the pair is observed in dataset.


In [148]:
def calc_obs_poipair_distmat(seqid_set, poi_all, seq_all):
    poi_distmat = pd.DataFrame(data=np.full((poi_all.shape[0], poi_all.shape[0]), np.nan, dtype=np.float), \
                               index=poi_all.index, columns=poi_all.index)
    for seqid in seqid_set:
        seq = extract_seq(seqid, seq_all)
        if len(seq) < 2: continue
        for j in range(len(seq)-1):
            poi1 = seq[j]
            poi2 = seq[j+1]
            if np.isnan(poi_distmat.loc[poi1, poi2]):
                dist = calc_dist(poi_all.loc[poi1, 'poiLon'], poi_all.loc[poi1, 'poiLat'], \
                                 poi_all.loc[poi2, 'poiLon'], poi_all.loc[poi2, 'poiLat'])
                poi_distmat.loc[poi1, poi2] = dist
                poi_distmat.loc[poi2, poi1] = dist
    return poi_distmat

In [149]:
poi_distmat = calc_obs_poipair_distmat(seq_all['seqID'].unique(), poi_all, seq_all)
poi_distmat


Out[149]:
poiID 1 2 3 4 6 7 8 9 10 11 ... 21 22 23 24 25 26 27 28 29 30
poiID
1 NaN 3.374660 NaN 0.835143 NaN 0.467431 0.499234 NaN 3.030221 NaN ... 1.458962 1.094119 1.189309 0.564375 2.183018 NaN 3.137767 1.012704 2.319111 0.294867
2 3.374660 NaN NaN 2.544650 NaN NaN NaN NaN NaN NaN ... NaN NaN 3.901948 NaN NaN NaN NaN NaN NaN 3.380300
3 NaN NaN NaN 2.437755 1.372334 1.684876 2.241542 NaN NaN NaN ... 0.657012 1.117674 0.923802 2.625464 1.889309 NaN NaN 1.115082 0.902520 1.837980
4 0.835143 2.544650 2.437755 NaN 1.397423 1.161969 0.339397 NaN 3.653970 2.908190 ... 1.810747 1.322054 1.589528 0.634904 1.774903 NaN NaN 1.374757 NaN 0.851501
6 NaN NaN 1.372334 1.397423 NaN 1.412097 1.376158 NaN NaN 1.512543 ... 1.003735 0.711743 1.006873 1.858595 0.715967 NaN 1.831988 0.916935 0.946920 1.302621
7 0.467431 NaN 1.684876 1.161969 1.412097 NaN 0.835124 NaN NaN 2.619294 ... 1.045146 0.783543 0.774725 1.030026 2.095884 NaN 2.777055 0.639632 2.008633 0.310629
8 0.499234 NaN 2.241542 0.339397 1.376158 0.835124 NaN NaN NaN 2.853071 ... 1.593802 1.130084 1.350432 0.484327 NaN NaN 3.114721 1.139623 2.247435 0.528062
9 NaN NaN NaN NaN NaN NaN NaN NaN NaN 7.054715 ... NaN 7.963547 NaN NaN NaN NaN NaN NaN NaN NaN
10 3.030221 NaN NaN 3.653970 NaN NaN NaN NaN NaN NaN ... 1.965216 2.436217 NaN NaN NaN NaN NaN NaN 2.486915 NaN
11 NaN NaN NaN 2.908190 1.512543 2.619294 2.853071 7.054715 NaN NaN ... 1.655460 1.849905 1.888500 3.322623 1.484285 8.021136 0.431079 NaN 0.616912 2.641660
12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13 NaN NaN NaN 4.405230 3.046508 4.304368 4.421919 NaN NaN 1.710602 ... 3.365736 NaN 3.594007 4.905005 2.685135 NaN 1.731676 3.671584 2.300347 4.292378
14 NaN 0.183578 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 3.719935 NaN NaN NaN 4.536767 NaN NaN NaN
15 2.635840 NaN NaN 2.700936 NaN 3.052298 NaN NaN NaN NaN ... NaN NaN NaN 2.204304 NaN NaN NaN NaN NaN NaN
16 0.630510 NaN 2.221955 0.248612 1.276547 0.918038 0.147439 NaN NaN 2.770357 ... 1.584746 1.104379 1.354671 0.605657 1.742983 NaN NaN 1.140469 2.171325 0.607493
17 NaN 0.212342 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 3.415143 NaN NaN NaN NaN NaN NaN NaN NaN
18 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19 2.390428 NaN NaN NaN NaN 2.831227 2.370718 NaN NaN 5.223634 ... NaN 3.450908 NaN 1.910472 4.112679 NaN NaN NaN NaN 2.670207
20 NaN NaN NaN NaN 5.686019 NaN NaN NaN NaN NaN ... NaN 6.364574 NaN 6.502507 NaN NaN NaN NaN NaN NaN
21 1.458962 NaN 0.657012 1.810747 1.003735 1.045146 1.593802 NaN 1.965216 1.655460 ... NaN 0.496441 0.271411 1.968623 1.675104 9.665746 1.753738 0.458902 1.087043 1.181324
22 1.094119 NaN 1.117674 1.322054 0.711743 0.783543 1.130084 7.963547 2.436217 1.849905 ... 0.496441 NaN 0.343851 1.544818 1.427662 9.857157 2.044387 0.205239 1.234510 0.799581
23 1.189309 3.901948 0.923802 1.589528 1.006873 0.774725 1.350432 NaN NaN 1.888500 ... 0.271411 0.343851 NaN 1.706658 1.715107 NaN 2.012280 0.214791 1.297656 0.915491
24 0.564375 NaN 2.625464 0.634904 1.858595 1.030026 0.484327 NaN NaN 3.322623 ... 1.968623 1.544818 1.706658 NaN 2.348482 NaN 3.568606 1.510533 2.712207 0.793996
25 2.183018 NaN 1.889309 1.774903 0.715967 2.095884 NaN NaN NaN 1.484285 ... 1.675104 1.427662 1.715107 2.348482 NaN NaN 1.895545 1.632878 1.144568 1.940634
26 NaN NaN NaN NaN NaN NaN NaN NaN NaN 8.021136 ... 9.665746 9.857157 NaN NaN NaN NaN NaN NaN NaN NaN
27 3.137767 NaN NaN NaN 1.831988 2.777055 3.114721 NaN NaN 0.431079 ... 1.753738 2.044387 2.012280 3.568606 1.895545 NaN NaN 2.141689 0.886073 2.843698
28 1.012704 NaN 1.115082 1.374757 0.916935 0.639632 1.139623 NaN NaN NaN ... 0.458902 0.205239 0.214791 1.510533 1.632878 NaN 2.141689 NaN 1.372439 0.727763
29 2.319111 NaN 0.902520 NaN 0.946920 2.008633 2.247435 NaN 2.486915 0.616912 ... 1.087043 1.234510 1.297656 2.712207 1.144568 NaN 0.886073 1.372439 NaN 2.024766
30 0.294867 3.380300 1.837980 0.851501 1.302621 0.310629 0.528062 NaN NaN 2.641660 ... 1.181324 0.799581 0.915491 0.793996 1.940634 NaN 2.843698 0.727763 2.024766 NaN

29 rows × 29 columns

3.5.2 Discretize POI Pair Distance


In [150]:
#distdata = pd.Series([x for x in np.unique(poi_distmat.get_values().flatten()) if not np.isnan(x)])
distdata = pd.Series([poi_distmat.iloc[x, y] \
                      for x in range(poi_distmat.index.shape[0]) \
                      for y in range(x+1, poi_distmat.index.shape[0]) \
                      if not np.isnan(poi_distmat.iloc[x, y])])

In [151]:
distdata.describe()


Out[151]:
count    179.000000
mean       2.069742
std        1.659092
min        0.147439
25%        0.975327
50%        1.710602
75%        2.655934
max        9.857157
dtype: float64

In [152]:
ax = distdata.hist(bins=10)
ax.set_xlabel('POI-Pair Distance (km)')
ax.set_ylabel('#POI-Pair')


Out[152]:
<matplotlib.text.Text at 0x7f5e14e4fac8>

In [153]:
#bins = np.linspace(0, 10, 7)
#bins = np.logspace(0, 2, 4)
#bins = [0, 1, 2, 3, 10]
dist_bins = [0, 2, 5, 100] # walk, ride, drive

In [154]:
ax = distdata.ix[np.nonzero(distdata)].hist(bins=dist_bins)
ax.set_xlabel('POI-Pair Distance (km)')
ax.set_ylabel('#POI-Pair')
ax.set_xscale('log')



In [155]:
poi_distclass_mat = pd.DataFrame(data=np.full((poi_all.shape[0], poi_all.shape[0]), np.nan, dtype=np.float), \
                                 index=poi_all.index, columns=poi_all.index)

In [156]:
for i in range(poi_all.index.shape[0]):
    poi1 = poi_all.index[i]
    for j in range(i+1, poi_all.index.shape[0]):
        poi2 = poi_all.index[j]
        if np.isnan(poi_distmat.loc[poi1, poi2]): continue
        dc = np.digitize([poi_distmat.loc[poi1, poi2]], dist_bins)[0]
        poi_distclass_mat.loc[poi1, poi2] = dc
        poi_distclass_mat.loc[poi2, poi1] = dc

In [157]:
poi_distclass_mat


Out[157]:
poiID 1 2 3 4 6 7 8 9 10 11 ... 21 22 23 24 25 26 27 28 29 30
poiID
1 NaN 2 NaN 1 NaN 1 1 NaN 2 NaN ... 1 1 1 1 2 NaN 2 1 2 1
2 2 NaN NaN 2 NaN NaN NaN NaN NaN NaN ... NaN NaN 2 NaN NaN NaN NaN NaN NaN 2
3 NaN NaN NaN 2 1 1 2 NaN NaN NaN ... 1 1 1 2 1 NaN NaN 1 1 1
4 1 2 2 NaN 1 1 1 NaN 2 2 ... 1 1 1 1 1 NaN NaN 1 NaN 1
6 NaN NaN 1 1 NaN 1 1 NaN NaN 1 ... 1 1 1 1 1 NaN 1 1 1 1
7 1 NaN 1 1 1 NaN 1 NaN NaN 2 ... 1 1 1 1 2 NaN 2 1 2 1
8 1 NaN 2 1 1 1 NaN NaN NaN 2 ... 1 1 1 1 NaN NaN 2 1 2 1
9 NaN NaN NaN NaN NaN NaN NaN NaN NaN 3 ... NaN 3 NaN NaN NaN NaN NaN NaN NaN NaN
10 2 NaN NaN 2 NaN NaN NaN NaN NaN NaN ... 1 2 NaN NaN NaN NaN NaN NaN 2 NaN
11 NaN NaN NaN 2 1 2 2 3 NaN NaN ... 1 1 1 2 1 3 1 NaN 1 2
12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13 NaN NaN NaN 2 2 2 2 NaN NaN 1 ... 2 NaN 2 2 2 NaN 1 2 2 2
14 NaN 1 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN 2 NaN NaN NaN 2 NaN NaN NaN
15 2 NaN NaN 2 NaN 2 NaN NaN NaN NaN ... NaN NaN NaN 2 NaN NaN NaN NaN NaN NaN
16 1 NaN 2 1 1 1 1 NaN NaN 2 ... 1 1 1 1 1 NaN NaN 1 2 1
17 NaN 1 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN 2 NaN NaN NaN NaN NaN NaN NaN NaN
18 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
19 2 NaN NaN NaN NaN 2 2 NaN NaN 3 ... NaN 2 NaN 1 2 NaN NaN NaN NaN 2
20 NaN NaN NaN NaN 3 NaN NaN NaN NaN NaN ... NaN 3 NaN 3 NaN NaN NaN NaN NaN NaN
21 1 NaN 1 1 1 1 1 NaN 1 1 ... NaN 1 1 1 1 3 1 1 1 1
22 1 NaN 1 1 1 1 1 3 2 1 ... 1 NaN 1 1 1 3 2 1 1 1
23 1 2 1 1 1 1 1 NaN NaN 1 ... 1 1 NaN 1 1 NaN 2 1 1 1
24 1 NaN 2 1 1 1 1 NaN NaN 2 ... 1 1 1 NaN 2 NaN 2 1 2 1
25 2 NaN 1 1 1 2 NaN NaN NaN 1 ... 1 1 1 2 NaN NaN 1 1 1 1
26 NaN NaN NaN NaN NaN NaN NaN NaN NaN 3 ... 3 3 NaN NaN NaN NaN NaN NaN NaN NaN
27 2 NaN NaN NaN 1 2 2 NaN NaN 1 ... 1 2 2 2 1 NaN NaN 2 1 2
28 1 NaN 1 1 1 1 1 NaN NaN NaN ... 1 1 1 1 1 NaN 2 NaN 1 1
29 2 NaN 1 NaN 1 2 2 NaN 2 1 ... 1 1 1 2 1 NaN 1 1 NaN 2
30 1 2 1 1 1 1 1 NaN NaN 2 ... 1 1 1 1 1 NaN 2 1 2 NaN

29 rows × 29 columns

3.5.3 Compute Transition Probabilities


In [158]:
def calc_poipair_dist_transmat(seqid_set, poi_all, seq_all, poi_distclass_mat):
    dist_class = [poi_distclass_mat.iloc[x, y] \
                  for x in range(poi_distclass_mat.index.shape[0]) \
                  for y in range(x+1, poi_distclass_mat.index.shape[0]) \
                  if not np.isnan(poi_distclass_mat.iloc[x, y])]
    dist_class = np.unique(np.array(dist_class))
    poipair_dist_transmat = pd.DataFrame(data=np.zeros((len(dist_class), len(dist_class)), dtype=np.float), \
                                         index=dist_class, columns=dist_class)
    for seqid in seqid_set:
        seq = extract_seq(seqid, seq_all)
        if len(seq) < 3: continue
        for j in range(1, len(seq)-1):
            poi1 = seq[j-1]
            poi2 = seq[j]
            poi3 = seq[j+1]
            dc1 = int(poi_distclass_mat.loc[poi1, poi2])
            dc2 = int(poi_distclass_mat.loc[poi2, poi3])
            assert(not np.isnan(dc1))
            assert(not np.isnan(dc2))
            poipair_dist_transmat.loc[dc1, dc2] += 1
    return poipair_dist_transmat

In [159]:
poipair_dist_transmat = calc_poipair_dist_transmat(seq_all['seqID'].unique(), poi_all, seq_all, poi_distclass_mat)
poipair_dist_transmat


Out[159]:
1.0 2.0 3.0
1 491 35 1
2 31 13 0
3 2 0 0

In [160]:
poipair_dist_transmat = normalise_transmat(poipair_dist_transmat)
poipair_dist_transmat


Out[160]:
1.0 2.0 3.0
1 0.931689 0.066414 0.001898
2 0.674322 0.298539 0.027140
3 0.846154 0.076923 0.076923

4. Compute Trajectory Likelihood


In [161]:
def calc_seq_loglikelihood(seq, poi_all, poi_transmat, poi_distclass_mat, poipair_dist_transmat, dist_bins):
    assert(len(seq) > 1)
    logL = math.log10(poi_transmat.loc[seq[0], seq[1]])
    for j in range(1, len(seq)-1):
        poi1 = seq[j-1]
        poi2 = seq[j]
        poi3 = seq[j+1]
        dc12 = poi_distclass_mat.loc[poi1, poi2]
        dc23 = poi_distclass_mat.loc[poi2, poi3]
        if np.isnan(dc12):
            dist12 = calc_dist(poi_all.loc[poi1, 'poiLon'], poi_all.loc[poi1, 'poiLat'], \
                               poi_all.loc[poi2, 'poiLon'], poi_all.loc[poi2, 'poiLat'])
            dc12 = np.digitize([dist12], dist_bins)[0]
        if np.isnan(dc23):
            dist23 = calc_dist(poi_all.loc[poi2, 'poiLon'], poi_all.loc[poi2, 'poiLat'], \
                               poi_all.loc[poi3, 'poiLon'], poi_all.loc[poi3, 'poiLat'])
            dc23 = np.digitize([dist23], dist_bins)[0]
        logL += math.log10(poi_transmat.loc[poi2, poi3])
        logL += math.log10(poipair_dist_transmat.loc[dc12, dc23])
    return logL

Simple check.


In [162]:
seq1 = [10, 21, 28, 22]
d12 = calc_dist(poi_all.loc[10,'poiLon'], poi_all.loc[10,'poiLat'], poi_all.loc[21,'poiLon'], poi_all.loc[21, 'poiLat'])
d23 = calc_dist(poi_all.loc[21,'poiLon'], poi_all.loc[21,'poiLat'], poi_all.loc[28,'poiLon'], poi_all.loc[28, 'poiLat'])
d34 = calc_dist(poi_all.loc[28,'poiLon'], poi_all.loc[28,'poiLat'], poi_all.loc[22,'poiLon'], poi_all.loc[22, 'poiLat'])

In [163]:
print(d12, d23, d34, dist_bins)


1.965215721038275 0.45890229014413647 0.20523940866045706 [0, 2, 5, 100]

In [164]:
s1 = math.log10(poi_transmat.loc[10, 21]) + math.log10(poi_transmat.loc[21, 28]) + math.log10(poi_transmat.loc[28, 22])
s2 = math.log10(poipair_dist_transmat.loc[1, 1]) + math.log10(poipair_dist_transmat.loc[1, 1])
print(s1+s2)


-3.489346149470383

In [165]:
calc_seq_loglikelihood([10, 21, 28, 22], poi_all, poi_transmat, poi_distclass_mat, poipair_dist_transmat, dist_bins)


Out[165]:
-3.489346149470383

4.1 Log Likelihood of Actual Trajectories


In [166]:
seqid_set = sorted(seq_all['seqID'].unique().tolist())
act_seq_logL = pd.DataFrame(data=np.zeros((len(seqid_set), 3), dtype=np.object), \
                            index=seqid_set, columns=['actSeq', 'logLikelihood', 'seqLen'])
act_seq_logL.index.name = 'seqID'

In [167]:
for seqid in seqid_set:
    seq = extract_seq(seqid, seq_all)
    logL = 0
    if len(seq) < 2: 
        logL = math.log10(1)
    else:
        logL = calc_seq_loglikelihood(seq, poi_all, poi_transmat, poi_distclass_mat, poipair_dist_transmat, dist_bins)
    act_seq_logL.loc[seqid, 'actSeq'] = str(seq)
    act_seq_logL.loc[seqid, 'seqLen'] = len(seq)
    act_seq_logL.loc[seqid, 'logLikelihood'] = logL
    #print('Sequence %-20s Log likelihood: %.3f' % (str(seq), logL))

In [168]:
act_seq_logL.head()


Out[168]:
actSeq logLikelihood seqLen
seqID
1 [30] 0 1
2 [6] 0 1
3 [6] 0 1
4 [13] 0 1
5 [24] 0 1

In [169]:
seq_345_logL = act_seq_logL[act_seq_logL['seqLen'].isin({3, 4, 5})].copy()
seq_345_logL.sort(columns=['actSeq'], ascending=True, inplace=True)

In [170]:
seq_345_logL


Out[170]:
actSeq logLikelihood seqLen
seqID
5927 [1, 30, 4] -2.682596 3
852 [1, 30, 7] -2.154322 3
2338 [1, 30, 7] -2.154322 3
4805 [1, 7, 28] -2.499418 3
1171 [1, 8, 24] -2.615213 3
1128 [10, 21, 28, 22] -3.489346 4
1089 [11, 13, 30] -3.763629 3
4973 [11, 19, 15] -5.114342 3
1123 [11, 22, 23, 21] -2.894671 4
5707 [11, 24, 16, 7] -4.168811 4
2331 [11, 27, 23] -4.222023 3
1310 [11, 27, 6] -3.278264 3
1940 [11, 29, 6, 23, 22] -4.970668 5
2604 [11, 29, 7] -3.481959 3
683 [13, 16, 29] -3.55381 3
5475 [13, 24, 16] -3.510603 3
3935 [13, 4, 16] -2.502124 3
5751 [14, 2, 17] -3.024283 3
1404 [15, 19, 1] -4.783978 3
1121 [16, 11, 13] -2.805966 3
515 [16, 11, 23] -2.132094 3
1225 [16, 17, 19] -4.936115 3
523 [16, 22, 23] -2.320441 3
5971 [16, 22, 29] -2.677521 3
1523 [16, 23, 28, 22] -3.767657 4
1385 [16, 24, 13] -4.786775 3
2275 [16, 30, 22, 6] -3.990581 4
1312 [16, 30, 28, 21] -4.343014 4
4408 [16, 30, 28, 22, 29] -5.691327 5
4284 [16, 4, 22, 28] -3.553297 4
... ... ... ...
1969 [7, 28, 29] -3.625783 3
778 [7, 28, 30] -2.86322 3
1118 [7, 30, 16, 28] -4.318185 4
2440 [7, 30, 16] -2.442738 3
783 [7, 30, 21, 23, 22] -3.742814 5
809 [7, 30, 21, 23, 28] -4.314476 5
1968 [7, 30, 22, 21, 23] -4.211454 5
162 [7, 30, 22] -1.89867 3
712 [7, 30, 23] -2.08686 3
3514 [7, 30, 6] -2.367825 3
544 [7, 30, 8] -1.949227 3
5856 [7, 30, 8] -1.949227 3
1078 [7, 8, 16] -2.557162 3
1949 [8, 16, 13] -3.751877 3
1378 [8, 16, 1] -2.349767 3
3626 [8, 16, 1] -2.349767 3
3403 [8, 16, 22] -2.592805 3
1912 [8, 16, 4, 1, 29] -6.651153 5
302 [8, 16, 4, 22] -3.468581 4
1275 [8, 16, 4] -2.349767 3
1934 [8, 16, 7] -2.186265 3
3402 [8, 19, 22] -4.836412 3
3767 [8, 22, 28, 16, 7] -4.839599 5
3960 [8, 24, 30] -2.427871 3
898 [8, 30, 16] -2.442738 3
813 [8, 30, 28, 23] -4.027582 4
867 [8, 30, 7] -1.949227 3
3155 [8, 4, 16] -2.410494 3
1536 [8, 4, 16] -2.410494 3
1335 [8, 4, 16] -2.410494 3

309 rows × 3 columns

4.2 Log Likelihood of Enumerated Trajectories

Construct distinct trajectories of length {3, 4, 5}.


In [171]:
seq_345 = dict()

In [172]:
for seqid in sorted(seq_all['seqID'].unique().tolist()):
    seq = extract_seq(seqid, seq_all)
    if len(seq) not in {3, 4, 5}: continue
    if str(seq) not in seq_345:
        seq_345[str(seq)] = [(seqid, seq_user.loc[seqid])]
    else:
        seq_345[str(seq)].append((seqid, seq_user.loc[seqid]))

In [173]:
distinct_seq_345 = sorted([seq_345[x][0][0] for x in sorted(seq_345.keys())])

In [174]:
print(len(distinct_seq_345))
#distinct_seq_345


266

In [175]:
poi_list = poi_all.index.tolist()
#poi_list

Enumerate trajectories of the same (start, end) and length (3, 4 or 5) as an actual sequence.


In [176]:
def enum_345_seq(seqid, poi_list, seq_all): 
    seq = extract_seq(seqid, seq_all)
    assert(len(seq) in {3, 4, 5})
    p0 = seq[0]
    pN = seq[-1]
    
    # enumerate sequences with length 3
    if len(seq) == 3:
        return [[p0, p, pN] \
                for p in poi_list if p not in {p0, pN}]
    
    # enumerate sequences with length 4
    if len(seq) == 4:
        return [[p0, p1, p2, pN] \
                for p1 in poi_list if p1 not in {p0, pN} \
                for p2 in poi_list if p2 not in {p0, p1, pN}]
    
    # enumerate sequences with length 5
    if len(seq) == 5:
        return [[p0, p1, p2, p3, pN] \
                for p1 in poi_list if p1 not in {p0, pN} \
                for p2 in poi_list if p2 not in {p0, p1, pN} \
                for p3 in poi_list if p3 not in {p0, p1, p2, pN}]

Compute (or load from .pkl file if possible) the log likelihood of enumerated trajectories.


In [177]:
enum_logL_dict = dict() # key: actual sequence, value: tuple(loglikelihood of actual sequence, enum sequences dataframe)

In [178]:
doEnum = True

In [179]:
if os.path.exists(fenumseq):
    dict1 = pickle.load(open(fenumseq, 'rb'))
    if (np.array(sorted(dict1.keys())) == np.array(sorted(seq_345.keys()))).all():
        doEnum = False
        enum_logL_dict = dict1

In [180]:
doEnum


Out[180]:
False

In [181]:
if doEnum:
    for seqid in distinct_seq_345:
        enum_seqs = enum_345_seq(seqid, poi_list, seq_all)
        df = pd.DataFrame(data=np.array([str(x) for x in enum_seqs]))
        df.columns = ['enumSeq']
        df.set_index('enumSeq', inplace=True)
        df['logLikelihood'] = 0
        for seq in enum_seqs:
            #print(seq)
            logL = calc_seq_loglikelihood(seq, poi_all, poi_transmat, poi_distclass_mat, poipair_dist_transmat, dist_bins)
            df.loc[str(seq), 'logLikelihood'] = logL
        key = seq_345_logL.loc[seqid, 'actSeq']
        df.sort(columns=['logLikelihood'], ascending=False, inplace=True) # sort by the loglikelihood of enum sequences
        enum_logL_dict[key] = (seq_345_logL.loc[seqid, 'logLikelihood'], df)
    pickle.dump(enum_logL_dict, open(fenumseq, 'wb'))

In [182]:
seqidx = 0
seqid = distinct_seq_345[seqidx]
act_seq = seq_345_logL.loc[seqid, 'actSeq']
print('Actual Sequence:', act_seq)
print('Loglikelihood:', enum_logL_dict[act_seq][0])
enum_logL_dict[act_seq][1].head()


Actual Sequence: [7, 11, 27]
Loglikelihood: -2.92158949476864
Out[182]:
logLikelihood
enumSeq
[7, 21, 27] -2.460358
[7, 11, 27] -2.921589
[7, 25, 27] -3.250408
[7, 29, 27] -3.375546
[7, 6, 27] -3.494974

In [183]:
rank = 1 + np.nonzero(enum_logL_dict[act_seq][1].index == act_seq)[0][0]
print('rank of actual sequence: %d/%d' % (rank, enum_logL_dict[act_seq][1].index.shape[0]))


rank of actual sequence: 2/27

4.3 Compare the Log Likelihood of Actual and Enumerated Trajectories


In [184]:
actseqs = sorted(enum_logL_dict.keys())
actseq_logL = [enum_logL_dict[x][0] for x in actseqs]

In [185]:
df = pd.DataFrame(actseqs)
df.columns = ['actSeq']
df['actSeqLogLikelihood'] = [enum_logL_dict[x][0] for x in actseqs]
df['enumSeq'] = [enum_logL_dict[x][1].index[0] for x in actseqs] # the enum sequence with highest log likelihood
df['enumSeqLogLikelihood'] = [enum_logL_dict[x][1].iloc[0]['logLikelihood'] for x in actseqs]
# rank of actual sequence
df['actSeqRank'] = [1+np.nonzero(enum_logL_dict[x][1].index == x)[0][0] for x in actseqs]
df['#enumSeq'] = [enum_logL_dict[x][1].index.shape[0] for x in actseqs]
df['actSeqRank(Top%)'] = 100*df['actSeqRank']/df['#enumSeq']
df


Out[185]:
actSeq actSeqLogLikelihood enumSeq enumSeqLogLikelihood actSeqRank #enumSeq actSeqRank(Top%)
0 [1, 30, 4] -2.682596 [1, 16, 4] -1.943463 2 27 7.407407
1 [1, 30, 7] -2.154322 [1, 16, 7] -1.779961 2 27 7.407407
2 [1, 7, 28] -2.499418 [1, 22, 28] -2.436397 3 27 11.111111
3 [1, 8, 24] -2.615213 [1, 22, 24] -2.497915 3 27 11.111111
4 [10, 21, 28, 22] -3.489346 [10, 21, 25, 22] -3.041884 20 702 2.849003
5 [11, 13, 30] -3.763629 [11, 21, 30] -1.980241 17 27 62.962963
6 [11, 19, 15] -5.114342 [11, 2, 15] -4.508219 10 27 37.037037
7 [11, 22, 23, 21] -2.894671 [11, 22, 23, 21] -2.894671 1 702 0.142450
8 [11, 24, 16, 7] -4.168811 [11, 22, 30, 7] -3.006971 99 702 14.102564
9 [11, 27, 23] -4.222023 [11, 21, 23] -2.041759 18 27 66.666667
10 [11, 27, 6] -3.278264 [11, 30, 6] -2.508229 13 27 48.148148
11 [11, 29, 6, 23, 22] -4.970668 [11, 23, 21, 25, 22] -3.735348 499 17550 2.843305
12 [11, 29, 7] -3.481959 [11, 30, 7] -2.089631 15 27 55.555556
13 [13, 16, 29] -3.553810 [13, 22, 29] -2.490469 10 27 37.037037
14 [13, 24, 16] -3.510603 [13, 1, 16] -2.502124 12 27 44.444444
15 [13, 4, 16] -2.502124 [13, 1, 16] -2.502124 2 27 7.407407
16 [14, 2, 17] -3.024283 [14, 1, 17] -2.999148 3 27 11.111111
17 [15, 19, 1] -4.783978 [15, 7, 1] -2.789820 27 27 100.000000
18 [16, 11, 13] -2.805966 [16, 11, 13] -2.805966 1 27 3.703704
19 [16, 11, 23] -2.132094 [16, 7, 23] -1.991690 3 27 11.111111
20 [16, 17, 19] -4.936115 [16, 24, 19] -3.859599 6 27 22.222222
21 [16, 22, 23] -2.320441 [16, 7, 23] -1.991690 4 27 14.814815
22 [16, 22, 29] -2.677521 [16, 11, 29] -2.434898 3 27 11.111111
23 [16, 23, 28, 22] -3.767657 [16, 7, 23, 22] -2.844602 77 702 10.968661
24 [16, 24, 13] -4.786775 [16, 11, 13] -2.805966 16 27 59.259259
25 [16, 30, 22, 6] -3.990581 [16, 7, 30, 6] -3.321223 42 702 5.982906
26 [16, 30, 28, 21] -4.343014 [16, 8, 25, 21] -2.844602 149 702 21.225071
27 [16, 30, 28, 22, 29] -5.691327 [16, 8, 23, 22, 29] -4.192915 611 17550 3.481481
28 [16, 4, 22, 28] -3.553297 [16, 8, 21, 28] -3.352238 10 702 1.424501
29 [16, 4, 22] -2.204984 [16, 7, 22] -2.003925 5 27 18.518519
... ... ... ... ... ... ... ...
236 [7, 23, 28, 30] -3.984990 [7, 23, 22, 30] -2.820918 83 702 11.823362
237 [7, 27, 30] -2.922141 [7, 21, 30] -1.980241 13 27 48.148148
238 [7, 28, 22, 30] -3.214104 [7, 23, 22, 30] -2.820918 16 702 2.279202
239 [7, 28, 23] -2.472579 [7, 22, 23] -2.041759 5 27 18.518519
240 [7, 28, 29] -3.625783 [7, 21, 29] -2.398840 8 27 29.629630
241 [7, 28, 30] -2.863220 [7, 21, 30] -1.980241 12 27 44.444444
242 [7, 30, 16, 28] -4.318185 [7, 23, 22, 28] -3.239517 85 702 12.108262
243 [7, 30, 16] -2.442738 [7, 1, 16] -2.410494 3 27 11.111111
244 [7, 30, 21, 23, 22] -3.742814 [7, 23, 21, 25, 22] -3.735348 2 17550 0.011396
245 [7, 30, 21, 23, 28] -4.314476 [7, 22, 25, 21, 28] -4.242984 16 17550 0.091168
246 [7, 30, 22, 21, 23] -4.211454 [7, 22, 25, 21, 23] -3.885903 25 17550 0.142450
247 [7, 30, 22] -1.898670 [7, 23, 22] -1.891204 2 27 7.407407
248 [7, 30, 23] -2.086860 [7, 22, 23] -2.041759 3 27 11.111111
249 [7, 30, 6] -2.367825 [7, 30, 6] -2.367825 1 27 3.703704
250 [7, 30, 8] -1.949227 [7, 30, 8] -1.949227 1 27 3.703704
251 [7, 8, 16] -2.557162 [7, 1, 16] -2.410494 4 27 14.814815
252 [8, 16, 13] -3.751877 [8, 11, 13] -3.176862 7 27 25.925926
253 [8, 16, 1] -2.349767 [8, 16, 1] -2.349767 1 27 3.703704
254 [8, 16, 22] -2.592805 [8, 25, 22] -1.891204 10 27 37.037037
255 [8, 16, 4, 1, 29] -6.651153 [8, 22, 25, 21, 29] -4.242984 1522 17550 8.672365
256 [8, 16, 4, 22] -3.468581 [8, 21, 25, 22] -2.894671 39 702 5.555556
257 [8, 16, 4] -2.349767 [8, 16, 4] -2.349767 1 27 3.703704
258 [8, 16, 7] -2.186265 [8, 30, 7] -1.949227 3 27 11.111111
259 [8, 19, 22] -4.836412 [8, 25, 22] -1.891204 19 27 70.370370
260 [8, 22, 28, 16, 7] -4.839599 [8, 25, 22, 30, 7] -3.847648 313 17550 1.783476
261 [8, 24, 30] -2.427871 [8, 21, 30] -1.980241 6 27 22.222222
262 [8, 30, 16] -2.442738 [8, 1, 16] -2.410494 3 27 11.111111
263 [8, 30, 28, 23] -4.027582 [8, 25, 22, 23] -2.882436 76 702 10.826211
264 [8, 30, 7] -1.949227 [8, 30, 7] -1.949227 1 27 3.703704
265 [8, 4, 16] -2.410494 [8, 1, 16] -2.410494 2 27 7.407407

266 rows × 7 columns


In [186]:
enumseq_logL = [enum_logL_dict[x][1]['logLikelihood'] for x in actseqs]

In [187]:
ncol = 10
nrow = int(len(actseqs) / 10)
if len(actseqs) % 10 > 0: nrow += 1
plt.figure(figsize=[20, nrow*5])
for k in range(nrow):
    plt.subplot(nrow, 1, k+1)
    plt.ylim([-16, 0])
    if (k*ncol + ncol) > len(actseqs):
        N = len(actseqs[k*ncol:])
        xidx = range(1, N+1)
        plt.boxplot(enumseq_logL[k*ncol:], labels=actseqs[k*ncol:])
        plt.plot([], 'gD', xdata=xidx, ydata=actseq_logL[k*ncol:], label='actual trajectory')
    else:
        xidx = range(1, ncol+1)
        plt.boxplot(enumseq_logL[k*ncol:(k+1)*ncol], labels=actseqs[k*ncol:(k+1)*ncol])
        plt.plot([], 'gD', xdata=xidx, ydata=actseq_logL[k*ncol:(k+1)*ncol], label='actual trajectory')
    #plt.xlabel('Actual Trajectory')
    plt.ylabel('Log Likelihood')
    plt.legend(numpoints=1)


4.4 Compute the F1-score of Enumerated Trajectories


In [188]:
def calc_F1score(seq_act, seq_rec):
    assert(len(seq_act) > 0)
    assert(len(seq_rec) > 0)
    actset = set(seq_act)
    recset = set(seq_rec)
    intersect = actset & recset
    recall = len(intersect) / len(seq_act)
    precision = len(intersect) / len(seq_rec)
    F1score = 2. * precision * recall / (precision + recall)
    return F1score

In [189]:
def parse_seqstr(seqstr):
    term = re.sub('[ \[\]]', '', seqstr).split(',')
    return [int(x) for x in term]

In [190]:
parse_seqstr(str([3, 11, 27]))


Out[190]:
[3, 11, 27]

In [191]:
calc_F1score(parse_seqstr(str([3, 11, 27])), parse_seqstr(str([3, 5, 27])))


Out[191]:
0.6666666666666666

In [192]:
calc_F1score(parse_seqstr(str([11, 29, 6, 23, 22])), parse_seqstr(str([11, 29, 6, 23, 22])))


Out[192]:
1.0

In [193]:
actseqs = sorted(enum_logL_dict.keys())

In [194]:
df1 = pd.DataFrame(actseqs)
df1.columns = ['actSeq']
df1['enumSeq#1'] = [enum_logL_dict[x][1].index[0] for x in actseqs] # the enum sequence with highest log likelihood
df1['F1-score#1'] = [calc_F1score(parse_seqstr(df1.loc[idx, 'actSeq']), parse_seqstr(df1.loc[idx, 'enumSeq#1'])) \
                     for idx in df1.index]
df1['enumSeq#2'] = [enum_logL_dict[x][1].index[1] for x in actseqs] # the enum sequence with second highest loglikelihood
df1['F1-score#2'] = [calc_F1score(parse_seqstr(df1.loc[idx, 'actSeq']), parse_seqstr(df1.loc[idx, 'enumSeq#2'])) \
                     for idx in df1.index]
df1['enumSeq#3'] = [enum_logL_dict[x][1].index[2] for x in actseqs] # the enum sequence with third highest loglikelihood
df1['F1-score#3'] = [calc_F1score(parse_seqstr(df1.loc[idx, 'actSeq']), parse_seqstr(df1.loc[idx, 'enumSeq#3'])) \
                     for idx in df1.index]

In [195]:
df1.head()


Out[195]:
actSeq enumSeq#1 F1-score#1 enumSeq#2 F1-score#2 enumSeq#3 F1-score#3
0 [1, 30, 4] [1, 16, 4] 0.666667 [1, 30, 4] 1.000000 [1, 7, 4] 0.666667
1 [1, 30, 7] [1, 16, 7] 0.666667 [1, 30, 7] 1.000000 [1, 22, 7] 0.666667
2 [1, 7, 28] [1, 22, 28] 0.666667 [1, 21, 28] 0.666667 [1, 7, 28] 1.000000
3 [1, 8, 24] [1, 22, 24] 0.666667 [1, 21, 24] 0.666667 [1, 8, 24] 1.000000
4 [10, 21, 28, 22] [10, 21, 25, 22] 0.750000 [10, 21, 23, 22] 0.750000 [10, 21, 30, 22] 0.750000

Boxplot of F1-score for top-K (or percent) log likelihood of enumerated trajectories.


In [196]:
topk = 10

In [197]:
enumseq_F1 = [[calc_F1score(parse_seqstr(x), parse_seqstr(enum_logL_dict[x][1].index[y])) \
               for y in range(enum_logL_dict[x][1].index.shape[0]) if y < topk] \
#              for y in range(enum_logL_dict[x][1].index.shape[0]) if y < enum_logL_dict[x][1].index.shape[0]*topk/100] \
               for x in actseqs]

In [198]:
seq0 = actseqs[0]
print(seq0)
print(enum_logL_dict[seq0][1].index.tolist())
print(enumseq_F1[0])
print(len(enumseq_F1))


[1, 30, 4]
['[1, 16, 4]', '[1, 30, 4]', '[1, 7, 4]', '[1, 8, 4]', '[1, 22, 4]', '[1, 21, 4]', '[1, 17, 4]', '[1, 23, 4]', '[1, 25, 4]', '[1, 28, 4]', '[1, 6, 4]', '[1, 11, 4]', '[1, 24, 4]', '[1, 29, 4]', '[1, 13, 4]', '[1, 27, 4]', '[1, 2, 4]', '[1, 3, 4]', '[1, 15, 4]', '[1, 14, 4]', '[1, 18, 4]', '[1, 10, 4]', '[1, 19, 4]', '[1, 12, 4]', '[1, 9, 4]', '[1, 20, 4]', '[1, 26, 4]']
[0.6666666666666666, 1.0, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666, 0.6666666666666666]
266

In [199]:
ncol = 10
nrow = int(len(actseqs) / 10)
if len(actseqs) % 10 > 0: nrow += 1
plt.figure(figsize=[20, nrow*5])
for k in range(nrow):
    plt.subplot(nrow, 1, k+1)
    plt.ylim([0.1, 1.1])
    if (k*ncol + ncol) > len(actseqs):
        N = len(actseqs[k*ncol:])
        xidx = range(1, N+1)
        plt.boxplot(enumseq_F1[k*ncol:], labels=actseqs[k*ncol:])
    else:
        xidx = range(1, ncol+1)
        plt.boxplot(enumseq_F1[k*ncol:(k+1)*ncol], labels=actseqs[k*ncol:(k+1)*ncol])
    #plt.xlabel('Actual Trajectory')
    plt.ylabel('F1 score')


Scatter plot of F1 score vs. log likelihood for enumerated trajectories.


In [200]:
print('topk =', topk)
enumseq_logL_list = [enum_logL_dict[x][1].iloc[i]['logLikelihood'] \
                     for x in actseqs \
                     for i in range(enum_logL_dict[x][1].shape[0]) if i < topk]


topk = 10

In [201]:
enumseq_F1_list = [calc_F1score(parse_seqstr(x), parse_seqstr(enum_logL_dict[x][1].index[y])) \
                   for x in actseqs \
                   for y in range(enum_logL_dict[x][1].index.shape[0]) if y < topk]

In [202]:
plt.figure(figsize=[10, 10])
plt.xlabel('Log Likelihood')
plt.ylabel('F1 score')
plt.scatter(enumseq_logL_list, enumseq_F1_list, marker='+')


Out[202]:
<matplotlib.collections.PathCollection at 0x7f5e12a14160>

Log Likelihood Rank vs. F1 score Rank.


In [203]:
enumseq_logL_rank = [list(range(1, 1+enum_logL_dict[x][1].index.shape[0])) for x in actseqs] 
# already sorted by loglikelihood

In [204]:
enumseq_F1_rank = []
for x in actseqs:
    F1score = [calc_F1score(parse_seqstr(x), parse_seqstr(enum_logL_dict[x][1].index[y])) \
               for y in range(enum_logL_dict[x][1].index.shape[0])]
    idx = np.argsort(F1score)
    rank = np.zeros(enum_logL_dict[x][1].index.shape[0], dtype=np.int)
    for i in range(len(idx)):
        rank[idx[i]] = i+1
    enumseq_F1_rank.append(rank)

In [205]:
X = [y for x in enumseq_logL_rank for y in x]
Y = [y for x in enumseq_F1_rank for y in x]
plt.figure(figsize=[10, 10])
plt.xlabel('Log Likelihood Rank')
plt.ylabel('F1 score Rank')
plt.xlim([0, max(X)+1])
plt.ylim([0, max(Y)+1])
assert(max(X) == max(Y))
plt.scatter(X, Y, marker='+')
plt.plot([0, max(X)+1], [0, max(Y)+1], linestyle='--', color='g')


Out[205]:
[<matplotlib.lines.Line2D at 0x7f5e12998080>]

Pearson Correlation Coefficient:


In [206]:
scipy.stats.pearsonr(X, Y)


Out[206]:
(-0.19296050323235645, 0.0)