NOTE: Before running this notebook, please run script src/ijcai15_setup.py
to setup data properly.
The states of the Markov chain (MC) corresponds to the categories of POIs, there is a special state "REST" which represents that people are having rests after some travelling.
Simulate trajectories using the transition matrix of the MC, when choosing a specific POI within a certain category, use the following rules:
For user $u$ and POI $p$, define
Travel History: \begin{equation*} S_u = \{(p_1, t_{p_1}^a, t_{p_1}^d), \dots, (p_n, t_{p_n}^a, t_{p_n}^d)\} \end{equation*} where $t_{p_i}^a$ is the arrival time and $t_{p_i}^d$ the departure time of user $u$ at POI $p_i$
Travel Sequences: split $S_u$ if \begin{equation*} |t_{p_i}^d - t_{p_{i+1}}^a| > \tau ~(\text{e.g.}~ \tau = 8 ~\text{hours}) \end{equation*}
POI Popularity: \begin{equation*} Pop(p) = \sum_{u \in U} \sum_{p_i \in S_u} \delta(p_i == p) \end{equation*}
In [372]:
%matplotlib inline
import os
import math
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
In [373]:
random.seed(123456789)
In [374]:
data_dir = 'data/data-ijcai15'
#fvisit = os.path.join(data_dir, 'userVisits-Osak.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Osak.csv')
#fvisit = os.path.join(data_dir, 'userVisits-Glas.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Glas.csv')
#fvisit = os.path.join(data_dir, 'userVisits-Edin.csv')
#fcoord = os.path.join(data_dir, 'photoCoords-Edin.csv')
fvisit = os.path.join(data_dir, 'userVisits-Toro.csv')
fcoord = os.path.join(data_dir, 'photoCoords-Toro.csv')
In [375]:
suffix = fvisit.split('-')[-1].split('.')[0]
In [376]:
visits = pd.read_csv(fvisit, sep=';')
visits.head()
Out[376]:
In [377]:
coords = pd.read_csv(fcoord, sep=';')
coords.head()
Out[377]:
In [378]:
# merge data frames according to column 'photoID'
assert(visits.shape[0] == coords.shape[0])
traj = pd.merge(visits, coords, on='photoID')
traj.head()
Out[378]:
In [379]:
num_photo = traj['photoID'].unique().shape[0]
num_user = traj['userID'].unique().shape[0]
num_seq = traj['seqID'].unique().shape[0]
num_poi = traj['poiID'].unique().shape[0]
pd.DataFrame([num_photo, num_user, num_seq, num_poi, num_photo/num_user, num_seq/num_user], \
index = ['#photo', '#user', '#seq', '#poi', '#photo/user', '#seq/user'], columns=[str(suffix)])
Out[379]:
Compute POI (Longitude, Latitude) as the average coordinates of the assigned photos.
In [380]:
poi_coords = traj[['poiID', 'photoLon', 'photoLat']].groupby('poiID').agg(np.mean)
poi_coords.reset_index(inplace=True)
poi_coords.rename(columns={'photoLon':'poiLon', 'photoLat':'poiLat'}, inplace=True)
poi_coords.head()
Out[380]:
Extract POI category and visiting frequency.
In [381]:
poi_catfreq = traj[['poiID', 'poiTheme', 'poiFreq']].groupby('poiID').first()
poi_catfreq.reset_index(inplace=True)
poi_catfreq.head()
Out[381]:
In [382]:
poi_all = pd.merge(poi_catfreq, poi_coords, on='poiID')
poi_all.set_index('poiID', inplace=True)
poi_all.head()
Out[382]:
In [383]:
seq_all = traj[['userID', 'seqID', 'poiID', 'dateTaken']].copy()\
.groupby(['userID', 'seqID', 'poiID']).agg([np.min, np.max])
seq_all.columns = seq_all.columns.droplevel()
seq_all.reset_index(inplace=True)
seq_all.rename(columns={'amin':'arrivalTime', 'amax':'departureTime'}, inplace=True)
seq_all['poiDuration(sec)'] = seq_all['departureTime'] - seq_all['arrivalTime']
seq_all.head()
Out[383]:
In [384]:
seq_start = seq_all[['userID', 'seqID', 'arrivalTime']].copy().groupby(['userID', 'seqID']).agg(np.min)
seq_start.rename(columns={'arrivalTime':'startTime'}, inplace=True)
seq_start.reset_index(inplace=True)
seq_start.head()
Out[384]:
In [385]:
seq_end = seq_all[['userID', 'seqID', 'departureTime']].copy().groupby(['userID', 'seqID']).agg(np.max)
seq_end.rename(columns={'departureTime':'endTime'}, inplace=True)
seq_end.reset_index(inplace=True)
seq_end.head()
Out[385]:
In [386]:
assert(seq_start.shape[0] == seq_end.shape[0])
user_seqs = pd.merge(seq_start, seq_end, on=['userID', 'seqID'])
user_seqs.head()
#user_seqs.loc[0, 'seqID']
#user_seqs['userID'].iloc[-1]
Out[386]:
Generate the extended transition matrix of POI category for actual trajectories with a special category REST.
For a specific user, if the time gap between the earlier sequence and the latter sequence is less than 'timeGap' (e.g. 24 hours), then add a REST state between the two sequences, otherwise, add a REST to REST transition after the earlier sequence.
In [387]:
def generate_ext_transmat(poi_all, seq_all, user_seqs, timeGap):
"""Calculate the extended transition matrix of POI category for actual trajectories with a special category REST.
For a specific user, if the time gap between the earlier sequence and the latter sequence is less than 'timeGap',
then add a REST state between the two sequences, otherwise,
add a REST to REST transition after the earlier sequence.
"""
assert(timeGap > 0)
states = poi_all['poiTheme'].unique().tolist()
states.sort()
states.append('REST')
ext_transmat = pd.DataFrame(data=np.zeros((len(states), len(states)), dtype=np.float64), \
index=states, columns=states)
for user in user_seqs['userID'].unique():
sequ = user_seqs[user_seqs['userID'] == user].copy()
sequ.sort(columns=['startTime'], ascending=True, inplace=True)
prev_seqEndTime = None
prev_endPOICat = None
# sequence with length 1 should be considered
for i in range(len(sequ.index)):
idx = sequ.index[i]
seqid = sequ.loc[idx, 'seqID']
seq = seq_all[seq_all['seqID'] == seqid].copy()
seq.sort(columns=['arrivalTime'], ascending=True, inplace=True)
for j in range(len(seq.index)-1):
poi1 = seq.loc[seq.index[j], 'poiID']
poi2 = seq.loc[seq.index[j+1], 'poiID']
cat1 = poi_all.loc[poi1, 'poiTheme']
cat2 = poi_all.loc[poi2, 'poiTheme']
ext_transmat.loc[cat1, cat2] += 1
# REST state
if i > 0:
startTime = sequ.loc[idx, 'startTime']
assert(prev_seqEndTime is not None)
assert(startTime >= prev_seqEndTime)
ext_transmat.loc[prev_endPOICat, 'REST'] += 1 # POI-->REST
if startTime - prev_seqEndTime < timeGap: # REST-->POI
poi0 = seq.loc[seq.index[0], 'poiID']
startPOICat = poi_all.loc[poi0, 'poiTheme']
ext_transmat.loc['REST', startPOICat] += 1
else: # REST-->REST
ext_transmat.loc['REST', 'REST'] += 1
# memorise info of previous sequence
prev_seqEndTime = sequ.loc[idx, 'endTime']
poiN = seq.loc[seq.index[-1], 'poiID']
prev_endPOICat = poi_all.loc[poiN, 'poiTheme']
# normalize each row to get the transition probability from cati to catj
for r in ext_transmat.index:
rowsum = ext_transmat.ix[r].sum()
if rowsum == 0: continue # deal with lack of data
ext_transmat.loc[r] /= rowsum
return ext_transmat
In [388]:
timeGap = 24 * 60 * 60 # 24 hours
In [389]:
trans_mat = generate_ext_transmat(poi_all, seq_all, user_seqs, timeGap)
trans_mat
Out[389]:
In [390]:
#trans_mat.columns[-1]
#trans_mat.loc['Sport']
#np.array(trans_mat.loc['Sport'])
#np.array(trans_mat.loc['Sport']).sum()
When choosing a specific POI within a certain POI category, consider two types of rules:
In [391]:
def calc_dist(longitude1, latitude1, longitude2, latitude2):
"""Calculate the distance (unit: km) between two places on earth"""
# convert degrees to radians
lon1 = math.radians(longitude1)
lat1 = math.radians(latitude1)
lon2 = math.radians(longitude2)
lat2 = math.radians(latitude2)
radius = 6371.009 # mean earth radius is 6371.009km, en.wikipedia.org/wiki/Earth_radius#Mean_radius
# The haversine formula, en.wikipedia.org/wiki/Great-circle_distance
dlon = math.fabs(lon1 - lon2)
dlat = math.fabs(lat1 - lat2)
return 2 * radius * math.asin( math.sqrt( \
(math.sin(0.5*dlat))**2 + math.cos(lat1) * math.cos(lat2) * (math.sin(0.5*dlon))**2 ))
Distance based rules
In [392]:
def rule_NN(current_poi, next_poi_cat, poi_all, randomized):
"""
choosing a specific POI within a category.
if randomized == True,
return a random POI choosing with probability proportional to the reciprocal of its distance to current POI
otherwise, return the Nearest Neighbor of the current POI
"""
assert(current_poi in poi_all.index)
assert(next_poi_cat in poi_all['poiTheme'].unique())
poi_index = None
if poi_all.loc[current_poi, 'poiTheme'] == next_poi_cat:
poi_index = [x for x in poi_all[poi_all['poiTheme'] == next_poi_cat].index if x != current_poi]
else:
poi_index = poi_all[poi_all['poiTheme'] == next_poi_cat].index
probs = np.zeros(len(poi_index), dtype=np.float64)
for i in range(len(poi_index)):
dist = calc_dist(poi_all.loc[current_poi, 'poiLon'], poi_all.loc[current_poi, 'poiLat'], \
poi_all.loc[poi_index[i],'poiLon'], poi_all.loc[poi_index[i],'poiLat'])
assert(dist > 0.)
probs[i] = 1. / dist
idx = None
if randomized == True:
probs /= np.sum(probs) # normalise
sample = np.random.multinomial(1, probs) # catgorical/multinoulli distribution, multinomial distribution (n=1)
for j in range(len(sample)):
if sample[j] == 1:
idx = j
break
else:
idx = probs.argmax()
assert(idx is not None)
return poi_index[idx]
POI Popularity based rules
In [393]:
def rule_Pop(current_poi, next_poi_cat, poi_all, randomized):
"""
choosing a specific POI within a category.
if randomized == True,
returen a random POI choosing with probability proportional to its popularity
otherwise, return the The most Popular POI
"""
assert(current_poi in poi_all.index)
assert(next_poi_cat in poi_all['poiTheme'].unique())
poi_index = None
if poi_all.loc[current_poi, 'poiTheme'] == next_poi_cat:
poi_index = [x for x in poi_all[poi_all['poiTheme'] == next_poi_cat].index if x != current_poi]
else:
poi_index = poi_all[poi_all['poiTheme'] == next_poi_cat].index
probs = np.zeros(len(poi_index), dtype=np.float64)
for i in range(len(poi_index)):
probs[i] = poi_all.loc[poi_index[i],'poiFreq']
idx = None
if randomized == True:
probs /= np.sum(probs) # normalise
sample = np.random.multinomial(1, probs) # catgorical/multinoulli distribution, multinomial distribution (n=1)
for j in range(len(sample)):
if sample[j] == 1:
idx = j
break
else:
idx = probs.argmax()
assert(idx is not None)
return poi_index[idx]
In [394]:
def extract_seq(seqid_set, seq_all):
"""Extract the actual sequences (i.e. a list of POI) from a set of sequence ID"""
seq_dict = dict()
for seqid in seqid_set:
seqi = seq_all[seq_all['seqID'] == seqid].copy()
seqi.sort(columns=['arrivalTime'], ascending=True, inplace=True)
seq_dict[seqid] = seqi['poiID'].tolist()
return seq_dict
In [395]:
all_seqid = seq_all['seqID'].unique()
In [396]:
all_seq_dict = extract_seq(all_seqid, seq_all)
In [397]:
def choose_start_poi(all_seq_dict, seqLen):
"""choose the first POI in a random actual sequence"""
assert(seqLen > 0)
while True:
seqid = random.choice(sorted(all_seq_dict.keys()))
if len(all_seq_dict[seqid]) > seqLen:
return all_seq_dict[seqid][0]
In [398]:
obs_mat = trans_mat.copy() * 0
obs_mat
Out[398]:
In [399]:
prefer_NN_over_Pop = True
randomized = True
N = 1000 # number of observations
In [400]:
prevpoi = choose_start_poi(all_seq_dict, 1)
prevcat = poi_all.loc[prevpoi, 'poiTheme']
nextpoi = None
nextcat = None
print('(%s, POI %d)->' % (prevcat, prevpoi))
n = 0
while n < N:
# choose the next POI category
# catgorical/multinoulli distribution, special case of multinomial distribution (n=1)
sample = np.random.multinomial(1, np.array(trans_mat.loc[prevcat]))
nextcat = None
for j in range(len(sample)):
if sample[j] == 1: nextcat = trans_mat.columns[j]
assert(nextcat is not None)
obs_mat.loc[prevcat, nextcat] += 1
# choose the next POI
if nextcat == 'REST':
nextpoi = choose_start_poi(all_seq_dict, 1) # restart
print('(REST)->')
else:
if prefer_NN_over_Pop == True:
nextpoi = rule_NN(prevpoi, nextcat, poi_all, randomized)
else:
nextpoi = rule_Pop(prevpoi, nextcat, poi_all, randomized)
print('(%s, POI %d)->' % (nextcat, nextpoi))
prevcat = nextcat
prevpoi = nextpoi
n += 1
In [401]:
obs_mat
Out[401]:
In [402]:
# MEL estimation
est_mat = obs_mat.copy()
for r in est_mat.index:
rowsum = est_mat.ix[r].sum()
if rowsum == 0: continue # deal with lack of data
est_mat.loc[r] /= rowsum
In [403]:
est_mat
Out[403]:
In [404]:
trans_mat
Out[404]: