In [1]:
%matplotlib inline
import os, sys, time
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
In [2]:
def print_progress(cnt, total):
"""Display a progress bar"""
assert(cnt > 0 and total > 0 and cnt <= total)
length = 80
ratio = cnt / total
n = int(length * ratio)
sys.stdout.write('\r[%-80s] %d%%' % ('-'*n, int(ratio*100)))
sys.stdout.flush()
In [3]:
data_dir = '../data'
fvisits = os.path.join(data_dir, 'userVisits-Melb-1.csv')
fpoi = os.path.join(data_dir, 'poi-Melb-all.csv')
fpoi_new = os.path.join(data_dir, 'poi-Melb-1.csv')
fphoto = os.path.join(data_dir, 'Melb_photos_bigbox.csv')
ftraj_all = os.path.join(data_dir, 'traj-all-Melb-1.csv')
ftraj_noshort = os.path.join(data_dir, 'traj-noshort-Melb.csv')
ftraj_nofew = os.path.join(data_dir, 'traj-nofew-Melb.csv')
fvisits2 = os.path.join(data_dir, 'userVisits-Melb-allPOI.csv')
fpoi2 = os.path.join(data_dir, 'costProfCat-MelbPOI-all.csv')
In [4]:
poi_df = pd.read_csv(fpoi)
poi_df.head()
Out[4]:
In [5]:
poi_df.drop(['poiURL', 'poiName'], axis=1, inplace=True)
In [6]:
poi_df.set_index('poiID', inplace=True)
In [7]:
print('#POIs:', poi_df.shape[0])
In [8]:
len(poi_df['poiTheme'].unique())
Out[8]:
In [9]:
photo_df = pd.read_csv(fphoto, skipinitialspace=True, parse_dates=[2])
photo_df.head()
Out[9]:
Remove photos with low accuracies (accuracy $< 16$).
In [10]:
print(photo_df['Accuracy'].unique())
photo_df = photo_df[photo_df['Accuracy'] == 16]
print(photo_df['Accuracy'].unique())
Remove columns that will not be used.
In [11]:
photo_df.drop(['Accuracy', 'URL', 'Marker(photo=0 video=1)'], axis=1, inplace=True)
Convert datatime to unix epoch.
In [12]:
photo_df['dateTaken'] = photo_df['Timestamp'].apply(lambda x: x.timestamp())
photo_df.drop('Timestamp', axis=1, inplace=True)
photo_df['dateTaken'] = photo_df['dateTaken'].astype(np.int)
Rename columns.
In [13]:
photo_df.rename(columns={'Photo_ID':'photoID', 'User_ID':'userID', 'Longitude':'photoLon', 'Latitude':'photoLat'}, \
inplace=True)
photo_df.head()
Out[13]:
In [14]:
photo_df.shape
Out[14]:
In [15]:
print('#Photos:', photo_df['photoID'].unique().shape[0])
print('#Users:', photo_df['userID'].unique().shape[0])
In [16]:
photo_df.set_index('photoID', inplace=True)
photo_df['poiID'] = -1
photo_df['trajID'] = -1
photo_df.head()
Out[16]:
Generate travel history for each user from the photos taken by him/her.
In [17]:
def calc_dist_vec(longitudes1, latitudes1, longitudes2, latitudes2):
"""Calculate the distance (unit: km) between two places on earth, vectorised"""
# convert degrees to radians
lng1 = np.radians(longitudes1)
lat1 = np.radians(latitudes1)
lng2 = np.radians(longitudes2)
lat2 = np.radians(latitudes2)
radius = 6371.0088 # mean earth radius, en.wikipedia.org/wiki/Earth_radius#Mean_radius
# The haversine formula, en.wikipedia.org/wiki/Great-circle_distance
dlng = np.fabs(lng1 - lng2)
dlat = np.fabs(lat1 - lat2)
dist = 2 * radius * np.arcsin( np.sqrt(
(np.sin(0.5*dlat))**2 + np.cos(lat1) * np.cos(lat2) * (np.sin(0.5*dlng))**2 ))
return dist
Sanity check.
In [18]:
calc_dist_vec(poi_df.loc[0, 'poiLon'], poi_df.loc[0, 'poiLat'], poi_df.loc[0, 'poiLon'], poi_df.loc[0, 'poiLat'])
Out[18]:
Filtering out photos that leads to super fast speeds.
In [19]:
SUPER_FAST = 150 / (60 * 60) # 150 km/h
In [20]:
filter_tags = pd.Series(data=np.zeros(photo_df.shape[0], dtype=np.bool), index=photo_df.index)
In [21]:
cnt = 0
total = photo_df['userID'].unique().shape[0]
for user in sorted(photo_df['userID'].unique().tolist()):
udf = photo_df[photo_df['userID'] == user].copy()
udf.sort_values(by='dateTaken', ascending=True, inplace=True)
udists = calc_dist_vec(udf['photoLon'][:-1].values, udf['photoLat'][:-1].values, \
udf['photoLon'][1: ].values, udf['photoLat'][1: ].values)
assert(udists.shape[0] == udf.shape[0]-1)
superfast = np.zeros(udf.shape[0]-1, dtype=np.bool)
for i in range(udf.shape[0]-1):
ix1 = udf.index[i]
ix2 = udf.index[i+1]
dtime = udf.loc[ix2, 'dateTaken'] - udf.loc[ix1, 'dateTaken']
assert(dtime >= 0)
if dtime == 0: superfast[i] = True
speed = udists[i] / dtime
if speed > SUPER_FAST: superfast[i] = True
for j in range(superfast.shape[0]-1):
if superfast[j] and superfast[j+1]: # jx0-->SUPER_FAST-->jx-->SUPER_FAST-->jx1: remove photo jx
jx = udf.index[j+1]
filter_tags.loc[jx] = True
cnt += 1; print_progress(cnt, total)
In [22]:
for jx in filter_tags.index:
if filter_tags.loc[jx] == True:
photo_df.drop(jx, axis=0, inplace=True)
In [23]:
photo_df.shape
Out[23]:
Distance between POIs.
In [24]:
poi_distmat = pd.DataFrame(data=np.zeros((poi_df.shape[0], poi_df.shape[0]), dtype=np.float), \
index=poi_df.index, columns=poi_df.index)
In [25]:
for ix in poi_df.index:
poi_distmat.loc[ix] = calc_dist_vec(poi_df.loc[ix, 'poiLon'], poi_df.loc[ix, 'poiLat'], \
poi_df['poiLon'], poi_df['poiLat'])
Distance between photos and POIs.
In [26]:
photo_poi_distmat = pd.DataFrame(data=np.zeros((photo_df.shape[0], poi_df.shape[0]), dtype=np.float), \
index=photo_df.index, columns=poi_df.index)
In [27]:
for i in range(photo_df.shape[0]):
ix = photo_df.index[i]
photo_poi_distmat.loc[ix] = calc_dist_vec(photo_df.loc[ix, 'photoLon'], photo_df.loc[ix, 'photoLat'], \
poi_df['poiLon'], poi_df['poiLat'])
print_progress(i+1, photo_df.shape[0])
"Map a photo to a POI if their coordinates differ by $<200$m based on the Haversine formula" according to the IJCAI15 paper.
In [28]:
DIST_MAX = 0.2 # 0.2km
Time gap is $8$ hours according to the IJCAI15 paper.
In [29]:
TIME_GAP = 8 * 60 * 60 # 8 hours
In [30]:
users = sorted(photo_df['userID'].unique().tolist())
Map photo to the closest POI.
In [31]:
traj_greedy = photo_df.copy()
In [32]:
cnt = 0
for ix in traj_greedy.index:
min_ix = photo_poi_distmat.loc[ix].idxmin()
if photo_poi_distmat.loc[ix, min_ix] > DIST_MAX: # photo is taken at position far from any POI, do NOT use it
pass
else:
traj_greedy.loc[ix, 'poiID'] = poi_df.index[min_ix] # map photo to the closest POI
# all POIs that are very close to a photo are an option to map
#photo_df.loc[ix, 'poiID'] = str(poi_df.index[~(dists > dist_max)].tolist())
cnt += 1; print_progress(cnt, traj_greedy.shape[0])
Build trajectories.
In [33]:
traj_greedy = traj_greedy[traj_greedy['poiID'] != -1]
In [34]:
tid = 0
cnt = 0
for user in users:
udf = traj_greedy[traj_greedy['userID'] == user].copy()
udf.sort_values(by='dateTaken', ascending=True, inplace=True)
if udf.shape[0] == 0:
cnt += 1; print_progress(cnt, len(users))
continue
traj_greedy.loc[udf.index[0], 'trajID'] = tid
for i in range(1, udf.shape[0]):
ix1 = udf.index[i-1]
ix2 = udf.index[i]
if udf.loc[ix2, 'dateTaken'] - udf.loc[ix1, 'dateTaken'] > TIME_GAP:
tid += 1
traj_greedy.loc[ix2, 'trajID'] = tid
else:
traj_greedy.loc[ix2, 'trajID'] = tid
tid += 1 # for trajectories of the next user
cnt += 1; print_progress(cnt, len(users))
Given a sequence of photos and a set of POIs, map the sequence of photos to the set of POI such that the total cost is minimised, i.e.,
\begin{equation} \text{minimize} \sum_i \text{distance}(\text{photo}_i, \text{POI}_i) + \alpha \sum_i \text{distance}(\text{POI}_i, \text{POI}_{i+1}) \end{equation}where $\text{photo}_i$ is mapped to $\text{POI}_i$, $\alpha$ is a trade-off parameter.
In [35]:
map_dp = False
In [36]:
def decode_photo_seq(photo_seq, poi_distmat, photo_poi_distmat, DIST_MAX, ALPHA=1):
"""
Map a sequence of photos to a set of POI such that the total cost, i.e.
cost = sum(distance(photo_i, POI_i)) + sum(distance(POI_i, POI_{i+1}))
is minimised.
Implemented using DP.
"""
assert(len(photo_seq) > 0)
assert(DIST_MAX > 0)
if len(photo_seq) == 1: # only one POI in this sequence
ix = photo_seq[0]
assert(ix in photo_poi_distmat.index)
return [photo_poi_distmat.loc[ix].idxmin()]
# set of POIs that are close to any photo in the input sequence of photos
poi_t = []
for jx in photo_seq:
poi_t = poi_t + poi_distmat.index[~(photo_poi_distmat.loc[jx] > DIST_MAX)].tolist()
columns = sorted(set(poi_t))
# cost_df.iloc[i, j] stores the minimum cost of photo sequence [..., 'photo_i'] among all
# possible POI sequences end with 'POI_j', 'photo_i' was mapped to 'POI_j'
cost_df = pd.DataFrame(data=np.zeros((len(photo_seq), len(columns)), dtype=np.float), \
index=photo_seq, columns=columns)
# trace_df.iloc[i, j] stores the (previous) 'POI_k' such that the cost of POI sequence
# [... --> 'POI_k' (prev POI) --> 'POI_j' (current POI)] is cost_df.iloc[i, j]
trace_df = pd.DataFrame(data=np.zeros((len(photo_seq), len(columns)), dtype=np.int), \
index=photo_seq, columns=columns)
# NO predecessor for the start POI
trace_df.iloc[0] = -1
# costs for the first row are just the distances (or np.inf) from the first photo to all POIs
for kx in cost_df.columns:
ix = photo_seq[0]
dist = photo_poi_distmat.loc[ix, kx]
cost_df.loc[ix, kx] = np.inf if dist > DIST_MAX else dist
# compute minimum costs recursively
for i in range(1, len(photo_seq)):
ix = cost_df.index[i]
prev = cost_df.index[i-1]
for jx in cost_df.columns:
# distance(photo_i, POI_j) + alpha * distance(POI_k, POI_j) + previous cost
# if distance(photo_i, POI_j) <= DIST_MAX else np.inf
costs = [np.inf if photo_poi_distmat.loc[ix, jx] > DIST_MAX else \
photo_poi_distmat.loc[ix, jx] + ALPHA * poi_distmat.loc[kx, jx] + cost_df.loc[prev, kx] \
for kx in cost_df.columns]
min_idx = np.argmin(costs)
cost_df.loc[ix, jx] = costs[min_idx]
trace_df.loc[ix, jx] = cost_df.columns[min_idx]
# trace back
pN = cost_df.loc[cost_df.index[-1]].idxmin() # the end POI
seq_reverse = [pN] # sequence of POI in reverse order
row_idx = trace_df.shape[0] - 1 # trace back from the last row of trace_df
while (row_idx > 0): # the first row are all -1
ix = trace_df.index[row_idx]
jx = seq_reverse[-1]
poi = trace_df.loc[ix, jx]
seq_reverse.append(poi)
row_idx -= 1
return seq_reverse[::-1] # reverse the sequence
Build travel sequences and map photos to POIs.
In [37]:
if map_dp == True:
traj_dp = photo_df.copy()
ALPHA = 1
In [39]:
if map_dp == True:
tid = 0
cnt = 0
for user in users:
udf = traj_dp[traj_dp['userID'] == user].copy()
udf.sort_values(by='dateTaken', ascending=True, inplace=True)
# filtering out photos that are far from all POIs
for ix in udf.index:
if photo_poi_distmat.loc[ix].min() > DIST_MAX:
udf.drop(ix, axis=0, inplace=True)
if udf.shape[0] == 0:
cnt += 1; print_progress(cnt, len(users))
continue
photo_seq = [udf.index[0]]
for i in range(1, udf.shape[0]):
ix1 = photo_seq[-1]
ix2 = udf.index[i]
if udf.loc[ix2, 'dateTaken'] - udf.loc[ix1, 'dateTaken'] > TIME_GAP:
assert(len(photo_seq) > 0)
poi_seq = decode_photo_seq(photo_seq, poi_distmat, photo_poi_distmat, DIST_MAX, ALPHA)
assert(len(poi_seq) == len(photo_seq))
for j in range(len(poi_seq)):
jx = photo_seq[j]
poi = poi_seq[j]
traj_dp.loc[jx, 'poiID'] = poi
traj_dp.loc[jx, 'trajID'] = tid
tid += 1
photo_seq.clear()
photo_seq.append(ix2)
else:
photo_seq.append(ix2)
assert(len(photo_seq) > 0)
poi_seq = decode_photo_seq(photo_seq, poi_distmat, photo_poi_distmat, DIST_MAX, ALPHA)
assert(len(poi_seq) == len(photo_seq))
for j in range(len(poi_seq)):
jx = photo_seq[j]
poi = poi_seq[j]
traj_dp.loc[jx, 'poiID'] = poi
traj_dp.loc[jx, 'trajID'] = tid
tid += 1
cnt += 1; print_progress(cnt, len(users))
Compute the total cost of trajectories.
In [40]:
def calc_cost(traj_df, poi_distmat, photo_poi_distmat):
cost = 0
traj_df = traj_df[traj_df['poiID'] != -1]
for tid in sorted(traj_df['trajID'].unique().tolist()):
tdf = traj_df[traj_df['trajID'] == tid].copy()
tdf.sort_values(by='dateTaken', ascending=True, inplace=True)
cost += np.trace(photo_poi_distmat.loc[tdf.index, tdf['poiID']])
cost += np.trace(poi_distmat.loc[tdf['poiID'][:-1].values, tdf['poiID'][1:].values])
return cost
In [41]:
calc_cost(traj_greedy, poi_distmat, photo_poi_distmat)
Out[41]:
In [42]:
if map_dp == True:
calc_cost(traj_dp, poi_distmat, photo_poi_distmat)
In [43]:
traj_greedy['trajID'].max()
Out[43]:
In [44]:
if map_dp == True:
traj_dp['trajID'].max()
Compare trajectories interactively.
In [45]:
compare_interactive = False
In [46]:
if compare_interactive == True and map_dp == True:
for tid in sorted(traj_dp['trajID'].unique().tolist()):
if tid == -1: continue
tdf1 = traj_dp[traj_dp['trajID'] == tid].copy()
tdf1.sort_values(by='dateTaken', ascending=True, inplace=True)
tdf2 = traj_greedy[traj_greedy['trajID'] == tid].copy()
tdf2.sort_values(by='dateTaken', ascending=True, inplace=True)
print(tdf1)
print(tdf2)
input('Press any key to continue...')
Save trajectories and related POIs to files.
In [47]:
visits = traj_greedy[traj_greedy['poiID'] != -1]
#visits = traj_dp[traj_dp['poiID'] != -1]
In [48]:
visits.head()
Out[48]:
Save visits data.
In [49]:
uservisits = visits.copy()
uservisits.rename(columns={'trajID':'seqID'}, inplace=True)
uservisits.to_csv(fvisits, index=False)
Save POIs to CSV file.
In [50]:
poiix = sorted(visits['poiID'].unique().tolist())
In [51]:
poi_df.loc[poiix].to_csv(fpoi_new, index=True)
Count the number of photos taken at each POI.
In [52]:
poifreq = visits[['poiID', 'dateTaken']].copy().groupby('poiID').agg(np.size)
poifreq.rename(columns={'dateTaken':'poiFreq'}, inplace=True)
Save data in file format like IJCAI datasets: user visits data, POI related data.
In [53]:
visits_df = visits.copy()
visits_df.reset_index(inplace=True)
visits_df.drop(['photoLon', 'photoLat'], axis=1, inplace=True)
visits_df['dateTaken'] = visits_df['dateTaken'].astype(np.int)
visits_df.rename(columns={'trajID':'seqID'}, inplace=True)
visits_df['poiTheme'] = poi_df.loc[visits_df['poiID'], 'poiTheme'].tolist()
visits_df['poiFreq'] = poifreq.loc[visits_df['poiID'], 'poiFreq'].astype(np.int).tolist()
Sort photos by date taken.
In [55]:
visits_df.sort_values(by='dateTaken', ascending=True, inplace=True)
In [54]:
visits_df.head()
Out[54]:
Save visits data.
In [56]:
cols = ['photoID', 'userID', 'dateTaken', 'poiID', 'poiTheme', 'poiFreq', 'seqID']
visits_df.to_csv(fvisits2, sep=';', quoting=2, columns=cols, index=False)
In [57]:
fvisits2
Out[57]:
POI related data: cost=POI-POI distance (meters), profit=frequency (#photos taken at the second POI).
In [58]:
costprofit_df = pd.DataFrame(columns=['from', 'to', 'cost', 'profit', 'category'])
In [59]:
pois = sorted(visits_df['poiID'].unique())
In [60]:
for poi1 in pois:
for poi2 in pois:
if poi1 == poi2: continue
ix = costprofit_df.shape[0]
costprofit_df.loc[ix, 'from'] = poi1
costprofit_df.loc[ix, 'to'] = poi2
costprofit_df.loc[ix, 'cost'] = poi_distmat.loc[poi1, poi2] * 1000 # meters
costprofit_df.loc[ix, 'profit'] = poifreq.loc[poi2, 'poiFreq']
costprofit_df.loc[ix, 'category'] = poi_df.loc[poi2, 'poiTheme']
In [61]:
costprofit_df.head()
Out[61]:
Save POI related data.
In [62]:
cols = ['from', 'to', 'cost', 'profit', 'category']
costprofit_df.to_csv(fpoi2, sep=';', quoting=2, columns=cols, index=False)