Generate Trajectories from Flickr Photos Taken in Melbourne


In [1]:
%matplotlib inline

import os, sys, time
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
def print_progress(cnt, total):
    """Display a progress bar"""
    assert(cnt > 0 and total > 0 and cnt <= total)
    length = 80
    ratio = cnt / total
    n = int(length * ratio)
    sys.stdout.write('\r[%-80s] %d%%' % ('-'*n, int(ratio*100)))
    sys.stdout.flush()

In [3]:
data_dir = '../data'
fvisits = os.path.join(data_dir, 'userVisits-Melb-1.csv')
fpoi = os.path.join(data_dir, 'poi-Melb-all.csv')
fpoi_new = os.path.join(data_dir, 'poi-Melb-1.csv')
fphoto = os.path.join(data_dir, 'Melb_photos_bigbox.csv')
ftraj_all = os.path.join(data_dir, 'traj-all-Melb-1.csv')
ftraj_noshort = os.path.join(data_dir, 'traj-noshort-Melb.csv')
ftraj_nofew = os.path.join(data_dir, 'traj-nofew-Melb.csv')

fvisits2 = os.path.join(data_dir, 'userVisits-Melb-allPOI.csv')
fpoi2 = os.path.join(data_dir, 'costProfCat-MelbPOI-all.csv')

1. Load POI Data


In [4]:
poi_df = pd.read_csv(fpoi)
poi_df.head()


Out[4]:
poiID poiName poiTheme poiLat poiLon poiURL
0 0 Arts Precinct City precincts -37.82167 144.96778 https://en.wikipedia.org/wiki/Melbourne_Arts_P...
1 1 Docklands City precincts -37.81700 144.94600 https://en.wikipedia.org/wiki/Docklands,_Victoria
2 2 Government Precinct City precincts -37.81190 144.97300 https://en.wikipedia.org/wiki/Spring_Street,_M...
3 3 Little Italy City precincts -37.79972 144.96694 https://en.wikipedia.org/wiki/Little_Italy,_Me...
4 4 RMIT City City precincts -37.80778 144.96333 https://en.wikipedia.org/wiki/City_campus_of_t...

In [5]:
poi_df.drop(['poiURL', 'poiName'], axis=1, inplace=True)

In [6]:
poi_df.set_index('poiID', inplace=True)

In [7]:
print('#POIs:', poi_df.shape[0])


#POIs: 88

In [8]:
len(poi_df['poiTheme'].unique())


Out[8]:
9

2. Load Photo Data


In [9]:
photo_df = pd.read_csv(fphoto, skipinitialspace=True, parse_dates=[2])
photo_df.head()


Out[9]:
Photo_ID User_ID Timestamp Longitude Latitude Accuracy URL Marker(photo=0 video=1)
0 5703013770 25287507@N02 2011-05-09 19:19:58 144.604775 -37.878579 16 http://www.flickr.com/photos/25287507@N02/5703... 0
1 5653121597 59335517@N02 2011-04-10 13:27:37 145.033779 -37.822310 16 http://www.flickr.com/photos/59335517@N02/5653... 0
2 5522325184 26303188@N00 2011-03-13 20:44:24 144.981122 -37.824344 14 http://www.flickr.com/photos/26303188@N00/5522... 0
3 7978703060 82732068@N02 2012-07-14 12:29:43 145.947854 -38.479344 15 http://www.flickr.com/photos/82732068@N02/7978... 0
4 174030514 19677632@N00 2004-08-01 19:28:40 145.533485 -37.949003 12 http://www.flickr.com/photos/19677632@N00/1740... 0

Remove photos with low accuracies (accuracy $< 16$).


In [10]:
print(photo_df['Accuracy'].unique())
photo_df = photo_df[photo_df['Accuracy'] == 16]
print(photo_df['Accuracy'].unique())


[16 14 15 12 11 13  8 10  9  3  5  7  6  4  1  2]
[16]

Remove columns that will not be used.


In [11]:
photo_df.drop(['Accuracy', 'URL', 'Marker(photo=0 video=1)'], axis=1, inplace=True)

Convert datatime to unix epoch.


In [12]:
photo_df['dateTaken'] = photo_df['Timestamp'].apply(lambda x: x.timestamp())
photo_df.drop('Timestamp', axis=1, inplace=True)
photo_df['dateTaken'] = photo_df['dateTaken'].astype(np.int)

Rename columns.


In [13]:
photo_df.rename(columns={'Photo_ID':'photoID', 'User_ID':'userID', 'Longitude':'photoLon', 'Latitude':'photoLat'}, \
                inplace=True)
photo_df.head()


Out[13]:
photoID userID photoLon photoLat dateTaken
0 5703013770 25287507@N02 144.604775 -37.878579 1304932798
1 5653121597 59335517@N02 145.033779 -37.822310 1302406057
5 9588963220 67774014@N00 144.965060 -37.815725 1377408461
9 6191232325 63488421@N08 144.666981 -37.922733 1316741616
10 6644759687 10559879@N00 144.961177 -37.812759 1325813367

In [14]:
photo_df.shape


Out[14]:
(94142, 5)

In [15]:
print('#Photos:', photo_df['photoID'].unique().shape[0])
print('#Users:', photo_df['userID'].unique().shape[0])


#Photos: 94142
#Users: 1659

In [16]:
photo_df.set_index('photoID', inplace=True)
photo_df['poiID'] = -1
photo_df['trajID'] = -1
photo_df.head()


Out[16]:
userID photoLon photoLat dateTaken poiID trajID
photoID
5703013770 25287507@N02 144.604775 -37.878579 1304932798 -1 -1
5653121597 59335517@N02 145.033779 -37.822310 1302406057 -1 -1
9588963220 67774014@N00 144.965060 -37.815725 1377408461 -1 -1
6191232325 63488421@N08 144.666981 -37.922733 1316741616 -1 -1
6644759687 10559879@N00 144.961177 -37.812759 1325813367 -1 -1

3. Map Photos to POIs & Build Trajectories

Generate travel history for each user from the photos taken by him/her.


In [17]:
def calc_dist_vec(longitudes1, latitudes1, longitudes2, latitudes2):
    """Calculate the distance (unit: km) between two places on earth, vectorised"""
    # convert degrees to radians
    lng1 = np.radians(longitudes1)
    lat1 = np.radians(latitudes1)
    lng2 = np.radians(longitudes2)
    lat2 = np.radians(latitudes2)
    radius = 6371.0088 # mean earth radius, en.wikipedia.org/wiki/Earth_radius#Mean_radius

    # The haversine formula, en.wikipedia.org/wiki/Great-circle_distance
    dlng = np.fabs(lng1 - lng2)
    dlat = np.fabs(lat1 - lat2)
    dist =  2 * radius * np.arcsin( np.sqrt( 
                (np.sin(0.5*dlat))**2 + np.cos(lat1) * np.cos(lat2) * (np.sin(0.5*dlng))**2 ))
    return dist

Sanity check.


In [18]:
calc_dist_vec(poi_df.loc[0, 'poiLon'], poi_df.loc[0, 'poiLat'], poi_df.loc[0, 'poiLon'], poi_df.loc[0, 'poiLat'])


Out[18]:
0.0

Filtering out photos that leads to super fast speeds.


In [19]:
SUPER_FAST = 150 / (60 * 60)  # 150 km/h

In [20]:
filter_tags = pd.Series(data=np.zeros(photo_df.shape[0], dtype=np.bool), index=photo_df.index)

In [21]:
cnt = 0
total = photo_df['userID'].unique().shape[0]
for user in sorted(photo_df['userID'].unique().tolist()):
    udf = photo_df[photo_df['userID'] == user].copy()
    udf.sort_values(by='dateTaken', ascending=True, inplace=True)
    udists = calc_dist_vec(udf['photoLon'][:-1].values, udf['photoLat'][:-1].values, \
                           udf['photoLon'][1: ].values, udf['photoLat'][1: ].values)
    assert(udists.shape[0] == udf.shape[0]-1)
    superfast = np.zeros(udf.shape[0]-1, dtype=np.bool)
    for i in range(udf.shape[0]-1):
        ix1 = udf.index[i]
        ix2 = udf.index[i+1]
        dtime = udf.loc[ix2, 'dateTaken'] - udf.loc[ix1, 'dateTaken']
        assert(dtime >= 0)
        if dtime == 0: superfast[i] = True
        speed = udists[i] / dtime
        if speed > SUPER_FAST: superfast[i] = True
    for j in range(superfast.shape[0]-1):
        if superfast[j] and superfast[j+1]:  # jx0-->SUPER_FAST-->jx-->SUPER_FAST-->jx1: remove photo jx
            jx = udf.index[j+1]
            filter_tags.loc[jx] = True
    cnt += 1; print_progress(cnt, total)


[--------------------------------------------------------------------------------] 100%

In [22]:
for jx in filter_tags.index:
    if filter_tags.loc[jx] == True:
        photo_df.drop(jx, axis=0, inplace=True)

In [23]:
photo_df.shape


Out[23]:
(92758, 6)

Distance between POIs.


In [24]:
poi_distmat = pd.DataFrame(data=np.zeros((poi_df.shape[0], poi_df.shape[0]), dtype=np.float), \
                           index=poi_df.index, columns=poi_df.index)

In [25]:
for ix in poi_df.index:
    poi_distmat.loc[ix] = calc_dist_vec(poi_df.loc[ix, 'poiLon'], poi_df.loc[ix, 'poiLat'], \
                                        poi_df['poiLon'], poi_df['poiLat'])

Distance between photos and POIs.


In [26]:
photo_poi_distmat = pd.DataFrame(data=np.zeros((photo_df.shape[0], poi_df.shape[0]), dtype=np.float), \
                                 index=photo_df.index, columns=poi_df.index)

In [27]:
for i in range(photo_df.shape[0]):
    ix = photo_df.index[i]
    photo_poi_distmat.loc[ix] = calc_dist_vec(photo_df.loc[ix, 'photoLon'], photo_df.loc[ix, 'photoLat'], \
                                              poi_df['poiLon'], poi_df['poiLat'])
    print_progress(i+1, photo_df.shape[0])


[--------------------------------------------------------------------------------] 100%

"Map a photo to a POI if their coordinates differ by $<200$m based on the Haversine formula" according to the IJCAI15 paper.


In [28]:
DIST_MAX = 0.2  # 0.2km

Time gap is $8$ hours according to the IJCAI15 paper.


In [29]:
TIME_GAP = 8 * 60 * 60  # 8 hours

In [30]:
users = sorted(photo_df['userID'].unique().tolist())

3.1 Map Photos to POIs: Approach I - Greedy

Map photo to the closest POI.


In [31]:
traj_greedy = photo_df.copy()

In [32]:
cnt = 0
for ix in traj_greedy.index:
    min_ix = photo_poi_distmat.loc[ix].idxmin()
    if photo_poi_distmat.loc[ix, min_ix] > DIST_MAX:  # photo is taken at position far from any POI, do NOT use it
        pass
    else:
        traj_greedy.loc[ix, 'poiID'] = poi_df.index[min_ix]  # map photo to the closest POI
        # all POIs that are very close to a photo are an option to map
        #photo_df.loc[ix, 'poiID'] = str(poi_df.index[~(dists > dist_max)].tolist())
    cnt += 1; print_progress(cnt, traj_greedy.shape[0])


[--------------------------------------------------------------------------------] 100%

Build trajectories.


In [33]:
traj_greedy = traj_greedy[traj_greedy['poiID'] != -1]

In [34]:
tid = 0
cnt = 0
for user in users:
    udf = traj_greedy[traj_greedy['userID'] == user].copy()
    udf.sort_values(by='dateTaken', ascending=True, inplace=True)
    if udf.shape[0] == 0: 
        cnt += 1; print_progress(cnt, len(users))
        continue
    
    traj_greedy.loc[udf.index[0], 'trajID'] = tid
    for i in range(1, udf.shape[0]):
        ix1 = udf.index[i-1]
        ix2 = udf.index[i]
        if udf.loc[ix2, 'dateTaken'] - udf.loc[ix1, 'dateTaken'] > TIME_GAP:
            tid += 1
            traj_greedy.loc[ix2, 'trajID'] = tid
        else:
            traj_greedy.loc[ix2, 'trajID'] = tid
    tid += 1  # for trajectories of the next user
    cnt += 1; print_progress(cnt, len(users))


[--------------------------------------------------------------------------------] 100%

3.2 Map Photos to POIs: Approach II - Dynamic Programming

Given a sequence of photos and a set of POIs, map the sequence of photos to the set of POI such that the total cost is minimised, i.e.,

\begin{equation} \text{minimize} \sum_i \text{distance}(\text{photo}_i, \text{POI}_i) + \alpha \sum_i \text{distance}(\text{POI}_i, \text{POI}_{i+1}) \end{equation}

where $\text{photo}_i$ is mapped to $\text{POI}_i$, $\alpha$ is a trade-off parameter.


In [35]:
map_dp = False

In [36]:
def decode_photo_seq(photo_seq, poi_distmat, photo_poi_distmat, DIST_MAX, ALPHA=1):
    """
    Map a sequence of photos to a set of POI such that the total cost, i.e.
    cost = sum(distance(photo_i, POI_i)) + sum(distance(POI_i, POI_{i+1})) 
    is minimised.
    Implemented using DP.
    """
    assert(len(photo_seq) > 0)
    assert(DIST_MAX > 0)
    
    if len(photo_seq) == 1:  # only one POI in this sequence
        ix = photo_seq[0]
        assert(ix in photo_poi_distmat.index)
        return [photo_poi_distmat.loc[ix].idxmin()]
    
    # set of POIs that are close to any photo in the input sequence of photos
    poi_t = []
    for jx in photo_seq:
        poi_t = poi_t + poi_distmat.index[~(photo_poi_distmat.loc[jx] > DIST_MAX)].tolist()
    columns = sorted(set(poi_t))
    
    # cost_df.iloc[i, j] stores the minimum cost of photo sequence [..., 'photo_i'] among all 
    # possible POI sequences end with 'POI_j', 'photo_i' was mapped to 'POI_j'
    cost_df = pd.DataFrame(data=np.zeros((len(photo_seq), len(columns)), dtype=np.float), \
                           index=photo_seq, columns=columns)
    
    # trace_df.iloc[i, j] stores the (previous) 'POI_k' such that the cost of POI sequence 
    # [... --> 'POI_k' (prev POI) --> 'POI_j' (current POI)] is cost_df.iloc[i, j]
    trace_df = pd.DataFrame(data=np.zeros((len(photo_seq), len(columns)), dtype=np.int), \
                            index=photo_seq, columns=columns)
    # NO predecessor for the start POI
    trace_df.iloc[0] = -1
    
    # costs for the first row are just the distances (or np.inf) from the first photo to all POIs
    for kx in cost_df.columns:
        ix = photo_seq[0]
        dist = photo_poi_distmat.loc[ix, kx]
        cost_df.loc[ix, kx] = np.inf if dist > DIST_MAX else dist
    
    # compute minimum costs recursively
    for i in range(1, len(photo_seq)):
        ix = cost_df.index[i]
        prev = cost_df.index[i-1]
        for jx in cost_df.columns:
            # distance(photo_i, POI_j) + alpha * distance(POI_k, POI_j) + previous cost
            # if distance(photo_i, POI_j) <= DIST_MAX else np.inf
            costs = [np.inf if photo_poi_distmat.loc[ix, jx] > DIST_MAX else \
                     photo_poi_distmat.loc[ix, jx] + ALPHA * poi_distmat.loc[kx, jx] + cost_df.loc[prev, kx] \
                     for kx in cost_df.columns]
            min_idx = np.argmin(costs)
            cost_df.loc[ix, jx] = costs[min_idx]
            trace_df.loc[ix, jx] = cost_df.columns[min_idx]
    
    # trace back
    pN = cost_df.loc[cost_df.index[-1]].idxmin()  # the end POI
    seq_reverse = [pN]  # sequence of POI in reverse order
    row_idx = trace_df.shape[0] - 1  # trace back from the last row of trace_df
    while (row_idx > 0):  # the first row are all -1
        ix = trace_df.index[row_idx]
        jx = seq_reverse[-1]
        poi = trace_df.loc[ix, jx]
        seq_reverse.append(poi)
        row_idx -= 1
        
    return seq_reverse[::-1]  # reverse the sequence

Build travel sequences and map photos to POIs.


In [37]:
if map_dp == True:
    traj_dp = photo_df.copy()
    ALPHA = 1

In [39]:
if map_dp == True:
    tid = 0
    cnt = 0
    for user in users:
        udf = traj_dp[traj_dp['userID'] == user].copy()
        udf.sort_values(by='dateTaken', ascending=True, inplace=True)

        # filtering out photos that are far from all POIs
        for ix in udf.index:
            if photo_poi_distmat.loc[ix].min() > DIST_MAX:
                udf.drop(ix, axis=0, inplace=True)

        if udf.shape[0] == 0: 
            cnt += 1; print_progress(cnt, len(users))
            continue

        photo_seq = [udf.index[0]]
        for i in range(1, udf.shape[0]):
            ix1 = photo_seq[-1]
            ix2 = udf.index[i]
            if udf.loc[ix2, 'dateTaken'] - udf.loc[ix1, 'dateTaken'] > TIME_GAP:
                assert(len(photo_seq) > 0)
                poi_seq = decode_photo_seq(photo_seq, poi_distmat, photo_poi_distmat, DIST_MAX, ALPHA)
                assert(len(poi_seq) == len(photo_seq))
                for j in range(len(poi_seq)):
                    jx = photo_seq[j]
                    poi = poi_seq[j]
                    traj_dp.loc[jx, 'poiID'] = poi
                    traj_dp.loc[jx, 'trajID'] = tid
                tid += 1
                photo_seq.clear()
                photo_seq.append(ix2)
            else:
                photo_seq.append(ix2)

        assert(len(photo_seq) > 0)
        poi_seq = decode_photo_seq(photo_seq, poi_distmat, photo_poi_distmat, DIST_MAX, ALPHA)
        assert(len(poi_seq) == len(photo_seq))
        for j in range(len(poi_seq)):
            jx = photo_seq[j]
            poi = poi_seq[j]
            traj_dp.loc[jx, 'poiID'] = poi
            traj_dp.loc[jx, 'trajID'] = tid
        tid += 1

        cnt += 1; print_progress(cnt, len(users))

Compute the total cost of trajectories.


In [40]:
def calc_cost(traj_df, poi_distmat, photo_poi_distmat):
    cost = 0
    traj_df = traj_df[traj_df['poiID'] != -1]
    for tid in sorted(traj_df['trajID'].unique().tolist()):
        tdf = traj_df[traj_df['trajID'] == tid].copy()
        tdf.sort_values(by='dateTaken', ascending=True, inplace=True)
        cost += np.trace(photo_poi_distmat.loc[tdf.index, tdf['poiID']])
        cost += np.trace(poi_distmat.loc[tdf['poiID'][:-1].values, tdf['poiID'][1:].values])
    return cost

In [41]:
calc_cost(traj_greedy, poi_distmat, photo_poi_distmat)


Out[41]:
3936.8923548191797

In [42]:
if map_dp == True:
    calc_cost(traj_dp, poi_distmat, photo_poi_distmat)

In [43]:
traj_greedy['trajID'].max()


Out[43]:
5105

In [44]:
if map_dp == True:
    traj_dp['trajID'].max()

Compare trajectories interactively.


In [45]:
compare_interactive = False

In [46]:
if compare_interactive == True and map_dp == True:
    for tid in sorted(traj_dp['trajID'].unique().tolist()):
        if tid == -1: continue
        tdf1 = traj_dp[traj_dp['trajID'] == tid].copy()
        tdf1.sort_values(by='dateTaken', ascending=True, inplace=True)
        tdf2 = traj_greedy[traj_greedy['trajID'] == tid].copy()
        tdf2.sort_values(by='dateTaken', ascending=True, inplace=True)
        print(tdf1)
        print(tdf2)
        input('Press any key to continue...')

4. Save Trajectory Data

Save trajectories and related POIs to files.


In [47]:
visits = traj_greedy[traj_greedy['poiID'] != -1]
#visits = traj_dp[traj_dp['poiID'] != -1]

In [48]:
visits.head()


Out[48]:
userID photoLon photoLat dateTaken poiID trajID
photoID
9588963220 67774014@N00 144.965060 -37.815725 1377408461 8 3333
703949177 79925938@N00 144.993074 -37.846570 1183135485 20 3975
775049707 79925938@N00 144.964085 -37.815018 1183823546 18 3977
8687823797 35558720@N03 144.960168 -37.823471 1364043697 25 1487
2676185 79925938@N00 144.969438 -37.816501 1104369016 71 3919

Save visits data.


In [49]:
uservisits = visits.copy()
uservisits.rename(columns={'trajID':'seqID'}, inplace=True)
uservisits.to_csv(fvisits, index=False)

Save POIs to CSV file.


In [50]:
poiix = sorted(visits['poiID'].unique().tolist())

In [51]:
poi_df.loc[poiix].to_csv(fpoi_new, index=True)

Count the number of photos taken at each POI.


In [52]:
poifreq = visits[['poiID', 'dateTaken']].copy().groupby('poiID').agg(np.size)
poifreq.rename(columns={'dateTaken':'poiFreq'}, inplace=True)

Save data in file format like IJCAI datasets: user visits data, POI related data.


In [53]:
visits_df = visits.copy()
visits_df.reset_index(inplace=True)
visits_df.drop(['photoLon', 'photoLat'], axis=1, inplace=True)
visits_df['dateTaken'] = visits_df['dateTaken'].astype(np.int)
visits_df.rename(columns={'trajID':'seqID'}, inplace=True)
visits_df['poiTheme'] = poi_df.loc[visits_df['poiID'], 'poiTheme'].tolist()
visits_df['poiFreq'] = poifreq.loc[visits_df['poiID'], 'poiFreq'].astype(np.int).tolist()

Sort photos by date taken.


In [55]:
visits_df.sort_values(by='dateTaken', ascending=True, inplace=True)

In [54]:
visits_df.head()


Out[54]:
photoID userID dateTaken poiID seqID poiTheme poiFreq
0 9588963220 67774014@N00 1377408461 8 3333 Shopping 209
1 703949177 79925938@N00 1183135485 20 3975 Shopping 653
2 775049707 79925938@N00 1183823546 18 3977 Shopping 659
3 8687823797 35558720@N03 1364043697 25 1487 Entertainment 1671
4 2676185 79925938@N00 1104369016 71 3919 Parks and spaces 1693

Save visits data.


In [56]:
cols = ['photoID', 'userID', 'dateTaken', 'poiID', 'poiTheme', 'poiFreq', 'seqID']
visits_df.to_csv(fvisits2, sep=';', quoting=2, columns=cols, index=False)

In [57]:
fvisits2


Out[57]:
'../data/userVisits-Melb-allPOI.csv'

POI related data: cost=POI-POI distance (meters), profit=frequency (#photos taken at the second POI).


In [58]:
costprofit_df = pd.DataFrame(columns=['from', 'to', 'cost', 'profit', 'category'])

In [59]:
pois = sorted(visits_df['poiID'].unique())

In [60]:
for poi1 in pois:
    for poi2 in pois:
        if poi1 == poi2: continue
        ix = costprofit_df.shape[0]
        costprofit_df.loc[ix, 'from'] = poi1
        costprofit_df.loc[ix, 'to'] = poi2
        costprofit_df.loc[ix, 'cost'] = poi_distmat.loc[poi1, poi2] * 1000  # meters
        costprofit_df.loc[ix, 'profit'] = poifreq.loc[poi2, 'poiFreq']
        costprofit_df.loc[ix, 'category'] = poi_df.loc[poi2, 'poiTheme']

In [61]:
costprofit_df.head()


Out[61]:
from to cost profit category
0 0 1 1982.34 139 City precincts
1 0 2 1179.18 304 City precincts
2 0 3 2441.85 138 City precincts
3 0 4 1593.2 349 City precincts
4 0 5 1249.32 70 City precincts

Save POI related data.


In [62]:
cols = ['from', 'to', 'cost', 'profit', 'category']
costprofit_df.to_csv(fpoi2, sep=';', quoting=2, columns=cols, index=False)