Path analysis


In [159]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from pylab import *

# import igraph as ig # Need to install this in your virtual environment
from re import sub

import editdistance # Needs to be installed. Usage: editdistance.eval('banana', 'bahama')
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage

# import seaborn as sns

In [2]:
import sys
sys.path.append('../../src/')
from utils.database import dbutils

conn = dbutils.connect()
cursor = conn.cursor()

In [16]:
# Helper function for making summary tables/distributions
def frequency(dataframe,columnname):
    out = dataframe[columnname].value_counts().to_frame()
    out.columns = ['frequency']
    out.index.name = columnname
    out.reset_index(inplace=True)
    out.sort_values('frequency',inplace=True,ascending=False)
    out['cumulative'] = out['frequency'].cumsum()/out['frequency'].sum()
    out['ccdf'] = 1 - out['cumulative']
    return out

In [3]:
nodes = pd.read_sql('select * from optourism.firenze_card_locations', con=conn)

In [177]:
# df = pd.read_csv('../src/output/firenzedata_feature_extracted.csv')
# df['museum_id'].replace(to_replace=39,value=38,inplace=True)
# df['date'] = pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.date
# df['hour'] = pd.to_datetime(df['date']) + pd.to_timedelta(pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.hour, unit='h')

In [178]:
# df.columns

In [179]:
# df.iloc[:,[0,2,3,9,10,11,12,13,14,15,16,17]].head()

In [180]:
# frequency(df,'is_in_museum_37')

In [181]:
# frequency(df,'entrances_per_card_per_museum')

In [182]:
# frequency(df,'total_duration_card_use')

In [183]:
df = pd.read_sql('select * from optourism.firenze_card_logs', con=conn)
df['museum_id'].replace(to_replace=39,value=38,inplace=True)
df['short_name'] = df['museum_id'].replace(dict(zip(nodes['museum_id'],nodes['short_name'])))
df['string'] = df['museum_id'].replace(dict(zip(nodes['museum_id'],nodes['string'])))
df['date'] = pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.date
df['hour'] = pd.to_datetime(df['date']) + pd.to_timedelta(pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.hour, unit='h')
df['total_people'] = df['total_adults'] + df['minors']

In [198]:
df.head()


Out[198]:
user_id museum_name entry_time adults_first_use adults_reuse total_adults minors museum_id short_name string date hour total_people
0 2089098 Palazzo Pitti 2016-09-19 14:49:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00 1
1 2089099 Palazzo Pitti 2016-09-19 14:49:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00 1
2 2083344 Palazzo Pitti 2016-09-19 14:57:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00 1
3 2083335 Palazzo Pitti 2016-09-19 14:57:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00 1
4 2083304 Palazzo Pitti 2016-09-19 14:58:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00 1

I propose distinguishing paths from flows. A path is an itinerary, and the flow is the number of people who take the flow. E.g., a family or a tour group produces one path, but adds mulitple people to the overall flow.

We now build a transition graph, a directed graph where an edge represents a person going from one museum to another within the same day.

We also produce the transition matrix, a row-normalized n-by-n matrix of the frequency of transition from the row node to the column node. If you take a vector of the current volumes in each location, and multiply that my the transition matrix, you get a prediction for the number of people on each node at the next time. This prediction can be refined with corrections for daily/weekly patterns and such.

Path analysis

To make paths:

We want a dataframe with user, the museum they went from and the museum they went to, the number of people on the card, and the time of entry to the next museum.

We will drop much of this data in creating paths, which will be concatenations of single-character codes for each museum.

To track the first visit per day, we add a dummy "source" node that everybody starts each day from. We give it the character code " ", and can then split(" ") along it.


In [189]:
df4 = df.groupby(['user_id','entry_time','date','hour','short_name','string']).sum()['total_people'].to_frame() # Need to group in this order to be correct further down
df4.reset_index(inplace=True)
df4['from'] = 'source' # Initialize 'from' column with 'source'
df4['to'] = df4['short_name'] # Copy 'to' column with row's museum_name
make_link = (df4['user_id'].shift(1)==df4['user_id'])&(df4['date'].shift(1)==df4['date']) # Row indexes at which to overwrite 'source'
df4['from'][make_link] = df4['short_name'].shift(1)[make_link]
df4['s'] = ' ' # Initialize 'from' column with 'source'
df4['t'] = df4['string'] # Copy 'to' column with row's museum_name
df4['s'][make_link] = df4['string'].shift(1)[make_link]
# Concatenating the source column is not enough, it leaves out the last place in the path. 
# Need to add a second 'source' column that, for the last item in a day's path, contains two characters.
df4['path'] = df4['s']
df4['path'][df4['from'].shift(-1)=='source'] = (df4['path'] + df4['t'])[df4['from'].shift(-1)=='source']
# Note: the above trick doesn't work for the last row of data. So, do this as well:
df4.iloc[-1:]['path'] = df4.iloc[-1:]['s'] + df4.iloc[-1:]['t']
df4.head()


Out[189]:
user_id entry_time date hour short_name string total_people from to s t path
0 1459702 2016-06-22 10:04:00 2016-06-22 2016-06-22 10:00:00 Uffizi U 1 source Uffizi U
1 1459702 2016-06-22 14:26:00 2016-06-22 2016-06-22 14:00:00 M. Casa Dante 3 1 Uffizi M. Casa Dante U 3 U
2 1459702 2016-06-22 15:49:00 2016-06-22 2016-06-22 15:00:00 Accademia A 1 M. Casa Dante Accademia 3 A 3A
3 1459702 2016-06-23 09:43:00 2016-06-23 2016-06-23 09:00:00 Opera del Duomo D 1 source Opera del Duomo D
4 1459702 2016-06-23 11:14:00 2016-06-23 2016-06-23 11:00:00 M. Galileo G 1 Opera del Duomo M. Galileo D G D

In [190]:
df5 = df4.groupby('user_id')['path'].sum().to_frame() # sum() on strings concatenates 
df5.head()


Out[190]:
path
user_id
1459702 U3A DGVBC
1473903 U D2V PX
1473904 U D2V PX
1473905 N cDAC
1473906 PSUV AIMC2 cl

In [191]:
df6 = df5['path'].apply(lambda x: pd.Series(x.strip().split(' '))) # Now split along strings. Takes a few seconds.
df6.head() # Note: 4 columns is correct, Firenze card is *72 hours from first use*, not from midnight of the day of first yse!


Out[191]:
0 1 2 3
user_id
1459702 U3A DGVBC NaN NaN
1473903 U D2V PX NaN
1473904 U D2V PX NaN
1473905 N cDAC NaN NaN
1473906 PSUV AIMC2 cl NaN

In [187]:
# df6.head(50) # Data stories just fall out! People traveling together, splitting off, etc. We assume this but strong coupling is hard to ignore.

In [192]:
# Ordered paths
fr1 = frequency(df5,'path')
fr1.head(20) # INSIGHT: the top 15 paths are permutations of Duomo, Uffizi, Accademia. But they are a very small fraction of the total.


Out[192]:
path frequency cumulative ccdf
0 D 337 0.006604 0.993396
1 U 218 0.010876 0.989124
2 AU 201 0.014815 0.985185
3 A 193 0.018597 0.981403
4 DA 189 0.022300 0.977700
5 ADU 186 0.025945 0.974055
6 UA 172 0.029316 0.970684
7 AD 168 0.032608 0.967392
8 DAU 145 0.035449 0.964551
9 U A 115 0.037703 0.962297
10 UDA 106 0.039780 0.960220
11 DU 95 0.041641 0.958359
12 D A 89 0.043385 0.956615
13 D AU 86 0.045071 0.954929
14 DA U 86 0.046756 0.953244
15 D U 79 0.048304 0.951696
16 P 77 0.049813 0.950187
17 AD U 77 0.051322 0.948678
18 A U 70 0.052693 0.947307
19 D UA 60 0.053869 0.946131

In [46]:
fr1.iloc[0:50].plot.bar(x='path',y='frequency',figsize=(24,10))
plt.title('Most common total Firenze card paths (ordered set)')
plt.xlabel('x = Encoded path')
plt.ylabel('Number of cards with total path x')
# plt.yscale('log')
plt.show()



In [22]:
# nodes # To make a legend

In [43]:
df7 = df5['path'].apply(lambda x: ''.join(sorted(list(sub(' ','',x))))).to_frame()
df7.head()


Out[43]:
path
user_id
1459702 3ABCDGUV
1473903 2DPUVX
1473904 2DPUVX
1473905 ACDNc
1473906 2ACIMPSUVcl

In [44]:
fr2 = frequency(df7,'path')
fr2.head()


Out[44]:
path frequency cumulative ccdf
0 ADU 1280 0.025083 0.974917
1 ADPU 782 0.040407 0.959593
2 AU 558 0.051341 0.948659
3 AD 503 0.061198 0.938802
4 D 337 0.067802 0.932198

In [47]:
fr2.iloc[0:50].plot.bar(x='path',y='frequency',figsize=(24,10))
plt.title('Most common set of museums visited on Firenze card (unordered paths)')
plt.xlabel('x = Set of encoded museums')
plt.ylabel('Number of cards with total set x')
plt.show()



In [ ]:


In [ ]:


In [ ]:
# How many nodes have differing numbers of minors on the card?

In [193]:
# How many nodes have differing numbers of minors on the card?
df8 = df.groupby(['user_id','entry_time','short_name'])['minors'].sum().reset_index()[['user_id','minors']].groupby('user_id').nunique()['minors'].to_frame()
df8.columns = ['unique_counts_of_minors']
df8.head()


Out[193]:
unique_counts_of_minors
user_id
1459702 1
1473903 1
1473904 1
1473905 1
1473906 2

In [194]:
frequency(df8, 'unique_counts_of_minors')


Out[194]:
unique_counts_of_minors frequency cumulative ccdf
0 1 49214 0.964394 0.035606
1 2 1745 0.998589 0.001411
2 3 72 1.000000 0.000000

In [195]:
df.set_index('user_id').loc[df8[df8['unique_counts_of_minors']>2].index][['short_name','total_adults','minors']].head()


Out[195]:
short_name total_adults minors
user_id
2030240 M. San Marco 1 0
2030240 Accademia 1 0
2030240 Accademia 0 1
2030240 Opera del Duomo 1 0
2030240 Opera del Duomo 0 1

In [196]:
df5[(df5.index<2030243)&(df5.index>2030238)]


Out[196]:
path
user_id
2030239 P VTCD N
2030240 MADNVTU 3GC
2030241 MADNVTU 3GC
2030242 UAX

In [197]:
# What are the variable numbers of minors? Is it always just 0 vs 1? No, we see more variety. 
df[df['user_id'].isin(df8[df8['unique_counts_of_minors']>2].index)][['user_id','entry_time','short_name','minors']].groupby(['user_id','entry_time','short_name'])['minors'].sum().reset_index()[['user_id','minors']].groupby('user_id')['minors'].value_counts().head(20)


Out[197]:
user_id  minors
2030240  0         5
         2         4
         1         1
2033164  0         7
         1         3
         2         1
2039260  0         2
         1         1
         2         1
2043368  0         6
         1         1
         2         1
2044949  0         4
         1         1
         2         1
2046939  0         4
         1         1
         2         1
2047406  0         3
         1         1
Name: minors, dtype: int64

In [ ]:
cards = df.groupby('user_id').agg({'short_name':'nunique',
                                  'total_adults':'sum',
                                  'minors':'max',
                                  })

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [172]:
# cards = df.groupby('user_id').agg({# 'entrances_per_card_per_museum':'sum', 
#                                    'museum_id':'nunique', # Should be equal to sum of set of is_in columns
#                                    # 'total_duration_card_use':'max', 
#                                    'total_adults':'sum', # This sum should be equal to 'entrances_per_card_per_museum'
#                                    'is_in_museum_1':'max',
#                                    'is_in_museum_2':'max',
#                                    'is_in_museum_3':'max',
#                                    'is_in_museum_4':'max',
#                                    'is_in_museum_5':'max',
#                                    'is_in_museum_6':'max',
#                                    'is_in_museum_7':'max',
#                                    'is_in_museum_8':'max',
#                                    'is_in_museum_9':'max',
#                                    'is_in_museum_10':'max',
#                                    'is_in_museum_11':'max',
#                                    'is_in_museum_12':'max',
#                                    'is_in_museum_13':'max',
#                                    'is_in_museum_14':'max',
#                                    'is_in_museum_15':'max',
#                                    'is_in_museum_16':'max',
#                                    'is_in_museum_17':'max',
#                                    'is_in_museum_18':'max',
#                                    'is_in_museum_19':'max',
#                                    'is_in_museum_20':'max',
#                                    'is_in_museum_21':'max',
#                                    'is_in_museum_22':'max',
#                                    'is_in_museum_23':'max',
#                                    'is_in_museum_24':'max',
#                                    'is_in_museum_25':'max',
#                                    'is_in_museum_26':'max',
#                                    'is_in_museum_27':'max',
#                                    'is_in_museum_28':'max',
#                                    'is_in_museum_29':'max',
#                                    'is_in_museum_30':'max',
#                                    'is_in_museum_31':'max',
#                                    'is_in_museum_32':'max',
#                                    'is_in_museum_33':'max',
#                                    'is_in_museum_34':'max',
#                                    'is_in_museum_35':'max',
#                                    'is_in_museum_36':'max',
#                                    'is_in_museum_37':'max',
#                                    'is_in_museum_38':'max'
#                                   })

In [173]:
# # Reorder correctly
# cards = cards[[# 'entrances_per_card_per_museum', 
#                'museum_id', 
#                # 'total_duration_card_use', 
#                'total_adults', 
#                'is_in_museum_1',
#                'is_in_museum_2',
#                'is_in_museum_3',
#                'is_in_museum_4',
#                'is_in_museum_5',
#                'is_in_museum_6',
#                'is_in_museum_7',
#                'is_in_museum_8',
#                'is_in_museum_9',
#                'is_in_museum_10',
#                'is_in_museum_11',
#                'is_in_museum_12',
#                'is_in_museum_13',
#                'is_in_museum_14',
#                'is_in_museum_15',
#                'is_in_museum_16',
#                'is_in_museum_17',
#                'is_in_museum_18',
#                'is_in_museum_19',
#                'is_in_museum_20',
#                'is_in_museum_21',
#                'is_in_museum_22',
#                'is_in_museum_23',
#                'is_in_museum_24',
#                'is_in_museum_25',
#                'is_in_museum_26',
#                'is_in_museum_27',
#                'is_in_museum_28',
#                'is_in_museum_29',
#                'is_in_museum_30',
#                'is_in_museum_31',
#                'is_in_museum_32',
#                'is_in_museum_33',
#                'is_in_museum_34',
#                'is_in_museum_35',
#                'is_in_museum_36',
#                'is_in_museum_37',
#                'is_in_museum_38'    
# ]]

In [174]:
# # Rename appropriately
# cards.columns = [# 'entrances',
#                  'museums_visited',
#                  # 'use_duration',
#                  'entrances_2',
#                  'visited_museum_1',
#                  'visited_museum_2',
#                  'visited_museum_3',
#                  'visited_museum_4',
#                  'visited_museum_5',
#                  'visited_museum_6',
#                  'visited_museum_7',
#                  'visited_museum_8',
#                  'visited_museum_9',
#                  'visited_museum_10',
#                  'visited_museum_11',
#                  'visited_museum_12',
#                  'visited_museum_13',
#                  'visited_museum_14',
#                  'visited_museum_15',
#                  'visited_museum_16',
#                  'visited_museum_17',
#                  'visited_museum_18',
#                  'visited_museum_19',
#                  'visited_museum_20',
#                  'visited_museum_21',
#                  'visited_museum_22',
#                  'visited_museum_23',
#                  'visited_museum_24',
#                  'visited_museum_25',
#                  'visited_museum_26',
#                  'visited_museum_27',
#                  'visited_museum_28',
#                  'visited_museum_29',
#                  'visited_museum_30',
#                  'visited_museum_31',
#                  'visited_museum_32',
#                  'visited_museum_33',
#                  'visited_museum_34',
#                  'visited_museum_35',
#                  'visited_museum_36',
#                  'visited_museum_37',
#                  'visited_museum_38']

In [175]:
cards.head(20)


Out[175]:
museums_visited entrances_2 visited_museum_1 visited_museum_2 visited_museum_3 visited_museum_4 visited_museum_5 visited_museum_6 visited_museum_7 visited_museum_8 ... visited_museum_29 visited_museum_30 visited_museum_31 visited_museum_32 visited_museum_33 visited_museum_34 visited_museum_35 visited_museum_36 visited_museum_37 visited_museum_38
user_id
1459702 8 8 1 0 1 0 0 0 0 0 ... 1 0 0 1 0 0 0 0 0 0
1473903 6 6 0 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
1473904 6 6 0 1 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
1473905 5 5 1 0 1 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
1473906 11 11 1 1 0 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
1473907 11 11 1 1 0 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 1
1474634 7 7 0 1 1 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 1 0
1474636 7 7 0 1 1 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 1 0
2014298 3 3 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2016016 4 4 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2016021 4 4 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2016022 4 4 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2016024 3 3 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2017368 2 2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2017369 2 2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2017450 3 3 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2017451 2 2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2017452 2 2 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2017453 7 7 0 0 1 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 1
2017454 8 8 1 1 1 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 1 1

20 rows × 40 columns


In [176]:
pd.DataFrame([cards.iloc[:,1:39].sum(axis=1),cards['entrances_2']])


Out[176]:
user_id 1459702 1473903 1473904 1473905 1473906 1473907 1474634 1474636 2014298 2016016 ... 2095756 2095757 2095758 2095759 2095760 2095763 2095764 2095765 2095766 2095767
Unnamed 0 16 11 11 10 20 20 14 14 5 8 ... 4 4 1 1 1 2 2 2 2 2
entrances_2 8 6 6 5 11 11 7 7 3 4 ... 2 2 1 1 1 1 1 1 1 1

2 rows × 51031 columns


In [ ]:
# Two tasks. First, do clustering on these people,

In [161]:
X = cards.iloc[:,1:39].as_matrix()

In [171]:
Z = linkage(y=X, method='single', metric='jaccard')


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-171-5887c53fbde0> in <module>()
----> 1 Z = linkage(y=X, method='single', metric='jaccard')

/home/mmalik/.local/lib/python2.7/site-packages/scipy/cluster/hierarchy.pyc in linkage(y, method, metric)
    669                          'matrix looks suspiciously like an uncondensed '
    670                          'distance matrix')
--> 671         y = distance.pdist(y, metric)
    672     else:
    673         raise ValueError("`y` must be 1 or 2 dimensional.")

/home/mmalik/.local/lib/python2.7/site-packages/scipy/spatial/distance.pyc in pdist(X, metric, p, w, V, VI)
   1375 
   1376     m, n = s
-> 1377     dm = np.zeros((m * (m - 1)) // 2, dtype=np.double)
   1378 
   1379     # validate input for multi-args metrics

MemoryError: 

In [ ]:
pdist()

In [166]:
df['museum_name'].nunique()


Out[166]:
40

In [167]:
df['museum_id'].nunique()


Out[167]:
40

In [168]:
df['short_name'].nunique()


Out[168]:
40

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [18]:
df7 = df5['s2'].apply(lambda x: pd.Series(len(sub(' ','',x))))

In [19]:
df7.head()


Out[19]:
0
user_id
1459702 8
1473903 6
1473904 6
1473905 5
1473906 11

In [20]:
df7.sort_values(0,ascending=False).head(10)


Out[20]:
0
user_id
2066305 32
2066304 32
2057462 30
2057463 30
2086996 27
2086995 27
2075456 26
2075457 26
2030826 26
2075874 25

In [21]:
df6.loc[df7.sort_values(0,ascending=False).head(10).index]


Out[21]:
0 1 2 3
user_id
2066305 PLYsG3VT Dcl2m9bC7EB Ud0MgqOXIanNA NaN
2066304 PLYsG3VT Dcl2m9bC7EB Ud0MgqOXIanNA NaN
2057462 Ml2cDNbLPVT UBC7amXIdS A0GH3sY6n NaN
2057463 Ml2cDNbLPVT UBC7amXIdS A0GH3sY6n NaN
2086996 PbFHsC7E3GNV DBU2mAIX M0cln9d NaN
2086995 PbFHsC7E3GNV DBU2mAIX M0cln9d NaN
2075456 UGaB3Fp Dl2cm0MOgXIANV dPLCT NaN
2075457 UGaB3Fp Dl2cm0MOgXIANV dPLCT NaN
2030826 sC7SFmT D2lcdNnIA MEbPV HG9aU
2075874 ND MqgOXI02ma AcVTBC73UG PYb

In [22]:
fr2 = frequency(df7,0)
fr2.head()


Out[22]:
0 frequency cumulative ccdf
0 6 5907 0.115753 0.884247
1 5 5632 0.226117 0.773883
2 7 5521 0.334307 0.665693
3 4 5516 0.442398 0.557602
4 8 4858 0.537595 0.462405

In [23]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.stem(fr2[0],fr2['frequency'], linestyle='steps--')
# yscale('log')
# xscale('log')
ax.set_title('Number of museum visits by Florence Card')
ax.set_ylabel('Frequency')
ax.set_xlabel('Number of museums')
plt.show()
# NOTE: This is the number of *visits*, not people on those cards!! 
# (And, not number of museums visited, this counts multiple visits to the same museum as distinct)



In [24]:
df8 = df.groupby(['user_id','short_name','entry_time']).sum()['total_adults'].to_frame()
df8.head()


Out[24]:
total_adults
user_id short_name entry_time
1459702 Accademia 2016-06-22 15:49:00 1
M. Bargello 2016-06-23 13:41:00 1
M. Casa Dante 2016-06-22 14:26:00 1
M. Galileo 2016-06-23 11:14:00 1
M. Palazzo Vecchio 2016-06-23 12:57:00 1

In [25]:
# Cards with more than one entrance to same museum
df9 = df.groupby(['user_id','short_name']).sum()['total_adults'].to_frame()
df9.columns = ['number_of_entries']
df9['number_of_entries'] = df9['number_of_entries']
df9[df9['number_of_entries']>1].head(50)


Out[25]:
number_of_entries
user_id short_name
2044654 M. Santa Maria Novella 2
Opera del Duomo 2
Uffizi 2
2045231 Accademia 2
2046704 Accademia 2
2052035 M. Galileo 2
2059646 Santa Croce 2
2059934 M. Palazzo Vecchio 2
2062498 M. Archeologico 2
2065347 M. Palazzo Vecchio 2
2065878 Cappelle Medicee 2
2067085 San Lorenzo 2
2069270 M. Bargello 2
2069663 M. Antropologia 2
2078714 M. Novecento 2
2088128 Pitti 2
2091065 Accademia 2
Opera del Duomo 2
Uffizi 2

In [26]:
df8.shape[0] # Number of entries


Out[26]:
360534

In [27]:
df9.shape[0] # 12 repeat visits. Negligible.


Out[27]:
360522

In [28]:
df9[df9['number_of_entries']==1].shape[0]


Out[28]:
360503

In [29]:
df9[df9['number_of_entries']==2].shape[0]


Out[29]:
19

In [30]:
df9[df9['number_of_entries']>2]


Out[30]:
number_of_entries
user_id short_name

In [18]:
# # This is the number of people who entered on each card entry, not the number of repeat entries! 
# frequency(df.groupby(['user_id','short_name',]).count()['entry_time'].to_frame(),'entry_time')

In [32]:
df9 = df7.reset_index()
df10 = df8.reset_index()
df11 = df9.merge(df10).groupby('user_id').sum()
df11.columns = ['visits','total_people']
df11['persons_per_visit'] = df11['total_people']/df11['visits']
df11.head()


Out[32]:
visits total_people persons_per_visit
user_id
1459702 64 8 0.125000
1473903 36 6 0.166667
1473904 36 6 0.166667
1473905 25 5 0.200000
1473906 121 11 0.090909

In [33]:
# df11[df11['persons_per_visit']>1].plot.scatter(x='visits',y='persons_per_visit')

We now want the following: a measure of similarity between adjacent rows, for detecting people traveling together (making the assumption that they bought Firenze cards consecutively).

This is simplest to do naively: not use anything statistical, but just fuzzy matching through edit distance, which is the number of operations (insertions, deletions, swaps) needed to change one string into another (or, opreations on list elements to change one list to another).

Since there are 3 days, and since we want slight deviations in otherwise identical large itineraries to count less, we calculate the following: a column with the edit distance between each pair of days between rows, summed, followed by a column with the total number of visits per row.


In [34]:
# edit = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum()))

In [35]:
df6.fillna('',inplace=True)
df6.iloc[0:10]


Out[35]:
0 1 2 3
user_id
1459702 U3A DGVBC
1473903 U D2V PX
1473904 U D2V PX
1473905 N cDAC
1473906 PSUV AIMC2 cl
1473907 PSUV AIMC2 cl
1474634 2D U MABm
1474636 2D U MABm
2014298 PN A
2016016 DV AU

In [36]:
def editdist(pair):
    return editdistance.eval(pair[0],pair[1])

In [37]:
df7 = pd.concat([df6,df6.shift()],axis=1)

In [38]:
df7.columns = ['0','1','2','3','0+','1+','2+','3+']
df7.head()


Out[38]:
0 1 2 3 0+ 1+ 2+ 3+
user_id
1459702 U3A DGVBC NaN NaN NaN NaN
1473903 U D2V PX U3A DGVBC
1473904 U D2V PX U D2V PX
1473905 N cDAC U D2V PX
1473906 PSUV AIMC2 cl N cDAC

In [39]:
# df8 = df7.iloc[:,[0,4,1,5,2,6,3,7]]
# df8.columns = ['0','0+','1','1+','2','2+','3','3+']
# df8.columns = ['0','0+','1','1+','2','2+','3','3+']
# df8.head()

In [40]:
df7['total_edit_distance'] = df7[['0','0+']].apply(editdist,axis=1) + df7[['1','1+']].apply(editdist,axis=1) + df7[['2','2+']].apply(editdist,axis=1) + df7[['3','3+']].apply(editdist,axis=1)
df7.head()


Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Out[40]:
0 1 2 3 0+ 1+ 2+ 3+ total_edit_distance
user_id
1459702 U3A DGVBC NaN NaN NaN NaN 0
1473903 U D2V PX U3A DGVBC 7
1473904 U D2V PX U D2V PX 0
1473905 N cDAC U D2V PX 6
1473906 PSUV AIMC2 cl N cDAC 10

In [41]:
df7['len'] = df7['0'].str.len() + df7['1'].str.len() + df7['2'].str.len() + df7['3'].str.len()
df7['len+'] = df7['0+'].str.len() + df7['1+'].str.len() + df7['2+'].str.len() + df7['3+'].str.len()
df7['len_tot'] = df7['len'] + df7['len+']
df7.head()


Out[41]:
0 1 2 3 0+ 1+ 2+ 3+ total_edit_distance len len+ len_tot
user_id
1459702 U3A DGVBC NaN NaN NaN NaN 0 8 NaN NaN
1473903 U D2V PX U3A DGVBC 7 6 8.0 14.0
1473904 U D2V PX U D2V PX 0 6 6.0 12.0
1473905 N cDAC U D2V PX 6 5 6.0 11.0
1473906 PSUV AIMC2 cl N cDAC 10 11 5.0 16.0

In [42]:
fr3 = frequency(df7[df7['total_edit_distance']==0],'len_tot')
fr3


Out[42]:
len_tot frequency cumulative ccdf
0 12.0 2494 0.118221 0.881779
1 8.0 2384 0.231229 0.768771
2 10.0 2375 0.343809 0.656191
3 14.0 2248 0.450370 0.549630
4 16.0 1976 0.544037 0.455963
5 6.0 1792 0.628982 0.371018
6 18.0 1635 0.706485 0.293515
7 20.0 1399 0.772801 0.227199
8 22.0 1126 0.826176 0.173824
9 4.0 1039 0.875427 0.124573
10 24.0 796 0.913159 0.086841
11 26.0 575 0.940415 0.059585
12 2.0 431 0.960846 0.039154
13 28.0 354 0.977626 0.022374
14 30.0 200 0.987107 0.012893
15 32.0 130 0.993269 0.006731
16 34.0 69 0.996540 0.003460
17 36.0 37 0.998294 0.001706
18 38.0 14 0.998957 0.001043
19 40.0 8 0.999336 0.000664
20 44.0 5 0.999573 0.000427
21 42.0 5 0.999810 0.000190
22 60.0 1 0.999858 0.000142
23 64.0 1 0.999905 0.000095
24 54.0 1 0.999953 0.000047
25 52.0 1 1.000000 0.000000

In [43]:
frequency(df7[df7['total_edit_distance']==0],'len_tot')


Out[43]:
len_tot frequency cumulative ccdf
0 12.0 2494 0.118221 0.881779
1 8.0 2384 0.231229 0.768771
2 10.0 2375 0.343809 0.656191
3 14.0 2248 0.450370 0.549630
4 16.0 1976 0.544037 0.455963
5 6.0 1792 0.628982 0.371018
6 18.0 1635 0.706485 0.293515
7 20.0 1399 0.772801 0.227199
8 22.0 1126 0.826176 0.173824
9 4.0 1039 0.875427 0.124573
10 24.0 796 0.913159 0.086841
11 26.0 575 0.940415 0.059585
12 2.0 431 0.960846 0.039154
13 28.0 354 0.977626 0.022374
14 30.0 200 0.987107 0.012893
15 32.0 130 0.993269 0.006731
16 34.0 69 0.996540 0.003460
17 36.0 37 0.998294 0.001706
18 38.0 14 0.998957 0.001043
19 40.0 8 0.999336 0.000664
20 44.0 5 0.999573 0.000427
21 42.0 5 0.999810 0.000190
22 60.0 1 0.999858 0.000142
23 64.0 1 0.999905 0.000095
24 54.0 1 0.999953 0.000047
25 52.0 1 1.000000 0.000000

In [44]:
df8 = df7.reset_index(inplace=False)
df8.reset_index(inplace=True)
df8.head()


Out[44]:
index user_id 0 1 2 3 0+ 1+ 2+ 3+ total_edit_distance len len+ len_tot
0 0 1459702 U3A DGVBC NaN NaN NaN NaN 0 8 NaN NaN
1 1 1473903 U D2V PX U3A DGVBC 7 6 8.0 14.0
2 2 1473904 U D2V PX U D2V PX 0 6 6.0 12.0
3 3 1473905 N cDAC U D2V PX 6 5 6.0 11.0
4 4 1473906 PSUV AIMC2 cl N cDAC 10 11 5.0 16.0

In [47]:
# df7[df7['total_edit_distance']==0].hist('len_tot',bins=100, grid=False, figsize=[16,8])
f, ax = plt.subplots(figsize=(12,5), dpi=300)
ax.stem(fr3['len_tot']/2,fr3['frequency'], linestyle='steps--')
# yscale('log')
# xscale('log')
ax.set_title('Number of museums in perfectly matched consecutive paths')
ax.set_ylabel('Number of cards')
ax.set_xlabel('Number of museums')
plt.show()
# NOTE: This is the number of *visits*, not people on those cards!! 
# (And, not number of museums visited, this counts multiple visits to the same museum as distinct)



In [ ]:
# df8.hist('user_id',bins=1000,figsize=[8,8])

In [ ]:
# df8[df8['user_id']>1500000].hist('user_id',bins=1000,figsize=[8,8])

In [ ]:
# df8.plot.scatter(x='index',y='total_edit_distance',figsize=[16,16], c=2+(df8['total_edit_distance']>0))
# sns.jointplot(x="index", y="total_edit_distance", data=df8)#, hue=(df9['total_edit_distance']==0))
# sns.jointplot(x="index", y="total_edit_distance", data=df8, kind='hex')

In [ ]:


In [ ]:
sns.jointplot(x="total_edit_distance", y="len_tot", data=df8)

In [ ]:
sns.jointplot(x="total_edit_distance", y="len_tot", data=df8, kind='hex')

In [ ]:
sns.jointplot(x="total_edit_distance", y="len_tot", data=df8, kind='kde')

Now, need to extract consecutive rows of zero edit distance.


In [ ]:
df8['dist_gt_0'] = 1*(df8['total_edit_distance'] != 0)
# df8['offset'] = 1*(df8['zero_dist'] + df8['zero_dist'].shift()==0)
df8['group'] = cumsum(df8['dist_gt_0'])
df8.head(50)

In [ ]:
df9 = df8[['group','user_id']].groupby('group').count()
df9.columns = ['people']
df9.head()

In [ ]:
frequency(df9,'people')

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [3]:
# # The code below was my attempt to get a node for starting the day and ending the day from the paths. 
# # The problem is that this gives the number of _cards_, not number of people! I had to go back to the
# # dynamic edgelist construction anyway. 
# df6.head()

In [4]:
# df9 = df5['s2'].apply(lambda x: pd.Series(x.strip().split(' ')))
# df9.fillna(' ',inplace=True)
# df9['0_first'] = df9[0].apply(lambda x: pd.Series(x[0]))
# df9['0_last'] = df9[0].apply(lambda x: pd.Series(x[-1]))
# df9['0_len'] = df9[0].apply(lambda x: pd.Series(len(x)))
# df9['1_first'] = df9[1].apply(lambda x: pd.Series(x[0]))
# df9['1_last'] = df9[1].apply(lambda x: pd.Series(x[-1]))
# df9['1_len'] = df9[1].apply(lambda x: pd.Series(len(x)))
# df9['2_first'] = df9[2].apply(lambda x: pd.Series(x[0]))
# df9['2_last'] = df9[2].apply(lambda x: pd.Series(x[-1]))
# df9['2_len'] = df9[2].apply(lambda x: pd.Series(len(x)))
# df9['3_first'] = df9[3].apply(lambda x: pd.Series(x[0]))
# df9['3_last'] = df9[3].apply(lambda x: pd.Series(x[-1]))
# df9['3_len'] = df9[3].apply(lambda x: pd.Series(len(x)))
# df9.head()

In [5]:
# df9.replace(' ',np.nan,inplace=True)
# df9.head()

In [6]:
# from_home = frequency(df9[['0_first','1_first','2_first','3_first']].stack().to_frame(),0)[[0,'frequency']]
# from_home.columns = ['0','from_home']
# from_home.set_index('0',inplace=True)
# from_home.head()

In [7]:
# only = frequency(pd.concat(
#     [df9[(df9['0_len']==1)&(df9['0_first'].notnull())]['0_first'], 
#      df9[(df9['1_len']==1)&(df9['1_first'].notnull())]['1_first'], 
#      df9[(df9['2_len']==1)&(df9['2_first'].notnull())]['2_first'], 
#      df9[(df9['3_len']==1)&(df9['3_first'].notnull())]['3_first']
#     ],axis=0).to_frame()
# ,0)[[0,'frequency']]
# only.columns = ['0','only']
# only.set_index('0',inplace=True)
# only.head()

In [8]:
# to_home = frequency(df9[['0_last','1_last','2_last','3_last']].stack().to_frame(),0)[[0,'frequency']]
# to_home.columns = ['0','to_home']
# to_home.set_index('0',inplace=True)
# to_home.head()

In [9]:
# from_to_home = nodes.set_index('string')['short_name'].to_frame().join([from_home,to_home,only])
# from_to_home.set_index('short_name',inplace=True)
# from_to_home.columns = ['home_to_node','node_to_home','only_visit_of_day']
# # from_to_home['from_home'] = from_to_home['from_home_incl_only'] - from_to_home['only_visit_of_day']
# # from_to_home['to_home'] = from_to_home['to_home_incl_only'] - from_to_home['only_visit_of_day']
# from_to_home.head()

In [10]:
# from_to_home['home_to_node'].sort_values(ascending=False).to_frame().head(20)

In [11]:
# from_to_home['node_to_home'].sort_values(ascending=False).to_frame().head(20)

In [12]:
# from_to_home.reset_index(inplace=True)

In [13]:
# from_to_home

In [14]:
# supp_edges = pd.DataFrame({'from':['home']*from_to_home.shape[0] + from_to_home['short_name'].tolist(),
#                           'to':from_to_home['short_name'].tolist() + ['home']*from_to_home.shape[0],
#                           'weight':from_to_home['home_to_node'].tolist() + from_to_home['node_to_home'].tolist() })

In [15]:
# supp_edges.dropna(how='any',inplace=True)
# supp_edges

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
frequency(df6,0).head()

In [ ]:
frequency(df6,1).head()

In [ ]:
frequency(df6,2).head()

In [ ]:
frequency(df6,3).head()

Now, I want a set of scatterplots between these frequencies.


In [33]:
pt = pd.concat([frequency(df6,0),frequency(df6,1),frequency(df6,2),frequency(df6,3)])
pt['daily_path'] = pt[0].replace(np.nan, '', regex=True) + pt[1].replace(np.nan, '', regex=True) + pt[2].replace(np.nan, '', regex=True) + pt[3].replace(np.nan, '', regex=True)
pt.drop([0,1,2,3,'ccdf','cumulative'],axis=1,inplace=True)
pt.head()


Out[33]:
frequency daily_path
0 3572 D
1 2317 U
2 1585 P
3 1486 A
4 1232 V

In [34]:
pt2 = pt.groupby('daily_path').sum()
pt2.sort_values('frequency', inplace=True, ascending=False)
pt2.head()


Out[34]:
frequency
daily_path
D 5932
P 5643
U 5420
A 3604
V 1909

In [35]:
pt2[pt2['frequency']>200].plot.bar(figsize=(16,8))
plt.title('Most common daily Firenze card paths across all days')
plt.xlabel('x = Encoded path')
plt.ylabel('Number of cards with daily path x')
# plt.yscale('log')
plt.show()



In [36]:
nodes.head()


Out[36]:
museum_name longitude latitude museum_id short_name string
0 Basilica di Santa Croce 11.262598 43.768754 1 Santa Croce C
1 Basilica San Lorenzo 11.254430 43.774932 2 San Lorenzo 2
2 Battistero di San Giovanni 11.254966 43.773131 3 Opera del Duomo D
3 Biblioteca Medicea Laurenziana 11.253924 43.774799 4 Laurenziana l
4 Cappella Brancacci 11.243859 43.768334 5 Brancacci b

In [37]:
# For reference, here are the displayed museums
# nodes[['string','short_name']].set_index('string').reindex(['D','P','U','A','V','T','N','C','G','B','S','c','m','M','b','Y','2'])
nodes[nodes['string'].isin(['D','P','U','A','V','T','N','C','G','B','S','c','m','M','b','Y','2'])][['string','short_name']]


Out[37]:
string short_name
0 C Santa Croce
1 2 San Lorenzo
2 D Opera del Duomo
4 b Brancacci
5 c Cappelle Medicee
9 U Uffizi
10 A Accademia
19 M M. San Marco
20 N M. Santa Maria Novella
23 G M. Galileo
26 B M. Bargello
40 V M. Palazzo Vecchio
48 m Palazzo Medici
49 S Palazzo Strozzi
50 T Torre di Palazzo Vecchio
51 Y V. Bardini
62 P Pitti

In [38]:
df6[pd.isnull(df6[0].str[0])].head()


Out[38]:
0 1 2 3
user_id

In [26]:
df6.to_csv('encoded_paths.csv')

In [27]:
nodes.to_csv('encoded_paths_legend.csv')

In [ ]:


In [39]:
df6.values


Out[39]:
array([['U3A', 'DGVBC', nan, nan],
       ['U', 'D2V', 'PX', nan],
       ['U', 'D2V', 'PX', nan],
       ..., 
       ['D', nan, nan, nan],
       ['D', nan, nan, nan],
       ['D', nan, nan, nan]], dtype=object)

In [ ]:


In [ ]: