Path analysis



In [159]:

    
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from pylab import *

# import igraph as ig # Need to install this in your virtual environment
from re import sub

import editdistance # Needs to be installed. Usage: editdistance.eval('banana', 'bahama')
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage

# import seaborn as sns



In [2]:

    
import sys
sys.path.append('../../src/')
from utils.database import dbutils

conn = dbutils.connect()
cursor = conn.cursor()



In [16]:

    
# Helper function for making summary tables/distributions
def frequency(dataframe,columnname):
    out = dataframe[columnname].value_counts().to_frame()
    out.columns = ['frequency']
    out.index.name = columnname
    out.reset_index(inplace=True)
    out.sort_values('frequency',inplace=True,ascending=False)
    out['cumulative'] = out['frequency'].cumsum()/out['frequency'].sum()
    out['ccdf'] = 1 - out['cumulative']
    return out



In [3]:

    
nodes = pd.read_sql('select * from optourism.firenze_card_locations', con=conn)



In [177]:

    
# df = pd.read_csv('../src/output/firenzedata_feature_extracted.csv')
# df['museum_id'].replace(to_replace=39,value=38,inplace=True)
# df['date'] = pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.date
# df['hour'] = pd.to_datetime(df['date']) + pd.to_timedelta(pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.hour, unit='h')



In [178]:

    
# df.columns



In [179]:

    
# df.iloc[:,[0,2,3,9,10,11,12,13,14,15,16,17]].head()



In [180]:

    
# frequency(df,'is_in_museum_37')



In [181]:

    
# frequency(df,'entrances_per_card_per_museum')



In [182]:

    
# frequency(df,'total_duration_card_use')



In [183]:

    
df = pd.read_sql('select * from optourism.firenze_card_logs', con=conn)
df['museum_id'].replace(to_replace=39,value=38,inplace=True)
df['short_name'] = df['museum_id'].replace(dict(zip(nodes['museum_id'],nodes['short_name'])))
df['string'] = df['museum_id'].replace(dict(zip(nodes['museum_id'],nodes['string'])))
df['date'] = pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.date
df['hour'] = pd.to_datetime(df['date']) + pd.to_timedelta(pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.hour, unit='h')
df['total_people'] = df['total_adults'] + df['minors']



In [198]:

    
df.head()









    Out[198]:







  
    
      
      user_id
      museum_name
      entry_time
      adults_first_use
      adults_reuse
      total_adults
      minors
      museum_id
      short_name
      string
      date
      hour
      total_people
    
  
  
    
      0
      2089098
      Palazzo Pitti
      2016-09-19 14:49:00
      0
      1
      1
      0
      38
      Pitti
      P
      2016-09-19
      2016-09-19 14:00:00
      1
    
    
      1
      2089099
      Palazzo Pitti
      2016-09-19 14:49:00
      0
      1
      1
      0
      38
      Pitti
      P
      2016-09-19
      2016-09-19 14:00:00
      1
    
    
      2
      2083344
      Palazzo Pitti
      2016-09-19 14:57:00
      0
      1
      1
      0
      38
      Pitti
      P
      2016-09-19
      2016-09-19 14:00:00
      1
    
    
      3
      2083335
      Palazzo Pitti
      2016-09-19 14:57:00
      0
      1
      1
      0
      38
      Pitti
      P
      2016-09-19
      2016-09-19 14:00:00
      1
    
    
      4
      2083304
      Palazzo Pitti
      2016-09-19 14:58:00
      0
      1
      1
      0
      38
      Pitti
      P
      2016-09-19
      2016-09-19 14:00:00
      1

I propose distinguishing paths from flows. A path is an itinerary, and the flow is the number of people who take the flow. E.g., a family or a tour group produces one path, but adds mulitple people to the overall flow.

We now build a transition graph, a directed graph where an edge represents a person going from one museum to another within the same day.

We also produce the transition matrix, a row-normalized n-by-n matrix of the frequency of transition from the row node to the column node. If you take a vector of the current volumes in each location, and multiply that my the transition matrix, you get a prediction for the number of people on each node at the next time. This prediction can be refined with corrections for daily/weekly patterns and such.

Path analysis

To make paths:

We want a dataframe with user, the museum they went from and the museum they went to, the number of people on the card, and the time of entry to the next museum.

We will drop much of this data in creating paths, which will be concatenations of single-character codes for each museum.

To track the first visit per day, we add a dummy "source" node that everybody starts each day from. We give it the character code " ", and can then split(" ") along it.



In [189]:

    
df4 = df.groupby(['user_id','entry_time','date','hour','short_name','string']).sum()['total_people'].to_frame() # Need to group in this order to be correct further down
df4.reset_index(inplace=True)
df4['from'] = 'source' # Initialize 'from' column with 'source'
df4['to'] = df4['short_name'] # Copy 'to' column with row's museum_name
make_link = (df4['user_id'].shift(1)==df4['user_id'])&(df4['date'].shift(1)==df4['date']) # Row indexes at which to overwrite 'source'
df4['from'][make_link] = df4['short_name'].shift(1)[make_link]
df4['s'] = ' ' # Initialize 'from' column with 'source'
df4['t'] = df4['string'] # Copy 'to' column with row's museum_name
df4['s'][make_link] = df4['string'].shift(1)[make_link]
# Concatenating the source column is not enough, it leaves out the last place in the path. 
# Need to add a second 'source' column that, for the last item in a day's path, contains two characters.
df4['path'] = df4['s']
df4['path'][df4['from'].shift(-1)=='source'] = (df4['path'] + df4['t'])[df4['from'].shift(-1)=='source']
# Note: the above trick doesn't work for the last row of data. So, do this as well:
df4.iloc[-1:]['path'] = df4.iloc[-1:]['s'] + df4.iloc[-1:]['t']
df4.head()









    Out[189]:







  
    
      
      user_id
      entry_time
      date
      hour
      short_name
      string
      total_people
      from
      to
      s
      t
      path
    
  
  
    
      0
      1459702
      2016-06-22 10:04:00
      2016-06-22
      2016-06-22 10:00:00
      Uffizi
      U
      1
      source
      Uffizi
      
      U
      
    
    
      1
      1459702
      2016-06-22 14:26:00
      2016-06-22
      2016-06-22 14:00:00
      M. Casa Dante
      3
      1
      Uffizi
      M. Casa Dante
      U
      3
      U
    
    
      2
      1459702
      2016-06-22 15:49:00
      2016-06-22
      2016-06-22 15:00:00
      Accademia
      A
      1
      M. Casa Dante
      Accademia
      3
      A
      3A
    
    
      3
      1459702
      2016-06-23 09:43:00
      2016-06-23
      2016-06-23 09:00:00
      Opera del Duomo
      D
      1
      source
      Opera del Duomo
      
      D
      
    
    
      4
      1459702
      2016-06-23 11:14:00
      2016-06-23
      2016-06-23 11:00:00
      M. Galileo
      G
      1
      Opera del Duomo
      M. Galileo
      D
      G
      D



In [190]:

    
df5 = df4.groupby('user_id')['path'].sum().to_frame() # sum() on strings concatenates 
df5.head()









    Out[190]:







  
    
      
      path
    
    
      user_id
      
    
  
  
    
      1459702
      U3A DGVBC
    
    
      1473903
      U D2V PX
    
    
      1473904
      U D2V PX
    
    
      1473905
      N cDAC
    
    
      1473906
      PSUV AIMC2 cl



In [191]:

    
df6 = df5['path'].apply(lambda x: pd.Series(x.strip().split(' '))) # Now split along strings. Takes a few seconds.
df6.head() # Note: 4 columns is correct, Firenze card is *72 hours from first use*, not from midnight of the day of first yse!



In [187]:

    
# df6.head(50) # Data stories just fall out! People traveling together, splitting off, etc. We assume this but strong coupling is hard to ignore.



In [192]:

    
# Ordered paths
fr1 = frequency(df5,'path')
fr1.head(20) # INSIGHT: the top 15 paths are permutations of Duomo, Uffizi, Accademia. But they are a very small fraction of the total.



In [46]:

    
fr1.iloc[0:50].plot.bar(x='path',y='frequency',figsize=(24,10))
plt.title('Most common total Firenze card paths (ordered set)')
plt.xlabel('x = Encoded path')
plt.ylabel('Number of cards with total path x')
# plt.yscale('log')
plt.show()



In [22]:

    
# nodes # To make a legend



In [43]:

    
df7 = df5['path'].apply(lambda x: ''.join(sorted(list(sub(' ','',x))))).to_frame()
df7.head()









    Out[43]:







  
    
      
      path
    
    
      user_id
      
    
  
  
    
      1459702
      3ABCDGUV
    
    
      1473903
      2DPUVX
    
    
      1473904
      2DPUVX
    
    
      1473905
      ACDNc
    
    
      1473906
      2ACIMPSUVcl



In [44]:

    
fr2 = frequency(df7,'path')
fr2.head()



In [47]:

    
fr2.iloc[0:50].plot.bar(x='path',y='frequency',figsize=(24,10))
plt.title('Most common set of museums visited on Firenze card (unordered paths)')
plt.xlabel('x = Set of encoded museums')
plt.ylabel('Number of cards with total set x')
plt.show()



In [ ]:



In [ ]:



In [ ]:

    
# How many nodes have differing numbers of minors on the card?



In [193]:

    
# How many nodes have differing numbers of minors on the card?
df8 = df.groupby(['user_id','entry_time','short_name'])['minors'].sum().reset_index()[['user_id','minors']].groupby('user_id').nunique()['minors'].to_frame()
df8.columns = ['unique_counts_of_minors']
df8.head()









    Out[193]:







  
    
      
      unique_counts_of_minors
    
    
      user_id
      
    
  
  
    
      1459702
      1
    
    
      1473903
      1
    
    
      1473904
      1
    
    
      1473905
      1
    
    
      1473906
      2



In [194]:

    
frequency(df8, 'unique_counts_of_minors')









    Out[194]:







  
    
      
      unique_counts_of_minors
      frequency
      cumulative
      ccdf
    
  
  
    
      0
      1
      49214
      0.964394
      0.035606
    
    
      1
      2
      1745
      0.998589
      0.001411
    
    
      2
      3
      72
      1.000000
      0.000000



In [195]:

    
df.set_index('user_id').loc[df8[df8['unique_counts_of_minors']>2].index][['short_name','total_adults','minors']].head()









    Out[195]:







  
    
      
      short_name
      total_adults
      minors
    
    
      user_id
      
      
      
    
  
  
    
      2030240
      M. San Marco
      1
      0
    
    
      2030240
      Accademia
      1
      0
    
    
      2030240
      Accademia
      0
      1
    
    
      2030240
      Opera del Duomo
      1
      0
    
    
      2030240
      Opera del Duomo
      0
      1



In [196]:

    
df5[(df5.index<2030243)&(df5.index>2030238)]









    Out[196]:







  
    
      
      path
    
    
      user_id
      
    
  
  
    
      2030239
      P VTCD N
    
    
      2030240
      MADNVTU 3GC
    
    
      2030241
      MADNVTU 3GC
    
    
      2030242
      UAX



In [197]:

    
# What are the variable numbers of minors? Is it always just 0 vs 1? No, we see more variety. 
df[df['user_id'].isin(df8[df8['unique_counts_of_minors']>2].index)][['user_id','entry_time','short_name','minors']].groupby(['user_id','entry_time','short_name'])['minors'].sum().reset_index()[['user_id','minors']].groupby('user_id')['minors'].value_counts().head(20)









    Out[197]:





user_id  minors
2030240  0         5
         2         4
         1         1
2033164  0         7
         1         3
         2         1
2039260  0         2
         1         1
         2         1
2043368  0         6
         1         1
         2         1
2044949  0         4
         1         1
         2         1
2046939  0         4
         1         1
         2         1
2047406  0         3
         1         1
Name: minors, dtype: int64



In [ ]:

    
cards = df.groupby('user_id').agg({'short_name':'nunique',
                                  'total_adults':'sum',
                                  'minors':'max',
                                  })



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [172]:

    
# cards = df.groupby('user_id').agg({# 'entrances_per_card_per_museum':'sum', 
#                                    'museum_id':'nunique', # Should be equal to sum of set of is_in columns
#                                    # 'total_duration_card_use':'max', 
#                                    'total_adults':'sum', # This sum should be equal to 'entrances_per_card_per_museum'
#                                    'is_in_museum_1':'max',
#                                    'is_in_museum_2':'max',
#                                    'is_in_museum_3':'max',
#                                    'is_in_museum_4':'max',
#                                    'is_in_museum_5':'max',
#                                    'is_in_museum_6':'max',
#                                    'is_in_museum_7':'max',
#                                    'is_in_museum_8':'max',
#                                    'is_in_museum_9':'max',
#                                    'is_in_museum_10':'max',
#                                    'is_in_museum_11':'max',
#                                    'is_in_museum_12':'max',
#                                    'is_in_museum_13':'max',
#                                    'is_in_museum_14':'max',
#                                    'is_in_museum_15':'max',
#                                    'is_in_museum_16':'max',
#                                    'is_in_museum_17':'max',
#                                    'is_in_museum_18':'max',
#                                    'is_in_museum_19':'max',
#                                    'is_in_museum_20':'max',
#                                    'is_in_museum_21':'max',
#                                    'is_in_museum_22':'max',
#                                    'is_in_museum_23':'max',
#                                    'is_in_museum_24':'max',
#                                    'is_in_museum_25':'max',
#                                    'is_in_museum_26':'max',
#                                    'is_in_museum_27':'max',
#                                    'is_in_museum_28':'max',
#                                    'is_in_museum_29':'max',
#                                    'is_in_museum_30':'max',
#                                    'is_in_museum_31':'max',
#                                    'is_in_museum_32':'max',
#                                    'is_in_museum_33':'max',
#                                    'is_in_museum_34':'max',
#                                    'is_in_museum_35':'max',
#                                    'is_in_museum_36':'max',
#                                    'is_in_museum_37':'max',
#                                    'is_in_museum_38':'max'
#                                   })



In [173]:

    
# # Reorder correctly
# cards = cards[[# 'entrances_per_card_per_museum', 
#                'museum_id', 
#                # 'total_duration_card_use', 
#                'total_adults', 
#                'is_in_museum_1',
#                'is_in_museum_2',
#                'is_in_museum_3',
#                'is_in_museum_4',
#                'is_in_museum_5',
#                'is_in_museum_6',
#                'is_in_museum_7',
#                'is_in_museum_8',
#                'is_in_museum_9',
#                'is_in_museum_10',
#                'is_in_museum_11',
#                'is_in_museum_12',
#                'is_in_museum_13',
#                'is_in_museum_14',
#                'is_in_museum_15',
#                'is_in_museum_16',
#                'is_in_museum_17',
#                'is_in_museum_18',
#                'is_in_museum_19',
#                'is_in_museum_20',
#                'is_in_museum_21',
#                'is_in_museum_22',
#                'is_in_museum_23',
#                'is_in_museum_24',
#                'is_in_museum_25',
#                'is_in_museum_26',
#                'is_in_museum_27',
#                'is_in_museum_28',
#                'is_in_museum_29',
#                'is_in_museum_30',
#                'is_in_museum_31',
#                'is_in_museum_32',
#                'is_in_museum_33',
#                'is_in_museum_34',
#                'is_in_museum_35',
#                'is_in_museum_36',
#                'is_in_museum_37',
#                'is_in_museum_38'    
# ]]



In [174]:

    
# # Rename appropriately
# cards.columns = [# 'entrances',
#                  'museums_visited',
#                  # 'use_duration',
#                  'entrances_2',
#                  'visited_museum_1',
#                  'visited_museum_2',
#                  'visited_museum_3',
#                  'visited_museum_4',
#                  'visited_museum_5',
#                  'visited_museum_6',
#                  'visited_museum_7',
#                  'visited_museum_8',
#                  'visited_museum_9',
#                  'visited_museum_10',
#                  'visited_museum_11',
#                  'visited_museum_12',
#                  'visited_museum_13',
#                  'visited_museum_14',
#                  'visited_museum_15',
#                  'visited_museum_16',
#                  'visited_museum_17',
#                  'visited_museum_18',
#                  'visited_museum_19',
#                  'visited_museum_20',
#                  'visited_museum_21',
#                  'visited_museum_22',
#                  'visited_museum_23',
#                  'visited_museum_24',
#                  'visited_museum_25',
#                  'visited_museum_26',
#                  'visited_museum_27',
#                  'visited_museum_28',
#                  'visited_museum_29',
#                  'visited_museum_30',
#                  'visited_museum_31',
#                  'visited_museum_32',
#                  'visited_museum_33',
#                  'visited_museum_34',
#                  'visited_museum_35',
#                  'visited_museum_36',
#                  'visited_museum_37',
#                  'visited_museum_38']



In [175]:

    
cards.head(20)









    Out[175]:







  
    
      
      museums_visited
      entrances_2
      visited_museum_1
      visited_museum_2
      visited_museum_3
      visited_museum_4
      visited_museum_5
      visited_museum_6
      visited_museum_7
      visited_museum_8
      ...
      visited_museum_29
      visited_museum_30
      visited_museum_31
      visited_museum_32
      visited_museum_33
      visited_museum_34
      visited_museum_35
      visited_museum_36
      visited_museum_37
      visited_museum_38
    
    
      user_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1459702
      8
      8
      1
      0
      1
      0
      0
      0
      0
      0
      ...
      1
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      1473903
      6
      6
      0
      1
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1473904
      6
      6
      0
      1
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1473905
      5
      5
      1
      0
      1
      0
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1473906
      11
      11
      1
      1
      0
      1
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1473907
      11
      11
      1
      1
      0
      1
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1474634
      7
      7
      0
      1
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      1474636
      7
      7
      0
      1
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      1
      0
    
    
      2014298
      3
      3
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2016016
      4
      4
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2016021
      4
      4
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2016022
      4
      4
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2016024
      3
      3
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2017368
      2
      2
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2017369
      2
      2
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2017450
      3
      3
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2017451
      2
      2
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2017452
      2
      2
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2017453
      7
      7
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
    
    
      2017454
      8
      8
      1
      1
      1
      0
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      1
    
  

20 rows × 40 columns



In [176]:

    
pd.DataFrame([cards.iloc[:,1:39].sum(axis=1),cards['entrances_2']])









    Out[176]:







  
    
      user_id
      1459702
      1473903
      1473904
      1473905
      1473906
      1473907
      1474634
      1474636
      2014298
      2016016
      ...
      2095756
      2095757
      2095758
      2095759
      2095760
      2095763
      2095764
      2095765
      2095766
      2095767
    
  
  
    
      Unnamed 0
      16
      11
      11
      10
      20
      20
      14
      14
      5
      8
      ...
      4
      4
      1
      1
      1
      2
      2
      2
      2
      2
    
    
      entrances_2
      8
      6
      6
      5
      11
      11
      7
      7
      3
      4
      ...
      2
      2
      1
      1
      1
      1
      1
      1
      1
      1
    
  

2 rows × 51031 columns



In [ ]:

    
# Two tasks. First, do clustering on these people,



In [161]:

    
X = cards.iloc[:,1:39].as_matrix()



In [171]:

    
Z = linkage(y=X, method='single', metric='jaccard')









    



---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-171-5887c53fbde0> in <module>()
----> 1 Z = linkage(y=X, method='single', metric='jaccard')

/home/mmalik/.local/lib/python2.7/site-packages/scipy/cluster/hierarchy.pyc in linkage(y, method, metric)
    669                          'matrix looks suspiciously like an uncondensed '
    670                          'distance matrix')
--> 671         y = distance.pdist(y, metric)
    672     else:
    673         raise ValueError("`y` must be 1 or 2 dimensional.")

/home/mmalik/.local/lib/python2.7/site-packages/scipy/spatial/distance.pyc in pdist(X, metric, p, w, V, VI)
   1375 
   1376     m, n = s
-> 1377     dm = np.zeros((m * (m - 1)) // 2, dtype=np.double)
   1378 
   1379     # validate input for multi-args metrics

MemoryError:



In [ ]:

    
pdist()



In [166]:

    
df['museum_name'].nunique()









    Out[166]:





40



In [167]:

    
df['museum_id'].nunique()









    Out[167]:





40



In [168]:

    
df['short_name'].nunique()









    Out[168]:





40



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [18]:

    
df7 = df5['s2'].apply(lambda x: pd.Series(len(sub(' ','',x))))



In [19]:

    
df7.head()



In [20]:

    
df7.sort_values(0,ascending=False).head(10)



In [21]:

    
df6.loc[df7.sort_values(0,ascending=False).head(10).index]









    Out[21]:







  
    
      
      0
      1
      2
      3
    
    
      user_id
      
      
      
      
    
  
  
    
      2066305
      PLYsG3VT
      Dcl2m9bC7EB
      Ud0MgqOXIanNA
      NaN
    
    
      2066304
      PLYsG3VT
      Dcl2m9bC7EB
      Ud0MgqOXIanNA
      NaN
    
    
      2057462
      Ml2cDNbLPVT
      UBC7amXIdS
      A0GH3sY6n
      NaN
    
    
      2057463
      Ml2cDNbLPVT
      UBC7amXIdS
      A0GH3sY6n
      NaN
    
    
      2086996
      PbFHsC7E3GNV
      DBU2mAIX
      M0cln9d
      NaN
    
    
      2086995
      PbFHsC7E3GNV
      DBU2mAIX
      M0cln9d
      NaN
    
    
      2075456
      UGaB3Fp
      Dl2cm0MOgXIANV
      dPLCT
      NaN
    
    
      2075457
      UGaB3Fp
      Dl2cm0MOgXIANV
      dPLCT
      NaN
    
    
      2030826
      sC7SFmT
      D2lcdNnIA
      MEbPV
      HG9aU
    
    
      2075874
      ND
      MqgOXI02ma
      AcVTBC73UG
      PYb



In [22]:

    
fr2 = frequency(df7,0)
fr2.head()



In [23]:

    
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.stem(fr2[0],fr2['frequency'], linestyle='steps--')
# yscale('log')
# xscale('log')
ax.set_title('Number of museum visits by Florence Card')
ax.set_ylabel('Frequency')
ax.set_xlabel('Number of museums')
plt.show()
# NOTE: This is the number of *visits*, not people on those cards!! 
# (And, not number of museums visited, this counts multiple visits to the same museum as distinct)



In [24]:

    
df8 = df.groupby(['user_id','short_name','entry_time']).sum()['total_adults'].to_frame()
df8.head()









    Out[24]:







  
    
      
      
      
      total_adults
    
    
      user_id
      short_name
      entry_time
      
    
  
  
    
      1459702
      Accademia
      2016-06-22 15:49:00
      1
    
    
      M. Bargello
      2016-06-23 13:41:00
      1
    
    
      M. Casa Dante
      2016-06-22 14:26:00
      1
    
    
      M. Galileo
      2016-06-23 11:14:00
      1
    
    
      M. Palazzo Vecchio
      2016-06-23 12:57:00
      1



In [25]:

    
# Cards with more than one entrance to same museum
df9 = df.groupby(['user_id','short_name']).sum()['total_adults'].to_frame()
df9.columns = ['number_of_entries']
df9['number_of_entries'] = df9['number_of_entries']
df9[df9['number_of_entries']>1].head(50)









    Out[25]:







  
    
      
      
      number_of_entries
    
    
      user_id
      short_name
      
    
  
  
    
      2044654
      M. Santa Maria Novella
      2
    
    
      Opera del Duomo
      2
    
    
      Uffizi
      2
    
    
      2045231
      Accademia
      2
    
    
      2046704
      Accademia
      2
    
    
      2052035
      M. Galileo
      2
    
    
      2059646
      Santa Croce
      2
    
    
      2059934
      M. Palazzo Vecchio
      2
    
    
      2062498
      M. Archeologico
      2
    
    
      2065347
      M. Palazzo Vecchio
      2
    
    
      2065878
      Cappelle Medicee
      2
    
    
      2067085
      San Lorenzo
      2
    
    
      2069270
      M. Bargello
      2
    
    
      2069663
      M. Antropologia
      2
    
    
      2078714
      M. Novecento
      2
    
    
      2088128
      Pitti
      2
    
    
      2091065
      Accademia
      2
    
    
      Opera del Duomo
      2
    
    
      Uffizi
      2



In [26]:

    
df8.shape[0] # Number of entries









    Out[26]:





360534



In [27]:

    
df9.shape[0] # 12 repeat visits. Negligible.









    Out[27]:





360522



In [28]:

    
df9[df9['number_of_entries']==1].shape[0]









    Out[28]:





360503



In [29]:

    
df9[df9['number_of_entries']==2].shape[0]









    Out[29]:





19



In [30]:

    
df9[df9['number_of_entries']>2]









    Out[30]:







  
    
      
      
      number_of_entries
    
    
      user_id
      short_name



In [18]:

    
# # This is the number of people who entered on each card entry, not the number of repeat entries! 
# frequency(df.groupby(['user_id','short_name',]).count()['entry_time'].to_frame(),'entry_time')



In [32]:

    
df9 = df7.reset_index()
df10 = df8.reset_index()
df11 = df9.merge(df10).groupby('user_id').sum()
df11.columns = ['visits','total_people']
df11['persons_per_visit'] = df11['total_people']/df11['visits']
df11.head()









    Out[32]:







  
    
      
      visits
      total_people
      persons_per_visit
    
    
      user_id
      
      
      
    
  
  
    
      1459702
      64
      8
      0.125000
    
    
      1473903
      36
      6
      0.166667
    
    
      1473904
      36
      6
      0.166667
    
    
      1473905
      25
      5
      0.200000
    
    
      1473906
      121
      11
      0.090909



In [33]:

    
# df11[df11['persons_per_visit']>1].plot.scatter(x='visits',y='persons_per_visit')

We now want the following: a measure of similarity between adjacent rows, for detecting people traveling together (making the assumption that they bought Firenze cards consecutively).

This is simplest to do naively: not use anything statistical, but just fuzzy matching through edit distance, which is the number of operations (insertions, deletions, swaps) needed to change one string into another (or, opreations on list elements to change one list to another).

Since there are 3 days, and since we want slight deviations in otherwise identical large itineraries to count less, we calculate the following: a column with the edit distance between each pair of days between rows, summed, followed by a column with the total number of visits per row.



In [34]:

    
# edit = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum()))



In [35]:

    
df6.fillna('',inplace=True)
df6.iloc[0:10]



In [36]:

    
def editdist(pair):
    return editdistance.eval(pair[0],pair[1])



In [37]:

    
df7 = pd.concat([df6,df6.shift()],axis=1)



In [38]:

    
df7.columns = ['0','1','2','3','0+','1+','2+','3+']
df7.head()



In [39]:

    
# df8 = df7.iloc[:,[0,4,1,5,2,6,3,7]]
# df8.columns = ['0','0+','1','1+','2','2+','3','3+']
# df8.columns = ['0','0+','1','1+','2','2+','3','3+']
# df8.head()



In [40]:

    
df7['total_edit_distance'] = df7[['0','0+']].apply(editdist,axis=1) + df7[['1','1+']].apply(editdist,axis=1) + df7[['2','2+']].apply(editdist,axis=1) + df7[['3','3+']].apply(editdist,axis=1)
df7.head()









    



Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored






    Out[40]:







  
    
      
      0
      1
      2
      3
      0+
      1+
      2+
      3+
      total_edit_distance
    
    
      user_id
      
      
      
      
      
      
      
      
      
    
  
  
    
      1459702
      U3A
      DGVBC
      
      
      NaN
      NaN
      NaN
      NaN
      0
    
    
      1473903
      U
      D2V
      PX
      
      U3A
      DGVBC
      
      
      7
    
    
      1473904
      U
      D2V
      PX
      
      U
      D2V
      PX
      
      0
    
    
      1473905
      N
      cDAC
      
      
      U
      D2V
      PX
      
      6
    
    
      1473906
      PSUV
      AIMC2
      cl
      
      N
      cDAC
      
      
      10



In [41]:

    
df7['len'] = df7['0'].str.len() + df7['1'].str.len() + df7['2'].str.len() + df7['3'].str.len()
df7['len+'] = df7['0+'].str.len() + df7['1+'].str.len() + df7['2+'].str.len() + df7['3+'].str.len()
df7['len_tot'] = df7['len'] + df7['len+']
df7.head()









    Out[41]:







  
    
      
      0
      1
      2
      3
      0+
      1+
      2+
      3+
      total_edit_distance
      len
      len+
      len_tot
    
    
      user_id
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1459702
      U3A
      DGVBC
      
      
      NaN
      NaN
      NaN
      NaN
      0
      8
      NaN
      NaN
    
    
      1473903
      U
      D2V
      PX
      
      U3A
      DGVBC
      
      
      7
      6
      8.0
      14.0
    
    
      1473904
      U
      D2V
      PX
      
      U
      D2V
      PX
      
      0
      6
      6.0
      12.0
    
    
      1473905
      N
      cDAC
      
      
      U
      D2V
      PX
      
      6
      5
      6.0
      11.0
    
    
      1473906
      PSUV
      AIMC2
      cl
      
      N
      cDAC
      
      
      10
      11
      5.0
      16.0



In [42]:

    
fr3 = frequency(df7[df7['total_edit_distance']==0],'len_tot')
fr3



In [43]:

    
frequency(df7[df7['total_edit_distance']==0],'len_tot')



In [44]:

    
df8 = df7.reset_index(inplace=False)
df8.reset_index(inplace=True)
df8.head()









    Out[44]:







  
    
      
      index
      user_id
      0
      1
      2
      3
      0+
      1+
      2+
      3+
      total_edit_distance
      len
      len+
      len_tot
    
  
  
    
      0
      0
      1459702
      U3A
      DGVBC
      
      
      NaN
      NaN
      NaN
      NaN
      0
      8
      NaN
      NaN
    
    
      1
      1
      1473903
      U
      D2V
      PX
      
      U3A
      DGVBC
      
      
      7
      6
      8.0
      14.0
    
    
      2
      2
      1473904
      U
      D2V
      PX
      
      U
      D2V
      PX
      
      0
      6
      6.0
      12.0
    
    
      3
      3
      1473905
      N
      cDAC
      
      
      U
      D2V
      PX
      
      6
      5
      6.0
      11.0
    
    
      4
      4
      1473906
      PSUV
      AIMC2
      cl
      
      N
      cDAC
      
      
      10
      11
      5.0
      16.0



In [47]:

    
# df7[df7['total_edit_distance']==0].hist('len_tot',bins=100, grid=False, figsize=[16,8])
f, ax = plt.subplots(figsize=(12,5), dpi=300)
ax.stem(fr3['len_tot']/2,fr3['frequency'], linestyle='steps--')
# yscale('log')
# xscale('log')
ax.set_title('Number of museums in perfectly matched consecutive paths')
ax.set_ylabel('Number of cards')
ax.set_xlabel('Number of museums')
plt.show()
# NOTE: This is the number of *visits*, not people on those cards!! 
# (And, not number of museums visited, this counts multiple visits to the same museum as distinct)



In [ ]:

    
# df8.hist('user_id',bins=1000,figsize=[8,8])



In [ ]:

    
# df8[df8['user_id']>1500000].hist('user_id',bins=1000,figsize=[8,8])



In [ ]:

    
# df8.plot.scatter(x='index',y='total_edit_distance',figsize=[16,16], c=2+(df8['total_edit_distance']>0))
# sns.jointplot(x="index", y="total_edit_distance", data=df8)#, hue=(df9['total_edit_distance']==0))
# sns.jointplot(x="index", y="total_edit_distance", data=df8, kind='hex')



In [ ]:



In [ ]:

    
sns.jointplot(x="total_edit_distance", y="len_tot", data=df8)



In [ ]:

    
sns.jointplot(x="total_edit_distance", y="len_tot", data=df8, kind='hex')



In [ ]:

    
sns.jointplot(x="total_edit_distance", y="len_tot", data=df8, kind='kde')

Now, need to extract consecutive rows of zero edit distance.



In [ ]:

    
df8['dist_gt_0'] = 1*(df8['total_edit_distance'] != 0)
# df8['offset'] = 1*(df8['zero_dist'] + df8['zero_dist'].shift()==0)
df8['group'] = cumsum(df8['dist_gt_0'])
df8.head(50)



In [ ]:

    
df9 = df8[['group','user_id']].groupby('group').count()
df9.columns = ['people']
df9.head()



In [ ]:

    
frequency(df9,'people')



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [3]:

    
# # The code below was my attempt to get a node for starting the day and ending the day from the paths. 
# # The problem is that this gives the number of _cards_, not number of people! I had to go back to the
# # dynamic edgelist construction anyway. 
# df6.head()



In [4]:

    
# df9 = df5['s2'].apply(lambda x: pd.Series(x.strip().split(' ')))
# df9.fillna(' ',inplace=True)
# df9['0_first'] = df9[0].apply(lambda x: pd.Series(x[0]))
# df9['0_last'] = df9[0].apply(lambda x: pd.Series(x[-1]))
# df9['0_len'] = df9[0].apply(lambda x: pd.Series(len(x)))
# df9['1_first'] = df9[1].apply(lambda x: pd.Series(x[0]))
# df9['1_last'] = df9[1].apply(lambda x: pd.Series(x[-1]))
# df9['1_len'] = df9[1].apply(lambda x: pd.Series(len(x)))
# df9['2_first'] = df9[2].apply(lambda x: pd.Series(x[0]))
# df9['2_last'] = df9[2].apply(lambda x: pd.Series(x[-1]))
# df9['2_len'] = df9[2].apply(lambda x: pd.Series(len(x)))
# df9['3_first'] = df9[3].apply(lambda x: pd.Series(x[0]))
# df9['3_last'] = df9[3].apply(lambda x: pd.Series(x[-1]))
# df9['3_len'] = df9[3].apply(lambda x: pd.Series(len(x)))
# df9.head()



In [5]:

    
# df9.replace(' ',np.nan,inplace=True)
# df9.head()



In [6]:

    
# from_home = frequency(df9[['0_first','1_first','2_first','3_first']].stack().to_frame(),0)[[0,'frequency']]
# from_home.columns = ['0','from_home']
# from_home.set_index('0',inplace=True)
# from_home.head()



In [7]:

    
# only = frequency(pd.concat(
#     [df9[(df9['0_len']==1)&(df9['0_first'].notnull())]['0_first'], 
#      df9[(df9['1_len']==1)&(df9['1_first'].notnull())]['1_first'], 
#      df9[(df9['2_len']==1)&(df9['2_first'].notnull())]['2_first'], 
#      df9[(df9['3_len']==1)&(df9['3_first'].notnull())]['3_first']
#     ],axis=0).to_frame()
# ,0)[[0,'frequency']]
# only.columns = ['0','only']
# only.set_index('0',inplace=True)
# only.head()



In [8]:

    
# to_home = frequency(df9[['0_last','1_last','2_last','3_last']].stack().to_frame(),0)[[0,'frequency']]
# to_home.columns = ['0','to_home']
# to_home.set_index('0',inplace=True)
# to_home.head()



In [9]:

    
# from_to_home = nodes.set_index('string')['short_name'].to_frame().join([from_home,to_home,only])
# from_to_home.set_index('short_name',inplace=True)
# from_to_home.columns = ['home_to_node','node_to_home','only_visit_of_day']
# # from_to_home['from_home'] = from_to_home['from_home_incl_only'] - from_to_home['only_visit_of_day']
# # from_to_home['to_home'] = from_to_home['to_home_incl_only'] - from_to_home['only_visit_of_day']
# from_to_home.head()



In [10]:

    
# from_to_home['home_to_node'].sort_values(ascending=False).to_frame().head(20)



In [11]:

    
# from_to_home['node_to_home'].sort_values(ascending=False).to_frame().head(20)



In [12]:

    
# from_to_home.reset_index(inplace=True)



In [13]:

    
# from_to_home



In [14]:

    
# supp_edges = pd.DataFrame({'from':['home']*from_to_home.shape[0] + from_to_home['short_name'].tolist(),
#                           'to':from_to_home['short_name'].tolist() + ['home']*from_to_home.shape[0],
#                           'weight':from_to_home['home_to_node'].tolist() + from_to_home['node_to_home'].tolist() })



In [15]:

    
# supp_edges.dropna(how='any',inplace=True)
# supp_edges



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
frequency(df6,0).head()



In [ ]:

    
frequency(df6,1).head()



In [ ]:

    
frequency(df6,2).head()



In [ ]:

    
frequency(df6,3).head()

Now, I want a set of scatterplots between these frequencies.



In [33]:

    
pt = pd.concat([frequency(df6,0),frequency(df6,1),frequency(df6,2),frequency(df6,3)])
pt['daily_path'] = pt[0].replace(np.nan, '', regex=True) + pt[1].replace(np.nan, '', regex=True) + pt[2].replace(np.nan, '', regex=True) + pt[3].replace(np.nan, '', regex=True)
pt.drop([0,1,2,3,'ccdf','cumulative'],axis=1,inplace=True)
pt.head()



In [34]:

    
pt2 = pt.groupby('daily_path').sum()
pt2.sort_values('frequency', inplace=True, ascending=False)
pt2.head()



In [35]:

    
pt2[pt2['frequency']>200].plot.bar(figsize=(16,8))
plt.title('Most common daily Firenze card paths across all days')
plt.xlabel('x = Encoded path')
plt.ylabel('Number of cards with daily path x')
# plt.yscale('log')
plt.show()



In [36]:

    
nodes.head()









    Out[36]:







  
    
      
      museum_name
      longitude
      latitude
      museum_id
      short_name
      string
    
  
  
    
      0
      Basilica di Santa Croce
      11.262598
      43.768754
      1
      Santa Croce
      C
    
    
      1
      Basilica San Lorenzo
      11.254430
      43.774932
      2
      San Lorenzo
      2
    
    
      2
      Battistero di San Giovanni
      11.254966
      43.773131
      3
      Opera del Duomo
      D
    
    
      3
      Biblioteca Medicea Laurenziana
      11.253924
      43.774799
      4
      Laurenziana
      l
    
    
      4
      Cappella Brancacci
      11.243859
      43.768334
      5
      Brancacci
      b



In [37]:

    
# For reference, here are the displayed museums
# nodes[['string','short_name']].set_index('string').reindex(['D','P','U','A','V','T','N','C','G','B','S','c','m','M','b','Y','2'])
nodes[nodes['string'].isin(['D','P','U','A','V','T','N','C','G','B','S','c','m','M','b','Y','2'])][['string','short_name']]









    Out[37]:







  
    
      
      string
      short_name
    
  
  
    
      0
      C
      Santa Croce
    
    
      1
      2
      San Lorenzo
    
    
      2
      D
      Opera del Duomo
    
    
      4
      b
      Brancacci
    
    
      5
      c
      Cappelle Medicee
    
    
      9
      U
      Uffizi
    
    
      10
      A
      Accademia
    
    
      19
      M
      M. San Marco
    
    
      20
      N
      M. Santa Maria Novella
    
    
      23
      G
      M. Galileo
    
    
      26
      B
      M. Bargello
    
    
      40
      V
      M. Palazzo Vecchio
    
    
      48
      m
      Palazzo Medici
    
    
      49
      S
      Palazzo Strozzi
    
    
      50
      T
      Torre di Palazzo Vecchio
    
    
      51
      Y
      V. Bardini
    
    
      62
      P
      Pitti



In [38]:

    
df6[pd.isnull(df6[0].str[0])].head()



In [26]:

    
df6.to_csv('encoded_paths.csv')



In [27]:

    
nodes.to_csv('encoded_paths_legend.csv')



In [ ]:



In [39]:

    
df6.values









    Out[39]:





array([['U3A', 'DGVBC', nan, nan],
       ['U', 'D2V', 'PX', nan],
       ['U', 'D2V', 'PX', nan],
       ..., 
       ['D', nan, nan, nan],
       ['D', nan, nan, nan],
       ['D', nan, nan, nan]], dtype=object)



In [ ]:



In [ ]:

	path	frequency	cumulative	ccdf
0	D	337	0.006604	0.993396
1	U	218	0.010876	0.989124
2	AU	201	0.014815	0.985185
3	A	193	0.018597	0.981403
4	DA	189	0.022300	0.977700
5	ADU	186	0.025945	0.974055
6	UA	172	0.029316	0.970684
7	AD	168	0.032608	0.967392
8	DAU	145	0.035449	0.964551
9	U A	115	0.037703	0.962297
10	UDA	106	0.039780	0.960220
11	DU	95	0.041641	0.958359
12	D A	89	0.043385	0.956615
13	D AU	86	0.045071	0.954929
14	DA U	86	0.046756	0.953244
15	D U	79	0.048304	0.951696
16	P	77	0.049813	0.950187
17	AD U	77	0.051322	0.948678
18	A U	70	0.052693	0.947307
19	D UA	60	0.053869	0.946131

	path	frequency	cumulative	ccdf
0	ADU	1280	0.025083	0.974917
1	ADPU	782	0.040407	0.959593
2	AU	558	0.051341	0.948659
3	AD	503	0.061198	0.938802
4	D	337	0.067802	0.932198

	0	frequency	cumulative	ccdf
0	6	5907	0.115753	0.884247
1	5	5632	0.226117	0.773883
2	7	5521	0.334307	0.665693
3	4	5516	0.442398	0.557602
4	8	4858	0.537595	0.462405

	len_tot	frequency	cumulative	ccdf
0	12.0	2494	0.118221	0.881779
1	8.0	2384	0.231229	0.768771
2	10.0	2375	0.343809	0.656191
3	14.0	2248	0.450370	0.549630
4	16.0	1976	0.544037	0.455963
5	6.0	1792	0.628982	0.371018
6	18.0	1635	0.706485	0.293515
7	20.0	1399	0.772801	0.227199
8	22.0	1126	0.826176	0.173824
9	4.0	1039	0.875427	0.124573
10	24.0	796	0.913159	0.086841
11	26.0	575	0.940415	0.059585
12	2.0	431	0.960846	0.039154
13	28.0	354	0.977626	0.022374
14	30.0	200	0.987107	0.012893
15	32.0	130	0.993269	0.006731
16	34.0	69	0.996540	0.003460
17	36.0	37	0.998294	0.001706
18	38.0	14	0.998957	0.001043
19	40.0	8	0.999336	0.000664
20	44.0	5	0.999573	0.000427
21	42.0	5	0.999810	0.000190
22	60.0	1	0.999858	0.000142
23	64.0	1	0.999905	0.000095
24	54.0	1	0.999953	0.000047
25	52.0	1	1.000000	0.000000

	len_tot	frequency	cumulative	ccdf
0	12.0	2494	0.118221	0.881779
1	8.0	2384	0.231229	0.768771
2	10.0	2375	0.343809	0.656191
3	14.0	2248	0.450370	0.549630
4	16.0	1976	0.544037	0.455963
5	6.0	1792	0.628982	0.371018
6	18.0	1635	0.706485	0.293515
7	20.0	1399	0.772801	0.227199
8	22.0	1126	0.826176	0.173824
9	4.0	1039	0.875427	0.124573
10	24.0	796	0.913159	0.086841
11	26.0	575	0.940415	0.059585
12	2.0	431	0.960846	0.039154
13	28.0	354	0.977626	0.022374
14	30.0	200	0.987107	0.012893
15	32.0	130	0.993269	0.006731
16	34.0	69	0.996540	0.003460
17	36.0	37	0.998294	0.001706
18	38.0	14	0.998957	0.001043
19	40.0	8	0.999336	0.000664
20	44.0	5	0.999573	0.000427
21	42.0	5	0.999810	0.000190
22	60.0	1	0.999858	0.000142
23	64.0	1	0.999905	0.000095
24	54.0	1	0.999953	0.000047
25	52.0	1	1.000000	0.000000

	user_id	museum_name	entry_time	adults_reuse	total_adults	museum_id	short_name	string	date	hour	total_people
0	2089098	Palazzo Pitti	2016-09-19 14:49:00	1	1	38	Pitti	P	2016-09-19	2016-09-19 14:00:00	1
1	2089099	Palazzo Pitti	2016-09-19 14:49:00	1	1	38	Pitti	P	2016-09-19	2016-09-19 14:00:00	1
2	2083344	Palazzo Pitti	2016-09-19 14:57:00	1	1	38	Pitti	P	2016-09-19	2016-09-19 14:00:00	1
3	2083335	Palazzo Pitti	2016-09-19 14:57:00	1	1	38	Pitti	P	2016-09-19	2016-09-19 14:00:00	1
4	2083304	Palazzo Pitti	2016-09-19 14:58:00	1	1	38	Pitti	P	2016-09-19	2016-09-19 14:00:00	1

	user_id	entry_time	date	hour	short_name	string	total_people	from	to	s	t	path
0	1459702	2016-06-22 10:04:00	2016-06-22	2016-06-22 10:00:00	Uffizi	U	1	source	Uffizi		U
1	1459702	2016-06-22 14:26:00	2016-06-22	2016-06-22 14:00:00	M. Casa Dante	3	1	Uffizi	M. Casa Dante	U	3	U
2	1459702	2016-06-22 15:49:00	2016-06-22	2016-06-22 15:00:00	Accademia	A	1	M. Casa Dante	Accademia	3	A	3A
3	1459702	2016-06-23 09:43:00	2016-06-23	2016-06-23 09:00:00	Opera del Duomo	D	1	source	Opera del Duomo		D
4	1459702	2016-06-23 11:14:00	2016-06-23	2016-06-23 11:00:00	M. Galileo	G	1	Opera del Duomo	M. Galileo	D	G	D

	path
user_id
1459702	U3A DGVBC
1473903	U D2V PX
1473904	U D2V PX
1473905	N cDAC
1473906	PSUV AIMC2 cl

	0	1	2	3
user_id
1459702	U3A	DGVBC	NaN	NaN
1473903	U	D2V	PX	NaN
1473904	U	D2V	PX	NaN
1473905	N	cDAC	NaN	NaN
1473906	PSUV	AIMC2	cl	NaN

	path
user_id
1459702	3ABCDGUV
1473903	2DPUVX
1473904	2DPUVX
1473905	ACDNc
1473906	2ACIMPSUVcl

	unique_counts_of_minors	frequency	cumulative	ccdf
0	1	49214	0.964394	0.035606
1	2	1745	0.998589	0.001411
2	3	72	1.000000	0.000000

	short_name	total_adults	minors
user_id
2030240	M. San Marco	1	0
2030240	Accademia	1	0
2030240	Accademia	0	1
2030240	Opera del Duomo	1	0
2030240	Opera del Duomo	0	1

	path
user_id
2030239	P VTCD N
2030240	MADNVTU 3GC
2030241	MADNVTU 3GC
2030242	UAX

user_id	1459702	1473903	1473904	1473905	1473906	1473907	1474634	1474636	2014298	2016016	...	2095756	2095757	2095758	2095759	2095760	2095763	2095764	2095765	2095766	2095767
Unnamed 0	16	11	11	10	20	20	14	14	5	8	...	4	4	1	1	1	2	2	2	2	2
entrances_2	8	6	6	5	11	11	7	7	3	4	...	2	2	1	1	1	1	1	1	1	1

	0
user_id
2066305	32
2066304	32
2057462	30
2057463	30
2086996	27
2086995	27
2075456	26
2075457	26
2030826	26
2075874	25

	0	1	2	3
user_id
2066305	PLYsG3VT	Dcl2m9bC7EB	Ud0MgqOXIanNA	NaN
2066304	PLYsG3VT	Dcl2m9bC7EB	Ud0MgqOXIanNA	NaN
2057462	Ml2cDNbLPVT	UBC7amXIdS	A0GH3sY6n	NaN
2057463	Ml2cDNbLPVT	UBC7amXIdS	A0GH3sY6n	NaN
2086996	PbFHsC7E3GNV	DBU2mAIX	M0cln9d	NaN
2086995	PbFHsC7E3GNV	DBU2mAIX	M0cln9d	NaN
2075456	UGaB3Fp	Dl2cm0MOgXIANV	dPLCT	NaN
2075457	UGaB3Fp	Dl2cm0MOgXIANV	dPLCT	NaN
2030826	sC7SFmT	D2lcdNnIA	MEbPV	HG9aU
2075874	ND	MqgOXI02ma	AcVTBC73UG	PYb

		number_of_entries
user_id	short_name
2044654	M. Santa Maria Novella	2
	Opera del Duomo	2
	Uffizi	2
2045231	Accademia	2
2046704	Accademia	2
2052035	M. Galileo	2
2059646	Santa Croce	2
2059934	M. Palazzo Vecchio	2
2062498	M. Archeologico	2
2065347	M. Palazzo Vecchio	2
2065878	Cappelle Medicee	2
2067085	San Lorenzo	2
2069270	M. Bargello	2
2069663	M. Antropologia	2
2078714	M. Novecento	2
2088128	Pitti	2
2091065	Accademia	2
	Opera del Duomo	2
	Uffizi	2

	visits	total_people	persons_per_visit
user_id
1459702	64	8	0.125000
1473903	36	6	0.166667
1473904	36	6	0.166667
1473905	25	5	0.200000
1473906	121	11	0.090909

	museum_name	longitude	latitude	museum_id	short_name	string
0	Basilica di Santa Croce	11.262598	43.768754	1	Santa Croce	C
1	Basilica San Lorenzo	11.254430	43.774932	2	San Lorenzo	2
2	Battistero di San Giovanni	11.254966	43.773131	3	Opera del Duomo	D
3	Biblioteca Medicea Laurenziana	11.253924	43.774799	4	Laurenziana	l
4	Cappella Brancacci	11.243859	43.768334	5	Brancacci	b