Path analysis



In [16]:

    
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from pylab import *

import igraph as ig # Need to install this in your virtual environment
from re import sub

import editdistance # Needs to be installed
from scipy.spatial.distance import pdist, squareform
# Usage:
# editdistance.eval('banana', 'bahama')

import seaborn as sns



In [17]:

    
import sys
sys.path.append('../../src/')
from utils.database import dbutils

conn = dbutils.connect()
cursor = conn.cursor()



In [3]:

    
nodes = pd.read_sql('select * from optourism.firenze_card_locations', con=conn)



In [4]:

    
df = pd.read_sql('select * from optourism.firenze_card_logs', con=conn)
df['museum_id'].replace(to_replace=39,value=38,inplace=True)
df['short_name'] = df['museum_id'].replace(dict(zip(nodes['museum_id'],nodes['short_name'])))
df['string'] = df['museum_id'].replace(dict(zip(nodes['museum_id'],nodes['string'])))
df['date'] = pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.date
df['hour'] = pd.to_datetime(df['date']) + pd.to_timedelta(pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.hour, unit='h')
df['total_people'] = df['total_adults'] + df['minors']



In [5]:

    
# Helper function for making summary tables/distributions
def frequency(dataframe,columnname):
    out = dataframe[columnname].value_counts().to_frame()
    out.columns = ['frequency']
    out.index.name = columnname
    out.reset_index(inplace=True)
    out.sort_values('frequency',inplace=True,ascending=False)
    out['cumulative'] = out['frequency'].cumsum()/out['frequency'].sum()
    out['ccdf'] = 1 - out['cumulative']
    return out

I propose distinguishing paths from flows. A path is an itinerary, and the flow is the number of people who take the flow. E.g., a family or a tour group produces one path, but adds mulitple people to the overall flow.

We now build a transition graph, a directed graph where an edge represents a person going from one museum to another within the same day.

We also produce the transition matrix, a row-normalized n-by-n matrix of the frequency of transition from the row node to the column node. If you take a vector of the current volumes in each location, and multiply that my the transition matrix, you get a prediction for the number of people on each node at the next time. This prediction can be refined with corrections for daily/weekly patterns and such.

Path analysis

To make paths:

We want a dataframe with user, the museum they went from and the museum they went to, the number of people on the card, and the time of entry to the next museum.

We will drop much of this data in creating paths, which will be concatenations of single-character codes for each museum.

To track the first visit per day, we add a dummy "source" node that everybody starts each day from. We give it the character code " ", and can then split(" ") along it.



In [6]:

    
df4 = df.groupby(['user_id','entry_time','date','hour','museum_name','short_name','string']).sum()['total_people'].to_frame() # Need to group in this order to be correct further down
df4.reset_index(inplace=True)



In [7]:

    
df4['from'] = 'source' # Initialize 'from' column with 'source'
df4['to'] = df4['short_name'] # Copy 'to' column with row's museum_name



In [8]:

    
make_link = (df4['user_id'].shift(1)==df4['user_id'])&(df4['date'].shift(1)==df4['date']) # Row indexes at which to overwrite 'source'
df4['from'][make_link] = df4['short_name'].shift(1)[make_link]



In [9]:

    
df4['s'] = ' ' # Initialize 'from' column with 'source'
df4['t'] = df4['string'] # Copy 'to' column with row's museum_name
df4['s'][make_link] = df4['string'].shift(1)[make_link]



In [10]:

    
# Concatenating the source column is not enough, it leaves out the last place in the path. 
# Need to add a second 'source' column that, for the last item in a day's path, contains two characters.
df4['s2'] = df4['s']
df4['s2'][df4['from'].shift(-1)=='source'] = (df4['s2'] + df4['t'])[df4['from'].shift(-1)=='source']
# Note: the above trick doesn't work for the last row of data. So, do this as well:
df4.iloc[-1:]['s2'] = df4.iloc[-1:]['s'] + df4.iloc[-1:]['t']



In [11]:

    
df5 = df4.groupby('user_id')['s2'].sum().to_frame() # sum() on strings concatenates 
df5.head()









    Out[11]:







  
    
      
      s2
    
    
      user_id
      
    
  
  
    
      1459702
      U3A DGVBC
    
    
      1473903
      U D2V PX
    
    
      1473904
      U D2V PX
    
    
      1473905
      N cDAC
    
    
      1473906
      PSUV AIMC2 cl



In [12]:

    
df6 = df5['s2'].apply(lambda x: pd.Series(x.strip().split(' '))) # Now split along strings. Takes a few seconds.
df6.head() # Note: 4 columns is correct, Firenze card is *72 hours from first use*, not from midnight of the day of first yse!



In [13]:

    
df6.head(50) # Data stories just fall out! People traveling together, splitting off, etc. We assume this but strong coupling is hard to ignore.



In [14]:

    
fr1 = frequency(df5,'s2')



In [15]:

    
# INSIGHT: the top 15 paths are permutations of Duomo, Uffizi, Accademia. 
fr1.head(50)



In [16]:

    
fr1.iloc[0:50].plot.bar(x='s2',y='frequency',figsize=(24,10))
plt.title('Most common total Firenze card paths')
plt.xlabel('x = Encoded path')
plt.ylabel('Number of cards with total path x')
# plt.yscale('log')
plt.show()



In [17]:

    
nodes









    Out[17]:







  
    
      
      museum_name
      longitude
      latitude
      museum_id
      short_name
      string
    
  
  
    
      0
      Basilica di Santa Croce
      11.262598
      43.768754
      1
      Santa Croce
      C
    
    
      1
      Basilica San Lorenzo
      11.254430
      43.774932
      2
      San Lorenzo
      2
    
    
      2
      Battistero di San Giovanni
      11.254966
      43.773131
      3
      Opera del Duomo
      D
    
    
      3
      Biblioteca Medicea Laurenziana
      11.253924
      43.774799
      4
      Laurenziana
      l
    
    
      4
      Cappella Brancacci
      11.243859
      43.768334
      5
      Brancacci
      b
    
    
      5
      Cappelle Medicee
      11.252750
      43.774914
      6
      Cappelle Medicee
      c
    
    
      6
      Casa Buonarroti
      11.263593
      43.769850
      7
      Casa Buonarroti
      7
    
    
      7
      Fondazione Primo Conti
      11.292696
      43.812167
      8
      Primo Conti
      8
    
    
      8
      Fondazione Scienza e Tecnica _ Planetario
      11.264543
      43.776782
      9
      Planetario
      _
    
    
      9
      Galleria degli Uffizi
      11.255607
      43.768526
      10
      Uffizi
      U
    
    
      10
      Galleria dell'Accademia di Firenze
      11.258516
      43.776755
      11
      Accademia
      A
    
    
      11
      La Specola
      11.247132
      43.764626
      12
      La Specola
      L
    
    
      12
      Musei Civici Fiesole
      11.293076
      43.807254
      13
      M. Civici Fiesole
      f
    
    
      13
      Museo Archeologico Nazionale di Firenze
      11.261037
      43.776634
      14
      M. Archeologico
      X
    
    
      14
      Museo Casa Dante
      11.257062
      43.771071
      15
      M. Casa Dante
      3
    
    
      15
      Museo degli Innocenti
      11.260970
      43.776340
      16
      M. Innocenti
      I
    
    
      16
      Museo del Calcio
      11.303383
      43.777617
      17
      M. Calcio
      u
    
    
      17
      Museo dell'Opificio delle Pietre Dure
      11.256901
      43.768732
      18
      M. Opificio
      0
    
    
      18
      Museo di Antropologia
      11.257962
      43.771754
      19
      M. Antropologia
      a
    
    
      19
      Museo di San Marco
      11.258964
      43.777506
      25
      M. San Marco
      M
    
    
      20
      Museo di Santa Maria Novella
      11.249420
      43.774049
      26
      M. Santa Maria Novella
      N
    
    
      21
      Museo Ebraico
      11.265515
      43.772972
      27
      M. Ebraico
      E
    
    
      22
      Museo Ferragamo
      11.251063
      43.769812
      28
      M. Ferragamo
      F
    
    
      23
      Museo Galileo
      11.256023
      43.767683
      29
      M. Galileo
      G
    
    
      24
      Museo Horne
      11.259375
      43.767443
      30
      M. Horne
      H
    
    
      25
      Museo Marini
      11.250052
      43.771906
      31
      M. Marini
      9
    
    
      26
      Museo Nazionale del Bargello
      11.257864
      43.770509
      32
      M. Bargello
      B
    
    
      27
      Museo Novecento
      11.249096
      43.773020
      33
      M. Novecento
      n
    
    
      28
      Museo Stefano Bardini
      11.259193
      43.765088
      34
      M. Stefano Bardini
      s
    
    
      29
      Museo Bandini di Fiesole
      11.292954
      43.807292
      53
      M. Bandini
      Z
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      33
      Museo di Casa Martelli
      11.253293
      43.774148
      57
      M. Casa Martelli
      1
    
    
      34
      Museo Enrico Caruso
      11.080747
      43.768062
      58
      M. Enrico Caruso
      e
    
    
      35
      Parco Mediceo di Pratolino (Villa Demidoff)
      11.299206
      43.861575
      59
      V. di Pratolino
      Q
    
    
      36
      Villa Corsini a Castello
      11.233250
      43.817123
      61
      V. Corsini
      K
    
    
      37
      Museo di Geologia
      11.259840
      43.778341
      20
      M. Geologia
      g
    
    
      38
      Museo di Mineralogia
      11.259840
      43.778341
      21
      M. Mineralogia
      q
    
    
      39
      Museo di Palazzo Davanzati
      11.254827
      43.770237
      22
      M. Palazzo Davanzati
      d
    
    
      40
      Museo di Palazzo Vecchio
      11.255600
      43.769517
      23
      M. Palazzo Vecchio
      V
    
    
      41
      Museo di Preistoria
      11.259883
      43.772897
      24
      M. Preistoria
      p
    
    
      42
      Scavi del Teatro Romano di Firenze-Palazzo Vec...
      11.255600
      43.769517
      60
      Teatro Romano
      t
    
    
      43
      Villa Medicea della Petraia
      11.234535
      43.818284
      62
      V. La Petraia
      w
    
    
      44
      Villa Medicea di Cerreto Guidi e Museo storico...
      10.880166
      43.759825
      63
      V. di Cerreto Guidi
      W
    
    
      45
      Villa Medicea di Poggio a Caiano
      11.055234
      43.816568
      64
      V. di Poggio a Caiano
      y
    
    
      46
      Museo Stibbert
      11.255899
      43.792889
      35
      M. Stibbert
      6
    
    
      47
      Orto Botanico
      11.261745
      43.779411
      36
      Orto Botanico
      O
    
    
      48
      Palazzo Medici Riccardi
      11.255910
      43.774764
      37
      Palazzo Medici
      m
    
    
      49
      Palazzo Strozzi
      11.252241
      43.771007
      40
      Palazzo Strozzi
      S
    
    
      50
      Torre di Palazzo Vecchio
      11.256007
      43.769281
      41
      Torre di Palazzo Vecchio
      T
    
    
      51
      Villa Bardini
      11.256237
      43.764011
      42
      V. Bardini
      Y
    
    
      52
      Cenacolo Andrea del Sarto
      11.285717
      43.771651
      43
      C. Andrea del Sarto
      @
    
    
      53
      Cenacolo del Fuligno
      11.252118
      43.775977
      44
      C. Fuligno
      5
    
    
      54
      Cenacolo del Ghirlandaio
      11.246334
      43.771995
      45
      C. Ghirlandaio
      h
    
    
      55
      Cenacolo di Sant'Apollonia
      11.256901
      43.768732
      46
      C. SantApollonia
      z
    
    
      56
      Chiostro dello Scalzo
      11.258651
      43.778441
      47
      Chiostro dello Scalzo
      x
    
    
      57
      Collezione Contini Bonacossi
      11.254966
      43.768636
      48
      Contini Bonacossi
      k
    
    
      58
      Complesso Monumentale Orsanmichele
      11.254763
      43.770868
      49
      Orsanmichele
      o
    
    
      59
      Fondazione Salvatore Romano
      11.247625
      43.766902
      50
      Fondazione Romano
      r
    
    
      60
      Forte Belvedere
      11.253607
      43.763600
      51
      Belvedere
      4
    
    
      61
      Giardino della Villa Medicea di Castello
      11.226004
      43.820745
      52
      V. di Castello
      J
    
    
      62
      Palazzo Pitti
      11.248342
      43.765178
      38
      Pitti
      P
    
  

63 rows × 6 columns



In [18]:

    
df7 = df5['s2'].apply(lambda x: pd.Series(len(sub(' ','',x))))



In [19]:

    
df7.head()



In [20]:

    
df7.sort_values(0,ascending=False).head(10)



In [21]:

    
df6.loc[df7.sort_values(0,ascending=False).head(10).index]









    Out[21]:







  
    
      
      0
      1
      2
      3
    
    
      user_id
      
      
      
      
    
  
  
    
      2066305
      PLYsG3VT
      Dcl2m9bC7EB
      Ud0MgqOXIanNA
      NaN
    
    
      2066304
      PLYsG3VT
      Dcl2m9bC7EB
      Ud0MgqOXIanNA
      NaN
    
    
      2057462
      Ml2cDNbLPVT
      UBC7amXIdS
      A0GH3sY6n
      NaN
    
    
      2057463
      Ml2cDNbLPVT
      UBC7amXIdS
      A0GH3sY6n
      NaN
    
    
      2086996
      PbFHsC7E3GNV
      DBU2mAIX
      M0cln9d
      NaN
    
    
      2086995
      PbFHsC7E3GNV
      DBU2mAIX
      M0cln9d
      NaN
    
    
      2075456
      UGaB3Fp
      Dl2cm0MOgXIANV
      dPLCT
      NaN
    
    
      2075457
      UGaB3Fp
      Dl2cm0MOgXIANV
      dPLCT
      NaN
    
    
      2030826
      sC7SFmT
      D2lcdNnIA
      MEbPV
      HG9aU
    
    
      2075874
      ND
      MqgOXI02ma
      AcVTBC73UG
      PYb



In [22]:

    
fr2 = frequency(df7,0)
fr2.head()



In [23]:

    
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.stem(fr2[0],fr2['frequency'], linestyle='steps--')
# yscale('log')
# xscale('log')
ax.set_title('Number of museum visits by Florence Card')
ax.set_ylabel('Frequency')
ax.set_xlabel('Number of museums')
plt.show()
# NOTE: This is the number of *visits*, not people on those cards!! 
# (And, not number of museums visited, this counts multiple visits to the same museum as distinct)



In [24]:

    
df8 = df.groupby(['user_id','short_name','entry_time']).sum()['total_adults'].to_frame()
df8.head()









    Out[24]:







  
    
      
      
      
      total_adults
    
    
      user_id
      short_name
      entry_time
      
    
  
  
    
      1459702
      Accademia
      2016-06-22 15:49:00
      1
    
    
      M. Bargello
      2016-06-23 13:41:00
      1
    
    
      M. Casa Dante
      2016-06-22 14:26:00
      1
    
    
      M. Galileo
      2016-06-23 11:14:00
      1
    
    
      M. Palazzo Vecchio
      2016-06-23 12:57:00
      1



In [25]:

    
# Cards with more than one entrance to same museum
df9 = df.groupby(['user_id','short_name']).sum()['total_adults'].to_frame()
df9.columns = ['number_of_entries']
df9['number_of_entries'] = df9['number_of_entries']
df9[df9['number_of_entries']>1].head(50)









    Out[25]:







  
    
      
      
      number_of_entries
    
    
      user_id
      short_name
      
    
  
  
    
      2044654
      M. Santa Maria Novella
      2
    
    
      Opera del Duomo
      2
    
    
      Uffizi
      2
    
    
      2045231
      Accademia
      2
    
    
      2046704
      Accademia
      2
    
    
      2052035
      M. Galileo
      2
    
    
      2059646
      Santa Croce
      2
    
    
      2059934
      M. Palazzo Vecchio
      2
    
    
      2062498
      M. Archeologico
      2
    
    
      2065347
      M. Palazzo Vecchio
      2
    
    
      2065878
      Cappelle Medicee
      2
    
    
      2067085
      San Lorenzo
      2
    
    
      2069270
      M. Bargello
      2
    
    
      2069663
      M. Antropologia
      2
    
    
      2078714
      M. Novecento
      2
    
    
      2088128
      Pitti
      2
    
    
      2091065
      Accademia
      2
    
    
      Opera del Duomo
      2
    
    
      Uffizi
      2



In [26]:

    
df8.shape[0] # Number of entries









    Out[26]:





360534



In [27]:

    
df9.shape[0] # 12 repeat visits. Negligible.









    Out[27]:





360522



In [28]:

    
df9[df9['number_of_entries']==1].shape[0]









    Out[28]:





360503



In [29]:

    
df9[df9['number_of_entries']==2].shape[0]









    Out[29]:





19



In [30]:

    
df9[df9['number_of_entries']>2]









    Out[30]:







  
    
      
      
      number_of_entries
    
    
      user_id
      short_name



In [18]:

    
# # This is the number of people who entered on each card entry, not the number of repeat entries! 
# frequency(df.groupby(['user_id','short_name',]).count()['entry_time'].to_frame(),'entry_time')



In [32]:

    
df9 = df7.reset_index()
df10 = df8.reset_index()
df11 = df9.merge(df10).groupby('user_id').sum()
df11.columns = ['visits','total_people']
df11['persons_per_visit'] = df11['total_people']/df11['visits']
df11.head()









    Out[32]:







  
    
      
      visits
      total_people
      persons_per_visit
    
    
      user_id
      
      
      
    
  
  
    
      1459702
      64
      8
      0.125000
    
    
      1473903
      36
      6
      0.166667
    
    
      1473904
      36
      6
      0.166667
    
    
      1473905
      25
      5
      0.200000
    
    
      1473906
      121
      11
      0.090909



In [33]:

    
# df11[df11['persons_per_visit']>1].plot.scatter(x='visits',y='persons_per_visit')

We now want the following: a measure of similarity between adjacent rows, for detecting people traveling together (making the assumption that they bought Firenze cards consecutively).

This is simplest to do naively: not use anything statistical, but just fuzzy matching through edit distance, which is the number of operations (insertions, deletions, swaps) needed to change one string into another (or, opreations on list elements to change one list to another).

Since there are 3 days, and since we want slight deviations in otherwise identical large itineraries to count less, we calculate the following: a column with the edit distance between each pair of days between rows, summed, followed by a column with the total number of visits per row.



In [34]:

    
# edit = pdist(X, lambda u, v: np.sqrt(((u-v)**2).sum()))



In [35]:

    
df6.fillna('',inplace=True)
df6.iloc[0:10]



In [36]:

    
def editdist(pair):
    return editdistance.eval(pair[0],pair[1])



In [37]:

    
df7 = pd.concat([df6,df6.shift()],axis=1)



In [38]:

    
df7.columns = ['0','1','2','3','0+','1+','2+','3+']
df7.head()



In [39]:

    
# df8 = df7.iloc[:,[0,4,1,5,2,6,3,7]]
# df8.columns = ['0','0+','1','1+','2','2+','3','3+']
# df8.columns = ['0','0+','1','1+','2','2+','3','3+']
# df8.head()



In [40]:

    
df7['total_edit_distance'] = df7[['0','0+']].apply(editdist,axis=1) + df7[['1','1+']].apply(editdist,axis=1) + df7[['2','2+']].apply(editdist,axis=1) + df7[['3','3+']].apply(editdist,axis=1)
df7.head()









    



Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored
Exception TypeError: "object of type 'float' has no len()" in 'editdistance.bycython.eval' ignored






    Out[40]:







  
    
      
      0
      1
      2
      3
      0+
      1+
      2+
      3+
      total_edit_distance
    
    
      user_id
      
      
      
      
      
      
      
      
      
    
  
  
    
      1459702
      U3A
      DGVBC
      
      
      NaN
      NaN
      NaN
      NaN
      0
    
    
      1473903
      U
      D2V
      PX
      
      U3A
      DGVBC
      
      
      7
    
    
      1473904
      U
      D2V
      PX
      
      U
      D2V
      PX
      
      0
    
    
      1473905
      N
      cDAC
      
      
      U
      D2V
      PX
      
      6
    
    
      1473906
      PSUV
      AIMC2
      cl
      
      N
      cDAC
      
      
      10



In [41]:

    
df7['len'] = df7['0'].str.len() + df7['1'].str.len() + df7['2'].str.len() + df7['3'].str.len()
df7['len+'] = df7['0+'].str.len() + df7['1+'].str.len() + df7['2+'].str.len() + df7['3+'].str.len()
df7['len_tot'] = df7['len'] + df7['len+']
df7.head()









    Out[41]:







  
    
      
      0
      1
      2
      3
      0+
      1+
      2+
      3+
      total_edit_distance
      len
      len+
      len_tot
    
    
      user_id
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1459702
      U3A
      DGVBC
      
      
      NaN
      NaN
      NaN
      NaN
      0
      8
      NaN
      NaN
    
    
      1473903
      U
      D2V
      PX
      
      U3A
      DGVBC
      
      
      7
      6
      8.0
      14.0
    
    
      1473904
      U
      D2V
      PX
      
      U
      D2V
      PX
      
      0
      6
      6.0
      12.0
    
    
      1473905
      N
      cDAC
      
      
      U
      D2V
      PX
      
      6
      5
      6.0
      11.0
    
    
      1473906
      PSUV
      AIMC2
      cl
      
      N
      cDAC
      
      
      10
      11
      5.0
      16.0



In [42]:

    
fr3 = frequency(df7[df7['total_edit_distance']==0],'len_tot')
fr3



In [43]:

    
frequency(df7[df7['total_edit_distance']==0],'len_tot')



In [44]:

    
df8 = df7.reset_index(inplace=False)
df8.reset_index(inplace=True)
df8.head()









    Out[44]:







  
    
      
      index
      user_id
      0
      1
      2
      3
      0+
      1+
      2+
      3+
      total_edit_distance
      len
      len+
      len_tot
    
  
  
    
      0
      0
      1459702
      U3A
      DGVBC
      
      
      NaN
      NaN
      NaN
      NaN
      0
      8
      NaN
      NaN
    
    
      1
      1
      1473903
      U
      D2V
      PX
      
      U3A
      DGVBC
      
      
      7
      6
      8.0
      14.0
    
    
      2
      2
      1473904
      U
      D2V
      PX
      
      U
      D2V
      PX
      
      0
      6
      6.0
      12.0
    
    
      3
      3
      1473905
      N
      cDAC
      
      
      U
      D2V
      PX
      
      6
      5
      6.0
      11.0
    
    
      4
      4
      1473906
      PSUV
      AIMC2
      cl
      
      N
      cDAC
      
      
      10
      11
      5.0
      16.0



In [47]:

    
# df7[df7['total_edit_distance']==0].hist('len_tot',bins=100, grid=False, figsize=[16,8])
f, ax = plt.subplots(figsize=(12,5), dpi=300)
ax.stem(fr3['len_tot']/2,fr3['frequency'], linestyle='steps--')
# yscale('log')
# xscale('log')
ax.set_title('Number of museums in perfectly matched consecutive paths')
ax.set_ylabel('Number of cards')
ax.set_xlabel('Number of museums')
plt.show()
# NOTE: This is the number of *visits*, not people on those cards!! 
# (And, not number of museums visited, this counts multiple visits to the same museum as distinct)



In [ ]:

    
# df8.hist('user_id',bins=1000,figsize=[8,8])



In [ ]:

    
# df8[df8['user_id']>1500000].hist('user_id',bins=1000,figsize=[8,8])



In [ ]:

    
# df8.plot.scatter(x='index',y='total_edit_distance',figsize=[16,16], c=2+(df8['total_edit_distance']>0))
# sns.jointplot(x="index", y="total_edit_distance", data=df8)#, hue=(df9['total_edit_distance']==0))
# sns.jointplot(x="index", y="total_edit_distance", data=df8, kind='hex')



In [ ]:



In [ ]:

    
sns.jointplot(x="total_edit_distance", y="len_tot", data=df8)



In [ ]:

    
sns.jointplot(x="total_edit_distance", y="len_tot", data=df8, kind='hex')



In [ ]:

    
sns.jointplot(x="total_edit_distance", y="len_tot", data=df8, kind='kde')

Now, need to extract consecutive rows of zero edit distance.



In [ ]:

    
df8['dist_gt_0'] = 1*(df8['total_edit_distance'] != 0)
# df8['offset'] = 1*(df8['zero_dist'] + df8['zero_dist'].shift()==0)
df8['group'] = cumsum(df8['dist_gt_0'])
df8.head(50)



In [ ]:

    
df9 = df8[['group','user_id']].groupby('group').count()
df9.columns = ['people']
df9.head()



In [ ]:

    
frequency(df9,'people')



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [3]:

    
# # The code below was my attempt to get a node for starting the day and ending the day from the paths. 
# # The problem is that this gives the number of _cards_, not number of people! I had to go back to the
# # dynamic edgelist construction anyway. 
# df6.head()



In [4]:

    
# df9 = df5['s2'].apply(lambda x: pd.Series(x.strip().split(' ')))
# df9.fillna(' ',inplace=True)
# df9['0_first'] = df9[0].apply(lambda x: pd.Series(x[0]))
# df9['0_last'] = df9[0].apply(lambda x: pd.Series(x[-1]))
# df9['0_len'] = df9[0].apply(lambda x: pd.Series(len(x)))
# df9['1_first'] = df9[1].apply(lambda x: pd.Series(x[0]))
# df9['1_last'] = df9[1].apply(lambda x: pd.Series(x[-1]))
# df9['1_len'] = df9[1].apply(lambda x: pd.Series(len(x)))
# df9['2_first'] = df9[2].apply(lambda x: pd.Series(x[0]))
# df9['2_last'] = df9[2].apply(lambda x: pd.Series(x[-1]))
# df9['2_len'] = df9[2].apply(lambda x: pd.Series(len(x)))
# df9['3_first'] = df9[3].apply(lambda x: pd.Series(x[0]))
# df9['3_last'] = df9[3].apply(lambda x: pd.Series(x[-1]))
# df9['3_len'] = df9[3].apply(lambda x: pd.Series(len(x)))
# df9.head()



In [5]:

    
# df9.replace(' ',np.nan,inplace=True)
# df9.head()



In [6]:

    
# from_home = frequency(df9[['0_first','1_first','2_first','3_first']].stack().to_frame(),0)[[0,'frequency']]
# from_home.columns = ['0','from_home']
# from_home.set_index('0',inplace=True)
# from_home.head()



In [7]:

    
# only = frequency(pd.concat(
#     [df9[(df9['0_len']==1)&(df9['0_first'].notnull())]['0_first'], 
#      df9[(df9['1_len']==1)&(df9['1_first'].notnull())]['1_first'], 
#      df9[(df9['2_len']==1)&(df9['2_first'].notnull())]['2_first'], 
#      df9[(df9['3_len']==1)&(df9['3_first'].notnull())]['3_first']
#     ],axis=0).to_frame()
# ,0)[[0,'frequency']]
# only.columns = ['0','only']
# only.set_index('0',inplace=True)
# only.head()



In [8]:

    
# to_home = frequency(df9[['0_last','1_last','2_last','3_last']].stack().to_frame(),0)[[0,'frequency']]
# to_home.columns = ['0','to_home']
# to_home.set_index('0',inplace=True)
# to_home.head()



In [9]:

    
# from_to_home = nodes.set_index('string')['short_name'].to_frame().join([from_home,to_home,only])
# from_to_home.set_index('short_name',inplace=True)
# from_to_home.columns = ['home_to_node','node_to_home','only_visit_of_day']
# # from_to_home['from_home'] = from_to_home['from_home_incl_only'] - from_to_home['only_visit_of_day']
# # from_to_home['to_home'] = from_to_home['to_home_incl_only'] - from_to_home['only_visit_of_day']
# from_to_home.head()



In [10]:

    
# from_to_home['home_to_node'].sort_values(ascending=False).to_frame().head(20)



In [11]:

    
# from_to_home['node_to_home'].sort_values(ascending=False).to_frame().head(20)



In [12]:

    
# from_to_home.reset_index(inplace=True)



In [13]:

    
# from_to_home



In [14]:

    
# supp_edges = pd.DataFrame({'from':['home']*from_to_home.shape[0] + from_to_home['short_name'].tolist(),
#                           'to':from_to_home['short_name'].tolist() + ['home']*from_to_home.shape[0],
#                           'weight':from_to_home['home_to_node'].tolist() + from_to_home['node_to_home'].tolist() })



In [15]:

    
# supp_edges.dropna(how='any',inplace=True)
# supp_edges



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
frequency(df6,0).head()



In [ ]:

    
frequency(df6,1).head()



In [ ]:

    
frequency(df6,2).head()



In [ ]:

    
frequency(df6,3).head()

Now, I want a set of scatterplots between these frequencies.



In [33]:

    
pt = pd.concat([frequency(df6,0),frequency(df6,1),frequency(df6,2),frequency(df6,3)])
pt['daily_path'] = pt[0].replace(np.nan, '', regex=True) + pt[1].replace(np.nan, '', regex=True) + pt[2].replace(np.nan, '', regex=True) + pt[3].replace(np.nan, '', regex=True)
pt.drop([0,1,2,3,'ccdf','cumulative'],axis=1,inplace=True)
pt.head()



In [34]:

    
pt2 = pt.groupby('daily_path').sum()
pt2.sort_values('frequency', inplace=True, ascending=False)
pt2.head()



In [35]:

    
pt2[pt2['frequency']>200].plot.bar(figsize=(16,8))
plt.title('Most common daily Firenze card paths across all days')
plt.xlabel('x = Encoded path')
plt.ylabel('Number of cards with daily path x')
# plt.yscale('log')
plt.show()



In [36]:

    
nodes.head()









    Out[36]:







  
    
      
      museum_name
      longitude
      latitude
      museum_id
      short_name
      string
    
  
  
    
      0
      Basilica di Santa Croce
      11.262598
      43.768754
      1
      Santa Croce
      C
    
    
      1
      Basilica San Lorenzo
      11.254430
      43.774932
      2
      San Lorenzo
      2
    
    
      2
      Battistero di San Giovanni
      11.254966
      43.773131
      3
      Opera del Duomo
      D
    
    
      3
      Biblioteca Medicea Laurenziana
      11.253924
      43.774799
      4
      Laurenziana
      l
    
    
      4
      Cappella Brancacci
      11.243859
      43.768334
      5
      Brancacci
      b



In [37]:

    
# For reference, here are the displayed museums
# nodes[['string','short_name']].set_index('string').reindex(['D','P','U','A','V','T','N','C','G','B','S','c','m','M','b','Y','2'])
nodes[nodes['string'].isin(['D','P','U','A','V','T','N','C','G','B','S','c','m','M','b','Y','2'])][['string','short_name']]









    Out[37]:







  
    
      
      string
      short_name
    
  
  
    
      0
      C
      Santa Croce
    
    
      1
      2
      San Lorenzo
    
    
      2
      D
      Opera del Duomo
    
    
      4
      b
      Brancacci
    
    
      5
      c
      Cappelle Medicee
    
    
      9
      U
      Uffizi
    
    
      10
      A
      Accademia
    
    
      19
      M
      M. San Marco
    
    
      20
      N
      M. Santa Maria Novella
    
    
      23
      G
      M. Galileo
    
    
      26
      B
      M. Bargello
    
    
      40
      V
      M. Palazzo Vecchio
    
    
      48
      m
      Palazzo Medici
    
    
      49
      S
      Palazzo Strozzi
    
    
      50
      T
      Torre di Palazzo Vecchio
    
    
      51
      Y
      V. Bardini
    
    
      62
      P
      Pitti



In [38]:

    
df6[pd.isnull(df6[0].str[0])].head()



In [26]:

    
df6.to_csv('encoded_paths.csv')



In [27]:

    
nodes.to_csv('encoded_paths_legend.csv')



In [ ]:



In [39]:

    
df6.values









    Out[39]:





array([['U3A', 'DGVBC', nan, nan],
       ['U', 'D2V', 'PX', nan],
       ['U', 'D2V', 'PX', nan],
       ..., 
       ['D', nan, nan, nan],
       ['D', nan, nan, nan],
       ['D', nan, nan, nan]], dtype=object)



In [ ]:



In [ ]:

	0	1	2	3
user_id
1459702	U3A	DGVBC	NaN	NaN
1473903	U	D2V	PX	NaN
1473904	U	D2V	PX	NaN
1473905	N	cDAC	NaN	NaN
1473906	PSUV	AIMC2	cl	NaN
1473907	PSUV	AIMC2	cl	NaN
1474634	2D	U	MABm	NaN
1474636	2D	U	MABm	NaN
2014298	PN	A	NaN	NaN
2016016	DV	AU	NaN	NaN
2016021	A	DU	P	NaN
2016022	A	DU	P	NaN
2016024	PN	A	NaN	NaN
2017368	AU	NaN	NaN	NaN
2017369	AU	NaN	NaN	NaN
2017450	UP	A	NaN	NaN
2017451	UP	NaN	NaN	NaN
2017452	UP	NaN	NaN	NaN
2017453	U	D	PAVT	B
2017454	UDC2cmA	P	NaN	NaN
2017455	UDC2cmA	P	NGVT	NaN
2017456	DUG	2cXm	ABVP	NaN
2017457	U	D	PAVT	NaN
2017458	DUG	2cXm	ABVP	NaN
2017459	UAD	NaN	NaN	NaN
2017460	USP	Dl2CA	NBV	NaN
2017461	USP	Dl2CA	NBV	NaN
2017462	USP	Dl2CA	NBV	NaN
2017463	USP	Dl2CA	NBV	NaN
2017464	CA	UD	bPmMG	B
2017465	CA	NaN	NaN	NaN
2017466	UCA	MD	m2cP	NaN
2017467	UCA	MD	m2cP	NaN
2017468	DmA	P	UVTG	NaN
2017469	UDA	NaN	NaN	NaN
2017470	UDA	NaN	NaN	NaN
2017471	DA	0cl2mN	Pb	NaN
2017472	DA	0cl2mN	Pb	NaN
2017473	UP	A	NaN	NaN
2017474	UP	A	NaN	NaN
2017475	UP	A	NaN	NaN
2017476	UGA	P	NaN	NaN
2017477	UP	A	NaN	NaN
2017478	UP	A	NaN	NaN
2017479	UP	A	NaN	NaN
2017480	UG	DA	L	P
2017481	UG	DA	L	P
2017482	UGVT	cl2AC	PN	NaN
2017483	UAD	NaN	NaN	NaN
2017484	UAD	NaN	NaN	NaN

	s2	frequency	cumulative	ccdf
0	D	337	0.006604	0.993396
1	U	218	0.010876	0.989124
2	AU	201	0.014815	0.985185
3	A	193	0.018597	0.981403
4	DA	189	0.022300	0.977700
5	ADU	186	0.025945	0.974055
6	UA	172	0.029316	0.970684
7	AD	168	0.032608	0.967392
8	DAU	145	0.035449	0.964551
9	U A	115	0.037703	0.962297
10	UDA	106	0.039780	0.960220
11	DU	95	0.041641	0.958359
12	D A	89	0.043385	0.956615
13	D AU	86	0.045071	0.954929
14	DA U	86	0.046756	0.953244
15	D U	79	0.048304	0.951696
16	P	77	0.049813	0.950187
17	AD U	77	0.051322	0.948678
18	A U	70	0.052693	0.947307
19	D UA	60	0.053869	0.946131
20	DUA	60	0.055045	0.944955
21	A D	57	0.056162	0.943838
22	U AD	56	0.057259	0.942741
23	UA D	56	0.058357	0.941643
24	UD	53	0.059395	0.940605
25	U DA	51	0.060395	0.939605
26	UAD	50	0.061374	0.938626
27	UD A	44	0.062237	0.937763
28	D P	43	0.063079	0.936921
29	UP	42	0.063902	0.936098
30	DU A	42	0.064725	0.935275
31	DA P	41	0.065529	0.934471
32	AD UP	40	0.066313	0.933687
33	U D	38	0.067057	0.932943
34	P U	34	0.067724	0.932276
35	AU D	32	0.068351	0.931649
36	AD UC	32	0.068978	0.931022
37	A DU	30	0.069566	0.930434
38	VU	29	0.070134	0.929866
40	VT	28	0.070683	0.929317
39	V	28	0.071231	0.928769
41	U P	26	0.071741	0.928259
42	DV	26	0.072250	0.927750
43	D U A	25	0.072740	0.927260
44	ADU P	25	0.073230	0.926770
45	DUP	25	0.073720	0.926280
49	P ADU	24	0.074190	0.925810
51	G	24	0.074661	0.925339
50	AP	24	0.075131	0.924869
47	ADBU	24	0.075601	0.924399

	0	frequency	cumulative	ccdf
0	6	5907	0.115753	0.884247
1	5	5632	0.226117	0.773883
2	7	5521	0.334307	0.665693
3	4	5516	0.442398	0.557602
4	8	4858	0.537595	0.462405

	len_tot	frequency	cumulative	ccdf
0	12.0	2494	0.118221	0.881779
1	8.0	2384	0.231229	0.768771
2	10.0	2375	0.343809	0.656191
3	14.0	2248	0.450370	0.549630
4	16.0	1976	0.544037	0.455963
5	6.0	1792	0.628982	0.371018
6	18.0	1635	0.706485	0.293515
7	20.0	1399	0.772801	0.227199
8	22.0	1126	0.826176	0.173824
9	4.0	1039	0.875427	0.124573
10	24.0	796	0.913159	0.086841
11	26.0	575	0.940415	0.059585
12	2.0	431	0.960846	0.039154
13	28.0	354	0.977626	0.022374
14	30.0	200	0.987107	0.012893
15	32.0	130	0.993269	0.006731
16	34.0	69	0.996540	0.003460
17	36.0	37	0.998294	0.001706
18	38.0	14	0.998957	0.001043
19	40.0	8	0.999336	0.000664
20	44.0	5	0.999573	0.000427
21	42.0	5	0.999810	0.000190
22	60.0	1	0.999858	0.000142
23	64.0	1	0.999905	0.000095
24	54.0	1	0.999953	0.000047
25	52.0	1	1.000000	0.000000

	len_tot	frequency	cumulative	ccdf
0	12.0	2494	0.118221	0.881779
1	8.0	2384	0.231229	0.768771
2	10.0	2375	0.343809	0.656191
3	14.0	2248	0.450370	0.549630
4	16.0	1976	0.544037	0.455963
5	6.0	1792	0.628982	0.371018
6	18.0	1635	0.706485	0.293515
7	20.0	1399	0.772801	0.227199
8	22.0	1126	0.826176	0.173824
9	4.0	1039	0.875427	0.124573
10	24.0	796	0.913159	0.086841
11	26.0	575	0.940415	0.059585
12	2.0	431	0.960846	0.039154
13	28.0	354	0.977626	0.022374
14	30.0	200	0.987107	0.012893
15	32.0	130	0.993269	0.006731
16	34.0	69	0.996540	0.003460
17	36.0	37	0.998294	0.001706
18	38.0	14	0.998957	0.001043
19	40.0	8	0.999336	0.000664
20	44.0	5	0.999573	0.000427
21	42.0	5	0.999810	0.000190
22	60.0	1	0.999858	0.000142
23	64.0	1	0.999905	0.000095
24	54.0	1	0.999953	0.000047
25	52.0	1	1.000000	0.000000

	s2
user_id
1459702	U3A DGVBC
1473903	U D2V PX
1473904	U D2V PX
1473905	N cDAC
1473906	PSUV AIMC2 cl

	museum_name	longitude	latitude	museum_id	short_name	string
0	Basilica di Santa Croce	11.262598	43.768754	1	Santa Croce	C
1	Basilica San Lorenzo	11.254430	43.774932	2	San Lorenzo	2
2	Battistero di San Giovanni	11.254966	43.773131	3	Opera del Duomo	D
3	Biblioteca Medicea Laurenziana	11.253924	43.774799	4	Laurenziana	l
4	Cappella Brancacci	11.243859	43.768334	5	Brancacci	b
5	Cappelle Medicee	11.252750	43.774914	6	Cappelle Medicee	c
6	Casa Buonarroti	11.263593	43.769850	7	Casa Buonarroti	7
7	Fondazione Primo Conti	11.292696	43.812167	8	Primo Conti	8
8	Fondazione Scienza e Tecnica _ Planetario	11.264543	43.776782	9	Planetario	_
9	Galleria degli Uffizi	11.255607	43.768526	10	Uffizi	U
10	Galleria dell'Accademia di Firenze	11.258516	43.776755	11	Accademia	A
11	La Specola	11.247132	43.764626	12	La Specola	L
12	Musei Civici Fiesole	11.293076	43.807254	13	M. Civici Fiesole	f
13	Museo Archeologico Nazionale di Firenze	11.261037	43.776634	14	M. Archeologico	X
14	Museo Casa Dante	11.257062	43.771071	15	M. Casa Dante	3
15	Museo degli Innocenti	11.260970	43.776340	16	M. Innocenti	I
16	Museo del Calcio	11.303383	43.777617	17	M. Calcio	u
17	Museo dell'Opificio delle Pietre Dure	11.256901	43.768732	18	M. Opificio	0
18	Museo di Antropologia	11.257962	43.771754	19	M. Antropologia	a
19	Museo di San Marco	11.258964	43.777506	25	M. San Marco	M
20	Museo di Santa Maria Novella	11.249420	43.774049	26	M. Santa Maria Novella	N
21	Museo Ebraico	11.265515	43.772972	27	M. Ebraico	E
22	Museo Ferragamo	11.251063	43.769812	28	M. Ferragamo	F
23	Museo Galileo	11.256023	43.767683	29	M. Galileo	G
24	Museo Horne	11.259375	43.767443	30	M. Horne	H
25	Museo Marini	11.250052	43.771906	31	M. Marini	9
26	Museo Nazionale del Bargello	11.257864	43.770509	32	M. Bargello	B
27	Museo Novecento	11.249096	43.773020	33	M. Novecento	n
28	Museo Stefano Bardini	11.259193	43.765088	34	M. Stefano Bardini	s
29	Museo Bandini di Fiesole	11.292954	43.807292	53	M. Bandini	Z
...	...	...	...	...	...	...
33	Museo di Casa Martelli	11.253293	43.774148	57	M. Casa Martelli	1
34	Museo Enrico Caruso	11.080747	43.768062	58	M. Enrico Caruso	e
35	Parco Mediceo di Pratolino (Villa Demidoff)	11.299206	43.861575	59	V. di Pratolino	Q
36	Villa Corsini a Castello	11.233250	43.817123	61	V. Corsini	K
37	Museo di Geologia	11.259840	43.778341	20	M. Geologia	g
38	Museo di Mineralogia	11.259840	43.778341	21	M. Mineralogia	q
39	Museo di Palazzo Davanzati	11.254827	43.770237	22	M. Palazzo Davanzati	d
40	Museo di Palazzo Vecchio	11.255600	43.769517	23	M. Palazzo Vecchio	V
41	Museo di Preistoria	11.259883	43.772897	24	M. Preistoria	p
42	Scavi del Teatro Romano di Firenze-Palazzo Vec...	11.255600	43.769517	60	Teatro Romano	t
43	Villa Medicea della Petraia	11.234535	43.818284	62	V. La Petraia	w
44	Villa Medicea di Cerreto Guidi e Museo storico...	10.880166	43.759825	63	V. di Cerreto Guidi	W
45	Villa Medicea di Poggio a Caiano	11.055234	43.816568	64	V. di Poggio a Caiano	y
46	Museo Stibbert	11.255899	43.792889	35	M. Stibbert	6
47	Orto Botanico	11.261745	43.779411	36	Orto Botanico	O
48	Palazzo Medici Riccardi	11.255910	43.774764	37	Palazzo Medici	m
49	Palazzo Strozzi	11.252241	43.771007	40	Palazzo Strozzi	S
50	Torre di Palazzo Vecchio	11.256007	43.769281	41	Torre di Palazzo Vecchio	T
51	Villa Bardini	11.256237	43.764011	42	V. Bardini	Y
52	Cenacolo Andrea del Sarto	11.285717	43.771651	43	C. Andrea del Sarto	@
53	Cenacolo del Fuligno	11.252118	43.775977	44	C. Fuligno	5
54	Cenacolo del Ghirlandaio	11.246334	43.771995	45	C. Ghirlandaio	h
55	Cenacolo di Sant'Apollonia	11.256901	43.768732	46	C. SantApollonia	z
56	Chiostro dello Scalzo	11.258651	43.778441	47	Chiostro dello Scalzo	x
57	Collezione Contini Bonacossi	11.254966	43.768636	48	Contini Bonacossi	k
58	Complesso Monumentale Orsanmichele	11.254763	43.770868	49	Orsanmichele	o
59	Fondazione Salvatore Romano	11.247625	43.766902	50	Fondazione Romano	r
60	Forte Belvedere	11.253607	43.763600	51	Belvedere	4
61	Giardino della Villa Medicea di Castello	11.226004	43.820745	52	V. di Castello	J
62	Palazzo Pitti	11.248342	43.765178	38	Pitti	P

	0
user_id
2066305	32
2066304	32
2057462	30
2057463	30
2086996	27
2086995	27
2075456	26
2075457	26
2030826	26
2075874	25

	0	1	2	3
user_id
2066305	PLYsG3VT	Dcl2m9bC7EB	Ud0MgqOXIanNA	NaN
2066304	PLYsG3VT	Dcl2m9bC7EB	Ud0MgqOXIanNA	NaN
2057462	Ml2cDNbLPVT	UBC7amXIdS	A0GH3sY6n	NaN
2057463	Ml2cDNbLPVT	UBC7amXIdS	A0GH3sY6n	NaN
2086996	PbFHsC7E3GNV	DBU2mAIX	M0cln9d	NaN
2086995	PbFHsC7E3GNV	DBU2mAIX	M0cln9d	NaN
2075456	UGaB3Fp	Dl2cm0MOgXIANV	dPLCT	NaN
2075457	UGaB3Fp	Dl2cm0MOgXIANV	dPLCT	NaN
2030826	sC7SFmT	D2lcdNnIA	MEbPV	HG9aU
2075874	ND	MqgOXI02ma	AcVTBC73UG	PYb

			total_adults
user_id	short_name	entry_time
1459702	Accademia	2016-06-22 15:49:00	1
	M. Bargello	2016-06-23 13:41:00	1
	M. Casa Dante	2016-06-22 14:26:00	1
	M. Galileo	2016-06-23 11:14:00	1
	M. Palazzo Vecchio	2016-06-23 12:57:00	1

		number_of_entries
user_id	short_name
2044654	M. Santa Maria Novella	2
	Opera del Duomo	2
	Uffizi	2
2045231	Accademia	2
2046704	Accademia	2
2052035	M. Galileo	2
2059646	Santa Croce	2
2059934	M. Palazzo Vecchio	2
2062498	M. Archeologico	2
2065347	M. Palazzo Vecchio	2
2065878	Cappelle Medicee	2
2067085	San Lorenzo	2
2069270	M. Bargello	2
2069663	M. Antropologia	2
2078714	M. Novecento	2
2088128	Pitti	2
2091065	Accademia	2
	Opera del Duomo	2
	Uffizi	2

	visits	total_people	persons_per_visit
user_id
1459702	64	8	0.125000
1473903	36	6	0.166667
1473904	36	6	0.166667
1473905	25	5	0.200000
1473906	121	11	0.090909