Path analysis


In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from pylab import *

import igraph as ig # Need to install this in your virtual environment

import psycopg2
from re import sub

import editdistance # Needs to be installed

# from pymining import seqmining

In [3]:
# import os
# import sys
# sys.path.append('/home/mmalik/optourism-repo' + "/pipeline")
# from firenzecard_analyzer import *

In [2]:
# TODO: connect with dbutils
conn_str = ""
conn = psycopg2.connect(conn_str)
cursor = conn.cursor()

Then, load the data (takes a few moments):


In [5]:
nodes = pd.read_sql('select * from optourism.firenze_card_locations', con=conn)
nodes.head()


Out[5]:
museum_name longitude latitude museum_id short_name string
0 Basilica di Santa Croce 11.262598 43.768754 1 Santa Croce C
1 Basilica San Lorenzo 11.254430 43.774932 2 San Lorenzo 2
2 Battistero di San Giovanni 11.254966 43.773131 3 Opera del Duomo D
3 Biblioteca Medicea Laurenziana 11.253924 43.774799 4 Laurenziana l
4 Cappella Brancacci 11.243859 43.768334 5 Brancacci b

In [16]:
df = pd.read_sql('select * from optourism.firenze_card_logs', con=conn)
df['museum_id'].replace(to_replace=39,value=38,inplace=True)
df['short_name'] = df['museum_id'].replace(dict(zip(nodes['museum_id'],nodes['short_name'])))
df['string'] = df['museum_id'].replace(dict(zip(nodes['museum_id'],nodes['string'])))
df['date'] = pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.date
df['hour'] = pd.to_datetime(df['date']) + pd.to_timedelta(pd.to_datetime(df['entry_time'], format='%Y-%m-%d %H:%M:%S').dt.hour, unit='h')
df.head()


Out[16]:
user_id museum_name entry_time adults_first_use adults_reuse total_adults minors museum_id short_name string date hour
0 2089098 Palazzo Pitti 2016-09-19 14:49:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00
1 2089099 Palazzo Pitti 2016-09-19 14:49:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00
2 2083344 Palazzo Pitti 2016-09-19 14:57:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00
3 2083335 Palazzo Pitti 2016-09-19 14:57:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00
4 2083304 Palazzo Pitti 2016-09-19 14:58:00 0 1 1 0 38 Pitti P 2016-09-19 2016-09-19 14:00:00

In [17]:
# Helper function for making summary tables/distributions
def frequency(dataframe,columnname):
    out = dataframe[columnname].value_counts().to_frame()
    out.columns = ['frequency']
    out.index.name = columnname
    out.reset_index(inplace=True)
    out.sort_values('frequency',inplace=True,ascending=False)
    out['cumulative'] = out['frequency'].cumsum()/out['frequency'].sum()
    out['ccdf'] = 1 - out['cumulative']
    return out

I propose distinguishing paths from flows. A path is an itinerary, and the flow is the number of people who take the flow. E.g., a family or a tour group produces one path, but adds mulitple people to the overall flow.

We now build a transition graph, a directed graph where an edge represents a person going from one museum to another within the same day.

We also produce the transition matrix, a row-normalized n-by-n matrix of the frequency of transition from the row node to the column node. If you take a vector of the current volumes in each location, and multiply that my the transition matrix, you get a prediction for the number of people on each node at the next time. This prediction can be refined with corrections for daily/weekly patterns and such.

Path analysis

To make paths:

We want a dataframe with user, the museum they went from and the museum they went to, the number of people on the card, and the time of entry to the next museum.

We will drop much of this data in creating paths, which will be concatenations of single-character codes for each museum.

To track the first visit per day, we add a dummy "source" node that everybody starts each day from. We give it the character code " ", and can then split(" ") along it.


In [18]:
df4 = df.groupby(['user_id','entry_time','date','hour','museum_name','short_name','string']).sum() # Need to group in this order to be correct further down
df4['total_people'] = df4['total_adults'] + df4['minors']
df4.head()


Out[18]:
adults_first_use adults_reuse total_adults minors museum_id total_people
user_id entry_time date hour museum_name short_name string
1459702 2016-06-22 10:04:00 2016-06-22 2016-06-22 10:00:00 Galleria degli Uffizi Uffizi U 1 0 1 0 10 1
2016-06-22 14:26:00 2016-06-22 2016-06-22 14:00:00 Museo Casa Dante M. Casa Dante 3 0 1 1 0 15 1
2016-06-22 15:49:00 2016-06-22 2016-06-22 15:00:00 Galleria dell'Accademia di Firenze Accademia A 0 1 1 0 11 1
2016-06-23 09:43:00 2016-06-23 2016-06-23 09:00:00 Battistero di San Giovanni Opera del Duomo D 0 1 1 0 3 1
2016-06-23 11:14:00 2016-06-23 2016-06-23 11:00:00 Museo Galileo M. Galileo G 0 1 1 0 29 1

In [19]:
df4.reset_index(inplace=True)
df4.drop(['adults_first_use','adults_reuse','total_adults','minors','museum_id'], axis = 1, inplace=True)
df4.head(10)


Out[19]:
user_id entry_time date hour museum_name short_name string total_people
0 1459702 2016-06-22 10:04:00 2016-06-22 2016-06-22 10:00:00 Galleria degli Uffizi Uffizi U 1
1 1459702 2016-06-22 14:26:00 2016-06-22 2016-06-22 14:00:00 Museo Casa Dante M. Casa Dante 3 1
2 1459702 2016-06-22 15:49:00 2016-06-22 2016-06-22 15:00:00 Galleria dell'Accademia di Firenze Accademia A 1
3 1459702 2016-06-23 09:43:00 2016-06-23 2016-06-23 09:00:00 Battistero di San Giovanni Opera del Duomo D 1
4 1459702 2016-06-23 11:14:00 2016-06-23 2016-06-23 11:00:00 Museo Galileo M. Galileo G 1
5 1459702 2016-06-23 12:57:00 2016-06-23 2016-06-23 12:00:00 Museo di Palazzo Vecchio M. Palazzo Vecchio V 1
6 1459702 2016-06-23 13:41:00 2016-06-23 2016-06-23 13:00:00 Museo Nazionale del Bargello M. Bargello B 1
7 1459702 2016-06-23 15:05:00 2016-06-23 2016-06-23 15:00:00 Basilica di Santa Croce Santa Croce C 1
8 1473903 2016-06-19 11:24:00 2016-06-19 2016-06-19 11:00:00 Galleria degli Uffizi Uffizi U 1
9 1473903 2016-06-20 12:05:00 2016-06-20 2016-06-20 12:00:00 Battistero di San Giovanni Opera del Duomo D 1

In [20]:
df4.columns


Out[20]:
Index([u'user_id', u'entry_time', u'date', u'hour', u'museum_name',
       u'short_name', u'string', u'total_people'],
      dtype='object')

In [21]:
df4['from'] = 'source' # Initialize 'from' column with 'source'
df4['to'] = df4['short_name'] # Copy 'to' column with row's museum_name
df4.head(10)


Out[21]:
user_id entry_time date hour museum_name short_name string total_people from to
0 1459702 2016-06-22 10:04:00 2016-06-22 2016-06-22 10:00:00 Galleria degli Uffizi Uffizi U 1 source Uffizi
1 1459702 2016-06-22 14:26:00 2016-06-22 2016-06-22 14:00:00 Museo Casa Dante M. Casa Dante 3 1 source M. Casa Dante
2 1459702 2016-06-22 15:49:00 2016-06-22 2016-06-22 15:00:00 Galleria dell'Accademia di Firenze Accademia A 1 source Accademia
3 1459702 2016-06-23 09:43:00 2016-06-23 2016-06-23 09:00:00 Battistero di San Giovanni Opera del Duomo D 1 source Opera del Duomo
4 1459702 2016-06-23 11:14:00 2016-06-23 2016-06-23 11:00:00 Museo Galileo M. Galileo G 1 source M. Galileo
5 1459702 2016-06-23 12:57:00 2016-06-23 2016-06-23 12:00:00 Museo di Palazzo Vecchio M. Palazzo Vecchio V 1 source M. Palazzo Vecchio
6 1459702 2016-06-23 13:41:00 2016-06-23 2016-06-23 13:00:00 Museo Nazionale del Bargello M. Bargello B 1 source M. Bargello
7 1459702 2016-06-23 15:05:00 2016-06-23 2016-06-23 15:00:00 Basilica di Santa Croce Santa Croce C 1 source Santa Croce
8 1473903 2016-06-19 11:24:00 2016-06-19 2016-06-19 11:00:00 Galleria degli Uffizi Uffizi U 1 source Uffizi
9 1473903 2016-06-20 12:05:00 2016-06-20 2016-06-20 12:00:00 Battistero di San Giovanni Opera del Duomo D 1 source Opera del Duomo

In [22]:
make_link = (df4['user_id'].shift(1)==df4['user_id'])&(df4['date'].shift(1)==df4['date']) # Row indexes at which to overwrite 'source'
df4['from'][make_link] = df4['short_name'].shift(1)[make_link]
df4.head(50)


Out[22]:
user_id entry_time date hour museum_name short_name string total_people from to
0 1459702 2016-06-22 10:04:00 2016-06-22 2016-06-22 10:00:00 Galleria degli Uffizi Uffizi U 1 source Uffizi
1 1459702 2016-06-22 14:26:00 2016-06-22 2016-06-22 14:00:00 Museo Casa Dante M. Casa Dante 3 1 Uffizi M. Casa Dante
2 1459702 2016-06-22 15:49:00 2016-06-22 2016-06-22 15:00:00 Galleria dell'Accademia di Firenze Accademia A 1 M. Casa Dante Accademia
3 1459702 2016-06-23 09:43:00 2016-06-23 2016-06-23 09:00:00 Battistero di San Giovanni Opera del Duomo D 1 source Opera del Duomo
4 1459702 2016-06-23 11:14:00 2016-06-23 2016-06-23 11:00:00 Museo Galileo M. Galileo G 1 Opera del Duomo M. Galileo
5 1459702 2016-06-23 12:57:00 2016-06-23 2016-06-23 12:00:00 Museo di Palazzo Vecchio M. Palazzo Vecchio V 1 M. Galileo M. Palazzo Vecchio
6 1459702 2016-06-23 13:41:00 2016-06-23 2016-06-23 13:00:00 Museo Nazionale del Bargello M. Bargello B 1 M. Palazzo Vecchio M. Bargello
7 1459702 2016-06-23 15:05:00 2016-06-23 2016-06-23 15:00:00 Basilica di Santa Croce Santa Croce C 1 M. Bargello Santa Croce
8 1473903 2016-06-19 11:24:00 2016-06-19 2016-06-19 11:00:00 Galleria degli Uffizi Uffizi U 1 source Uffizi
9 1473903 2016-06-20 12:05:00 2016-06-20 2016-06-20 12:00:00 Battistero di San Giovanni Opera del Duomo D 1 source Opera del Duomo
10 1473903 2016-06-20 15:44:00 2016-06-20 2016-06-20 15:00:00 Basilica San Lorenzo San Lorenzo 2 1 Opera del Duomo San Lorenzo
11 1473903 2016-06-20 17:34:00 2016-06-20 2016-06-20 17:00:00 Museo di Palazzo Vecchio M. Palazzo Vecchio V 1 San Lorenzo M. Palazzo Vecchio
12 1473903 2016-06-21 11:22:00 2016-06-21 2016-06-21 11:00:00 Palazzo Pitti Pitti P 1 source Pitti
13 1473903 2016-06-21 15:35:00 2016-06-21 2016-06-21 15:00:00 Museo Archeologico Nazionale di Firenze M. Archeologico X 1 Pitti M. Archeologico
14 1473904 2016-06-19 11:24:00 2016-06-19 2016-06-19 11:00:00 Galleria degli Uffizi Uffizi U 1 source Uffizi
15 1473904 2016-06-20 12:05:00 2016-06-20 2016-06-20 12:00:00 Battistero di San Giovanni Opera del Duomo D 1 source Opera del Duomo
16 1473904 2016-06-20 15:44:00 2016-06-20 2016-06-20 15:00:00 Basilica San Lorenzo San Lorenzo 2 1 Opera del Duomo San Lorenzo
17 1473904 2016-06-20 17:34:00 2016-06-20 2016-06-20 17:00:00 Museo di Palazzo Vecchio M. Palazzo Vecchio V 1 San Lorenzo M. Palazzo Vecchio
18 1473904 2016-06-21 11:22:00 2016-06-21 2016-06-21 11:00:00 Palazzo Pitti Pitti P 1 source Pitti
19 1473904 2016-06-21 15:35:00 2016-06-21 2016-06-21 15:00:00 Museo Archeologico Nazionale di Firenze M. Archeologico X 1 Pitti M. Archeologico
20 1473905 2016-07-01 13:56:00 2016-07-01 2016-07-01 13:00:00 Museo di Santa Maria Novella M. Santa Maria Novella N 1 source M. Santa Maria Novella
21 1473905 2016-07-02 11:21:00 2016-07-02 2016-07-02 11:00:00 Cappelle Medicee Cappelle Medicee c 1 source Cappelle Medicee
22 1473905 2016-07-02 12:07:00 2016-07-02 2016-07-02 12:00:00 Battistero di San Giovanni Opera del Duomo D 1 Cappelle Medicee Opera del Duomo
23 1473905 2016-07-02 13:29:00 2016-07-02 2016-07-02 13:00:00 Galleria dell'Accademia di Firenze Accademia A 1 Opera del Duomo Accademia
24 1473905 2016-07-02 15:06:00 2016-07-02 2016-07-02 15:00:00 Basilica di Santa Croce Santa Croce C 1 Accademia Santa Croce
25 1473906 2016-07-23 09:38:00 2016-07-23 2016-07-23 09:00:00 Palazzo Pitti Pitti P 1 source Pitti
26 1473906 2016-07-23 15:10:00 2016-07-23 2016-07-23 15:00:00 Palazzo Strozzi Palazzo Strozzi S 1 Pitti Palazzo Strozzi
27 1473906 2016-07-23 16:30:00 2016-07-23 2016-07-23 16:00:00 Galleria degli Uffizi Uffizi U 1 Palazzo Strozzi Uffizi
28 1473906 2016-07-23 19:15:00 2016-07-23 2016-07-23 19:00:00 Museo di Palazzo Vecchio M. Palazzo Vecchio V 1 Uffizi M. Palazzo Vecchio
29 1473906 2016-07-24 09:10:00 2016-07-24 2016-07-24 09:00:00 Galleria dell'Accademia di Firenze Accademia A 1 source Accademia
30 1473906 2016-07-24 10:48:00 2016-07-24 2016-07-24 10:00:00 Museo degli Innocenti M. Innocenti I 1 Accademia M. Innocenti
31 1473906 2016-07-24 11:58:00 2016-07-24 2016-07-24 11:00:00 Museo di San Marco M. San Marco M 2 M. Innocenti M. San Marco
32 1473906 2016-07-24 14:19:00 2016-07-24 2016-07-24 14:00:00 Basilica di Santa Croce Santa Croce C 1 M. San Marco Santa Croce
33 1473906 2016-07-24 16:29:00 2016-07-24 2016-07-24 16:00:00 Basilica San Lorenzo San Lorenzo 2 1 Santa Croce San Lorenzo
34 1473906 2016-07-25 09:43:00 2016-07-25 2016-07-25 09:00:00 Cappelle Medicee Cappelle Medicee c 1 source Cappelle Medicee
35 1473906 2016-07-25 10:06:00 2016-07-25 2016-07-25 10:00:00 Biblioteca Medicea Laurenziana Laurenziana l 1 Cappelle Medicee Laurenziana
36 1473907 2016-07-23 09:38:00 2016-07-23 2016-07-23 09:00:00 Palazzo Pitti Pitti P 1 source Pitti
37 1473907 2016-07-23 15:10:00 2016-07-23 2016-07-23 15:00:00 Palazzo Strozzi Palazzo Strozzi S 1 Pitti Palazzo Strozzi
38 1473907 2016-07-23 16:30:00 2016-07-23 2016-07-23 16:00:00 Galleria degli Uffizi Uffizi U 1 Palazzo Strozzi Uffizi
39 1473907 2016-07-23 19:15:00 2016-07-23 2016-07-23 19:00:00 Museo di Palazzo Vecchio M. Palazzo Vecchio V 1 Uffizi M. Palazzo Vecchio
40 1473907 2016-07-24 09:10:00 2016-07-24 2016-07-24 09:00:00 Galleria dell'Accademia di Firenze Accademia A 1 source Accademia
41 1473907 2016-07-24 10:48:00 2016-07-24 2016-07-24 10:00:00 Museo degli Innocenti M. Innocenti I 1 Accademia M. Innocenti
42 1473907 2016-07-24 11:58:00 2016-07-24 2016-07-24 11:00:00 Museo di San Marco M. San Marco M 1 M. Innocenti M. San Marco
43 1473907 2016-07-24 14:19:00 2016-07-24 2016-07-24 14:00:00 Basilica di Santa Croce Santa Croce C 1 M. San Marco Santa Croce
44 1473907 2016-07-24 16:29:00 2016-07-24 2016-07-24 16:00:00 Basilica San Lorenzo San Lorenzo 2 1 Santa Croce San Lorenzo
45 1473907 2016-07-25 09:43:00 2016-07-25 2016-07-25 09:00:00 Cappelle Medicee Cappelle Medicee c 1 source Cappelle Medicee
46 1473907 2016-07-25 10:06:00 2016-07-25 2016-07-25 10:00:00 Biblioteca Medicea Laurenziana Laurenziana l 1 Cappelle Medicee Laurenziana
47 1474634 2016-06-09 13:36:00 2016-06-09 2016-06-09 13:00:00 Basilica San Lorenzo San Lorenzo 2 1 source San Lorenzo
48 1474634 2016-06-09 14:07:00 2016-06-09 2016-06-09 14:00:00 Battistero di San Giovanni Opera del Duomo D 1 San Lorenzo Opera del Duomo
49 1474634 2016-06-10 16:02:00 2016-06-10 2016-06-10 16:00:00 Galleria degli Uffizi Uffizi U 1 source Uffizi

In [23]:
df4['s'] = ' ' # Initialize 'from' column with 'source'
df4['t'] = df4['string'] # Copy 'to' column with row's museum_name
df4['s'][make_link] = df4['string'].shift(1)[make_link]
df4.head()


Out[23]:
user_id entry_time date hour museum_name short_name string total_people from to s t
0 1459702 2016-06-22 10:04:00 2016-06-22 2016-06-22 10:00:00 Galleria degli Uffizi Uffizi U 1 source Uffizi U
1 1459702 2016-06-22 14:26:00 2016-06-22 2016-06-22 14:00:00 Museo Casa Dante M. Casa Dante 3 1 Uffizi M. Casa Dante U 3
2 1459702 2016-06-22 15:49:00 2016-06-22 2016-06-22 15:00:00 Galleria dell'Accademia di Firenze Accademia A 1 M. Casa Dante Accademia 3 A
3 1459702 2016-06-23 09:43:00 2016-06-23 2016-06-23 09:00:00 Battistero di San Giovanni Opera del Duomo D 1 source Opera del Duomo D
4 1459702 2016-06-23 11:14:00 2016-06-23 2016-06-23 11:00:00 Museo Galileo M. Galileo G 1 Opera del Duomo M. Galileo D G

In [24]:
# Concatenating the source column is not enough, it leaves out the last place in the path. 
# Need to add a second 'source' column that, for the last item in a day's path, contains two characters.
df4['s2'] = df4['s']
df4['s2'][df4['from'].shift(-1)=='source'] = (df4['s2'] + df4['t'])[df4['from'].shift(-1)=='source']
# Note: the above trick doesn't work for the last row of data. So, do this as well:
df4.iloc[-1:]['s2'] = df4.iloc[-1:]['s'] + df4.iloc[-1:]['t']
df4.tail()


Out[24]:
user_id entry_time date hour museum_name short_name string total_people from to s t s2
360529 2095763 2016-09-30 17:31:00 2016-09-30 2016-09-30 17:00:00 Museo di Palazzo Vecchio M. Palazzo Vecchio V 1 source M. Palazzo Vecchio V V
360530 2095764 2016-09-30 17:31:00 2016-09-30 2016-09-30 17:00:00 Museo di Palazzo Vecchio M. Palazzo Vecchio V 1 source M. Palazzo Vecchio V V
360531 2095765 2016-09-30 17:24:00 2016-09-30 2016-09-30 17:00:00 Battistero di San Giovanni Opera del Duomo D 2 source Opera del Duomo D D
360532 2095766 2016-09-30 17:55:00 2016-09-30 2016-09-30 17:00:00 Battistero di San Giovanni Opera del Duomo D 1 source Opera del Duomo D D
360533 2095767 2016-09-30 17:55:00 2016-09-30 2016-09-30 17:00:00 Battistero di San Giovanni Opera del Duomo D 1 source Opera del Duomo D D

In [25]:
df4.head()


Out[25]:
user_id entry_time date hour museum_name short_name string total_people from to s t s2
0 1459702 2016-06-22 10:04:00 2016-06-22 2016-06-22 10:00:00 Galleria degli Uffizi Uffizi U 1 source Uffizi U
1 1459702 2016-06-22 14:26:00 2016-06-22 2016-06-22 14:00:00 Museo Casa Dante M. Casa Dante 3 1 Uffizi M. Casa Dante U 3 U
2 1459702 2016-06-22 15:49:00 2016-06-22 2016-06-22 15:00:00 Galleria dell'Accademia di Firenze Accademia A 1 M. Casa Dante Accademia 3 A 3A
3 1459702 2016-06-23 09:43:00 2016-06-23 2016-06-23 09:00:00 Battistero di San Giovanni Opera del Duomo D 1 source Opera del Duomo D
4 1459702 2016-06-23 11:14:00 2016-06-23 2016-06-23 11:00:00 Museo Galileo M. Galileo G 1 Opera del Duomo M. Galileo D G D

In [26]:
df5 = df4.groupby('user_id')['s2'].sum().to_frame() # sum() on strings concatenates 
df5.head()


Out[26]:
s2
user_id
1459702 U3A DGVBC
1473903 U D2V PX
1473904 U D2V PX
1473905 N cDAC
1473906 PSUV AIMC2 cl

In [27]:
df6 = df5['s2'].apply(lambda x: pd.Series(x.strip().split(' '))) # Now split along strings. Takes a few seconds.
df6.head() # Note: 4 columns is correct, Firenze card is *72 hours from first use*, not from midnight of the day of first yse!


Out[27]:
0 1 2 3
user_id
1459702 U3A DGVBC NaN NaN
1473903 U D2V PX NaN
1473904 U D2V PX NaN
1473905 N cDAC NaN NaN
1473906 PSUV AIMC2 cl NaN

In [28]:
df6.head(50) # Data stories just fall out! People traveling together, splitting off, etc. We assume this but strong coupling is hard to ignore.


Out[28]:
0 1 2 3
user_id
1459702 U3A DGVBC NaN NaN
1473903 U D2V PX NaN
1473904 U D2V PX NaN
1473905 N cDAC NaN NaN
1473906 PSUV AIMC2 cl NaN
1473907 PSUV AIMC2 cl NaN
1474634 2D U MABm NaN
1474636 2D U MABm NaN
2014298 PN A NaN NaN
2016016 DV AU NaN NaN
2016021 A DU P NaN
2016022 A DU P NaN
2016024 PN A NaN NaN
2017368 AU NaN NaN NaN
2017369 AU NaN NaN NaN
2017450 UP A NaN NaN
2017451 UP NaN NaN NaN
2017452 UP NaN NaN NaN
2017453 U D PAVT B
2017454 UDC2cmA P NaN NaN
2017455 UDC2cmA P NGVT NaN
2017456 DUG 2cXm ABVP NaN
2017457 U D PAVT NaN
2017458 DUG 2cXm ABVP NaN
2017459 UAD NaN NaN NaN
2017460 USP Dl2CA NBV NaN
2017461 USP Dl2CA NBV NaN
2017462 USP Dl2CA NBV NaN
2017463 USP Dl2CA NBV NaN
2017464 CA UD bPmMG B
2017465 CA NaN NaN NaN
2017466 UCA MD m2cP NaN
2017467 UCA MD m2cP NaN
2017468 DmA P UVTG NaN
2017469 UDA NaN NaN NaN
2017470 UDA NaN NaN NaN
2017471 DA 0cl2mN Pb NaN
2017472 DA 0cl2mN Pb NaN
2017473 UP A NaN NaN
2017474 UP A NaN NaN
2017475 UP A NaN NaN
2017476 UGA P NaN NaN
2017477 UP A NaN NaN
2017478 UP A NaN NaN
2017479 UP A NaN NaN
2017480 UG DA L P
2017481 UG DA L P
2017482 UGVT cl2AC PN NaN
2017483 UAD NaN NaN NaN
2017484 UAD NaN NaN NaN

In [29]:
frequency(df6,0).head()


Out[29]:
0 frequency cumulative ccdf
0 D 3572 0.069997 0.930003
1 U 2317 0.115400 0.884600
2 P 1585 0.146460 0.853540
3 A 1486 0.175580 0.824420
4 V 1232 0.199722 0.800278

In [30]:
frequency(df6,1).head()


Out[30]:
1 frequency cumulative ccdf
0 U 1790 0.039562 0.960438
1 P 1503 0.072780 0.927220
2 D 1454 0.104915 0.895085
3 A 1174 0.130862 0.869138
4 UP 987 0.152676 0.847324

In [31]:
frequency(df6,2).head()


Out[31]:
2 frequency cumulative ccdf
0 P 2009 0.070289 0.929711
1 U 982 0.104646 0.895354
2 D 749 0.130852 0.869148
3 A 693 0.155098 0.844902
4 C 525 0.173466 0.826534

In [32]:
frequency(df6,3).head()


Out[32]:
3 frequency cumulative ccdf
0 P 546 0.091167 0.908833
1 U 331 0.146435 0.853565
2 B 260 0.189848 0.810152
3 N 255 0.232426 0.767574
4 A 251 0.274336 0.725664

Now, I want a set of scatterplots between these frequencies.


In [33]:
pt = pd.concat([frequency(df6,0),frequency(df6,1),frequency(df6,2),frequency(df6,3)])
pt['daily_path'] = pt[0].replace(np.nan, '', regex=True) + pt[1].replace(np.nan, '', regex=True) + pt[2].replace(np.nan, '', regex=True) + pt[3].replace(np.nan, '', regex=True)
pt.drop([0,1,2,3,'ccdf','cumulative'],axis=1,inplace=True)
pt.head()


Out[33]:
frequency daily_path
0 3572 D
1 2317 U
2 1585 P
3 1486 A
4 1232 V

In [34]:
pt2 = pt.groupby('daily_path').sum()
pt2.sort_values('frequency', inplace=True, ascending=False)
pt2.head()


Out[34]:
frequency
daily_path
D 5932
P 5643
U 5420
A 3604
V 1909

In [35]:
pt2[pt2['frequency']>200].plot.bar(figsize=(16,8))
plt.title('Most common daily Firenze card paths across all days')
plt.xlabel('x = Encoded path')
plt.ylabel('Number of cards with daily path x')
# plt.yscale('log')
plt.show()



In [36]:
nodes.head()


Out[36]:
museum_name longitude latitude museum_id short_name string
0 Basilica di Santa Croce 11.262598 43.768754 1 Santa Croce C
1 Basilica San Lorenzo 11.254430 43.774932 2 San Lorenzo 2
2 Battistero di San Giovanni 11.254966 43.773131 3 Opera del Duomo D
3 Biblioteca Medicea Laurenziana 11.253924 43.774799 4 Laurenziana l
4 Cappella Brancacci 11.243859 43.768334 5 Brancacci b

In [37]:
# For reference, here are the displayed museums
# nodes[['string','short_name']].set_index('string').reindex(['D','P','U','A','V','T','N','C','G','B','S','c','m','M','b','Y','2'])
nodes[nodes['string'].isin(['D','P','U','A','V','T','N','C','G','B','S','c','m','M','b','Y','2'])][['string','short_name']]


Out[37]:
string short_name
0 C Santa Croce
1 2 San Lorenzo
2 D Opera del Duomo
4 b Brancacci
5 c Cappelle Medicee
9 U Uffizi
10 A Accademia
19 M M. San Marco
20 N M. Santa Maria Novella
23 G M. Galileo
26 B M. Bargello
40 V M. Palazzo Vecchio
48 m Palazzo Medici
49 S Palazzo Strozzi
50 T Torre di Palazzo Vecchio
51 Y V. Bardini
62 P Pitti

In [38]:
df6[pd.isnull(df6[0].str[0])].head()


Out[38]:
0 1 2 3
user_id

In [26]:
df6.to_csv('encoded_paths.csv')

In [27]:
nodes.to_csv('encoded_paths_legend.csv')

In [ ]:


In [39]:
df6.values


Out[39]:
array([['U3A', 'DGVBC', nan, nan],
       ['U', 'D2V', 'PX', nan],
       ['U', 'D2V', 'PX', nan],
       ..., 
       ['D', nan, nan, nan],
       ['D', nan, nan, nan],
       ['D', nan, nan, nan]], dtype=object)

In [ ]:


In [ ]: