Network Analysis

First, import relevant libraries:


In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

import igraph as ig

In [2]:
import sys
sys.path.append('../../src/')
from utils.database import dbutils

conn = dbutils.connect()
cursor = conn.cursor()

Then, load the data (takes a few moments):


In [3]:
df = pd.read_sql('select * from optourism.firenze_card_logs', con=conn)
df.head()


Out[3]:
user_id museum_name entry_time adults_first_use adults_reuse total_adults minors museum_id
0 2047835 Battistero di San Giovanni 2016-08-01 08:37:00 0 1 1 0 2
1 2047834 Battistero di San Giovanni 2016-08-01 08:37:00 0 1 1 0 2
2 2067906 Battistero di San Giovanni 2016-08-01 08:39:00 0 1 1 0 2
3 2067905 Battistero di San Giovanni 2016-08-01 08:40:00 0 1 1 0 2
4 2068678 Battistero di San Giovanni 2016-08-01 08:42:00 0 1 1 0 2

In [4]:
def frequency(dataframe,columnname):
    out = dataframe[columnname].value_counts().to_frame()
    out.columns = ['frequency']
    out.index.name = columnname
    out.reset_index(inplace=True)
    out = out.sort_values(columnname)
    out['cumulative'] = out['frequency'].cumsum()/out['frequency'].sum()
    out['ccdf'] = 1 - out['cumulative']
    return out

In [6]:
(df['adults_first_use'] + df['adults_reuse'] != df['total_adults']).sum() # Check to make sure the columns add up


Out[6]:
0

In [7]:
(df['total_adults'] > 1).sum() # Check to make sure there is never more than 1 adult per card, acc


Out[7]:
0

In [9]:
fr1 = frequency(df,'minors')
fr1.head() # Only 1 child max per card, which is about 10% of the cases


Out[9]:
minors frequency cumulative ccdf
0 0 360541 0.907898 0.092102
1 1 36575 1.000000 0.000000

In [26]:
# Now, do some people visit the same museum more than once?
fr2 = frequency(df.groupby(['museum_name','user_id'])['total_adults'].count().to_frame(),'total_adults')
fr2.head(20) # Yes! Some up to 20?!? Let's investigate more.


Out[26]:
total_adults frequency cumulative ccdf
0 1 333456 0.924926 0.075074
1 2 19488 0.978980 0.021020
2 3 5914 0.995384 0.004616
3 4 1512 0.999578 0.000422
4 5 111 0.999886 0.000114
5 6 19 0.999939 0.000061
6 7 7 0.999958 0.000042
10 8 2 0.999964 0.000036
7 10 6 0.999981 0.000019
9 12 2 0.999986 0.000014
13 13 1 0.999989 0.000011
8 14 2 0.999994 0.000006
12 15 1 0.999997 0.000003
11 20 1 1.000000 0.000000

In [25]:
df1 = df.groupby(['museum_name','user_id'])['total_adults'].count().to_frame()
df1[df1['total_adults']>5]


Out[25]:
total_adults
museum_name user_id
Basilica di Santa Croce 2070787 6
Battistero di San Giovanni 2040932 6
2043298 8
2043961 8
2055516 6
2058028 15
2058342 6
2059220 10
2062281 6
2063201 6
2063677 6
2068023 7
2070787 6
2073716 10
2078165 7
2078556 6
2081333 6
Biblioteca Medicea Laurenziana 2042664 7
Galleria degli Uffizi 2064117 6
2067390 7
2068648 20
2070787 6
Galleria dell'Accademia di Firenze 2070787 6
2078823 12
Museo Galileo 2070787 6
Museo Nazionale del Bargello 2049800 14
2061143 10
2064542 10
2064714 10
2067390 7
2069827 13
2081898 6
Museo di Palazzo Vecchio 2065730 10
2067390 7
Museo di San Marco 2017844 6
2070787 6
Museo di Santa Maria Novella 2070787 6
2073033 7
2075251 14
Palazzo Pitti 2 Ð Giardino di Boboli, Museo degli Argenti, Museo delle Porcellan 2070787 6
2080579 12

In [24]:
# For Battistero di San Giovanni, it is expected, since that includes all Duomo locations. 
# But for others it's unexpected. Let's take a look at the most egregious case.
df[(df['user_id']==2068648) & (df['museum_name']=='Galleria degli Uffizi')]


Out[24]:
user_id museum_name entry_time adults_first_use adults_reuse total_adults minors museum_id
21430 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21431 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21432 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21433 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21434 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21435 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21436 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21437 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21439 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21440 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21441 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21442 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 1 1 0 9
21443 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21444 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21445 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21446 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21447 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21448 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21449 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9
21450 2068648 Galleria degli Uffizi 2016-08-06 16:29:00 0 0 0 1 9

In [32]:
# Suggests we need deduplication. But let's make sure.
df2 = df.groupby(['user_id','museum_name','entry_time']).count()
df2[(df2['total_adults']>1)|(df2['minors']>1)].head(50)


Out[32]:
adults_first_use adults_reuse total_adults minors museum_id
user_id museum_name entry_time
1473906 Museo di San Marco 2016-07-24 11:58:00 2 2 2 2 2
2017453 Torre di Palazzo Vecchio 2016-06-17 20:04:00 2 2 2 2 2
2017468 Battistero di San Giovanni 2016-06-16 10:46:00 3 3 3 3 3
Galleria degli Uffizi 2016-06-18 10:45:00 3 3 3 3 3
Galleria dell'Accademia di Firenze 2016-06-16 12:56:00 3 3 3 3 3
Museo Galileo 2016-06-18 15:56:00 3 3 3 3 3
Museo di Palazzo Vecchio 2016-06-18 13:17:00 3 3 3 3 3
Palazzo Medici Riccardi 2016-06-16 12:00:00 3 3 3 3 3
Palazzo Pitti 2 Ð Giardino di Boboli, Museo degli Argenti, Museo delle Porcellan 2016-06-17 12:34:00 3 3 3 3 3
Torre di Palazzo Vecchio 2016-06-18 14:58:00 3 3 3 3 3
2017470 Battistero di San Giovanni 2016-06-16 12:07:00 3 3 3 3 3
Galleria degli Uffizi 2016-06-16 09:41:00 3 3 3 3 3
Galleria dell'Accademia di Firenze 2016-06-16 14:19:00 3 3 3 3 3
2017487 Battistero di San Giovanni 2016-06-15 13:13:00 3 3 3 3 3
Galleria degli Uffizi 2016-06-15 11:26:00 3 3 3 3 3
Galleria dell'Accademia di Firenze 2016-06-15 16:10:00 3 3 3 3 3
Museo Galileo 2016-06-14 12:14:00 3 3 3 3 3
Museo Nazionale del Bargello 2016-06-14 14:04:00 3 3 3 3 3
Museo di Palazzo Vecchio 2016-06-14 21:35:00 3 3 3 3 3
2017489 Galleria degli Uffizi 2016-06-14 12:03:00 2 2 2 2 2
Museo Galileo 2016-06-15 13:46:00 2 2 2 2 2
2017800 Battistero di San Giovanni 2016-06-20 14:22:00 2 2 2 2 2
2017818 Museo Nazionale del Bargello 2016-06-22 10:24:00 3 3 3 3 3
Palazzo Pitti 2 Ð Giardino di Boboli, Museo degli Argenti, Museo delle Porcellan 2016-06-24 10:58:00 3 3 3 3 3
2017821 Museo Galileo 2016-06-22 14:33:00 2 2 2 2 2
2017822 Battistero di San Giovanni 2016-06-22 08:46:00 2 2 2 2 2
Museo Nazionale del Bargello 2016-06-21 14:12:00 2 2 2 2 2
Museo di Palazzo Vecchio 2016-06-22 17:43:00 2 2 2 2 2
Torre di Palazzo Vecchio 2016-06-22 18:10:00 2 2 2 2 2
2017844 Museo di San Marco 2016-06-20 11:13:00 6 6 6 6 6
2019284 Basilica di Santa Croce 2016-07-03 15:07:00 2 2 2 2 2
Battistero di San Giovanni 2016-07-04 11:43:00 2 2 2 2 2
Palazzo Pitti 2 Ð Giardino di Boboli, Museo degli Argenti, Museo delle Porcellan 2016-07-02 12:48:00 2 2 2 2 2
2019298 Battistero di San Giovanni 2016-06-20 17:06:00 2 2 2 2 2
2019299 Battistero di San Giovanni 2016-06-20 17:06:00 2 2 2 2 2
2024301 Museo Galileo 2016-06-01 11:28:00 2 2 2 2 2
2024302 Museo Galileo 2016-06-01 11:28:00 2 2 2 2 2
2027552 Basilica San Lorenzo 2016-06-02 16:32:00 3 3 3 3 3
Basilica di Santa Croce 2016-06-04 09:29:00 3 3 3 3 3
Battistero di San Giovanni 2016-06-02 13:08:00 3 3 3 3 3
Casa Buonarroti 2016-06-04 10:16:00 3 3 3 3 3
Galleria degli Uffizi 2016-06-02 09:15:00 3 3 3 3 3
Galleria dell'Accademia di Firenze 2016-06-02 15:46:00 3 3 3 3 3
La Specola 2016-06-03 12:42:00 3 3 3 3 3
Museo Casa Dante 2016-06-04 10:47:00 3 3 3 3 3
Museo Galileo 2016-06-03 16:58:00 3 3 3 3 3
Museo Novecento 2016-06-03 18:30:00 3 3 3 3 3
Museo di Palazzo Vecchio 2016-06-03 15:23:00 3 3 3 3 3
Museo di Santa Maria Novella 2016-06-02 17:37:00 3 3 3 3 3
Palazzo Pitti 2 Ð Giardino di Boboli, Museo degli Argenti, Museo delle Porcellan 2016-06-03 09:37:00 3 3 3 3 3

In [33]:
df3 = df2[(df2['total_adults']>1)|(df2['minors']>1)]
df3.head()


Out[33]:
adults_first_use adults_reuse total_adults minors museum_id
user_id museum_name entry_time
1473906 Museo di San Marco 2016-07-24 11:58:00 2 2 2 2 2
2017453 Torre di Palazzo Vecchio 2016-06-17 20:04:00 2 2 2 2 2
2017468 Battistero di San Giovanni 2016-06-16 10:46:00 3 3 3 3 3
Galleria degli Uffizi 2016-06-18 10:45:00 3 3 3 3 3
Galleria dell'Accademia di Firenze 2016-06-16 12:56:00 3 3 3 3 3

In [38]:
df3.shape # How many unique user_id / museum_name / entry_time rows


Out[38]:
(27055, 5)

In [40]:
df3['adults_first_use'].sum() # How many entries there are


Out[40]:
63637

In [41]:
df3['adults_first_use'].sum() - df3.shape[0] # How many, then, are greater than 1: number of duplicate records


Out[41]:
36582

In [39]:
df.shape # How many records there were initially


Out[39]:
(397116, 8)

In [42]:
df.shape[0] - (df3['adults_first_use'].sum() - df3.shape[0]) # Should be the result when deduplicated


Out[42]:
360534

In [10]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.stem(fr1['minors']+1,fr1['frequency'], linestyle='steps--')
yscale('log')
xscale('log')
ax.set_title('Distribution of number of minors')
ax.set_ylabel('Entrances')
ax.set_xlabel('Number of minors')
plt.show()



In [17]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.plot(frc2['calls'],frc2['cumulative'])
# yscale('log')
xscale('log')
# ylim([.7,1.01])
ax.set_title('CDF of calls per person among Italians who have made calls in Florence city')
ax.set_ylabel('Proportion of users making x or fewer calls')
ax.set_xlabel('Number of calls')
# axvline(4.5) # Our cutoff
# axhline(.1)
plt.show()



In [18]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.plot(frc2['calls'],frc2['ccdf'])
yscale('log')
xscale('log')
ax.set_title('CCDF of calls per person among Italians who have made calls in Florence city')
ax.set_ylabel('Proportion of users making at least x calls')
ax.set_xlabel('Number of calls')
plt.show()



In [19]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.stem(frc_uc['calls_in_florence_comune'],frc_uc['frequency'], linestyle='steps--')
yscale('log')
xscale('log')
ax.set_title('Calls per person among Italians in Florence city')
ax.set_ylabel('Number of people making x calls')
ax.set_xlabel('Number of calls')
plt.show()



In [32]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.plot(frc_uc['calls_in_florence_comune'],frc_uc['cumulative'])
# yscale('log')
xscale('log')
ylim([.75,1.01])
ax.set_title('CDF of calls per person among Italians in Florence city')
ax.set_ylabel('Proportion of users making x or fewer calls')
ax.set_xlabel('Number of calls')
plt.show()



In [21]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.plot(frc_uc['calls_in_florence_comune'],frc_uc['ccdf'])
yscale('log')
xscale('log')
ax.set_title('CCDF of calls per person among Italians in Florence city')
ax.set_ylabel('Proportion of users making at least x calls')
ax.set_xlabel('Number of calls')
plt.show()



In [22]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.stem(frc_dc['days_active_in_florence_comune'],frc_dc['frequency'], linestyle='steps--')
yscale('log')
xscale('log')
ax.set_title('Days active per person among Italians in Florence city')
ax.set_ylabel('Number of people with x days active')
ax.set_xlabel('Days active')
plt.show()



In [33]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.plot(frc_dc['days_active_in_florence_comune'],frc_dc['cumulative'])
# yscale('log')
xscale('log')
ylim([.86,1.01])
ax.set_title('CDF of days active per person among Italians in Florence city')
ax.set_ylabel('Proportion of users active on x or fewer days')
ax.set_xlabel('Number of days active')
plt.show()



In [24]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.plot(frc_dc['days_active_in_florence_comune'],frc_dc['ccdf'])
yscale('log')
xscale('log')
ax.set_title('CCDF of days active per person among Italians in Florence city')
ax.set_ylabel('Proportion of users active on at least x days')
ax.set_xlabel('Number of days a')
plt.show()



In [25]:
dfc['mean_calls_per_day'] = dfc['calls']/dfc['days_active']
dfc[dfc['calls_in_florence_comune']>0].head()


Out[25]:
cust_id home_region calls days_active days_active_in_florence_comune calls_in_florence_comune mean_calls_per_day
1 25440450 Campania 52588 48 20 120 1095.583333
2 29613826 Lazio 52546 46 26 320 1142.304348
6 2267462 Sicilia 35447 29 9 46 1222.310345
8 28437176 Puglia 31993 22 3 185 1454.227273
12 12558421 Toscana 26848 49 11 169 547.918367

In [26]:
print dfc[dfc['calls_in_florence_comune']>0]['mean_calls_per_day'].max()
print dfc[dfc['calls_in_florence_comune']>0]['mean_calls_per_day'].min()
print dfc[dfc['calls_in_florence_comune']>0]['mean_calls_per_day'].mean()
print dfc[dfc['calls_in_florence_comune']>0]['mean_calls_per_day'].median()
print dfc[dfc['calls_in_florence_comune']>0]['mean_calls_per_day'].std()
print dfc[dfc['calls_in_florence_comune']>0]['mean_calls_per_day'].std()*2+dfc[dfc['calls_in_florence_comune']>0]['mean_calls_per_day'].mean()
print dfc[dfc['calls_in_florence_comune']>0]['mean_calls_per_day'].std()*3+dfc[dfc['calls_in_florence_comune']>0]['mean_calls_per_day'].mean()


1461.88888889
1.0
16.2813373574
12.0
16.6621865933
49.6057105439
66.2678971372

In [27]:
dfc[(dfc['calls_in_florence_comune']>0)&(dfc['mean_calls_per_day']<1000)].plot.hist(y='mean_calls_per_day', logy=True, figsize=(15,10), bins=200)
plt.ylabel('Frequency')
plt.xlabel('Average calls per active day')
plt.axvline(150,color="black")
# plt.xlim([0,1000])
plt.title('Average calls per active day by foreign SIM cards who were in Florence city')


Out[27]:
<matplotlib.text.Text at 0x7f3e15b7dad0>

In [28]:
# dfc.plot.scatter(x='calls',y='days_active',figsize=(15,10),logy=True,logx=True)

In [41]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.scatter(x=dfc['calls'],y=dfc['days_active'],s=.1)
yscale('log')
xscale('log')
ax.set_title('Calls by days active among Italians')
ax.set_xlabel('Calls')
ax.set_ylabel('Days active')
y=[1/10000, 1*200]
x2=[150/10000, 150*200]
plt.plot(x2,y,color='black',linewidth=.25,ls='dashed')
ax.axvline(4.5,color='black',linewidth=.25,ls='dashed')
ylim([.9,90])
xlim([.9,10*10000])
plt.show()



In [30]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.scatter(x=dfc['calls'],y=dfc['calls_in_florence_comune'],s=.1)
yscale('log')
xscale('log')
ax.set_title('Total calls vs calls in Florence city, for Italians with calls in Florence')
ax.set_xlabel('Total calls')
ax.set_ylabel('Calls in Florence city')
xlim([.9,10*10000])
ylim([.9,10*10000])
plt.show()



In [44]:
f, ax = plt.subplots(figsize=(6,5), dpi=300)
ax.scatter(x=dfc['days_active'],y=dfc['days_active_in_florence_comune'],s=.1)
yscale('log')
xscale('log')
ax.set_title('Total days active vs days active in Florence city, for Italians with calls in Florence')
ax.set_xlabel('Total days active')
ax.set_ylabel('Days active in Florence city')
ylim([.9,80])
xlim([.9,80])
plt.show()



In [ ]: