notebook.community

Edit and run



In [ ]:

    
#############################################
#### Exploratory Analysis of FirenzeCard data
#############################################

# import libraries
import sys
sys.path.append('../src/')
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline  
import psycopg2
from features.firenzecard import *



In [ ]:

    
# establish connection to db
# connection = connect(host='', port=)

df = pd.read_csv('../src/output/firenzedata_feature_extracted.csv')



In [ ]:

    
########################
# Card Usage Behaviour
########################

# How many cards are there?
print('How many Firenzecards are there?', len(df['user_id'].unique()))



In [ ]:

    
# How many cards were activated? 
len(df[(df['adults_first_use'] == 1)])

# What is the most common day of activation?
day_of_activation, plot_url_activation = plot_day_of_activation(df, plotname='DOA')
plot_url_activation



In [ ]:

    
# How many users use the card for 24h or less? (not cumulative calculation)
print(len(df[df['total_duration_card_use'] <= 24].user_id.unique()))

# ... 24 - 48h?
print(len(df[(df['total_duration_card_use'] > 24) & (df['total_duration_card_use'] <= 48)].user_id.unique()))

# ... 48 - 72h?
print(len(df[(df['total_duration_card_use'] > 48) & (df['total_duration_card_use'] <= 72)].user_id.unique()))



In [ ]:

    
# How many museums visited per card / per day
total_museums_per_card, plot_url1 = plot_museums_visited_per_card(df, plotname1 = 'Number-museums-per-card')
plot_url1



In [ ]:

    
########################
# Popular Museums
########################

# What are the most popular museums?
popular_museums, plot_url2 = plot_museum_aggregate_entries(df, plotname='PM')
plot_url2



In [ ]:

    
########################
# State Museum Visits
########################

national_museum_entries, plot_url3 = plot_national_museum_entries(connection, export_to_csv=True,export_path='../src/output/')
plot_url3



In [ ]:

    
# How many cards are entering museums with minors? What proportion of all cards is this?
minors = df[df['is_card_with_minors'] == 1]
minors = minors.groupby('user_id').size().to_frame()                         
len(minors)



In [ ]:

    
##############################
# Entries in Museums over time
##############################

museum_list = ['Santa Croce', 'Opera del Duomo', 'Uffizi', 'Accademia',
       'M. Casa Dante', 'M. Palazzo Vecchio', 'M. Galileo', 'M. Bargello',
       'San Lorenzo', 'M. Archeologico', 'Pitti', 'Cappelle Medicee',
       'M. Santa Maria Novella', 'M. San Marco', 'Laurenziana',
       'M. Innocenti', 'Palazzo Strozzi', 'Palazzo Medici',
       'Torre di Palazzo Vecchio', 'Brancacci', 'M. Opificio',
       'La Specola', 'Orto Botanico', 'V. Bardini', 'M. Stefano Bardini',
       'M. Antropologia', 'M. Ebraico', 'M. Marini', 'Casa Buonarroti',
       'M. Horne', 'M. Ferragamo', 'M. Novecento', 'M. Palazzo Davanzati',
       'M. Geologia', 'M. Civici Fiesole', 'M. Stibbert', 'M. Mineralogia',
       'M. Preistoria', 'M. Calcio', 'Primo Conti','All Museums']



In [ ]:

    
df_date, plot_urls = get_museum_entries_per_timedelta_and_plot(df, museum_list, timedelta='date',
                                                          start_date='2016-06-01',
                                                          end_date='2016-09-30', plot=False, export_to_csv=False,
                                                          export_path='../src/output/')

df2_date = df_date['All Museums']
df_date['All Museums'].head()



In [ ]:

    
df_hour, plot_urls = get_museum_entries_per_timedelta_and_plot(df, museum_list, timedelta='hour',
                                                          start_date='2016-06-01',
                                                          end_date='2016-09-30', plot=False, export_to_csv=False,
                                                          export_path='../src/output/')

df2_hour = df_hour['All Museums']
df_hour['All Museums'].head()



In [ ]:

    
df_dow, plot_urls = get_museum_entries_per_timedelta_and_plot(df, museum_list, timedelta='day_of_week',
                                                          start_date='2016-06-01',
                                                          end_date='2016-09-30', plot=False, export_to_csv=False,
                                                          export_path='../src/output/')

df2_dow = df_dow['All Museums']
df_dow['All Museums'].head()



In [ ]:

    
# Timeline of usage(per avg hour, calendar day, calendar month, weekday) - segment per museum
mean_entries_hour, mean_entries_dow, mean_entries_date = get_timelines_of_usage(df2_hour, df2_date, df2_dow)
# mean_entries_hour.head(), mean_entries_dow.head(), mean_entries_date.head()



In [ ]:

    
# Daily Museums entries 
date, date_url = plot_timeseries_button_plot(df_date, timedelta= 'date', plotname='timeseries')
date_url



In [ ]:

    
# Hourly Museums entries 
hour, hour_url = plot_timeseries_button_plot(df_hour, timedelta= 'hour', plotname='timeseries')
hour_url



In [ ]:

    
# Day of Week museum entries
dow, dow_url = plot_timeseries_button_plot(df_dow, timedelta= 'day_of_week', plotname='testing')
dow_url



In [ ]:

    
##########################
## Geographic Timseries map
##########################

# Which museums are full, and which are rather empty, at different times of the day?
# Are they located next to each other?
data, geomap_plot_url = plot_geomap_timeseries(df, df2_hour, date_to_plot='2016-07-10', 
                           plotname='map-test', mapbox_access_token='pk.eyJ1IjoiY2hlbHNlYXBsb3RseSIsImEiOiJjaXFqeXVzdDkwMHFrZnRtOGtlMGtwcGs4In0.SLidkdBMEap9POJGIe1eGw', min_timedelta=7,
                           max_timedelta=23)
geomap_plot_url



In [ ]:

    
######################
# Museum timeseries correlations
######################

lst = list(df.museum_id.unique())

corr_matrix, high_corr, inverse_corr = get_correlation_matrix(df=df2_hour, lst = lst, corr_method = 'spearman',
                                                              timedelta='hour', timedelta_subset = False, 
                                                              timedeltamin = 0, timedeltamax = 3, 
                                                              below_threshold= -0.7, above_threshold=0.7, 
                                                              export_to_csv=True, export_path='../src/output/')

inverse_corr.head()



In [ ]: