In [ ]:
#############################################
#### Exploratory Analysis of FirenzeCard data
#############################################
# import libraries
import sys
sys.path.append('../src/')
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import psycopg2
from features.firenzecard import *
In [ ]:
# establish connection to db
# connection = connect(host='', port=)
df = pd.read_csv('../src/output/firenzedata_feature_extracted.csv')
In [ ]:
########################
# Card Usage Behaviour
########################
# How many cards are there?
print('How many Firenzecards are there?', len(df['user_id'].unique()))
In [ ]:
# How many cards were activated?
len(df[(df['adults_first_use'] == 1)])
# What is the most common day of activation?
day_of_activation, plot_url_activation = plot_day_of_activation(df, plotname='DOA')
plot_url_activation
In [ ]:
# How many users use the card for 24h or less? (not cumulative calculation)
print(len(df[df['total_duration_card_use'] <= 24].user_id.unique()))
# ... 24 - 48h?
print(len(df[(df['total_duration_card_use'] > 24) & (df['total_duration_card_use'] <= 48)].user_id.unique()))
# ... 48 - 72h?
print(len(df[(df['total_duration_card_use'] > 48) & (df['total_duration_card_use'] <= 72)].user_id.unique()))
In [ ]:
# How many museums visited per card / per day
total_museums_per_card, plot_url1 = plot_museums_visited_per_card(df, plotname1 = 'Number-museums-per-card')
plot_url1
In [ ]:
########################
# Popular Museums
########################
# What are the most popular museums?
popular_museums, plot_url2 = plot_museum_aggregate_entries(df, plotname='PM')
plot_url2
In [ ]:
########################
# State Museum Visits
########################
national_museum_entries, plot_url3 = plot_national_museum_entries(connection, export_to_csv=True,export_path='../src/output/')
plot_url3
In [ ]:
# How many cards are entering museums with minors? What proportion of all cards is this?
minors = df[df['is_card_with_minors'] == 1]
minors = minors.groupby('user_id').size().to_frame()
len(minors)
In [ ]:
##############################
# Entries in Museums over time
##############################
museum_list = ['Santa Croce', 'Opera del Duomo', 'Uffizi', 'Accademia',
'M. Casa Dante', 'M. Palazzo Vecchio', 'M. Galileo', 'M. Bargello',
'San Lorenzo', 'M. Archeologico', 'Pitti', 'Cappelle Medicee',
'M. Santa Maria Novella', 'M. San Marco', 'Laurenziana',
'M. Innocenti', 'Palazzo Strozzi', 'Palazzo Medici',
'Torre di Palazzo Vecchio', 'Brancacci', 'M. Opificio',
'La Specola', 'Orto Botanico', 'V. Bardini', 'M. Stefano Bardini',
'M. Antropologia', 'M. Ebraico', 'M. Marini', 'Casa Buonarroti',
'M. Horne', 'M. Ferragamo', 'M. Novecento', 'M. Palazzo Davanzati',
'M. Geologia', 'M. Civici Fiesole', 'M. Stibbert', 'M. Mineralogia',
'M. Preistoria', 'M. Calcio', 'Primo Conti','All Museums']
In [ ]:
df_date, plot_urls = get_museum_entries_per_timedelta_and_plot(df, museum_list, timedelta='date',
start_date='2016-06-01',
end_date='2016-09-30', plot=False, export_to_csv=False,
export_path='../src/output/')
df2_date = df_date['All Museums']
df_date['All Museums'].head()
In [ ]:
df_hour, plot_urls = get_museum_entries_per_timedelta_and_plot(df, museum_list, timedelta='hour',
start_date='2016-06-01',
end_date='2016-09-30', plot=False, export_to_csv=False,
export_path='../src/output/')
df2_hour = df_hour['All Museums']
df_hour['All Museums'].head()
In [ ]:
df_dow, plot_urls = get_museum_entries_per_timedelta_and_plot(df, museum_list, timedelta='day_of_week',
start_date='2016-06-01',
end_date='2016-09-30', plot=False, export_to_csv=False,
export_path='../src/output/')
df2_dow = df_dow['All Museums']
df_dow['All Museums'].head()
In [ ]:
# Timeline of usage(per avg hour, calendar day, calendar month, weekday) - segment per museum
mean_entries_hour, mean_entries_dow, mean_entries_date = get_timelines_of_usage(df2_hour, df2_date, df2_dow)
# mean_entries_hour.head(), mean_entries_dow.head(), mean_entries_date.head()
In [ ]:
# Daily Museums entries
date, date_url = plot_timeseries_button_plot(df_date, timedelta= 'date', plotname='timeseries')
date_url
In [ ]:
# Hourly Museums entries
hour, hour_url = plot_timeseries_button_plot(df_hour, timedelta= 'hour', plotname='timeseries')
hour_url
In [ ]:
# Day of Week museum entries
dow, dow_url = plot_timeseries_button_plot(df_dow, timedelta= 'day_of_week', plotname='testing')
dow_url
In [ ]:
##########################
## Geographic Timseries map
##########################
# Which museums are full, and which are rather empty, at different times of the day?
# Are they located next to each other?
data, geomap_plot_url = plot_geomap_timeseries(df, df2_hour, date_to_plot='2016-07-10',
plotname='map-test', mapbox_access_token='pk.eyJ1IjoiY2hlbHNlYXBsb3RseSIsImEiOiJjaXFqeXVzdDkwMHFrZnRtOGtlMGtwcGs4In0.SLidkdBMEap9POJGIe1eGw', min_timedelta=7,
max_timedelta=23)
geomap_plot_url
In [ ]:
######################
# Museum timeseries correlations
######################
lst = list(df.museum_id.unique())
corr_matrix, high_corr, inverse_corr = get_correlation_matrix(df=df2_hour, lst = lst, corr_method = 'spearman',
timedelta='hour', timedelta_subset = False,
timedeltamin = 0, timedeltamax = 3,
below_threshold= -0.7, above_threshold=0.7,
export_to_csv=True, export_path='../src/output/')
inverse_corr.head()
In [ ]: