In [1]:
#### Notebook to generate FirenzeCard analysis
#### Timeseries, and summary statistics
import sys
sys.path.append('../src/')
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import psycopg2
from features.firenzecard import *
from IPython.core.debugger import Tracer
from scipy.stats import norm
from sklearn.neighbors import KernelDensity
In [26]:
def frequency(dataframe,columnname):
"""
:param dataframe: a pandas dataframe
:param columnname: a single column, with discrete (including integer) values
:return: a frequency table, suitable for plotting the empirical PMF, empirical CDF, or empirical CCDF
"""
out = dataframe[columnname].value_counts().to_frame()
out.columns = ['frequency']
out.index.name = columnname
out.reset_index(inplace=True)
out.sort_values('frequency',inplace=True,ascending=False)
out['cumulative'] = out['frequency'].cumsum()/out['frequency'].sum()
out['ccdf'] = 1 - out['cumulative']
return out
In [24]:
df = pd.read_csv('../src/output/firenzedata_feature_extracted.csv')
df.columns
Out[24]:
In [10]:
df1 = df.groupby(['user_id','entry_time']).agg({'time_since_previous_museum':'max', 'total_adults':'sum', 'total_people':'sum', 'museum_id':'max'})
# df1 = df.groupby(['user_id','entry_time','museum_id']).agg({'time_until_next_museum':['min','max'], 'total_adults':['sum'], 'total_people':['sum']})
df1.dropna(how="any",inplace=True)
df1 = df1[df1['time_since_previous_museum']>0]
df1.reset_index(inplace=True)
# df1[df1.time_until_next_museum['min']!=df1.time_until_next_museum['max']] # All have minors
In [11]:
df1.head()
Out[11]:
In [12]:
df1[df1.total_people>1].head(20)
Out[12]:
In [13]:
# df[(df['user_id']==2017470)].sort_values('entry_time')
In [14]:
x1 = df1.loc[np.repeat(df1.index.values,df1['total_people'])]['time_since_previous_museum']
x1.head()
Out[14]:
In [15]:
x2 = df1['time_since_previous_museum']
In [16]:
df1.shape
Out[16]:
In [17]:
x1.shape
Out[17]:
In [18]:
x2.shape
Out[18]:
In [28]:
# time_until_next_museum
trace1 = go.Histogram(x=x1,
xbins=dict(start=np.min(x1),
size=0.25, end=np.max(x1)),
marker=dict(color='#CC171D'),
name='Firenze Cards'
)
trace2 = go.Histogram(x=x2,
xbins=dict(start=np.min(x2),
size=0.25, end=np.max(x2)),
marker=dict(color='#1789CC'),
name='Total People on Visit'
)
layout = go.Layout(
title="Time Between Museum Visits",
titlefont=dict(size=28),
barmode='stack',
legend=dict(
x=0.8,
y=0.9,
traceorder='normal',
font=dict(
family='sans-serif',
size=16,
color='#000'
),
bgcolor='#FFFFFF',
bordercolor='#E2E2E2',
borderwidth=2
),
width=1200,
height=800,
xaxis=dict(
title='Time Gap in Hours (15 minute bins)',
titlefont=dict(size=20),
nticks=32,
ticks='outside',
tickfont=dict(size=16)
),
yaxis=dict(
title='Number of People-Visits with Given Time Gap',
titlefont=dict(size=20),
ticks='outside',
tickfont=dict(size=16)
)
)
fig = go.Figure(data=go.Data([trace1,trace2]), layout=layout)
py.iplot(fig, filename='TUNME', sharing='private', auto_open=False)
Out[28]:
In [185]:
fr = frequency(df.groupby(['user_id','entry_time','museum_id']).sum()['total_people'].to_frame(),'total_people')
fr
Out[185]:
In [192]:
100-(fr['cumulative'].sub(fr['cumulative'].shift())*100).sum()
Out[192]:
In [193]:
fr['cumulative'].sub(fr['cumulative'].shift())*100
Out[193]:
In [ ]:
# # Card use count
# total_card_use_count = pd.DataFrame(df.groupby('user_id', as_index=True).size().rename('total_card_use_count'))
# df = pd.merge(total_card_use_count.reset_index(), df, on=['user_id'], how='inner')
# trace = go.Histogram(x=df['total_card_use_count'], xbins=dict(start=np.min(df['total_card_use_count']), size=1, end=np.max(df['total_card_use_count'])),
# marker=dict(color='rgb(0, 0, 100)'))
# layout = go.Layout(
# title="Card use count",
# legend=dict(
# x=1,
# y=1,
# traceorder='normal',
# font=dict(
# family='sans-serif',
# size=12,
# color='#000'
# ),
# bgcolor='#E2E2E2',
# bordercolor='#FFFFFF',
# borderwidth=2
# )
# )
# fig = go.Figure(data=go.Data([trace]), layout=layout)
# py.iplot(fig, filename='CUC', sharing='private', auto_open=False)
In [29]:
df2 = df.groupby(['user_id','entry_time','museum_id']).sum()[['total_people','total_adults']].reset_index()
df2.head()
Out[29]:
In [41]:
df3 = df2[['user_id','museum_id']].groupby('user_id').count().join(df2[['user_id','total_people','total_adults']].groupby('user_id').sum())
df3.columns = ['museums_visited','total_entries','adult_entries']
df3.head()
Out[41]:
In [201]:
# Frequency plot of number of unique museums visited per card
x = df3['museums_visited']
trace1 = go.Histogram(x=x, xbins=dict(start=np.min(x)-.25, size=.5, end=np.max(x)+.25),
marker=dict(color='#CC171D'),
name = 'Museums visited')
# trace2 = go.Histogram(x=adult, xbins=dict(start=np.min(adult), size=1, end=np.max(adult)),
# marker=dict(color='rgb(0, 100, 0)'),
# name = 'Adults')
layout = go.Layout(
title="Number of Museums Visited per Card",
titlefont=dict(size=28),
# legend=dict(
# traceorder='normal',
# font=dict(
# family='sans-serif',
# size=12,
# color='#000'
# ),
# bgcolor='#CC171D',
# bordercolor='#FFFFFF',
# borderwidth=0
# ),
width=1200,
height=800,
xaxis=dict(
title='Number of Museums Visited',
titlefont=dict(size=20),
range=[0.75,32.25],
nticks=32,
ticks='outside',
tickfont=dict(size=16)
),
yaxis=dict(
title='Number of Cards',
titlefont=dict(size=20),
ticks='outside',
tickfont=dict(size=16)
)
)
fig = go.Figure(data=go.Data([trace1]), layout=layout)
py.iplot(fig, filename='MPC_2', sharing='private', auto_open=False)
Out[201]:
In [204]:
x = df.groupby('user_id')['total_duration_card_use'].max()/pd.Timedelta('1 hour')
x.head()
Out[204]:
In [101]:
# kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(x.as_matrix()c
# kde
Out[101]:
In [208]:
trace = go.Histogram(x=x[x>0],
xbins=dict(start=np.min(x), size=1.0/4.0,
end=np.max(x)),
marker=dict(color='#CC171D'))
layout = go.Layout(
title="Duration of Card Usage",
titlefont=dict(size=28),
width=1200,
height=800,
xaxis=dict(
title='Hours Between First and Last Use of Card (bins of 15 minutes)',
titlefont=dict(size=20),
range=[0,72],
nticks=32,
ticks='outside',
tickfont=dict(size=16)
),
yaxis=dict(
title='Number of Cards',
titlefont=dict(size=20),
ticks='outside',
tickfont=dict(size=16)
)
)
fig = go.Figure(data=go.Data([trace]), layout=layout)
py.iplot(fig, filename='DOU_2', sharing='private', auto_open=False)
Out[208]:
In [196]:
# df['time_until_next_museum'] = df['time_until_next_museum'].apply(
# lambda x: pd.Timedelta(x) / pd.Timedelta('1 hour'))
# trace = go.Histogram(x=df['time_until_next_museum'], xbins=dict(start=np.min(x), size=0.25, end=np.max(x)),
# marker=dict(color='rgb(0, 0, 100)'))
# layout = go.Layout(
# title=""
# )
# fig = go.Figure(data=go.Data([trace]), layout=layout)
# py.iplot(fig, filename='TUNM', sharing='private', auto_open=False)
In [15]:
# Histogram of Monthly total museum entry data - from florence card
#https://plot.ly/~qiweihan/110
In [194]:
# Histogram of Monthly total museum entry data - from National Museums
# Comparison of PROPORTION of Firenze card entries with museum totals (pie chart?)
%load_ext sql
#TODO: connect with dbutils
conn_str = ""
conn = psycopg2.connect(conn_str)
c = conn.cursor()
test = get_national_museums(conn, export_to_csv=True, export_path='../src/output/')
test = test[(test['visit_month'] == 'June') | (test['visit_month'] == 'July') |
(test['visit_month'] == 'August') | (test['visit_month'] == 'September')]
In [195]:
trace = Bar(
x=test['place'],
y=test['total_visitors'],
)
fig = go.Figure(data=go.Data([trace]))
fig['layout'].update(height=800, width=900, title='Stacked subplots')
py.iplot(fig, filename='State Museum Entries', sharing='private')
Out[195]:
In [18]:
# # Which museums are most popular? number of entries per museum, per date
# trace = Bar(
# x=df[],
# y=df[],
# )
# fig = go.Figure(data=go.Data([trace]))
# fig['layout'].update(height=800, width=900, title='Stacked subplots')
# py.iplot(fig, filename='', sharing='private')
In [19]:
# Timeline of usage(per avg hour, calendar day, calendar month, weekday) - segment per museum
In [ ]:
# # Daytype of activation of card
# # get day of week for first use for every user
# trace = go.Histogram(x=df[''], xbins=dict(start=np.min(x), size=0.25, end=np.max(x)),
# marker=dict(color='rgb(0, 0, 100)'))
# layout = go.Layout(
# title=""
# )
# fig = go.Figure(data=go.Data([trace]), layout=layout)
# py.iplot(fig, filename='daytype of activation', sharing='private', auto_open=False)
In [27]:
dotw = {0:'Monday',
1:'Tuesday',
2:'Wednesday',
3:'Thursday',
4:'Friday',
5:'Saturday',
6:'Sunday'}
x = df[df['adults_first_use']==1][['user_id','day_of_week']].groupby('user_id').mean()['day_of_week'].map(dotw).to_frame()
fr2 = frequency(x,'day_of_week')[['day_of_week','frequency']]
fr2
Out[27]:
In [28]:
# Frequency plot of number of unique museums visited per card
trace = go.Bar(x=['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Satuday'],
y=[4919,6439,9959,8150,8150,7601,5648],
marker=dict(color='#CC171D'))
layout = go.Layout(
title="Day of Firenze Card Activation",
titlefont=dict(size=28),
width=1200,
height=800,
xaxis=dict(
title='Day of the Week',
titlefont=dict(size=20),
nticks=7,
ticks='outside',
tickfont=dict(size=16)
),
yaxis=dict(
title='Number of Cards Activated',
titlefont=dict(size=20),
ticks='outside',
tickfont=dict(size=16)
)
)
fig = go.Figure(data=go.Data([trace]), layout=layout)
py.iplot(fig, filename='daytype of activation', sharing='private', auto_open=False)
Out[28]:
In [ ]: