In [1]:
#### Notebook to generate FirenzeCard analysis
#### Timeseries, and summary statistics
import sys
sys.path.append('../src/')
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline  
import psycopg2
from features.firenzecard import *
from IPython.core.debugger import Tracer

from scipy.stats import norm
from sklearn.neighbors import KernelDensity

In [26]:
def frequency(dataframe,columnname):
    """
    :param dataframe: a pandas dataframe
    :param columnname: a single column, with discrete (including integer) values
    :return: a frequency table, suitable for plotting the empirical PMF, empirical CDF, or empirical CCDF
    """
    out = dataframe[columnname].value_counts().to_frame()
    out.columns = ['frequency']
    out.index.name = columnname
    out.reset_index(inplace=True)
    out.sort_values('frequency',inplace=True,ascending=False)
    out['cumulative'] = out['frequency'].cumsum()/out['frequency'].sum()
    out['ccdf'] = 1 - out['cumulative']
    return out

In [24]:
df = pd.read_csv('../src/output/firenzedata_feature_extracted.csv')
df.columns


Out[24]:
Index([u'user_id', u'museum_id', u'entrances_per_card_per_museum',
       u'museum_name', u'longitude', u'latitude', u'short_name', u'string',
       u'entry_time', u'adults_first_use', u'adults_reuse', u'total_adults',
       u'minors', u'time', u'date', u'hour', u'day_of_week', u'total_people',
       u'time_since_previous_museum', u'total_duration_card_use',
       u'entry_is_adult', u'is_card_with_minors', u'is_in_museum_1',
       u'is_in_museum_2', u'is_in_museum_3', u'is_in_museum_4',
       u'is_in_museum_5', u'is_in_museum_6', u'is_in_museum_7',
       u'is_in_museum_8', u'is_in_museum_9', u'is_in_museum_10',
       u'is_in_museum_11', u'is_in_museum_12', u'is_in_museum_13',
       u'is_in_museum_14', u'is_in_museum_15', u'is_in_museum_16',
       u'is_in_museum_17', u'is_in_museum_18', u'is_in_museum_19',
       u'is_in_museum_20', u'is_in_museum_21', u'is_in_museum_22',
       u'is_in_museum_23', u'is_in_museum_24', u'is_in_museum_25',
       u'is_in_museum_26', u'is_in_museum_27', u'is_in_museum_28',
       u'is_in_museum_29', u'is_in_museum_30', u'is_in_museum_31',
       u'is_in_museum_32', u'is_in_museum_33', u'is_in_museum_34',
       u'is_in_museum_35', u'is_in_museum_36', u'is_in_museum_37',
       u'is_in_museum_38', u'is_in_museum_39'],
      dtype='object')

In [10]:
df1 = df.groupby(['user_id','entry_time']).agg({'time_since_previous_museum':'max', 'total_adults':'sum', 'total_people':'sum', 'museum_id':'max'})
# df1 = df.groupby(['user_id','entry_time','museum_id']).agg({'time_until_next_museum':['min','max'], 'total_adults':['sum'], 'total_people':['sum']})
df1.dropna(how="any",inplace=True)
df1 = df1[df1['time_since_previous_museum']>0]
df1.reset_index(inplace=True)
# df1[df1.time_until_next_museum['min']!=df1.time_until_next_museum['max']] # All have minors

In [11]:
df1.head()


Out[11]:
user_id entry_time total_people time_since_previous_museum total_adults museum_id
0 1459702 2016-06-22 14:26:00 1 4.36667 1 15
1 1459702 2016-06-22 15:49:00 1 1.38333 1 11
2 1459702 2016-06-23 09:43:00 1 17.9 1 3
3 1459702 2016-06-23 11:14:00 1 1.51667 1 29
4 1459702 2016-06-23 12:57:00 1 1.71667 1 23

In [12]:
df1[df1.total_people>1].head(20)


Out[12]:
user_id entry_time total_people time_since_previous_museum total_adults museum_id
26 1473906 2016-07-24 11:58:00 2 1.16667 1 25
76 2017453 2016-06-17 20:04:00 2 0.783333 1 41
189 2017468 2016-06-16 12:00:00 3 1.23333 1 37
190 2017468 2016-06-16 12:56:00 3 0.933333 1 11
191 2017468 2016-06-17 12:34:00 3 23.6333 1 38
192 2017468 2016-06-18 10:45:00 3 22.1833 1 10
193 2017468 2016-06-18 13:17:00 3 2.53333 1 23
194 2017468 2016-06-18 14:58:00 3 1.68333 1 41
195 2017468 2016-06-18 15:56:00 3 0.966667 1 29
198 2017470 2016-06-16 12:07:00 3 2.43333 1 3
199 2017470 2016-06-16 14:19:00 3 2.2 1 11
260 2017487 2016-06-14 14:04:00 3 1.83333 1 32
261 2017487 2016-06-14 21:35:00 3 7.51667 1 23
262 2017487 2016-06-15 11:26:00 3 13.85 1 10
263 2017487 2016-06-15 13:13:00 3 1.78333 1 3
264 2017487 2016-06-15 16:10:00 3 2.95 1 11
270 2017489 2016-06-15 13:46:00 2 25.7167 1 29
317 2017800 2016-06-20 14:22:00 2 21.5 1 3
433 2017818 2016-06-24 10:58:00 3 22.6667 1 38
453 2017821 2016-06-22 14:33:00 2 2.23333 1 29

In [13]:
# df[(df['user_id']==2017470)].sort_values('entry_time')

In [14]:
x1 = df1.loc[np.repeat(df1.index.values,df1['total_people'])]['time_since_previous_museum']
x1.head()


Out[14]:
0    4.36667
1    1.38333
2       17.9
3    1.51667
4    1.71667
Name: time_since_previous_museum, dtype: object

In [15]:
x2 = df1['time_since_previous_museum']

In [16]:
df1.shape


Out[16]:
(309330, 6)

In [17]:
x1.shape


Out[17]:
(340810,)

In [18]:
x2.shape


Out[18]:
(309330,)

In [28]:
# time_until_next_museum
trace1 = go.Histogram(x=x1, 
                     xbins=dict(start=np.min(x1), 
                                size=0.25, end=np.max(x1)), 
                     marker=dict(color='#CC171D'), 
                     name='Firenze Cards'
                    )
trace2 = go.Histogram(x=x2, 
                     xbins=dict(start=np.min(x2), 
                                size=0.25, end=np.max(x2)), 
                     marker=dict(color='#1789CC'), 
                     name='Total People on Visit'
                    )

layout = go.Layout(
    title="Time Between Museum Visits",
    titlefont=dict(size=28),
    barmode='stack',
    legend=dict(
        x=0.8,
        y=0.9,
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=16,
            color='#000'
        ),
        bgcolor='#FFFFFF',
        bordercolor='#E2E2E2',
        borderwidth=2
    ),
    width=1200,
    height=800,
    xaxis=dict(
        title='Time Gap in Hours (15 minute bins)',
        titlefont=dict(size=20),
        nticks=32,
        ticks='outside',
        tickfont=dict(size=16)
    ),
    yaxis=dict(
        title='Number of People-Visits with Given Time Gap',
        titlefont=dict(size=20),
        ticks='outside',
        tickfont=dict(size=16)
    )
)
fig = go.Figure(data=go.Data([trace1,trace2]), layout=layout)
py.iplot(fig, filename='TUNME', sharing='private', auto_open=False)


Out[28]:

In [185]:
fr = frequency(df.groupby(['user_id','entry_time','museum_id']).sum()['total_people'].to_frame(),'total_people')
fr


Out[185]:
total_people frequency cumulative ccdf
0 1 333331 0.924974 0.075026
1 2 19464 0.978985 0.021015
2 3 5909 0.995382 0.004618
3 4 1512 0.999578 0.000422
4 5 111 0.999886 0.000114
5 6 19 0.999939 0.000061
6 7 7 0.999958 0.000042
7 10 6 0.999975 0.000025
8 14 2 0.999981 0.000019
9 12 2 0.999986 0.000014
10 8 2 0.999992 0.000008
11 20 1 0.999994 0.000006
12 15 1 0.999997 0.000003
13 13 1 1.000000 0.000000

In [192]:
100-(fr['cumulative'].sub(fr['cumulative'].shift())*100).sum()


Out[192]:
92.497391555299018

In [193]:
fr['cumulative'].sub(fr['cumulative'].shift())*100


Out[193]:
0          NaN
1     5.401145
2     1.639713
3     0.419571
4     0.030802
5     0.005272
6     0.001942
7     0.001665
8     0.000555
9     0.000555
10    0.000555
11    0.000277
12    0.000277
13    0.000277
Name: cumulative, dtype: float64

In [ ]:
# # Card use count
# total_card_use_count = pd.DataFrame(df.groupby('user_id', as_index=True).size().rename('total_card_use_count'))
# df = pd.merge(total_card_use_count.reset_index(), df, on=['user_id'], how='inner')

# trace = go.Histogram(x=df['total_card_use_count'], xbins=dict(start=np.min(df['total_card_use_count']), size=1, end=np.max(df['total_card_use_count'])),
#                      marker=dict(color='rgb(0, 0, 100)'))
# layout = go.Layout(
#     title="Card use count",
#     legend=dict(
#         x=1,
#         y=1,
#         traceorder='normal',
#         font=dict(
#             family='sans-serif',
#             size=12,
#             color='#000'
#         ),
#         bgcolor='#E2E2E2',
#         bordercolor='#FFFFFF',
#         borderwidth=2
#     )
# )
# fig = go.Figure(data=go.Data([trace]), layout=layout)
# py.iplot(fig, filename='CUC', sharing='private', auto_open=False)

In [29]:
df2 = df.groupby(['user_id','entry_time','museum_id']).sum()[['total_people','total_adults']].reset_index()
df2.head()


Out[29]:
user_id entry_time museum_id total_people total_adults
0 1459702 2016-06-22 10:04:00 10 1 1
1 1459702 2016-06-22 14:26:00 15 1 1
2 1459702 2016-06-22 15:49:00 11 1 1
3 1459702 2016-06-23 09:43:00 3 1 1
4 1459702 2016-06-23 11:14:00 29 1 1

In [41]:
df3 = df2[['user_id','museum_id']].groupby('user_id').count().join(df2[['user_id','total_people','total_adults']].groupby('user_id').sum())
df3.columns = ['museums_visited','total_entries','adult_entries']
df3.head()


Out[41]:
museums_visited total_entries adult_entries
user_id
1459702 8 8 8
1473903 6 6 6
1473904 6 6 6
1473905 5 5 5
1473906 11 12 11

In [201]:
# Frequency plot of number of unique museums visited per card
x = df3['museums_visited']
trace1 = go.Histogram(x=x, xbins=dict(start=np.min(x)-.25, size=.5, end=np.max(x)+.25),
                     marker=dict(color='#CC171D'),
                     name = 'Museums visited')

# trace2 = go.Histogram(x=adult, xbins=dict(start=np.min(adult), size=1, end=np.max(adult)),
#                      marker=dict(color='rgb(0, 100, 0)'),
#                      name = 'Adults')

layout = go.Layout(
    title="Number of Museums Visited per Card",
    titlefont=dict(size=28),
#     legend=dict(
#         traceorder='normal',
#         font=dict(
#             family='sans-serif',
#             size=12,
#             color='#000'
#         ),
#         bgcolor='#CC171D',
#         bordercolor='#FFFFFF',
#         borderwidth=0
#     ),
    width=1200,
    height=800,
    xaxis=dict(
        title='Number of Museums Visited',
        titlefont=dict(size=20),
        range=[0.75,32.25],
        nticks=32,
        ticks='outside',
        tickfont=dict(size=16)
    ), 
    yaxis=dict(
        title='Number of Cards',
        titlefont=dict(size=20),
        ticks='outside',
        tickfont=dict(size=16)
    )
)
fig = go.Figure(data=go.Data([trace1]), layout=layout)
py.iplot(fig, filename='MPC_2', sharing='private', auto_open=False)


Out[201]:

In [204]:
x = df.groupby('user_id')['total_duration_card_use'].max()/pd.Timedelta('1 hour')
x.head()


Out[204]:
user_id
1459702    29.016667
1473903    52.183333
1473904    52.183333
1473905    25.166667
1473906    48.466667
Name: total_duration_card_use, dtype: float64

In [101]:
# kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(x.as_matrix()c
# kde


Out[101]:
KernelDensity(algorithm='auto', atol=0, bandwidth=0.75, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [208]:
trace = go.Histogram(x=x[x>0],
                     xbins=dict(start=np.min(x), size=1.0/4.0,
                                end=np.max(x)),
                     marker=dict(color='#CC171D'))

layout = go.Layout(
    title="Duration of Card Usage",
    titlefont=dict(size=28),
    width=1200,
    height=800,
    xaxis=dict(
        title='Hours Between First and Last Use of Card (bins of 15 minutes)',
        titlefont=dict(size=20),
        range=[0,72],
        nticks=32,
        ticks='outside',
        tickfont=dict(size=16)
    ),
    yaxis=dict(
        title='Number of Cards',
        titlefont=dict(size=20),
        ticks='outside',
        tickfont=dict(size=16)
    )
)
fig = go.Figure(data=go.Data([trace]), layout=layout)
py.iplot(fig, filename='DOU_2', sharing='private', auto_open=False)


Out[208]:

In [196]:
# df['time_until_next_museum'] = df['time_until_next_museum'].apply(
#     lambda x: pd.Timedelta(x) / pd.Timedelta('1 hour'))
# trace = go.Histogram(x=df['time_until_next_museum'], xbins=dict(start=np.min(x), size=0.25, end=np.max(x)),
#                      marker=dict(color='rgb(0, 0, 100)'))
# layout = go.Layout(
#     title=""
# )
# fig = go.Figure(data=go.Data([trace]), layout=layout)
# py.iplot(fig, filename='TUNM', sharing='private', auto_open=False)

In [15]:
# Histogram of Monthly total museum entry data - from florence card 
#https://plot.ly/~qiweihan/110

In [194]:
# Histogram of Monthly total museum entry data - from National Museums
# Comparison of PROPORTION of Firenze card entries with museum totals (pie chart?)
%load_ext sql
#TODO: connect with dbutils
conn_str = ""
conn = psycopg2.connect(conn_str)
c = conn.cursor()

test = get_national_museums(conn, export_to_csv=True, export_path='../src/output/')
test = test[(test['visit_month'] == 'June') | (test['visit_month'] == 'July') | 
            (test['visit_month'] == 'August') | (test['visit_month'] == 'September')]


The sql extension is already loaded. To reload it, use:
  %reload_ext sql

In [195]:
trace = Bar(
        x=test['place'],
        y=test['total_visitors'],
        )

fig = go.Figure(data=go.Data([trace]))
fig['layout'].update(height=800, width=900, title='Stacked subplots')

py.iplot(fig, filename='State Museum Entries', sharing='private')


Out[195]:

In [18]:
# # Which museums are most popular? number of entries per museum, per date
# trace = Bar(
#         x=df[],
#         y=df[],
#         )

# fig = go.Figure(data=go.Data([trace]))
# fig['layout'].update(height=800, width=900, title='Stacked subplots')

# py.iplot(fig, filename='', sharing='private')

In [19]:
# Timeline of usage(per avg hour, calendar day, calendar month, weekday) - segment per museum

In [ ]:
# # Daytype of activation of card
# # get day of week for first use for every user
# trace = go.Histogram(x=df[''], xbins=dict(start=np.min(x), size=0.25, end=np.max(x)),
#                      marker=dict(color='rgb(0, 0, 100)'))
# layout = go.Layout(
#     title=""
# )
# fig = go.Figure(data=go.Data([trace]), layout=layout)
# py.iplot(fig, filename='daytype of activation', sharing='private', auto_open=False)

In [27]:
dotw = {0:'Monday',
       1:'Tuesday',
       2:'Wednesday',
       3:'Thursday',
       4:'Friday',
       5:'Saturday',
       6:'Sunday'}
x = df[df['adults_first_use']==1][['user_id','day_of_week']].groupby('user_id').mean()['day_of_week'].map(dotw).to_frame()
fr2 = frequency(x,'day_of_week')[['day_of_week','frequency']]
fr2


Out[27]:
day_of_week frequency
0 Tuesday 9959
1 Wednesday 8150
2 Friday 7601
3 Thursday 7592
4 Monday 6439
5 Saturday 5648
6 Sunday 4919

In [28]:
# Frequency plot of number of unique museums visited per card
trace = go.Bar(x=['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Satuday'], 
               y=[4919,6439,9959,8150,8150,7601,5648],
               marker=dict(color='#CC171D'))

layout = go.Layout(
    title="Day of Firenze Card Activation",
    titlefont=dict(size=28),
    width=1200,
    height=800,
    xaxis=dict(
        title='Day of the Week',
        titlefont=dict(size=20),
        nticks=7,
        ticks='outside',
        tickfont=dict(size=16)
    ),
    yaxis=dict(
        title='Number of Cards Activated',
        titlefont=dict(size=20),
        ticks='outside',
        tickfont=dict(size=16)
    )
)
fig = go.Figure(data=go.Data([trace]), layout=layout)
py.iplot(fig, filename='daytype of activation', sharing='private', auto_open=False)


Out[28]:

In [ ]: