notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from fbprophet import Prophet
import matplotlib.pyplot as plt
import matplotlib.dates as mdates



In [2]:

    
train = pd.read_csv('ts_PassengerTraffic_Train.csv')
test = pd.read_csv('ts_PassengerTraffic_Test.csv')
print(train.shape)
train.head()









    



(18288, 3)






    Out[2]:







  
    
      
      ID
      Datetime
      Count
    
  
  
    
      0
      0
      25-08-2012 00:00
      8
    
    
      1
      1
      25-08-2012 01:00
      2
    
    
      2
      2
      25-08-2012 02:00
      6
    
    
      3
      3
      25-08-2012 03:00
      2
    
    
      4
      4
      25-08-2012 04:00
      2



In [3]:

    
test.head()









    Out[3]:







  
    
      
      ID
      Datetime
    
  
  
    
      0
      18288
      26-09-2014 00:00
    
    
      1
      18289
      26-09-2014 01:00
    
    
      2
      18290
      26-09-2014 02:00
    
    
      3
      18291
      26-09-2014 03:00
    
    
      4
      18292
      26-09-2014 04:00



In [4]:

    
# convert object to datetime format
train['Datetime'] = pd.to_datetime(train['Datetime'], format = '%d-%m-%Y %H:%M')
test['Datetime'] = pd.to_datetime(test['Datetime'], format = '%d-%m-%Y %H:%M')
train.head()









    Out[4]:







  
    
      
      ID
      Datetime
      Count
    
  
  
    
      0
      0
      2012-08-25 00:00:00
      8
    
    
      1
      1
      2012-08-25 01:00:00
      2
    
    
      2
      2
      2012-08-25 02:00:00
      6
    
    
      3
      3
      2012-08-25 03:00:00
      2
    
    
      4
      4
      2012-08-25 04:00:00
      2



In [5]:

    
train['hour'] = train['Datetime'].dt.hour
train['YearMonth'] = train['Datetime'].dt.year.astype(str).str.cat(train['Datetime'].dt.month.astype(str), sep='-')
train.head()









    Out[5]:







  
    
      
      ID
      Datetime
      Count
      hour
      YearMonth
    
  
  
    
      0
      0
      2012-08-25 00:00:00
      8
      0
      2012-8
    
    
      1
      1
      2012-08-25 01:00:00
      2
      1
      2012-8
    
    
      2
      2
      2012-08-25 02:00:00
      6
      2
      2012-8
    
    
      3
      3
      2012-08-25 03:00:00
      2
      3
      2012-8
    
    
      4
      4
      2012-08-25 04:00:00
      2
      4
      2012-8



In [40]:

    
ax = train[['Count', 'YearMonth']].plot(kind='bar', title ="Passenger Count per Month", 
                                        figsize=(15, 10), legend=True, fontsize=12)
ax.set_xlabel("Year-Month", fontsize=12)
ax.set_ylabel("Passenger Count", fontsize=12) 
ax.xaxis.set_major_locator(mdates.YearLocator(1))   # set xticks
ax.xaxis.set_major_formatter(mdates.DateFormatter('%y')) 
plt.setp(ax.xaxis.get_majorticklabels())
plt.show()



In [81]:

    
import plotly.offline as plty
import plotly.graph_objs as go
plty.init_notebook_mode()

# daily trend - it's growing with the population capacity
daily_count = train.groupby(train['Datetime'].dt.date)['Count'].sum()
plty.iplot([go.Scatter(
    x=daily_count.index,
    y=daily_count
)])



In [82]:

    
import plotly.offline as plty
import plotly.graph_objs as go
plty.init_notebook_mode()

# monthly trend
monthly_count = train.groupby('YearMonth')['Count'].sum()
print(monthly_count.head())
plty.iplot([go.Scatter(
    x=monthly_count.index,
    y=monthly_count
)])









    











    



YearMonth
2012-10     8174
2012-11    11396
2012-12    11666
2012-8       496
2012-9      3200
Name: Count, dtype: int64



In [6]:

    
# The data has so much noise, we can aggregate the data
## hourly average fraction
hourly_frac = train.groupby(['hour']).mean()/np.sum(train.groupby(['hour']).mean())
hourly_frac.drop(['ID'], axis = 1, inplace = True)
hourly_frac.columns = ['fraction']
hourly_frac.head()



In [7]:

    
train.index = train.Datetime
train.drop(['ID','hour','Datetime', 'YearMonth'], axis = 1, inplace = True)
train.head()









    Out[7]:







  
    
      
      Count
    
    
      Datetime
      
    
  
  
    
      2012-08-25 00:00:00
      8
    
    
      2012-08-25 01:00:00
      2
    
    
      2012-08-25 02:00:00
      6
    
    
      2012-08-25 03:00:00
      2
    
    
      2012-08-25 04:00:00
      2



In [8]:

    
daily_train = train.resample('D').sum()  # sum up daily counts
daily_train.head()



In [9]:

    
# Prophet needs to name the datetime as 'ds', and name the label as 'y'
daily_train['ds'] = daily_train.index
daily_train['y'] = daily_train.Count
daily_train.drop('Count', inplace=True, axis=1)
daily_train.head()



In [10]:

    
# fit the prophet model
import warnings
warnings.filterwarnings("ignore")

m = Prophet(daily_seasonality = True, seasonality_prior_scale=0.1)  # seasonality_prior_scale decides the influence of seasonality
m.fit(daily_train)









    Out[10]:





<fbprophet.forecaster.Prophet at 0x1b3b71fa518>

For all the param in Prophet(), check https://github.com/facebook/prophet/blob/master/python/fbprophet/forecaster.py



In [11]:

    
periods = int(test.shape[0]/24)
print(periods)
future = m.make_future_dataframe(periods=periods)
forecast = m.predict(future)



In [12]:

    
forecast.head()









    Out[12]:







  
    
      
      ds
      trend
      yhat_lower
      yhat_upper
      trend_lower
      trend_upper
      additive_terms
      additive_terms_lower
      additive_terms_upper
      daily
      ...
      weekly
      weekly_lower
      weekly_upper
      yearly
      yearly_lower
      yearly_upper
      multiplicative_terms
      multiplicative_terms_lower
      multiplicative_terms_upper
      yhat
    
  
  
    
      0
      2012-08-25
      -410.029546
      -2275.674516
      158.687953
      -410.029546
      -410.029546
      -692.834321
      -692.834321
      -692.834321
      -266.593483
      ...
      -994.397544
      -994.397544
      -994.397544
      568.156706
      568.156706
      568.156706
      0.0
      0.0
      0.0
      -1102.863868
    
    
      1
      2012-08-26
      -401.962915
      -2490.313263
      -57.849586
      -401.962915
      -401.962915
      -851.021967
      -851.021967
      -851.021967
      -266.593483
      ...
      -1143.331390
      -1143.331390
      -1143.331390
      558.902906
      558.902906
      558.902906
      0.0
      0.0
      0.0
      -1252.984882
    
    
      2
      2012-08-27
      -393.896284
      -914.491577
      1574.258615
      -393.896284
      -393.896284
      737.414257
      737.414257
      737.414257
      -266.593483
      ...
      457.205322
      457.205322
      457.205322
      546.802418
      546.802418
      546.802418
      0.0
      0.0
      0.0
      343.517973
    
    
      3
      2012-08-28
      -385.829652
      -677.520338
      1706.761873
      -385.829652
      -385.829652
      873.955761
      873.955761
      873.955761
      -266.593483
      ...
      607.880644
      607.880644
      607.880644
      532.668600
      532.668600
      532.668600
      0.0
      0.0
      0.0
      488.126108
    
    
      4
      2012-08-29
      -377.763021
      -808.976597
      1685.047381
      -377.763021
      -377.763021
      807.214035
      807.214035
      807.214035
      -266.593483
      ...
      556.474077
      556.474077
      556.474077
      517.333442
      517.333442
      517.333442
      0.0
      0.0
      0.0
      429.451014
    
  

5 rows × 22 columns



In [13]:

    
# see forcast components - trend, yearly seasonality, and weekly seasonality of the time series
fig1 = m.plot_components(forecast)



In [118]:

    
# See forecasts
firg2 = m.plot(forecast)



In [14]:

    
# Use the above fraction to convert daily forecast to hourly forecast
forecast.head()









    Out[14]:







  
    
      
      ds
      trend
      yhat_lower
      yhat_upper
      trend_lower
      trend_upper
      additive_terms
      additive_terms_lower
      additive_terms_upper
      daily
      ...
      weekly
      weekly_lower
      weekly_upper
      yearly
      yearly_lower
      yearly_upper
      multiplicative_terms
      multiplicative_terms_lower
      multiplicative_terms_upper
      yhat
    
  
  
    
      0
      2012-08-25
      -410.029546
      -2275.674516
      158.687953
      -410.029546
      -410.029546
      -692.834321
      -692.834321
      -692.834321
      -266.593483
      ...
      -994.397544
      -994.397544
      -994.397544
      568.156706
      568.156706
      568.156706
      0.0
      0.0
      0.0
      -1102.863868
    
    
      1
      2012-08-26
      -401.962915
      -2490.313263
      -57.849586
      -401.962915
      -401.962915
      -851.021967
      -851.021967
      -851.021967
      -266.593483
      ...
      -1143.331390
      -1143.331390
      -1143.331390
      558.902906
      558.902906
      558.902906
      0.0
      0.0
      0.0
      -1252.984882
    
    
      2
      2012-08-27
      -393.896284
      -914.491577
      1574.258615
      -393.896284
      -393.896284
      737.414257
      737.414257
      737.414257
      -266.593483
      ...
      457.205322
      457.205322
      457.205322
      546.802418
      546.802418
      546.802418
      0.0
      0.0
      0.0
      343.517973
    
    
      3
      2012-08-28
      -385.829652
      -677.520338
      1706.761873
      -385.829652
      -385.829652
      873.955761
      873.955761
      873.955761
      -266.593483
      ...
      607.880644
      607.880644
      607.880644
      532.668600
      532.668600
      532.668600
      0.0
      0.0
      0.0
      488.126108
    
    
      4
      2012-08-29
      -377.763021
      -808.976597
      1685.047381
      -377.763021
      -377.763021
      807.214035
      807.214035
      807.214035
      -266.593483
      ...
      556.474077
      556.474077
      556.474077
      517.333442
      517.333442
      517.333442
      0.0
      0.0
      0.0
      429.451014
    
  

5 rows × 22 columns



In [15]:

    
test = test.rename(index=str, columns={'Datetime': 'ds'})
test.head()









    Out[15]:







  
    
      
      ID
      ds
    
  
  
    
      0
      18288
      2014-09-26 00:00:00
    
    
      1
      18289
      2014-09-26 01:00:00
    
    
      2
      18290
      2014-09-26 02:00:00
    
    
      3
      18291
      2014-09-26 03:00:00
    
    
      4
      18292
      2014-09-26 04:00:00



In [16]:

    
for df in [test, forecast]:
    df['hour'] = df.ds.dt.hour
    df['day'] = df.ds.dt.day
    df['month'] = df.ds.dt.month
    df['year'] = df.ds.dt.year
    
test_forecast = pd.merge(test,forecast, on=['day','month','year'], how='left')
test_forecast.head()









    Out[16]:







  
    
      
      ID
      ds_x
      hour_x
      day
      month
      year
      ds_y
      trend
      yhat_lower
      yhat_upper
      ...
      weekly_lower
      weekly_upper
      yearly
      yearly_lower
      yearly_upper
      multiplicative_terms
      multiplicative_terms_lower
      multiplicative_terms_upper
      yhat
      hour_y
    
  
  
    
      0
      18288
      2014-09-26 00:00:00
      0
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
    
      1
      18289
      2014-09-26 01:00:00
      1
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
    
      2
      18290
      2014-09-26 02:00:00
      2
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
    
      3
      18291
      2014-09-26 03:00:00
      3
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
    
      4
      18292
      2014-09-26 04:00:00
      4
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
  

5 rows × 29 columns



In [18]:

    
test_forecast.head()









    Out[18]:







  
    
      
      ID
      ds_x
      hour_x
      day
      month
      year
      ds_y
      trend
      yhat_lower
      yhat_upper
      ...
      weekly_lower
      weekly_upper
      yearly
      yearly_lower
      yearly_upper
      multiplicative_terms
      multiplicative_terms_lower
      multiplicative_terms_upper
      yhat
      hour_y
    
  
  
    
      0
      18288
      2014-09-26 00:00:00
      0
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
    
      1
      18289
      2014-09-26 01:00:00
      1
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
    
      2
      18290
      2014-09-26 02:00:00
      2
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
    
      3
      18291
      2014-09-26 03:00:00
      3
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
    
      4
      18292
      2014-09-26 04:00:00
      4
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
    
  

5 rows × 29 columns



In [20]:

    
test_forecast = pd.merge(test_forecast, hourly_frac, left_on='hour_x', right_index=True, how='left')
test_forecast.head()









    Out[20]:







  
    
      
      ID
      ds_x
      hour_x
      day
      month
      year
      ds_y
      trend
      yhat_lower
      yhat_upper
      ...
      weekly_upper
      yearly
      yearly_lower
      yearly_upper
      multiplicative_terms
      multiplicative_terms_lower
      multiplicative_terms_upper
      yhat
      hour_y
      fraction
    
  
  
    
      0
      18288
      2014-09-26 00:00:00
      0
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
      0.044287
    
    
      1
      18289
      2014-09-26 01:00:00
      1
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
      0.035343
    
    
      2
      18290
      2014-09-26 02:00:00
      2
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
      0.029911
    
    
      3
      18291
      2014-09-26 03:00:00
      3
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
      0.024714
    
    
      4
      18292
      2014-09-26 04:00:00
      4
      26
      9
      2014
      2014-09-26
      10794.886613
      10368.431501
      12710.470877
      ...
      156.033434
      829.138519
      829.138519
      829.138519
      0.0
      0.0
      0.0
      11513.465083
      0
      0.020802
    
  

5 rows × 30 columns



In [21]:

    
test_forecast['prediction'] = test_forecast['yhat']*test_forecast['fraction']



In [22]:

    
test_forecast.columns









    Out[22]:





Index(['ID', 'ds_x', 'hour_x', 'day', 'month', 'year', 'ds_y', 'trend',
       'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
       'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
       'daily', 'daily_lower', 'daily_upper', 'weekly', 'weekly_lower',
       'weekly_upper', 'yearly', 'yearly_lower', 'yearly_upper',
       'multiplicative_terms', 'multiplicative_terms_lower',
       'multiplicative_terms_upper', 'yhat', 'hour_y', 'fraction',
       'prediction'],
      dtype='object')



In [23]:

    
cols = ['ID','hour_x','prediction']
test_forecast= test_forecast[cols]
test_forecast.head()



In [24]:

    
test_forecast.isnull().sum()/test_forecast.shape[0]









    Out[24]:





ID            0.0
hour_x        0.0
prediction    0.0
dtype: float64

	ID	Datetime	Count
0	0	25-08-2012 00:00	8
1	1	25-08-2012 01:00	2
2	2	25-08-2012 02:00	6
3	3	25-08-2012 03:00	2
4	4	25-08-2012 04:00	2

	ID	Datetime
0	18288	26-09-2014 00:00
1	18289	26-09-2014 01:00
2	18290	26-09-2014 02:00
3	18291	26-09-2014 03:00
4	18292	26-09-2014 04:00

	ID	Datetime	Count
0	0	2012-08-25 00:00:00	8
1	1	2012-08-25 01:00:00	2
2	2	2012-08-25 02:00:00	6
3	3	2012-08-25 03:00:00	2
4	4	2012-08-25 04:00:00	2

	ID	Datetime	Count	hour	YearMonth
0	0	2012-08-25 00:00:00	8	0	2012-8
1	1	2012-08-25 01:00:00	2	1	2012-8
2	2	2012-08-25 02:00:00	6	2	2012-8
3	3	2012-08-25 03:00:00	2	3	2012-8
4	4	2012-08-25 04:00:00	2	4	2012-8

	Count
Datetime
2012-08-25 00:00:00	8
2012-08-25 01:00:00	2
2012-08-25 02:00:00	6
2012-08-25 03:00:00	2
2012-08-25 04:00:00	2

	Count
Datetime
2012-08-25	76
2012-08-26	88
2012-08-27	62
2012-08-28	58
2012-08-29	60

	ds	trend	yhat_lower	yhat_upper	trend_lower	trend_upper	additive_terms	additive_terms_lower	additive_terms_upper	daily	...	weekly	weekly_lower	weekly_upper	yearly	yearly_lower	yearly_upper	yhat
0	2012-08-25	-410.029546	-2275.674516	158.687953	-410.029546	-410.029546	-692.834321	-692.834321	-692.834321	-266.593483	...	-994.397544	-994.397544	-994.397544	568.156706	568.156706	568.156706	-1102.863868
1	2012-08-26	-401.962915	-2490.313263	-57.849586	-401.962915	-401.962915	-851.021967	-851.021967	-851.021967	-266.593483	...	-1143.331390	-1143.331390	-1143.331390	558.902906	558.902906	558.902906	-1252.984882
2	2012-08-27	-393.896284	-914.491577	1574.258615	-393.896284	-393.896284	737.414257	737.414257	737.414257	-266.593483	...	457.205322	457.205322	457.205322	546.802418	546.802418	546.802418	343.517973
3	2012-08-28	-385.829652	-677.520338	1706.761873	-385.829652	-385.829652	873.955761	873.955761	873.955761	-266.593483	...	607.880644	607.880644	607.880644	532.668600	532.668600	532.668600	488.126108
4	2012-08-29	-377.763021	-808.976597	1685.047381	-377.763021	-377.763021	807.214035	807.214035	807.214035	-266.593483	...	556.474077	556.474077	556.474077	517.333442	517.333442	517.333442	429.451014

	ID	ds
0	18288	2014-09-26 00:00:00
1	18289	2014-09-26 01:00:00
2	18290	2014-09-26 02:00:00
3	18291	2014-09-26 03:00:00
4	18292	2014-09-26 04:00:00

	ID	ds_x	hour_x	day	month	year	ds_y	trend	yhat_lower	yhat_upper	...	weekly_lower	weekly_upper	yearly	yearly_lower	yearly_upper	yhat
0	18288	2014-09-26 00:00:00	0	26	9	2014	2014-09-26	10794.886613	10368.431501	12710.470877	...	156.033434	156.033434	829.138519	829.138519	829.138519	11513.465083
1	18289	2014-09-26 01:00:00	1	26	9	2014	2014-09-26	10794.886613	10368.431501	12710.470877	...	156.033434	156.033434	829.138519	829.138519	829.138519	11513.465083
2	18290	2014-09-26 02:00:00	2	26	9	2014	2014-09-26	10794.886613	10368.431501	12710.470877	...	156.033434	156.033434	829.138519	829.138519	829.138519	11513.465083
3	18291	2014-09-26 03:00:00	3	26	9	2014	2014-09-26	10794.886613	10368.431501	12710.470877	...	156.033434	156.033434	829.138519	829.138519	829.138519	11513.465083
4	18292	2014-09-26 04:00:00	4	26	9	2014	2014-09-26	10794.886613	10368.431501	12710.470877	...	156.033434	156.033434	829.138519	829.138519	829.138519	11513.465083

	ID	hour_x	prediction
0	18288	0	509.892083
1	18289	1	406.920558
2	18290	2	344.380127
3	18291	3	284.548996
4	18292	4	239.505750