In [1]:
import pandas as pd
import numpy as np
from fbprophet import Prophet
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [2]:
train = pd.read_csv('ts_PassengerTraffic_Train.csv')
test = pd.read_csv('ts_PassengerTraffic_Test.csv')
print(train.shape)
train.head()


(18288, 3)
Out[2]:
ID Datetime Count
0 0 25-08-2012 00:00 8
1 1 25-08-2012 01:00 2
2 2 25-08-2012 02:00 6
3 3 25-08-2012 03:00 2
4 4 25-08-2012 04:00 2

In [3]:
test.head()


Out[3]:
ID Datetime
0 18288 26-09-2014 00:00
1 18289 26-09-2014 01:00
2 18290 26-09-2014 02:00
3 18291 26-09-2014 03:00
4 18292 26-09-2014 04:00

In [4]:
# convert object to datetime format
train['Datetime'] = pd.to_datetime(train['Datetime'], format = '%d-%m-%Y %H:%M')
test['Datetime'] = pd.to_datetime(test['Datetime'], format = '%d-%m-%Y %H:%M')
train.head()


Out[4]:
ID Datetime Count
0 0 2012-08-25 00:00:00 8
1 1 2012-08-25 01:00:00 2
2 2 2012-08-25 02:00:00 6
3 3 2012-08-25 03:00:00 2
4 4 2012-08-25 04:00:00 2

In [5]:
train['hour'] = train['Datetime'].dt.hour
train['YearMonth'] = train['Datetime'].dt.year.astype(str).str.cat(train['Datetime'].dt.month.astype(str), sep='-')
train.head()


Out[5]:
ID Datetime Count hour YearMonth
0 0 2012-08-25 00:00:00 8 0 2012-8
1 1 2012-08-25 01:00:00 2 1 2012-8
2 2 2012-08-25 02:00:00 6 2 2012-8
3 3 2012-08-25 03:00:00 2 3 2012-8
4 4 2012-08-25 04:00:00 2 4 2012-8

In [40]:
ax = train[['Count', 'YearMonth']].plot(kind='bar', title ="Passenger Count per Month", 
                                        figsize=(15, 10), legend=True, fontsize=12)
ax.set_xlabel("Year-Month", fontsize=12)
ax.set_ylabel("Passenger Count", fontsize=12) 
ax.xaxis.set_major_locator(mdates.YearLocator(1))   # set xticks
ax.xaxis.set_major_formatter(mdates.DateFormatter('%y')) 
plt.setp(ax.xaxis.get_majorticklabels())
plt.show()



In [81]:
import plotly.offline as plty
import plotly.graph_objs as go
plty.init_notebook_mode()

# daily trend - it's growing with the population capacity
daily_count = train.groupby(train['Datetime'].dt.date)['Count'].sum()
plty.iplot([go.Scatter(
    x=daily_count.index,
    y=daily_count
)])



In [82]:
import plotly.offline as plty
import plotly.graph_objs as go
plty.init_notebook_mode()

# monthly trend
monthly_count = train.groupby('YearMonth')['Count'].sum()
print(monthly_count.head())
plty.iplot([go.Scatter(
    x=monthly_count.index,
    y=monthly_count
)])


YearMonth
2012-10     8174
2012-11    11396
2012-12    11666
2012-8       496
2012-9      3200
Name: Count, dtype: int64

In [6]:
# The data has so much noise, we can aggregate the data
## hourly average fraction
hourly_frac = train.groupby(['hour']).mean()/np.sum(train.groupby(['hour']).mean())
hourly_frac.drop(['ID'], axis = 1, inplace = True)
hourly_frac.columns = ['fraction']
hourly_frac.head()


Out[6]:
fraction
hour
0 0.044287
1 0.035343
2 0.029911
3 0.024714
4 0.020802

In [7]:
train.index = train.Datetime
train.drop(['ID','hour','Datetime', 'YearMonth'], axis = 1, inplace = True)
train.head()


Out[7]:
Count
Datetime
2012-08-25 00:00:00 8
2012-08-25 01:00:00 2
2012-08-25 02:00:00 6
2012-08-25 03:00:00 2
2012-08-25 04:00:00 2

In [8]:
daily_train = train.resample('D').sum()  # sum up daily counts
daily_train.head()


Out[8]:
Count
Datetime
2012-08-25 76
2012-08-26 88
2012-08-27 62
2012-08-28 58
2012-08-29 60

In [9]:
# Prophet needs to name the datetime as 'ds', and name the label as 'y'
daily_train['ds'] = daily_train.index
daily_train['y'] = daily_train.Count
daily_train.drop('Count', inplace=True, axis=1)
daily_train.head()


Out[9]:
ds y
Datetime
2012-08-25 2012-08-25 76
2012-08-26 2012-08-26 88
2012-08-27 2012-08-27 62
2012-08-28 2012-08-28 58
2012-08-29 2012-08-29 60

In [10]:
# fit the prophet model
import warnings
warnings.filterwarnings("ignore")

m = Prophet(daily_seasonality = True, seasonality_prior_scale=0.1)  # seasonality_prior_scale decides the influence of seasonality
m.fit(daily_train)


Out[10]:
<fbprophet.forecaster.Prophet at 0x1b3b71fa518>

In [11]:
periods = int(test.shape[0]/24)
print(periods)
future = m.make_future_dataframe(periods=periods)
forecast = m.predict(future)


213

In [12]:
forecast.head()


Out[12]:
ds trend yhat_lower yhat_upper trend_lower trend_upper additive_terms additive_terms_lower additive_terms_upper daily ... weekly weekly_lower weekly_upper yearly yearly_lower yearly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat
0 2012-08-25 -410.029546 -2275.674516 158.687953 -410.029546 -410.029546 -692.834321 -692.834321 -692.834321 -266.593483 ... -994.397544 -994.397544 -994.397544 568.156706 568.156706 568.156706 0.0 0.0 0.0 -1102.863868
1 2012-08-26 -401.962915 -2490.313263 -57.849586 -401.962915 -401.962915 -851.021967 -851.021967 -851.021967 -266.593483 ... -1143.331390 -1143.331390 -1143.331390 558.902906 558.902906 558.902906 0.0 0.0 0.0 -1252.984882
2 2012-08-27 -393.896284 -914.491577 1574.258615 -393.896284 -393.896284 737.414257 737.414257 737.414257 -266.593483 ... 457.205322 457.205322 457.205322 546.802418 546.802418 546.802418 0.0 0.0 0.0 343.517973
3 2012-08-28 -385.829652 -677.520338 1706.761873 -385.829652 -385.829652 873.955761 873.955761 873.955761 -266.593483 ... 607.880644 607.880644 607.880644 532.668600 532.668600 532.668600 0.0 0.0 0.0 488.126108
4 2012-08-29 -377.763021 -808.976597 1685.047381 -377.763021 -377.763021 807.214035 807.214035 807.214035 -266.593483 ... 556.474077 556.474077 556.474077 517.333442 517.333442 517.333442 0.0 0.0 0.0 429.451014

5 rows × 22 columns


In [13]:
# see forcast components - trend, yearly seasonality, and weekly seasonality of the time series
fig1 = m.plot_components(forecast)



In [118]:
# See forecasts
firg2 = m.plot(forecast)



In [14]:
# Use the above fraction to convert daily forecast to hourly forecast
forecast.head()


Out[14]:
ds trend yhat_lower yhat_upper trend_lower trend_upper additive_terms additive_terms_lower additive_terms_upper daily ... weekly weekly_lower weekly_upper yearly yearly_lower yearly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat
0 2012-08-25 -410.029546 -2275.674516 158.687953 -410.029546 -410.029546 -692.834321 -692.834321 -692.834321 -266.593483 ... -994.397544 -994.397544 -994.397544 568.156706 568.156706 568.156706 0.0 0.0 0.0 -1102.863868
1 2012-08-26 -401.962915 -2490.313263 -57.849586 -401.962915 -401.962915 -851.021967 -851.021967 -851.021967 -266.593483 ... -1143.331390 -1143.331390 -1143.331390 558.902906 558.902906 558.902906 0.0 0.0 0.0 -1252.984882
2 2012-08-27 -393.896284 -914.491577 1574.258615 -393.896284 -393.896284 737.414257 737.414257 737.414257 -266.593483 ... 457.205322 457.205322 457.205322 546.802418 546.802418 546.802418 0.0 0.0 0.0 343.517973
3 2012-08-28 -385.829652 -677.520338 1706.761873 -385.829652 -385.829652 873.955761 873.955761 873.955761 -266.593483 ... 607.880644 607.880644 607.880644 532.668600 532.668600 532.668600 0.0 0.0 0.0 488.126108
4 2012-08-29 -377.763021 -808.976597 1685.047381 -377.763021 -377.763021 807.214035 807.214035 807.214035 -266.593483 ... 556.474077 556.474077 556.474077 517.333442 517.333442 517.333442 0.0 0.0 0.0 429.451014

5 rows × 22 columns


In [15]:
test = test.rename(index=str, columns={'Datetime': 'ds'})
test.head()


Out[15]:
ID ds
0 18288 2014-09-26 00:00:00
1 18289 2014-09-26 01:00:00
2 18290 2014-09-26 02:00:00
3 18291 2014-09-26 03:00:00
4 18292 2014-09-26 04:00:00

In [16]:
for df in [test, forecast]:
    df['hour'] = df.ds.dt.hour
    df['day'] = df.ds.dt.day
    df['month'] = df.ds.dt.month
    df['year'] = df.ds.dt.year
    
test_forecast = pd.merge(test,forecast, on=['day','month','year'], how='left')
test_forecast.head()


Out[16]:
ID ds_x hour_x day month year ds_y trend yhat_lower yhat_upper ... weekly_lower weekly_upper yearly yearly_lower yearly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat hour_y
0 18288 2014-09-26 00:00:00 0 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0
1 18289 2014-09-26 01:00:00 1 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0
2 18290 2014-09-26 02:00:00 2 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0
3 18291 2014-09-26 03:00:00 3 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0
4 18292 2014-09-26 04:00:00 4 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0

5 rows × 29 columns


In [18]:
test_forecast.head()


Out[18]:
ID ds_x hour_x day month year ds_y trend yhat_lower yhat_upper ... weekly_lower weekly_upper yearly yearly_lower yearly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat hour_y
0 18288 2014-09-26 00:00:00 0 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0
1 18289 2014-09-26 01:00:00 1 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0
2 18290 2014-09-26 02:00:00 2 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0
3 18291 2014-09-26 03:00:00 3 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0
4 18292 2014-09-26 04:00:00 4 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0

5 rows × 29 columns


In [20]:
test_forecast = pd.merge(test_forecast, hourly_frac, left_on='hour_x', right_index=True, how='left')
test_forecast.head()


Out[20]:
ID ds_x hour_x day month year ds_y trend yhat_lower yhat_upper ... weekly_upper yearly yearly_lower yearly_upper multiplicative_terms multiplicative_terms_lower multiplicative_terms_upper yhat hour_y fraction
0 18288 2014-09-26 00:00:00 0 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0 0.044287
1 18289 2014-09-26 01:00:00 1 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0 0.035343
2 18290 2014-09-26 02:00:00 2 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0 0.029911
3 18291 2014-09-26 03:00:00 3 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0 0.024714
4 18292 2014-09-26 04:00:00 4 26 9 2014 2014-09-26 10794.886613 10368.431501 12710.470877 ... 156.033434 829.138519 829.138519 829.138519 0.0 0.0 0.0 11513.465083 0 0.020802

5 rows × 30 columns


In [21]:
test_forecast['prediction'] = test_forecast['yhat']*test_forecast['fraction']

In [22]:
test_forecast.columns


Out[22]:
Index(['ID', 'ds_x', 'hour_x', 'day', 'month', 'year', 'ds_y', 'trend',
       'yhat_lower', 'yhat_upper', 'trend_lower', 'trend_upper',
       'additive_terms', 'additive_terms_lower', 'additive_terms_upper',
       'daily', 'daily_lower', 'daily_upper', 'weekly', 'weekly_lower',
       'weekly_upper', 'yearly', 'yearly_lower', 'yearly_upper',
       'multiplicative_terms', 'multiplicative_terms_lower',
       'multiplicative_terms_upper', 'yhat', 'hour_y', 'fraction',
       'prediction'],
      dtype='object')

In [23]:
cols = ['ID','hour_x','prediction']
test_forecast= test_forecast[cols]
test_forecast.head()


Out[23]:
ID hour_x prediction
0 18288 0 509.892083
1 18289 1 406.920558
2 18290 2 344.380127
3 18291 3 284.548996
4 18292 4 239.505750

In [24]:
test_forecast.isnull().sum()/test_forecast.shape[0]


Out[24]:
ID            0.0
hour_x        0.0
prediction    0.0
dtype: float64