In [ ]:
import graphlab as gl
daily_stats = gl.load_timeseries('working_data/global_daily_stats.ts')
print "Number of rows:", len(daily_stats)
print "Start:", daily_stats.min_time
print "End:", daily_stats.max_time
daily_stats.print_rows(3)
The first point in this dataset is our forecasting goal. Pretend it's March 15, and we don't know the count of earthquakes for March 16th.
In [ ]:
daily_update = gl.load_timeseries('working_data/global_daily_update.ts')
daily_update.print_rows()
In [ ]:
daily_stats.to_sframe().show()
In [ ]:
import matplotlib.pyplot as plt
%matplotlib notebook
plt.style.use('ggplot')
fig, ax = plt.subplots()
ax.plot(daily_stats['time'], daily_stats['count'], color='dodgerblue')
ax.set_xlabel('Date')
ax.set_ylabel('Number of earthquakes')
fig.autofmt_xdate()
fig.show()
In [ ]:
baseline_forecast = daily_stats['count'].mean()
print baseline_forecast
In [ ]:
daily_stats['lag1_count'] = daily_stats.shift(1)['count']
daily_stats['lag2_count'] = daily_stats.shift(2)['count']
daily_stats.print_rows(3)
In [ ]:
train_counts = daily_stats[2:].to_sframe()
ar_model = gl.linear_regression.create(train_counts, target='count',
features=['lag1_count', 'lag2_count'],
l2_penalty=0., validation_set=None,
verbose=False)
print ar_model
In [ ]:
train_counts.tail(5).print_rows()
In [ ]:
## Construct the input dataset first.
sf_forecast = gl.SFrame({'lag1_count': [daily_stats['count'][-1]],
'lag2_count': [daily_stats['count'][-2]]})
## Compute the model's forecast
ar_forecast = ar_model.predict(sf_forecast)
print ar_forecast[0]
In [ ]:
date_parts = daily_stats.index.split_datetime(column_name_prefix='date',
limit=['year', 'month', 'day'])
To forecast tomorrow's earthqauke count:
In [ ]:
daily_stats['lag1_avg_mag'] = daily_stats.shift(1)['avg_mag']
daily_stats['lag1_max_mag'] = daily_stats.shift(1)['max_mag']
In [ ]:
sf_train = daily_stats.to_sframe()
sf_train = sf_train.add_columns(date_parts)
sf_train.print_rows(3)
In [ ]:
feature_list = ['lag1_avg_mag', 'lag1_max_mag', 'lag1_count',
'date.year', 'date.month', 'date.day']
# Remove the row with no lagged features.
sf_train = sf_train[1:]
gbt_model = gl.boosted_trees_regression.create(sf_train, target='count',
features=feature_list,
max_iterations=20,
validation_set=None,
verbose=False)
print gbt_model
In [ ]:
## Prepend the last couple rows of the training data.
ts_forecast = daily_stats[daily_update.column_names()][-2:].union(daily_update)
## Create the lagged features.
ts_forecast['lag1_avg_mag'] = ts_forecast.shift(1)['avg_mag']
ts_forecast['lag1_max_mag'] = ts_forecast.shift(1)['max_mag']
ts_forecast['lag1_count'] = ts_forecast.shift(1)['count']
## Split the timestamp into date parts.
new_date_parts = ts_forecast.index.split_datetime(column_name_prefix='date',
limit=['year', 'month', 'day'])
## Add the date parts to the dataset.
sf_forecast = ts_forecast.to_sframe().add_columns(new_date_parts)
sf_forecast.print_rows(3)
In [ ]:
gbt_forecast = gbt_model.predict(sf_forecast)
gbt_forecast[2]
In [ ]:
print "Actual value for March 16:", daily_update['count'][0]
print "\nBaseline forecast:", baseline_forecast
print "AR model forecast:", ar_forecast[0]
print "GBT forecast:", gbt_forecast[2], "\t(*** winner ***)"