1. Load and inspect the data: daily global earthquakes

Load the main dataset: Feb. 2, 2013 - Mar. 15, 2016



In [ ]:

    
import graphlab as gl

daily_stats = gl.load_timeseries('working_data/global_daily_stats.ts')

print "Number of rows:", len(daily_stats)
print "Start:", daily_stats.min_time
print "End:", daily_stats.max_time
daily_stats.print_rows(3)

Load the recent data: Mar. 16, 2016 - Mar. 22, 2016

The first point in this dataset is our forecasting goal. Pretend it's March 15, and we don't know the count of earthquakes for March 16th.



In [ ]:

    
daily_update = gl.load_timeseries('working_data/global_daily_update.ts')
daily_update.print_rows()

Visualize the data with GraphLab Canvas



In [ ]:

    
daily_stats.to_sframe().show()

Visualize the data with matplotlib



In [ ]:

    
import matplotlib.pyplot as plt
%matplotlib notebook
plt.style.use('ggplot')

fig, ax = plt.subplots()
ax.plot(daily_stats['time'], daily_stats['count'], color='dodgerblue')
ax.set_xlabel('Date')
ax.set_ylabel('Number of earthquakes')
fig.autofmt_xdate()
fig.show()

2. A naive baseline: the grand mean



In [ ]:

    
baseline_forecast = daily_stats['count'].mean()

print baseline_forecast

3. The autoregressive model

Create lagged features



In [ ]:

    
daily_stats['lag1_count'] = daily_stats.shift(1)['count']
daily_stats['lag2_count'] = daily_stats.shift(2)['count']

daily_stats.print_rows(3)

Train the model



In [ ]:

    
train_counts = daily_stats[2:].to_sframe()

ar_model = gl.linear_regression.create(train_counts, target='count',
                                        features=['lag1_count', 'lag2_count'],
                                        l2_penalty=0., validation_set=None,
                                        verbose=False)

print ar_model



In [ ]:

    
train_counts.tail(5).print_rows()

Get a forecast from the model



In [ ]:

    
## Construct the input dataset first.
sf_forecast = gl.SFrame({'lag1_count': [daily_stats['count'][-1]],
                         'lag2_count': [daily_stats['count'][-2]]})

## Compute the model's forecast
ar_forecast = ar_model.predict(sf_forecast)
print ar_forecast[0]

4. The gradient-boosted trees model

Split the timestamp into parts



In [ ]:

    
date_parts = daily_stats.index.split_datetime(column_name_prefix='date',
                                        limit=['year', 'month', 'day'])

Create lags for observed features

To forecast tomorrow's earthqauke count:

we do know what the date will be, so no need to lag,
we don't know what the max and average magnitude will be, so we need to lag.



In [ ]:

    
daily_stats['lag1_avg_mag'] = daily_stats.shift(1)['avg_mag']
daily_stats['lag1_max_mag'] = daily_stats.shift(1)['max_mag']



In [ ]:

    
sf_train = daily_stats.to_sframe()
sf_train = sf_train.add_columns(date_parts)

sf_train.print_rows(3)

Train the model



In [ ]:

    
feature_list = ['lag1_avg_mag', 'lag1_max_mag', 'lag1_count',
                'date.year', 'date.month', 'date.day']

# Remove the row with no lagged features.
sf_train = sf_train[1:]

gbt_model = gl.boosted_trees_regression.create(sf_train, target='count',
                                               features=feature_list,
                                               max_iterations=20,
                                               validation_set=None,
                                               verbose=False)

print gbt_model

Compute the model's forecast



In [ ]:

    
## Prepend the last couple rows of the training data.
ts_forecast = daily_stats[daily_update.column_names()][-2:].union(daily_update)

## Create the lagged features.
ts_forecast['lag1_avg_mag'] = ts_forecast.shift(1)['avg_mag']
ts_forecast['lag1_max_mag'] = ts_forecast.shift(1)['max_mag']
ts_forecast['lag1_count'] = ts_forecast.shift(1)['count']

## Split the timestamp into date parts.
new_date_parts = ts_forecast.index.split_datetime(column_name_prefix='date',
                                        limit=['year', 'month', 'day'])

## Add the date parts to the dataset.
sf_forecast = ts_forecast.to_sframe().add_columns(new_date_parts)

sf_forecast.print_rows(3)



In [ ]:

    
gbt_forecast = gbt_model.predict(sf_forecast)
gbt_forecast[2]

5. And the winner is...



In [ ]:

    
print "Actual value for March 16:", daily_update['count'][0]
print "\nBaseline forecast:", baseline_forecast
print "AR model forecast:", ar_forecast[0]
print "GBT forecast:", gbt_forecast[2], "\t(*** winner ***)"