1. Load and inspect the data: Oklahoma earthquake stats


In [ ]:
import graphlab as gl

okla_daily = gl.load_timeseries('working_data/ok_daily_stats.ts')

print "Number of rows:", len(okla_daily)
print "Start:", okla_daily.min_time
print "End:", okla_daily.max_time
okla_daily.print_rows(3)

In [ ]:
import matplotlib.pyplot as plt
%matplotlib notebook
plt.style.use('ggplot')

fig, ax = plt.subplots()
ax.plot(okla_daily['time'], okla_daily['count'], color='dodgerblue')
ax.set_ylabel('Number of quakes')
ax.set_xlabel('Date')
fig.autofmt_xdate()
fig.show()

2. Let the toolkit choose the model


In [ ]:
from graphlab.toolkits import anomaly_detection

model = anomaly_detection.create(okla_daily, features=['count'])

print model

3. The simple thresholding model


In [ ]:
threshold = 5
anomaly_mask = okla_daily['count'] >= threshold

anomaly_scores = okla_daily[['count']]
anomaly_scores['threshold_score'] = anomaly_mask

In [ ]:
anomaly_scores.tail(8).print_rows()

4. The moving Z-score model


In [ ]:
from graphlab.toolkits.anomaly_detection import moving_zscore

zscore_model = moving_zscore.create(okla_daily, feature='count',
                                    window_size=30,
                                    min_observations=15)

print zscore_model

In [ ]:
zscore_model.scores.tail(3)

In [ ]:
zscore_model.scores.head(3)

In [ ]:
anomaly_scores['outlier_score'] = zscore_model.scores['anomaly_score']
anomaly_scores.tail(5).print_rows()

In [ ]:
fig, ax = plt.subplots(2, sharex=True)
ax[0].plot(anomaly_scores['time'], anomaly_scores['count'], color='dodgerblue')
ax[0].set_ylabel('# quakes')

ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'], color='orchid')
ax[1].set_ylabel('outlier score')

ax[1].set_xlabel('Date')
fig.autofmt_xdate()
fig.show()

5. The Bayesian changepoint model


In [ ]:
from graphlab.toolkits.anomaly_detection import bayesian_changepoints

changept_model = bayesian_changepoints.create(okla_daily, feature='count',
                                              expected_runlength=2000, lag=7)

print changept_model

In [ ]:
anomaly_scores['changepoint_score'] = changept_model.scores['changepoint_score']
anomaly_scores.head(5)

In [ ]:
fig, ax = plt.subplots(3, sharex=True)
ax[0].plot(anomaly_scores['time'], anomaly_scores['count'], color='dodgerblue')
ax[0].set_ylabel('# quakes')

ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'], color='orchid')
ax[1].set_ylabel('outlier score')

ax[2].plot(anomaly_scores['time'], anomaly_scores['changepoint_score'], color='orchid')
ax[2].set_ylabel('changepoint score')

ax[2].set_xlabel('Date')
fig.autofmt_xdate()
fig.show()

6. How to use the anomaly scores

Option 1: choose an anomaly threshold a priori

  • Slightly better than choosing a threshold in the original feature space.
  • For Bayesian changepoint detection, where the scores are probabilities, there is a natural threshold of 0.5.

In [ ]:
threshold = 0.5
anom_mask = anomaly_scores['changepoint_score'] >= threshold

anomalies = anomaly_scores[anom_mask]

print "Number of anomalies:", len(anomalies)
anomalies.head(5)

Option 2: choose the top-k anomalies

If you have a fixed budget for investigating and acting on anomalies, this is a good way to go.


In [ ]:
anomalies = anomaly_scores.to_sframe().topk('changepoint_score', k=5)

print "Number of anomalies:", len(anomalies)
anomalies.head(5)

Option 3: look at the anomaly distribution and choose a threshold


In [ ]:
anomaly_scores['changepoint_score'].show()

In [ ]:
threshold = 0.072
anom_mask = anomaly_scores['changepoint_score'] >= threshold

anomalies = anomaly_scores[anom_mask]

print "Number of anomalies:", len(anomalies)
anomalies.head(5)

Option 4: get fancy with plotting


In [ ]:
from interactive_plot import LineDrawer

fig, ax = plt.subplots(3, sharex=True)
guide_lines = []
threshold_lines = []

p = ax[0].plot(anomaly_scores['time'], anomaly_scores['count'],
               color='dodgerblue')
ax[0].set_ylabel('# quakes')

line, = ax[0].plot((anomaly_scores.min_time, anomaly_scores.min_time),
                   ax[0].get_ylim(), lw=1, ls='--', color='black')
guide_lines.append(line)

ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'],
           color='orchid')
ax[1].set_ylabel('outlier score')
line, = ax[1].plot((anomaly_scores.min_time, anomaly_scores.min_time),
                   ax[1].get_ylim(), lw=1, ls='--', color='black')
guide_lines.append(line)

ax[2].plot(anomaly_scores['time'], anomaly_scores['changepoint_score'],
           color='orchid')
ax[2].set_ylabel('changepoint score')
ax[2].set_xlabel('Date')
line, = ax[2].plot((anomaly_scores.min_time, anomaly_scores.min_time), (0., 1.),
                   lw=1, ls='--', color='black')
guide_lines.append(line)

for a in ax:
    line, = a.plot(anomaly_scores.range, (0., 0.), lw=1, ls='--',
                   color='black')
    threshold_lines.append(line)

plot_scores = anomaly_scores[['count', 'outlier_score', 'changepoint_score']]
interactive_thresholder = LineDrawer(plot_scores, guide_lines, threshold_lines)
interactive_thresholder.connect()
fig.autofmt_xdate()
fig.show()

In [ ]:
interactive_thresholder.anoms.print_rows(10)

7. Updating the model with new data


In [ ]:
okla_new = gl.load_timeseries('working_data/ok_daily_update.ts')
okla_new.print_rows(20)

Why do we want to update the model, rather than training a new one?

  1. Because we've updated our parameters using the data we've seen already.
  2. Updating simplifies the drudgery of prepending the data to get a final score for the lags in the previous data set.

In [ ]:
changept_model2 = changept_model.update(okla_new)

print changept_model2

In [ ]:
changept_model2.scores.print_rows(20)