In [ ]:
import graphlab as gl
okla_daily = gl.load_timeseries('working_data/ok_daily_stats.ts')
print "Number of rows:", len(okla_daily)
print "Start:", okla_daily.min_time
print "End:", okla_daily.max_time
okla_daily.print_rows(3)
In [ ]:
import matplotlib.pyplot as plt
%matplotlib notebook
plt.style.use('ggplot')
fig, ax = plt.subplots()
ax.plot(okla_daily['time'], okla_daily['count'], color='dodgerblue')
ax.set_ylabel('Number of quakes')
ax.set_xlabel('Date')
fig.autofmt_xdate()
fig.show()
In [ ]:
from graphlab.toolkits import anomaly_detection
model = anomaly_detection.create(okla_daily, features=['count'])
print model
In [ ]:
threshold = 5
anomaly_mask = okla_daily['count'] >= threshold
anomaly_scores = okla_daily[['count']]
anomaly_scores['threshold_score'] = anomaly_mask
In [ ]:
anomaly_scores.tail(8).print_rows()
In [ ]:
from graphlab.toolkits.anomaly_detection import moving_zscore
zscore_model = moving_zscore.create(okla_daily, feature='count',
window_size=30,
min_observations=15)
print zscore_model
In [ ]:
zscore_model.scores.tail(3)
In [ ]:
zscore_model.scores.head(3)
In [ ]:
anomaly_scores['outlier_score'] = zscore_model.scores['anomaly_score']
anomaly_scores.tail(5).print_rows()
In [ ]:
fig, ax = plt.subplots(2, sharex=True)
ax[0].plot(anomaly_scores['time'], anomaly_scores['count'], color='dodgerblue')
ax[0].set_ylabel('# quakes')
ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'], color='orchid')
ax[1].set_ylabel('outlier score')
ax[1].set_xlabel('Date')
fig.autofmt_xdate()
fig.show()
In [ ]:
from graphlab.toolkits.anomaly_detection import bayesian_changepoints
changept_model = bayesian_changepoints.create(okla_daily, feature='count',
expected_runlength=2000, lag=7)
print changept_model
In [ ]:
anomaly_scores['changepoint_score'] = changept_model.scores['changepoint_score']
anomaly_scores.head(5)
In [ ]:
fig, ax = plt.subplots(3, sharex=True)
ax[0].plot(anomaly_scores['time'], anomaly_scores['count'], color='dodgerblue')
ax[0].set_ylabel('# quakes')
ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'], color='orchid')
ax[1].set_ylabel('outlier score')
ax[2].plot(anomaly_scores['time'], anomaly_scores['changepoint_score'], color='orchid')
ax[2].set_ylabel('changepoint score')
ax[2].set_xlabel('Date')
fig.autofmt_xdate()
fig.show()
In [ ]:
threshold = 0.5
anom_mask = anomaly_scores['changepoint_score'] >= threshold
anomalies = anomaly_scores[anom_mask]
print "Number of anomalies:", len(anomalies)
anomalies.head(5)
If you have a fixed budget for investigating and acting on anomalies, this is a good way to go.
In [ ]:
anomalies = anomaly_scores.to_sframe().topk('changepoint_score', k=5)
print "Number of anomalies:", len(anomalies)
anomalies.head(5)
In [ ]:
anomaly_scores['changepoint_score'].show()
In [ ]:
threshold = 0.072
anom_mask = anomaly_scores['changepoint_score'] >= threshold
anomalies = anomaly_scores[anom_mask]
print "Number of anomalies:", len(anomalies)
anomalies.head(5)
In [ ]:
from interactive_plot import LineDrawer
fig, ax = plt.subplots(3, sharex=True)
guide_lines = []
threshold_lines = []
p = ax[0].plot(anomaly_scores['time'], anomaly_scores['count'],
color='dodgerblue')
ax[0].set_ylabel('# quakes')
line, = ax[0].plot((anomaly_scores.min_time, anomaly_scores.min_time),
ax[0].get_ylim(), lw=1, ls='--', color='black')
guide_lines.append(line)
ax[1].plot(anomaly_scores['time'], anomaly_scores['outlier_score'],
color='orchid')
ax[1].set_ylabel('outlier score')
line, = ax[1].plot((anomaly_scores.min_time, anomaly_scores.min_time),
ax[1].get_ylim(), lw=1, ls='--', color='black')
guide_lines.append(line)
ax[2].plot(anomaly_scores['time'], anomaly_scores['changepoint_score'],
color='orchid')
ax[2].set_ylabel('changepoint score')
ax[2].set_xlabel('Date')
line, = ax[2].plot((anomaly_scores.min_time, anomaly_scores.min_time), (0., 1.),
lw=1, ls='--', color='black')
guide_lines.append(line)
for a in ax:
line, = a.plot(anomaly_scores.range, (0., 0.), lw=1, ls='--',
color='black')
threshold_lines.append(line)
plot_scores = anomaly_scores[['count', 'outlier_score', 'changepoint_score']]
interactive_thresholder = LineDrawer(plot_scores, guide_lines, threshold_lines)
interactive_thresholder.connect()
fig.autofmt_xdate()
fig.show()
In [ ]:
interactive_thresholder.anoms.print_rows(10)
In [ ]:
okla_new = gl.load_timeseries('working_data/ok_daily_update.ts')
okla_new.print_rows(20)
Why do we want to update the model, rather than training a new one?
In [ ]:
changept_model2 = changept_model.update(okla_new)
print changept_model2
In [ ]:
changept_model2.scores.print_rows(20)