Anomaly detection detects data points in data that does not fit well with the rest of data. In this notebook we demonstrate how to do unsupervised anomaly detection using recurrent nueral network (RNN) model.
We used one of the dataset in Numenta Anomaly Benchmark (NAB) (link) for demo, i.e. NYC taxi passengers dataset, which contains 10320 records, each indicating the total number of taxi passengers in NYC at a corresonponding time spot. We use RNN to learn from 50 previous values, and predict just the 1 next value. The data points whose actual values are distant from predicted values are considered anomalies (distance threshold can be adjusted as needed).
References:
In [2]:
import os
from zoo.common.nncontext import *
sc = init_nncontext("Anomaly Detection Example")
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
%pylab inline
import seaborn
import matplotlib.dates as md
from matplotlib import pyplot as plt
from sklearn import preprocessing
In [3]:
from zoo.pipeline.api.keras.layers import Dense, Dropout, LSTM
from zoo.pipeline.api.keras.models import Sequential
In [4]:
try:
dataset_path = os.getenv("ANALYTICS_ZOO_HOME")+"/bin/data/NAB/nyc_taxi/nyc_taxi.csv"
df = pd.read_csv(dataset_path)
except Exception as e:
print("nyc_taxi.csv doesn't exist")
print("you can run $ANALYTICS_ZOO_HOME/bin/data/NAB/nyc_taxi/get_nyc_taxi.sh to download nyc_taxi.csv")
Each record is in format of (timestamp, value). Timestamps range between 2014-07-01 and 2015-01-31.
In [5]:
print(df.info())
In [6]:
# check the timestamp format and frequence
print(df['timestamp'].head(10))
In [7]:
# check the mean of passenger number
print(df['value'].mean())
In [8]:
# change the type of timestamp column for plotting
df['datetime'] = pd.to_datetime(df['timestamp'])
# visualisation of anomaly throughout time (viz 1)
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(df['datetime'], df['value'], color='blue', linewidth=0.6)
ax.set_title('NYC taxi passengers throughout time')
plt.xlabel('datetime')
plt.xticks(rotation=45)
plt.ylabel('The Number of NYC taxi passengers')
plt.show()
In [9]:
# the hours when people are awake (6:00-00:00)
df['hours'] = df['datetime'].dt.hour
df['awake'] = (((df['hours'] >= 6) & (df['hours'] <= 23)) | (df['hours'] == 0)).astype(int)
In [10]:
# creation of 2 distinct categories that seem useful (sleeping time and awake time)
df['categories'] = df['awake']
a = df.loc[df['categories'] == 0, 'value']
b = df.loc[df['categories'] == 1, 'value']
fig, ax = plt.subplots()
a_heights, a_bins = np.histogram(a)
b_heights, b_bins = np.histogram(b, bins=a_bins)
width = (a_bins[1] - a_bins[0])/6
ax.bar(a_bins[:-1], a_heights*100/a.count(), width=width, facecolor='yellow', label='Sleeping time')
ax.bar(b_bins[:-1]+width, (b_heights*100/b.count()), width=width, facecolor='red', label ='Awake time')
ax.set_title('Histogram of NYC taxi passengers in different categories')
plt.xlabel('The number of NYC taxi passengers')
plt.ylabel('Record counts')
plt.legend()
plt.show()
In [11]:
df['awake'].head(4)
Out[11]:
In [12]:
df['timestamp'].head(4)
Out[12]:
From the above result, we can conclude:
In [13]:
#select and standardize data
data_n = df[['value', 'hours', 'awake']]
standard_scaler = preprocessing.StandardScaler()
np_scaled = standard_scaler.fit_transform(data_n)
data_n = pd.DataFrame(np_scaled)
#important parameters and train/test size
prediction_time = 1
testdatasize = 1000
unroll_length = 50
testdatacut = testdatasize + unroll_length + 1
#train data
x_train = data_n[0:-prediction_time-testdatacut].to_numpy()
y_train = data_n[prediction_time:-testdatacut ][0].to_numpy()
#test data
x_test = data_n[0-testdatacut:-prediction_time].to_numpy()
y_test = data_n[prediction_time-testdatacut: ][0].to_numpy()
for example, if unroll_length=5
[[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
[10]
...
]
will be unrolled to create following sequences
[[ 1, 2, 3, 4, 5]
[ 2, 3, 4, 5, 6]
[ 3, 4, 5, 6, 7]
[ 4, 5, 6, 7, 8]
[ 5, 6, 7, 8, 9]
[ 6, 7, 8, 9, 10]
...
]
In [14]:
#unroll: create sequence of 50 previous data points for each data points
def unroll(data,sequence_length=24):
result = []
for index in range(len(data) - sequence_length):
result.append(data[index: index + sequence_length])
return np.asarray(result)
# adapt the datasets for the sequence data shape
x_train = unroll(x_train,unroll_length)
x_test = unroll(x_test,unroll_length)
y_train = y_train[-x_train.shape[0]:]
y_test = y_test[-x_test.shape[0]:]
# see the shape
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print("x_test", x_test.shape)
print("y_test", y_test.shape)
In [15]:
# Build the model
model = Sequential()
model.add(LSTM(
input_shape=(x_train.shape[1], x_train.shape[-1]),
output_dim=20,
return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(
10,
return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(
output_dim=1))
model.compile(loss='mse', optimizer='rmsprop')
In [16]:
%%time
# Train the model
print("Training begins.")
model.fit(
x_train,
y_train,
batch_size=1024,
nb_epoch=20)
print("Training completed.")
In [17]:
# create the list of difference between prediction and test data
diff=[]
ratio=[]
predictions = model.predict(x_test)
p = predictions.collect()
for u in range(len(y_test)):
pr = p[u][0]
ratio.append((y_test[u]/pr)-1)
diff.append(abs(y_test[u]- pr))
In [18]:
# plot the predicted values and actual values (for the test data)
fig, axs = plt.subplots()
axs.plot(p,color='red', label='predicted values')
axs.plot(y_test,color='blue', label='actual values')
axs.set_title('the predicted values and actual values (for the test data)')
plt.xlabel('test data index')
plt.ylabel('number of taxi passengers after scaling')
plt.legend(loc='upper left')
plt.show()
In [19]:
# An estimation of anomly population of the dataset
outliers_fraction = 0.01
# select the most distant prediction/reality data points as anomalies
diff = pd.Series(diff)
number_of_outliers = int(outliers_fraction*len(diff))
threshold = diff.nlargest(number_of_outliers).min()
In [20]:
# plot the difference and the threshold (for the test data)
fig, axs = plt.subplots()
axs.plot(diff,color='blue', label='diff')
axs.set_title('the difference between the predicted values and actual values with the threshold line')
plt.hlines(threshold, 0, 1000, color='red', label='threshold')
plt.xlabel('test data index')
plt.ylabel('difference value after scaling')
plt.legend(loc='upper left')
plt.show()
In [21]:
# data with anomaly label (test data part)
test = (diff >= threshold).astype(int)
# the training data part where we didn't predict anything (overfitting possible): no anomaly
complement = pd.Series(0, index=np.arange(len(data_n)-testdatasize))
last_train_data= (df['datetime'].tolist())[-testdatasize]
# add the data to the main
df['anomaly27'] = complement.append(test, ignore_index='True')
In [22]:
# visualisation of anomaly throughout time (viz 1)
fig, ax = plt.subplots(figsize=(12, 5))
a = df.loc[df['anomaly27'] == 1, ['datetime', 'value']] #anomaly
ax.plot(df['datetime'], df['value'], color='blue', label='no anomaly value', linewidth=0.6)
ax.scatter(a['datetime'].tolist(),a['value'], color='red', label='anomalies value')
ax.set_title('the number of nyc taxi value throughout time (with anomalies scattered)')
max_value = df['value'].max()
min_value = df['value'].min()
plt.vlines(last_train_data, min_value, max_value, color='black', linestyles = "dashed", label='test begins')
plt.xlabel('datetime')
plt.xticks(rotation=45)
plt.ylabel('the number of nyc taxi value')
plt.legend(loc='upper left')
plt.show()
In [ ]: