detect_anoms <- function(data, k = 0.49, alpha = 0.05, num_obs_per_period = NULL,use_decomp = TRUE, use_esd = FALSE, one_tail = TRUE,upper_tail = TRUE, verbose = FALSE)
The generalized (extreme Studentized deviate) ESD test is used to detect one or more outliers in a univariate data set that follows an approximately normal distribution.
The generalized ESD test is defined for the hypothesis:
H0: There are no outliers in the data set
Ha: There are up to r outliers in the data set
We test the null hypothesis that the data has no outliers vs. the alternative hypothesis that there are at most k outliers (for some user specified value of k).
To test the data set S with n elements is we generate k test statistics G1, G2, …, Gk where each Gj statistic (Grubb's statistic) is defined as follows,
where tcrit is the critical value of the t distribution T(n−2) and the significance level is α/n. Thus the null hypothesis is rejected if G > Gcrit.
two-tailed Grubbs’ statistic, defined as follows:
S1 = S
x̄j is the mean of Sj and sj is the standard deviation of Sj
Essentially you run k separate Grubbs’ tests, testing whether Gj > Gj-crit where Gj-crit is Gcrit as described above, but adjusted for the correct value of the sample size; i.e. n is replaced by n − j + 1. Now let r be the largest value of j ≤ k such that Gj > Gj-crit. Then we conclude there are r outliers, namely x1, …, xr. If r = 0 there are no outliers.
Note that if Gj > Gj-crit and h < j, then both xh and xj are outliers even if Gh ≤ Gh-crit.
In [ ]:
#Checking if the input 'num_obs_per_period' (number of obervarions in a period) is present
if(is.null(num_obs_per_period)){
stop("must supply period length for time series decomposition")
}
num_obs <- nrow(data)
# Check to make sure we have at least two periods worth of data for anomaly context
if(num_obs < num_obs_per_period * 2){
stop("Anom detection needs at least 2 periods worth of data")
}
# Check if our timestamps are posix
posix_timestamp <- if (class(data[[1L]])[1L] == "POSIXlt") TRUE else FALSE
# Handling NAs
if (length(rle(is.na(c(NA,data[[2L]],NA)))$values)>3){
stop("Data contains non-leading NAs. We suggest replacing NAs with interpolated values (see na.approx in Zoo package).")
} else {
data <- na.omit(data)
}
In [ ]:
#Step 1: Decompose data. This returns a univarite remainder which will be used for anomaly detection.
data_decomp <- stl(ts(data[[2L]], frequency = num_obs_per_period),s.window = "periodic", robust = TRUE)
In [ ]:
# Remove the seasonal component, and the median of the data to create the univariate remainder
data <- data.frame(timestamp = data[[1L]], count = (data[[2L]]-data_decomp$time.series[,"seasonal"]-median(data[[2L]])))
# Store the smoothed seasonal component, plus the trend component for use in determining the "expected values" option
data_decomp <- data.frame(timestamp=data[[1L]],
count=(as.numeric(trunc(data_decomp$time.series[,"trend"]+data_decomp$time.series[,"seasonal"]))))
if(posix_timestamp){
data_decomp <- format_timestamp(data_decomp)
}
In [ ]:
# Maximum number of outliers that S-H-ESD can detect (e.g. 49% of data)
max_outliers <- trunc(num_obs*k)
if(max_outliers == 0){
stop(paste0("With longterm=TRUE, AnomalyDetection splits the data into 2 week periods by default.
You have ", num_obs, " observations in a period, which is too few. Set a higher piecewise_median_period_weeks."))
}
func_ma <- match.fun(median)
func_sigma <- match.fun(mad)
## Define values and vectors.
# Defining R_idx to store the maximum number of outliers
n <- length(data[[2L]])
if (posix_timestamp){
R_idx <- as.POSIXlt(data[[1L]][1L:max_outliers], tz = "UTC")
} else {
R_idx <- 1L:max_outliers
}
#Initialising the number of outliers to zero initially
num_anoms <- 0L
# Compute test statistic until r=max_outliers values have been
# removed from the sample.
for (i in 1L:max_outliers){
if(verbose) message(paste(i,"/", max_outliers,"completed"))
#Subtracting the median value from the data to make it consistent
if(one_tail){
if(upper_tail){
ares <- data[[2L]] - func_ma(data[[2L]])
} else {
ares <- func_ma(data[[2L]]) - data[[2L]]
}
} else {
ares = abs(data[[2L]] - func_ma(data[[2L]]))
}
# protect against constant time series
data_sigma <- func_sigma(data[[2L]])
if(data_sigma == 0)
break
# Dividing ares by mad (mean absolute deviation)
ares <- ares/data_sigma
R <- max(ares)
temp_max_idx <- which(ares == R)[1L]
R_idx[i] <- data[[1L]][temp_max_idx]
#Removing the outlier point from the data set and using the data set for further outlier detection
data <- data[-which(data[[1L]] == R_idx[i]), ]
## Compute critical value.
if(one_tail){
p <- 1 - alpha/(n-i+1)
} else {
p <- 1 - alpha/(2*(n-i+1))
}
t <- qt(p,(n-i-1L))
lam <- t*(n-i) / sqrt((n-i-1+t**2)*(n-i+1))
if(R > lam)
num_anoms <- i
}
#Checking if we have got any anomalies
if(num_anoms > 0) {
R_idx <- R_idx[1L:num_anoms]
} else {
R_idx = NULL
}
#Output list containing anomalies and thier position
anom_list <- list(anoms = R_idx, stl = data_decomp)
anom_list will containt the anomalies along with their respective position in a list format1
In [ ]: