Basado en Shirin Glander Pag personal :https://shiring.github.io/about
In [1]:
options(repr.plot.width=4, repr.plot.height=3)
In [2]:
unzip("creditcard.csv.zip")
In [3]:
creditcard <- read.csv("creditcard.csv")
In [4]:
str(creditcard)
In [5]:
library(dplyr)
library(ggplot2)
In [6]:
creditcard %>%
ggplot(aes(x = Class)) +
geom_bar(color = "navy", fill = "deepskyblue4")
In [7]:
summary(creditcard$Time)
In [8]:
# separate transactions by day
creditcard$day <- ifelse(creditcard$Time > 3600 * 24, "day2", "day1")
In [9]:
# make transaction relative to day
creditcard$Time_day <- ifelse(creditcard$day == "day2", creditcard$Time - 86400, creditcard$Time)
In [10]:
summary(creditcard[creditcard$day == "day1", ]$Time_day)
In [11]:
summary(creditcard[creditcard$day == "day2", ]$Time_day)
In [12]:
# bin transactions according to time of day
creditcard$Time <- as.factor(ifelse(creditcard$Time_day <= 38138, "gr1", # mean 1st Qu.
ifelse(creditcard$Time_day <= 52327, "gr2", # mean mean
ifelse(creditcard$Time_day <= 69580, "gr3", # mean 3rd Qu
"gr4"))))
In [13]:
creditcard %>%
ggplot(aes(x = day)) +
geom_bar(color = "navy", fill = "deepskyblue4")
In [14]:
creditcard <- select(creditcard, -Time_day, -day)
In [15]:
# convert class variable to factor
creditcard$Class <- factor(creditcard$Class)
creditcard %>%
ggplot(aes(x = Time)) +
geom_bar(color = "navy", fill = "deepskyblue4") +
facet_wrap( ~ Class, scales = "free", ncol = 2)
In [16]:
summary(creditcard[creditcard$Class == "0", ]$Amount)
In [17]:
summary(creditcard[creditcard$Class == "1", ]$Amount)
In [18]:
creditcard %>%
ggplot(aes(x = Amount)) +
geom_histogram(color = "navy", fill = "deepskyblue4") +
facet_wrap( ~ Class, scales = "free", ncol = 2)
In [50]:
library(h2o)
h2o.init(nthreads = 1)
In [20]:
# convert data to H2OFrame
creditcard_hf <- as.h2o(creditcard)
In [21]:
splits <- h2o.splitFrame(creditcard_hf,
ratios = c(0.4, 0.4),
seed = 42)
train_unsupervised <- splits[[1]]
train_supervised <- splits[[2]]
test <- splits[[3]]
response <- "Class"
features <- setdiff(colnames(train_unsupervised), response)
In [22]:
model_nn <- h2o.deeplearning(x = features,
training_frame = train_unsupervised,
model_id = "model_nn",
autoencoder = TRUE,
reproducible = TRUE, #slow - turn off for real problems
ignore_const_cols = FALSE,
seed = 42,
hidden = c(10, 2, 10),
epochs = 100,
activation = "Tanh")
In [23]:
h2o.saveModel(model_nn, path='C:/Users/admin/Desktop/GitHub/Mineria.de.Datos.Aplicada/Detención Anomalias',force = TRUE)
In [24]:
model_nn <- h2o.loadModel("model_nn/model_nn")
model_nn
In [25]:
#Convert to autoencoded representation
test_autoenc <- h2o.predict(model_nn, test)
In [26]:
train_features <- h2o.deepfeatures(model_nn, train_unsupervised, layer = 2) %>%
as.data.frame() %>%
mutate(Class = as.vector(train_unsupervised[, 31]))
In [27]:
ggplot(train_features, aes(x = DF.L2.C1, y = DF.L2.C2, color = Class)) +
geom_point(alpha = 0.1)
In [28]:
# let's take the third hidden layer
train_features <- h2o.deepfeatures(model_nn, train_unsupervised, layer = 3) %>%
as.data.frame() %>%
mutate(Class = as.factor(as.vector(train_unsupervised[, 31]))) %>%
as.h2o()
features_dim <- setdiff(colnames(train_features), response)
In [29]:
model_nn_dim <- h2o.deeplearning(y = response,
x = features_dim,
training_frame = train_features,
reproducible = TRUE, #slow - turn off for real problems
balance_classes = TRUE,
ignore_const_cols = FALSE,
seed = 42,
hidden = c(10, 2, 10),
epochs = 100,
activation = "Tanh")
In [30]:
h2o.saveModel(model_nn_dim, path="model_nn_dim", force = TRUE)
In [31]:
model_nn_dim <- h2o.loadModel("model_nn_dim/DeepLearning_model_R_1493672949388_1")
model_nn_dim
In [32]:
test_dim <- h2o.deepfeatures(model_nn, test, layer = 3)
h2o.predict(model_nn_dim, test_dim) %>%
as.data.frame() %>%
mutate(actual = as.vector(test[, 31])) %>%
group_by(actual, predict) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
In [33]:
anomaly <- h2o.anomaly(model_nn, test) %>%
as.data.frame() %>%
tibble::rownames_to_column() %>%
mutate(Class = as.vector(test[, 31]))
In [34]:
mean_mse <- anomaly %>%
group_by(Class) %>%
summarise(mean = mean(Reconstruction.MSE))
In [35]:
ggplot(anomaly, aes(x = as.numeric(rowname), y = Reconstruction.MSE, color = as.factor(Class))) +
geom_point(alpha = 0.3) +
geom_hline(data = mean_mse, aes(yintercept = mean, color = Class)) +
scale_color_brewer(palette = "Set1") +
labs(x = "instance number",
color = "Class")
In [36]:
anomaly <- anomaly %>%
mutate(outlier = ifelse(Reconstruction.MSE > 0.02, "outlier", "no_outlier"))
In [37]:
anomaly %>%
group_by(Class, outlier) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
In [38]:
model_nn_2 <- h2o.deeplearning(y = response,
x = features,
training_frame = train_supervised,
pretrained_autoencoder = "model_nn",
reproducible = TRUE, #slow - turn off for real problems
balance_classes = TRUE,
ignore_const_cols = FALSE,
seed = 42,
hidden = c(10, 2, 10),
epochs = 100,
activation = "Tanh")
In [39]:
h2o.saveModel(model_nn_2, path="model_nn_2", force = TRUE)
In [40]:
model_nn_2 <- h2o.loadModel("model_nn_2/DeepLearning_model_R_1493672949388_23")
model_nn_2
In [41]:
pred <- as.data.frame(h2o.predict(object = model_nn_2, newdata = test)) %>%
mutate(actual = as.vector(test[, 31]))
In [42]:
pred %>%
group_by(actual, predict) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
In [43]:
pred %>%
ggplot(aes(x = actual, fill = predict)) +
geom_bar() +
scale_fill_brewer(palette = "Set1") +
facet_wrap( ~ actual, scales = "free", ncol = 2)
In [44]:
library(ROCR)
# http://stackoverflow.com/questions/24563061/computing-integral-of-a-line-plot-in-r
line_integral <- function(x, y) {
dx <- diff(x)
end <- length(y)
my <- (y[1:(end - 1)] + y[2:end]) / 2
sum(dx * my)
}
In [45]:
prediction_obj <- prediction(pred$p1, pred$actual)
par(mfrow = c(1, 2))
par(mar = c(5.1,4.1,4.1,2.1))
# precision-recall curve
perf1 <- performance(prediction_obj, measure = "prec", x.measure = "rec")
x <- perf1@x.values[[1]]
y <- perf1@y.values[[1]]
y[1] <- 0
options(repr.plot.width=8, repr.plot.height=4)
plot(perf1, main = paste("Area bajo la \n Curva Precision-Recall :\n", round(abs(line_integral(x,y)), digits = 3)))
In [46]:
# sensitivity-specificity curve
perf2 <- performance(prediction_obj, measure = "sens", x.measure = "spec")
x <- perf2@x.values[[1]]
y <- perf2@y.values[[1]]
y[1] <- 0
plot(perf2, main = paste("Area bajo la curva\nSensitivity-Specificity :\n", round(abs(line_integral(x,y)), digits = 3)))
In [47]:
thresholds <- seq(from = 0, to = 1, by = 0.1)
pred_thresholds <- data.frame(actual = pred$actual)
for (threshold in thresholds) {
prediction <- ifelse(pred$p1 > threshold, 1, 0)
prediction_true <- ifelse(pred_thresholds$actual == prediction, TRUE, FALSE)
pred_thresholds <- cbind(pred_thresholds, prediction_true)
}
In [48]:
colnames(pred_thresholds)[-1] <- thresholds
library(tidyverse)
pred_thresholds %>%
gather(x, y, 2:ncol(pred_thresholds)) %>%
group_by(actual, x, y) %>%
summarise(n = n()) %>%
ggplot(aes(x = as.numeric(x), y = n, color = actual)) +
geom_vline(xintercept = 0.6, alpha = 0.5) +
geom_line() +
geom_point(alpha = 0.5) +
facet_wrap(actual ~ y, scales = "free", ncol = 2) +
labs(x = "prediction threshold",
y = "number of instances")
In [49]:
pred %>%
mutate(predict = ifelse(pred$p1 > 0.6, 1, 0)) %>%
group_by(actual, predict) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
In [ ]:
In [ ]:
In [ ]: