In [1]:
library(ggplot2)
library(dplyr)
library(reshape2)
library(Matrix)
library(gridExtra)
# You may need to set these to your own local paths.
project_directory <- file.path(Sys.getenv("GIT_REPO_LOC"), "MicrocreditLRVB/inst/simulated_data")
raw_data_directory <- file.path(Sys.getenv("GIT_REPO_LOC"), "microcredit_vb/data")
In [2]:
# Read in the data that was produced by the R script.
csv_data <- read.csv(file.path(raw_data_directory, "microcredit_data_processed.csv"))
# The number of distinct groups.
n_g <- max(csv_data$site)
# Get the observations and the total number of observations.
y <- csv_data$profit
y_g <- as.integer(csv_data$site)
# The x array will indicate which rows should also get the
# treatment effect. The model accomodates generic x, but for
# this application we only need indicators.
x <- cbind(rep(1, length(y)), as.numeric(csv_data$treatment))
data_df <- data.frame(x=x, y=y, y_g=y_g)
data_df_transform <-
data_df %>%
mutate(zero_y=abs(y) < 1e-8)
# Note that 1e-8 appears to be a good filter -- we don't have any
# nearly zero entries until ~1e-1, above which they're probably actual nonzero values.
min(abs(filter(data_df_transform, !zero_y)$y))
In [4]:
#######################
# Look at qqplots and raw distributions with outlier trimming. Even after trimming, they do not look very normal.
trim_level <- 0.3
y_quantiles <-
filter(data_df_transform, !zero_y) %>%
group_by(y_g) %>%
summarize(qlower=quantile(y, trim_level), qupper=quantile(y, 1 - trim_level))
data_df_trim <-
filter(data_df_transform, !zero_y) %>%
inner_join(y_quantiles, by="y_g") %>%
filter(y < qupper & y > qlower) %>%
group_by(y_g) %>%
arrange(y) %>%
mutate(q=(1:length(y)) / length(y), norm=qnorm(q))
# Qqplots
ggplot(filter(data_df_trim)) +
geom_point(aes(x=norm, y=y)) +
facet_grid(y_g ~ ., scales="free")
ggplot(filter(data_df_trim)) +
geom_histogram(aes(x=y, y=..ndensity..), bins=20) +
facet_grid(~ y_g, scales="free") +
geom_vline(aes(xintercept=0))
In [ ]:
###################
# Look at cumulative distributions to check for power law behavior.
# If it is a power law, the cumulative distribution will be a straight line
# with slope given by the power law coefficient plus one.
quantile_list <- list()
for (group in 1:max(data_df_transform$y_g)) { for (y_sign in c(-1, 1)) { for (arm in c(0, 1)) {
rows <- with(data_df_transform, (y_g == group) & (!zero_y) & (y * y_sign > 0) & (x.2 == arm))
if (sum(rows) > 0) {
quantile_df <- data.frame(y=sort(y_sign * data_df_transform[rows, ]$y),
quantile=(sum(rows):1) / sum(rows),
group=group, y_sign=y_sign, arm=arm)
quantile_list[[length(quantile_list) + 1]] <- quantile_df
}
}}}
quantile_df <- do.call(rbind, quantile_list)
ggplot(quantile_df) +
geom_point(aes(x=y_sign * log10(y), y=log10(quantile), color=factor(arm))) +
facet_grid(group ~ y_sign) +
ggtitle("Overlaid arms")
ggplot(quantile_df) +
geom_point(aes(x=log10(y), y=log10(quantile), color=factor(y_sign))) +
facet_grid(group ~ arm) +
ggtitle("Overlaid signs")
ggplot(quantile_df) +
geom_point(aes(x=log10(y), y=log10(quantile), color=paste(arm, y_sign))) +
facet_grid(group ~ .) +
ggtitle("Everything overlaid")
In [ ]:
##########################################
# Execute box-cox transforms and save.
# A good reference for the Box-Cox transform:
# https://www.ime.usp.br/~abe/lista/pdfm9cJKUmFZp.pdf
library(MASS)
data_df_transform <-
data_df %>%
mutate(zero_y=abs(y) < 1e-8)
# Non-zero values of y_trans will be sent in the loop below.
data_df_transform$y_trans <- 0.0
data_df_transform$lambda <- NaN
for (group in 1:max(y_g)) { for (y_sign in c(-1, 1)) {
rows <- with(data_df_transform, (y_g == group) & (!zero_y) & (y * y_sign > 0))
bc_y <- y_sign * data_df_transform[rows, ]$y
if (length(bc_y) > 0) {
# The MASS boxcox function is pretty primitive. Better to do it yourself with optim.
bc <- boxcox(bc_y ~ 1, plotit=FALSE, lambda=seq(-1, 1, 0.001))
lambda <- bc$x[which.max(bc$y)]
if (abs(lambda) < 0.001) {
lambda <- 0
}
if (lambda == 0) {
y_trans <- log(bc_y)
} else {
y_trans <- ((bc_y ^ lambda) - 1) / lambda
}
qqnorm(y_trans, main=lambda)
#readline(prompt="Press [enter] to continue")
data_df_transform[rows, "y_trans"] <- y_sign * y_trans
data_df_transform[rows, "lambda"] <- lambda
}
}}
ggplot(filter(data_df_transform, !zero_y)) +
geom_histogram(aes(x=y_trans, y=..density..), bins=100) +
facet_grid(y_g ~ .)
# Look at the lambdas chosen. They are similar mostly except for group 2.
mutate(data_df_transform, y_pos=y > 0) %>%
filter(!zero_y) %>%
group_by(y_g, y_pos) %>%
summarize(lambda=unique(lambda))