In [48]:
# Preliminaries
library(dplyr)
library(magrittr)
library(MASS)
library(ggplot2)
set.seed(1)
n <- 10 ^ 3
mu <- matrix(c(0, 0))
rho <- 0.5
sigma1 <- 0.5
sigma2 <- 2
Sigma <- matrix(c(sigma1 ^ 2,
rho * sigma1 * sigma2,
rho * sigma1 * sigma2,
sigma2 ^ 2),
c(2, 2))
simulatedXandC <- mvrnorm(n = n, mu = mu, Sigma = Sigma)
simulatedError <- rnorm(n = n)
simulatedZ <- round(runif(n))
group <- round(runif(n))
In [49]:
df <- data.frame(X = simulatedXandC[, 1],
C = simulatedXandC[, 2],
error = simulatedError)
df %<>%
mutate(Y = 10 + 1 * X + 5 * C + error)
fit <- lm(Y ~ X, data = df)
summary(fit)
fit <- lm(Y ~ X + C, data = df)
summary(fit)
Out[49]:
Out[49]:
In [109]:
# Regression discontinuity design
library(rdd)
df <- data.frame(X = simulatedXandC[, 1],
C = simulatedXandC[, 2],
T = (simulatedXandC[, 1] > 0),
error = simulatedError)
df %<>%
mutate(Y = 10 + X ^ 3 + 5 * T + 1 * C + error)
fit <- RDestimate(Y ~ X, data = df)
summary(fit)
ggplot(data = df, aes(x = X, y = Y, color = (X > 0))) +
geom_point() +
geom_smooth(method = lm,
se = FALSE)
In [110]:
# no bunching
df %>%
ggplot(aes(x = X)) +
geom_histogram(breaks = seq(-2, 2, 0.5), colour="white", fill = "#00B39F")
# bunching
df %>%
mutate(rand = runif(n()),
X = ifelse(rand < 0.33 & X < 0, -X, X)) %>%
ggplot(aes(x = X)) +
geom_histogram(breaks = seq(-2, 2, 0.5), colour="white", fill = "#00B39F")
In [119]:
#
df %>%
ggplot(aes(x = error, color = (X > 0))) +
geom_density()
#
df %>%
ggplot(aes(x = C, color = (X > 0))) +
geom_density()
In [103]:
# Difference-in-differences
df <- data.frame(time = 10 * round(runif(n), 1) - 5,
treatment = group,
error = simulatedError)
df %<>%
mutate(Y = 1 - time + 5 * treatment + 7 * (time >= 0) * treatment + error)
fit <- lm(Y ~ time + treatment + I((time >= 0) * treatment), data = df)
summary(fit)
dfMean <- df %>%
group_by(time, treatment) %>%
summarise(Y = mean(Y))
ggplot(data = df, aes(x = time, y = Y, color = factor(treatment + 2 * (time >= 0)))) +
geom_point() +
geom_smooth(method = lm,
se = FALSE) +
theme(legend.position = "none") +
scale_color_manual(values = c("#F05253", "#2A73CC", "#F05253", "#2A73CC"))
Out[103]:
In [108]:
# parallel trends
#9772B2
df %>%
filter(time < 0) %>%
ggplot(aes(x = time, y = Y, color = factor(treatment + 2 * (time >= 0)))) +
geom_point() +
geom_smooth(method = lm,
se = FALSE) +
theme(legend.position = "none") +
scale_color_manual(values = c("#F05253", "#2A73CC", "#F05253", "#2A73CC"))
df %>%
mutate(Y = ifelse(treatment, Y - 2 * time, Y)) %>%
filter(time < 0) %>%
ggplot(aes(x = time, y = Y, color = factor(treatment + 2 * (time >= 0)))) +
geom_point() +
geom_smooth(method = lm,
se = FALSE) +
theme(legend.position = "none") +
scale_color_manual(values = c("#F05253", "#2A73CC", "#F05253", "#2A73CC"))
In [6]:
# Fixed-effects regression
df <- data.frame(X = simulatedXandC[, 1],
C = round(pmin(2, pmax(-2, simulatedXandC[, 2]))),
error = simulatedError)
df %<>%
mutate(Y = 10 + 1 * X + 5 * C + error)
fit <- lm(Y ~ X, data = df)
summary(fit)
fit <- lm(Y ~ X + factor(C), data = df)
summary(fit)
Out[6]:
Out[6]:
In [7]:
# Instrumental variables
df <- data.frame(Z = simulatedZ,
X = simulatedXandC[, 1] + simulatedZ,
C = simulatedXandC[, 2],
error = simulatedError)
df %<>%
mutate(Y = 10 + 1.0 * X - 2 * C + error)
# Naive regression produces biased estimates
fit <- lm(Y ~ X, data = df)
summary(fit, vcov = sandwich)
# IV recovers the underlying model
fit <- ivreg(Y ~ X | Z, data = df)
summary(fit, vcov = sandwich, df = Inf, diagnostics = TRUE)
Out[7]:
Out[7]: