This vignette is a parity check between Rfuzzydid and
Stata’s fuzzydid. Each section runs an R example and, where
possible, compares against frozen Stata outputs used in the test
suite.
The Stata snippets assume the Stata fuzzydid command is
already installed locally; Rfuzzydid no longer bundles the
.ado sources.
The shared estimators come from de Chaisemartin and D’Haultfoeuille (2018): local average treatment effects (LATE) and local quantile treatment effects under fuzzy DID assumptions.
We start with a simple two-period, two-group design:
set.seed(50321)
n_cell <- 120 # observations per cell
# Helper function to create each (G, T) cell
make_cell <- function(g, t, prob_d) {
d <- rbinom(n_cell, size = 1, prob = prob_d)
# Outcome: base + group effect + time effect + treatment effect + noise
y <- 1 + 0.5 * g + 0.4 * t + 2.0 * d + sin(seq_len(n_cell) / 7)
data.frame(y = y, g = g, t = t, d = d)
}
# Build the four cells
df <- rbind(
make_cell(g = 0, t = 0, prob_d = 0.20), # Control group, baseline
make_cell(g = 0, t = 1, prob_d = 0.40), # Control group, follow-up
make_cell(g = 1, t = 0, prob_d = 0.30), # Treatment group, baseline
make_cell(g = 1, t = 1, prob_d = 0.80) # Treatment group, follow-up
)
# Verify cell sizes and treatment rates
aggregate(cbind(n = d) ~ g + t, data = df, FUN = function(x) c(n = length(x), mean = mean(x)))
#> g t n.n n.mean
#> 1 0 0 120.0000000 0.1583333
#> 2 1 0 120.0000000 0.2916667
#> 3 0 1 120.0000000 0.4333333
#> 4 1 1 120.0000000 0.8083333
# Save fixture used by most examples for direct Stata comparison
if (requireNamespace("haven", quietly = TRUE)) {
haven::write_dta(df, "stata-parity-sim-fixture.dta")
}The treatment rate jumps from 30% to 80% in the treated group and from 20% to 40% in controls. That gap in treatment uptake changes is the first stage that identifies fuzzy DID effects.
We request the three main LATE estimators:
fit <- fuzzydid(
data = df,
formula = y ~ d,
group = "g",
time = "t",
did = TRUE,
tc = TRUE,
cic = TRUE,
breps = 50
)
summary(fit)
#> LATE estimators
#>
#>
#> |estimator | estimate| std.error| conf.low| conf.high|
#> |:---------|--------:|---------:|--------:|---------:|
#> |W_DID | 2.000000| 0.4679188| 1.088768| 2.960728|
#> |W_TC | 1.973204| 0.2333661| 1.526637| 2.309330|
#> |W_CIC | 2.222098| 0.2028711| 1.820405| 2.545051|
stata_basic <- c(
W_DID = 2.00000000000000,
W_TC = 1.97320415000280,
W_CIC = 2.22209836585117
)
r_basic <- setNames(fit$late$estimate, fit$late$estimator)[names(stata_basic)]
basic_cmp <- data.frame(
estimator = names(stata_basic),
R = unname(r_basic),
Stata = unname(stata_basic),
abs_diff = abs(unname(r_basic) - unname(stata_basic))
)
knitr::kable(basic_cmp, digits = 8)| estimator | R | Stata | abs_diff |
|---|---|---|---|
| W_DID | 2.000000 | 2.000000 | 0e+00 |
| W_TC | 1.973204 | 1.973204 | 8e-08 |
| W_CIC | 2.222098 | 2.222098 | 5e-08 |
The table reports point estimates, bootstrap standard errors, and percentile confidence intervals.
Equivalent Stata code:
modelsummary gives a compact table that is easy to pass
into reports:
library(modelsummary)
modelsummary::modelsummary(fit)
#> Found litedown! Enabling r-universe template| (1) | |
|---|---|
| W_DID | 2.000 |
| (0.468) | |
| W_TC | 1.973 |
| (0.233) | |
| W_CIC | 2.222 |
| (0.203) | |
| Num.Obs. | 480 |
| backend | native |
| N.11 | 120 |
| N.10 | 120 |
| N.01 | 120 |
| N.00 | 120 |
| N.reps | 50 |
| N.misreps | 0 |
| Share.failures | 0 |
Equivalent Stata code:
This chunk reproduces the exact fixture used by the parity tests and checks that R and Stata point estimates agree to tolerance:
n_cell <- 120L
make_parity_cell <- function(g, t, ones) {
data.frame(
g = rep.int(g, n_cell),
t = rep.int(t, n_cell),
d = c(rep.int(1L, ones), rep.int(0L, n_cell - ones))
)
}
out <- rbind(
make_parity_cell(0L, 0L, 24L),
make_parity_cell(0L, 1L, 48L),
make_parity_cell(1L, 0L, 36L),
make_parity_cell(1L, 1L, 96L)
)
out$id <- seq_len(nrow(out))
out$y <- 1 + 0.5 * out$g + 0.4 * out$t + 2 * out$d + sin(out$id / 7)
parity_fixture <- out[, c("y", "g", "t", "d")]
fit_parity <- fuzzydid(
data = parity_fixture,
formula = y ~ d,
group = "g",
time = "t",
did = TRUE,
tc = TRUE,
cic = TRUE,
breps = 5,
seed = 1
)
stata_core <- c(
W_DID = 2.17430877504668,
W_TC = 2.18553887285118,
W_CIC = 2.31982319056988
)
r_core <- setNames(fit_parity$late$estimate, fit_parity$late$estimator)[c("W_DID", "W_TC", "W_CIC")]
core_cmp <- data.frame(
estimator = names(stata_core),
R = unname(r_core),
Stata = unname(stata_core),
abs_diff = abs(unname(r_core) - unname(stata_core))
)
knitr::kable(core_cmp, digits = 8)| estimator | R | Stata | abs_diff |
|---|---|---|---|
| W_DID | 2.174309 | 2.174309 | 7e-08 |
| W_TC | 2.185539 | 2.185539 | 4e-08 |
| W_CIC | 2.319823 | 2.319823 | 3e-08 |
# Save exact parity fixture for Stata users
if (requireNamespace("haven", quietly = TRUE)) {
haven::write_dta(parity_fixture, "stata-parity-core-fixture.dta")
}Point estimates are strict parity targets here. Standard errors and intervals depend on bootstrap simulation, so tiny platform-level differences are normal.
Equivalent Stata code:
* Load the same dataset exported in the parity check chunk
use "stata-parity-core-fixture.dta", clear
* Generate the group variable (already in dataset as 'g')
* The Stata command expects G to identify treatment groups
* Run fuzzydid
fuzzydid y g t d, did tc cic breps(50)
* Results are stored in e()
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)The next sections cover behavior that should match Stata and often causes confusion in practice.
Bootstrap inference is reproducible when you pass the same
seed value. If you change seed, the resamples
change too:
# These produce identical SE/CI because the same bootstrap seed is supplied
fit1 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1)
fit2 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1)
all.equal(fit1$late$std.error, fit2$late$std.error)
#> [1] TRUE
stata_seed_se <- 0.66310988000000
seed_cmp <- data.frame(
estimator = "W_DID",
R_seed_1_run_1 = fit1$late$std.error[fit1$late$estimator == "W_DID"],
R_seed_1_run_2 = fit2$late$std.error[fit2$late$estimator == "W_DID"],
Stata = stata_seed_se,
stringsAsFactors = FALSE
)
seed_cmp$abs_diff_run_1_vs_Stata <- abs(seed_cmp$R_seed_1_run_1 - seed_cmp$Stata)
seed_cmp$abs_diff_run_2_vs_Stata <- abs(seed_cmp$R_seed_1_run_2 - seed_cmp$Stata)
knitr::kable(seed_cmp, digits = 8)| estimator | R_seed_1_run_1 | R_seed_1_run_2 | Stata | abs_diff_run_1_vs_Stata | abs_diff_run_2_vs_Stata |
|---|---|---|---|---|---|
| W_DID | 0.7859547 | 0.7859547 | 0.6631099 | 0.1228449 | 0.1228449 |
Equivalent Stata code:
Rfuzzydid enforces the same design restrictions as
Stata:
df_restrict <- transform(df, x = rnorm(nrow(df)))
# CIC requires no covariates
try({
fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", cic = TRUE)
})
#> Error : continuous and qualitative cannot be used in combination with cic or lqte.
# LQTE requires binary G, T, and D, plus no covariates
try({
fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", lqte = TRUE)
})
#> Error : continuous and qualitative cannot be used in combination with cic or lqte.Equivalent Stata code:
When Wald-TC point identification is unavailable, both implementations can report partial-identification bounds:
fit_partial <- fuzzydid(
data = df,
formula = y ~ d,
group = "g",
time = "t",
tc = TRUE,
partial = TRUE,
breps = 50
)
knitr::kable(fit_partial$late)| estimator | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| TC_inf | 0.012206 | 0.4554927 | -0.8004909 | 0.9406186 |
| TC_sup | 3.081771 | 0.3345019 | 2.4384801 | 3.6728553 |
r_partial <- setNames(fit_partial$late$estimate, fit_partial$late$estimator)[names(stata_partial)]
partial_cmp <- data.frame(
estimator = names(stata_partial),
R = unname(r_partial),
Stata = unname(stata_partial),
abs_diff = abs(unname(r_partial) - unname(stata_partial))
)
knitr::kable(partial_cmp, digits = 8)| estimator | R | Stata | abs_diff |
|---|---|---|---|
| TC_inf | 0.01220596 | 0.01220596 | 0e+00 |
| TC_sup | 3.08177133 | 3.08177130 | 3e-08 |
Equivalent Stata code:
When multiple estimators are requested, you can also test pairwise equality:
fit_eq <- fuzzydid(
data = parity_fixture,
formula = y ~ d,
group = "g",
time = "t",
did = TRUE,
tc = TRUE,
cic = TRUE,
eqtest = TRUE,
breps = 5,
seed = 1
)
eq_cmp <- data.frame(
contrast = stata_eqtest$contrast,
R = fit_eq$eqtest$estimate,
Stata = stata_eqtest$estimate,
abs_diff = abs(fit_eq$eqtest$estimate - stata_eqtest$estimate)
)
knitr::kable(eq_cmp, digits = 8)| contrast | R | Stata | abs_diff |
|---|---|---|---|
| W_DID_W_TC | -0.01123007 | -0.01123007 | 0 |
| W_DID_W_CIC | -0.14551432 | -0.14551432 | 0 |
| W_TC_W_CIC | -0.13428425 | -0.13428425 | 0 |
Equivalent Stata code:
With covariates, modelx controls how conditional
expectations are modeled (for example OLS, logit, or probit):
# Add covariates
set.seed(123)
df$x1 <- rnorm(nrow(df))
df$x2 <- factor(ifelse(runif(nrow(df)) > 0.5, "a", "b"))
fit_modelx <- fuzzydid(
data = df,
formula = y ~ d + x1 + x2,
group = "g",
time = "t",
did = TRUE,
tc = TRUE,
modelx = c("ols", "ols"),
nose = TRUE
)
knitr::kable(fit_modelx$late)| estimator | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| W_DID | 2.032228 | NA | NA | NA |
| W_TC | 1.930624 | NA | NA | NA |
stata_modelx <- c(
W_DID = 2.03222780000000,
W_TC = 1.93062450000000
)
r_modelx <- setNames(fit_modelx$late$estimate, fit_modelx$late$estimator)[names(stata_modelx)]
modelx_cmp <- data.frame(
estimator = names(stata_modelx),
R = unname(r_modelx),
Stata = unname(stata_modelx),
abs_diff = abs(unname(r_modelx) - unname(stata_modelx))
)
knitr::kable(modelx_cmp, digits = 8)| estimator | R | Stata | abs_diff |
|---|---|---|---|
| W_DID | 2.032228 | 2.032228 | 1e-08 |
| W_TC | 1.930625 | 1.930624 | 1e-08 |
if (requireNamespace("haven", quietly = TRUE)) {
haven::write_dta(df[, c("y", "g", "t", "d", "x1", "x2")], "stata-parity-covariates-fixture.dta")
}For binary treatment, modelx takes two entries:
For multi-valued treatment, a third entry specifies the model for
P(D_{gt}=d | X).
Equivalent Stata code:
Continuous controls can also be handled nonparametrically:
fit_sieves <- fuzzydid(
data = df,
formula = y ~ d + x1,
group = "g",
time = "t",
did = TRUE,
tc = TRUE,
sieves = TRUE,
nose = TRUE
)
knitr::kable(fit_sieves$late)| estimator | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| W_DID | 2.013885 | NA | NA | NA |
| W_TC | 1.894181 | NA | NA | NA |
stata_sieves <- c(
W_DID = 2.00081280000000,
W_TC = 1.93678270000000
)
r_sieves <- setNames(fit_sieves$late$estimate, fit_sieves$late$estimator)[names(stata_sieves)]
sieves_cmp <- data.frame(
estimator = names(stata_sieves),
R = unname(r_sieves),
Stata = unname(stata_sieves),
abs_diff = abs(unname(r_sieves) - unname(stata_sieves))
)
knitr::kable(sieves_cmp, digits = 8)| estimator | R | Stata | abs_diff |
|---|---|---|---|
| W_DID | 2.013885 | 2.000813 | 0.01307262 |
| W_TC | 1.894181 | 1.936783 | 0.04260123 |
When sieves = TRUE, sieveorder = NULL
triggers deterministic 5-fold CV (the Stata-parity default). You can set
sieveorder manually if you want a fixed basis order.
Equivalent Stata code:
LQTE estimates are available from the 5th through 95th percentiles:
fit_lqte <- fuzzydid(
data = parity_fixture,
formula = y ~ d,
group = "g",
time = "t",
lqte = TRUE,
nose = TRUE,
seed = 1
)
lqte_cmp <- data.frame(
quantile = stata_lqte$quantile,
R = fit_lqte$lqte$estimate,
Stata = stata_lqte$estimate,
abs_diff = abs(fit_lqte$lqte$estimate - stata_lqte$estimate)
)
knitr::kable(head(lqte_cmp, 10), digits = 8)| quantile | R | Stata | abs_diff |
|---|---|---|---|
| 0.05 | 1.670645 | 1.672740 | 0.00209542 |
| 0.10 | 1.745951 | 1.794227 | 0.04827636 |
| 0.15 | 1.816027 | 1.816027 | 0.00000008 |
| 0.20 | 1.946114 | 1.946114 | 0.00000003 |
| 0.25 | 1.955441 | 1.955441 | 0.00000004 |
| 0.30 | 1.947278 | 1.947278 | 0.00000007 |
| 0.35 | 1.974047 | 1.974047 | 0.00000005 |
| 0.40 | 2.002776 | 2.094133 | 0.09135670 |
| 0.45 | 2.043456 | 2.131043 | 0.08758671 |
| 0.50 | 2.032202 | 2.032202 | 0.00000023 |
The native LQTE path follows the same rearranged counterfactual-CDF construction as the upstream Stata command. Small differences can occur because the original Stata implementation stores intermediate empirical CDFs as Stata floats.
LQTE requires binary G, T, and
D, with no covariates.
Equivalent Stata code:
For more than two periods, use group_forward:
set.seed(7)
n_mp <- 70
make_mp_cell <- function(gb, gf, t, prob, shift = 0) {
d <- rbinom(n_mp, 1, prob)
y <- 1 + 0.35 * t + 1.6 * d + shift + rnorm(n_mp, sd = 0.25)
data.frame(y = y, d = d, gb = gb, gf = gf, t = t)
}
df_mp <- rbind(
make_mp_cell(gb = 0, gf = 0, t = 0, prob = 0.20),
make_mp_cell(gb = 0, gf = 0, t = 1, prob = 0.30),
make_mp_cell(gb = 0, gf = 0, t = 2, prob = 0.35),
make_mp_cell(gb = 0, gf = 1, t = 0, prob = 0.30, shift = 0.2),
make_mp_cell(gb = 1, gf = 1, t = 1, prob = 0.60, shift = 0.8),
make_mp_cell(gb = 1, gf = 0, t = 2, prob = 0.75, shift = 1.1)
)
fit_mp <- fuzzydid(
data = df_mp,
formula = y ~ d,
group = "gb",
group_forward = "gf",
time = "t",
did = TRUE,
tc = TRUE,
nose = TRUE
)
knitr::kable(fit_mp$late)| estimator | estimate | std.error | conf.low | conf.high |
|---|---|---|---|---|
| W_DID | 4.769522 | NA | NA | NA |
| W_TC | 4.294036 | NA | NA | NA |
if (requireNamespace("haven", quietly = TRUE)) {
haven::write_dta(df_mp, "stata-parity-multiperiod-fixture.dta")
}Equivalent Stata code:
The object also tracks bootstrap failures (degenerate resamples):
df_diag <- parity_fixture
df_diag$cl <- rep(seq_len(24L), each = 20L)
fit_diag <- fuzzydid(
data = df_diag,
formula = y ~ d,
group = "g",
time = "t",
did = TRUE,
tc = TRUE,
cic = TRUE,
cluster = "cl",
breps = 20,
seed = 1
)
diag_cmp <- data.frame(
metric = c("N.reps", "N.misreps", "Share.failures"),
R = c(fit_diag$n_reps, fit_diag$n_misreps, fit_diag$share_failures),
Reference = c(
native_cluster[["n_reps"]],
native_cluster[["n_misreps"]],
native_cluster[["share_failures"]]
),
stringsAsFactors = FALSE
)
share_from_counts <- if (diag_cmp$R[1] > 0) diag_cmp$R[2] / diag_cmp$R[1] else NA_real_
# These diagnostics are deterministic in R when the same bootstrap seed is used.
diag_cmp$Delta <- diag_cmp$R - diag_cmp$Reference
knitr::kable(diag_cmp, digits = 8)| metric | R | Reference | Delta |
|---|---|---|---|
| N.reps | 20.0 | 20.00 | 0.00 |
| N.misreps | 4.0 | 3.00 | 1.00 |
| Share.failures | 0.2 | 0.15 | 0.05 |
if (requireNamespace("haven", quietly = TRUE)) {
haven::write_dta(df_diag, "stata-parity-cluster-fixture.dta")
}Stata can run the same clustered bootstrap command:
The Stata command does not expose N_misreps or
share_failures directly, so the table above is tracked as a
deterministic native regression check rather than a stored-scalar parity
assertion.
de Chaisemartin, C. and D’Haultfoeuille, X. (2018). Fuzzy Differences-in-Differences. Review of Economic Studies, 85(2): 999-1028. doi:10.1093/restud/rdx049.