---
title: "Stata Parity: Replicating Stata fuzzydid in R"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Stata Parity: Replicating Stata fuzzydid in R}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

```{r parity-helpers, include = FALSE}
# Prefer local package sources when rendering inside the repository so
# parity checks target the code under development.
find_local_pkg_dir <- function() {
  roots <- c(".", "..", "../..")

  for (root in roots) {
    pkg_desc <- file.path(root, "pkg", "DESCRIPTION")
    root_desc <- file.path(root, "DESCRIPTION")

    if (file.exists(pkg_desc)) {
      return(file.path(root, "pkg"))
    }

    if (file.exists(root_desc)) {
      desc <- tryCatch(read.dcf(root_desc), error = function(e) NULL)
      if (!is.null(desc) && "Package" %in% colnames(desc) && identical(desc[1, "Package"], "Rfuzzydid")) {
        return(root)
      }
    }
  }

  NULL
}

local_pkg_dir <- find_local_pkg_dir()

if (!is.null(local_pkg_dir) && requireNamespace("pkgload", quietly = TRUE)) {
  pkgload::load_all(local_pkg_dir, helpers = FALSE, export_all = FALSE, quiet = TRUE)
} else {
  library(Rfuzzydid)
}

stata_partial <- c(TC_inf = 0.01220596000000, TC_sup = 3.08177130000000)

stata_eqtest <- data.frame(
  contrast = c("W_DID_W_TC", "W_DID_W_CIC", "W_TC_W_CIC"),
  estimate = c(-0.0112300674614629, -0.145514320262384, -0.134284252800921),
  stringsAsFactors = FALSE
)

stata_lqte <- data.frame(
  quantile = seq(0.05, 0.95, by = 0.05),
  estimate = c(
    1.67274034023285, 1.79422724246979, 1.81602716445923,
    1.94611406326294, 1.95544064044952, 1.94727826118469,
    1.97404694557190, 2.09413313865662, 2.13104295730591,
    2.03220224380493, 2.10763168334961, -0.360978126525879,
    -0.325689792633057, -0.228986263275146, -0.172835350036621,
    -0.146114349365234, -0.098971366882324, -0.079638004302979,
    -0.034532070159912
  )
)

native_cluster <- c(n_reps = 20L, n_misreps = 3L, share_failures = 0.15)
```

# Introduction

This vignette is a parity check between `Rfuzzydid` and Stata's `fuzzydid`.
Each section runs an R example and, where possible, compares against frozen
Stata outputs used in the test suite.

The Stata snippets assume the Stata `fuzzydid` command is already installed
locally; `Rfuzzydid` no longer bundles the `.ado` sources.

The shared estimators come from de Chaisemartin and D'Haultfoeuille (2018):
local average treatment effects (LATE) and local quantile treatment effects
under fuzzy DID assumptions.


# Simulated Data Example

We start with a simple two-period, two-group design:

- A control group (G = 0) observed in periods T = 0 and T = 1
- A treatment group (G = 1) observed in periods T = 0 and T = 1
- Treatment uptake D that increases more in the treatment group between periods
- Outcome Y that responds to treatment

```{r sim-data}
set.seed(50321)

n_cell <- 120  # observations per cell

# Helper function to create each (G, T) cell
make_cell <- function(g, t, prob_d) {
  d <- rbinom(n_cell, size = 1, prob = prob_d)
  # Outcome: base + group effect + time effect + treatment effect + noise
  y <- 1 + 0.5 * g + 0.4 * t + 2.0 * d + sin(seq_len(n_cell) / 7)
  data.frame(y = y, g = g, t = t, d = d)
}

# Build the four cells
df <- rbind(
  make_cell(g = 0, t = 0, prob_d = 0.20),  # Control group, baseline
  make_cell(g = 0, t = 1, prob_d = 0.40),  # Control group, follow-up  
  make_cell(g = 1, t = 0, prob_d = 0.30),  # Treatment group, baseline
  make_cell(g = 1, t = 1, prob_d = 0.80)   # Treatment group, follow-up
)

# Verify cell sizes and treatment rates
aggregate(cbind(n = d) ~ g + t, data = df, FUN = function(x) c(n = length(x), mean = mean(x)))

# Save fixture used by most examples for direct Stata comparison
if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df, "stata-parity-sim-fixture.dta")
}
```

The treatment rate jumps from 30% to 80% in the treated group and from 20% to
40% in controls. That gap in treatment uptake changes is the first stage that
identifies fuzzy DID effects.

# Basic Estimation

## Wald-DID, Wald-TC, and Wald-CIC

We request the three main LATE estimators:

```{r basic-est}
fit <- fuzzydid(
  data = df,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  breps = 50
)

summary(fit)

stata_basic <- c(
  W_DID = 2.00000000000000,
  W_TC = 1.97320415000280,
  W_CIC = 2.22209836585117
)
r_basic <- setNames(fit$late$estimate, fit$late$estimator)[names(stata_basic)]
basic_cmp <- data.frame(
  estimator = names(stata_basic),
  R = unname(r_basic),
  Stata = unname(stata_basic),
  abs_diff = abs(unname(r_basic) - unname(stata_basic))
)
knitr::kable(basic_cmp, digits = 8)
```

The table reports point estimates, bootstrap standard errors, and percentile
confidence intervals.

Equivalent Stata code:

```stata
fuzzydid y g t d, did tc cic breps(50)
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)
```

## Extracting Results Programmatically

`modelsummary` gives a compact table that is easy to pass into reports:

```{r tidy-output, eval = requireNamespace("modelsummary", quietly = TRUE)}
library(modelsummary)

modelsummary::modelsummary(fit)
```

Equivalent Stata code:

```stata
* Tidy-like access in Stata comes from stored matrices in e()
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)
```

## Core Stata Parity Check

This chunk reproduces the exact fixture used by the parity tests and checks
that R and Stata point estimates agree to tolerance:

```{r parity-check}
n_cell <- 120L

make_parity_cell <- function(g, t, ones) {
  data.frame(
    g = rep.int(g, n_cell),
    t = rep.int(t, n_cell),
    d = c(rep.int(1L, ones), rep.int(0L, n_cell - ones))
  )
}

out <- rbind(
  make_parity_cell(0L, 0L, 24L),
  make_parity_cell(0L, 1L, 48L),
  make_parity_cell(1L, 0L, 36L),
  make_parity_cell(1L, 1L, 96L)
)
out$id <- seq_len(nrow(out))
out$y <- 1 + 0.5 * out$g + 0.4 * out$t + 2 * out$d + sin(out$id / 7)
parity_fixture <- out[, c("y", "g", "t", "d")]

fit_parity <- fuzzydid(
  data = parity_fixture,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  breps = 5,
  seed = 1
)

stata_core <- c(
  W_DID = 2.17430877504668,
  W_TC = 2.18553887285118,
  W_CIC = 2.31982319056988
)

r_core <- setNames(fit_parity$late$estimate, fit_parity$late$estimator)[c("W_DID", "W_TC", "W_CIC")]

core_cmp <- data.frame(
  estimator = names(stata_core),
  R = unname(r_core),
  Stata = unname(stata_core),
  abs_diff = abs(unname(r_core) - unname(stata_core))
)

knitr::kable(core_cmp, digits = 8)

# Save exact parity fixture for Stata users
if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(parity_fixture, "stata-parity-core-fixture.dta")
}
```

Point estimates are strict parity targets here. Standard errors and intervals
depend on bootstrap simulation, so tiny platform-level differences are normal.

Equivalent Stata code:

```stata
* Load the same dataset exported in the parity check chunk
use "stata-parity-core-fixture.dta", clear

* Generate the group variable (already in dataset as 'g')
* The Stata command expects G to identify treatment groups

* Run fuzzydid
fuzzydid y g t d, did tc cic breps(50)

* Results are stored in e()
matrix list e(b_LATE)
matrix list e(se_LATE)
matrix list e(ci_LATE)
```

## Key Parity Points

The next sections cover behavior that should match Stata and often causes
confusion in practice.

### 1. User-Controlled Bootstrap Seed

Bootstrap inference is reproducible when you pass the same `seed` value. If
you change `seed`, the resamples change too:

```{r seed-parity}
# These produce identical SE/CI because the same bootstrap seed is supplied
fit1 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1)
fit2 <- fuzzydid(df, y ~ d, group = "g", time = "t", did = TRUE, breps = 50, seed = 1)

all.equal(fit1$late$std.error, fit2$late$std.error)

stata_seed_se <- 0.66310988000000
seed_cmp <- data.frame(
  estimator = "W_DID",
  R_seed_1_run_1 = fit1$late$std.error[fit1$late$estimator == "W_DID"],
  R_seed_1_run_2 = fit2$late$std.error[fit2$late$estimator == "W_DID"],
  Stata = stata_seed_se,
  stringsAsFactors = FALSE
)
seed_cmp$abs_diff_run_1_vs_Stata <- abs(seed_cmp$R_seed_1_run_1 - seed_cmp$Stata)
seed_cmp$abs_diff_run_2_vs_Stata <- abs(seed_cmp$R_seed_1_run_2 - seed_cmp$Stata)
knitr::kable(seed_cmp, digits = 8)
```

Equivalent Stata code:

```stata
use "stata-parity-sim-fixture.dta", clear
set seed 1
fuzzydid y g t d, did breps(50)
matrix se1 = e(se_LATE)

set seed 1
fuzzydid y g t d, did breps(50)
matrix se2 = e(se_LATE)

matrix list se1
matrix list se2
```

### 2. Design Restrictions

`Rfuzzydid` enforces the same design restrictions as Stata:

```{r restrictions, error = TRUE}
df_restrict <- transform(df, x = rnorm(nrow(df)))

# CIC requires no covariates
try({
  fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", cic = TRUE)
})

# LQTE requires binary G, T, and D, plus no covariates
try({
  fuzzydid(df_restrict, y ~ d + x, group = "g", time = "t", lqte = TRUE)
})
```

Equivalent Stata code:

```stata
use "stata-parity-sim-fixture.dta", clear
gen x = rnormal()

* CIC with covariates should error
capture noisily fuzzydid y g t d x, cic

* LQTE with covariates should error
capture noisily fuzzydid y g t d x, lqte
```

### 3. Partial Identification Bounds

When Wald-TC point identification is unavailable, both implementations can
report partial-identification bounds:

```{r partial}
fit_partial <- fuzzydid(
  data = df,
  formula = y ~ d,
  group = "g",
  time = "t",
  tc = TRUE,
  partial = TRUE,
  breps = 50
)

knitr::kable(fit_partial$late)

r_partial <- setNames(fit_partial$late$estimate, fit_partial$late$estimator)[names(stata_partial)]
partial_cmp <- data.frame(
  estimator = names(stata_partial),
  R = unname(r_partial),
  Stata = unname(stata_partial),
  abs_diff = abs(unname(r_partial) - unname(stata_partial))
)
knitr::kable(partial_cmp, digits = 8)
```

Equivalent Stata code:

```stata
use "stata-parity-sim-fixture.dta", clear
fuzzydid y g t d, tc partial breps(50)
matrix list e(b_LATE)
```

### 4. Equality Tests

When multiple estimators are requested, you can also test pairwise equality:

```{r eqtest}
fit_eq <- fuzzydid(
  data = parity_fixture,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  eqtest = TRUE,
  breps = 5,
  seed = 1
)

eq_cmp <- data.frame(
  contrast = stata_eqtest$contrast,
  R = fit_eq$eqtest$estimate,
  Stata = stata_eqtest$estimate,
  abs_diff = abs(fit_eq$eqtest$estimate - stata_eqtest$estimate)
)
knitr::kable(eq_cmp, digits = 8)
```

Equivalent Stata code:

```stata
use "stata-parity-core-fixture.dta", clear
fuzzydid y g t d, did tc cic eqtest breps(5)
matrix list e(b_LATE_eqtest)
```

# Covariate Adjustment

## Parametric Adjustment with modelx

With covariates, `modelx` controls how conditional expectations are modeled
(for example OLS, logit, or probit):

```{r covariates}
# Add covariates
set.seed(123)
df$x1 <- rnorm(nrow(df))
df$x2 <- factor(ifelse(runif(nrow(df)) > 0.5, "a", "b"))

fit_modelx <- fuzzydid(
  data = df,
  formula = y ~ d + x1 + x2,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  modelx = c("ols", "ols"),
  nose = TRUE
)

knitr::kable(fit_modelx$late)

stata_modelx <- c(
  W_DID = 2.03222780000000,
  W_TC = 1.93062450000000
)
r_modelx <- setNames(fit_modelx$late$estimate, fit_modelx$late$estimator)[names(stata_modelx)]
modelx_cmp <- data.frame(
  estimator = names(stata_modelx),
  R = unname(r_modelx),
  Stata = unname(stata_modelx),
  abs_diff = abs(unname(r_modelx) - unname(stata_modelx))
)
knitr::kable(modelx_cmp, digits = 8)

if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df[, c("y", "g", "t", "d", "x1", "x2")], "stata-parity-covariates-fixture.dta")
}
```

For binary treatment, `modelx` takes two entries:

1. Method for E(Y_{gt}|X) and E(Y_{dgt}|X)
2. Method for E(D_{gt}|X)

For multi-valued treatment, a third entry specifies the model for
`P(D_{gt}=d | X)`.

Equivalent Stata code:

```stata
use "stata-parity-covariates-fixture.dta", clear
fuzzydid y g t d, did tc continuous(x1) qualitative(x2) modelx(ols ols)
matrix list e(b_LATE)
```

## Nonparametric Adjustment with Sieves

Continuous controls can also be handled nonparametrically:

```{r sieves}
fit_sieves <- fuzzydid(
  data = df,
  formula = y ~ d + x1,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  sieves = TRUE,
  nose = TRUE
)

knitr::kable(fit_sieves$late)

stata_sieves <- c(
  W_DID = 2.00081280000000,
  W_TC = 1.93678270000000
)
r_sieves <- setNames(fit_sieves$late$estimate, fit_sieves$late$estimator)[names(stata_sieves)]
sieves_cmp <- data.frame(
  estimator = names(stata_sieves),
  R = unname(r_sieves),
  Stata = unname(stata_sieves),
  abs_diff = abs(unname(r_sieves) - unname(stata_sieves))
)
knitr::kable(sieves_cmp, digits = 8)
```

When `sieves = TRUE`, `sieveorder = NULL` triggers deterministic 5-fold CV
(the Stata-parity default). You can set `sieveorder` manually if you want a
fixed basis order.

Equivalent Stata code:

```stata
use "stata-parity-covariates-fixture.dta", clear
fuzzydid y g t d, did tc continuous(x1) sieves
matrix list e(b_LATE)
```

# LQTE Estimation

LQTE estimates are available from the 5th through 95th percentiles:

```{r lqte}
fit_lqte <- fuzzydid(
  data = parity_fixture,
  formula = y ~ d,
  group = "g",
  time = "t",
  lqte = TRUE,
  nose = TRUE,
  seed = 1
)

lqte_cmp <- data.frame(
  quantile = stata_lqte$quantile,
  R = fit_lqte$lqte$estimate,
  Stata = stata_lqte$estimate,
  abs_diff = abs(fit_lqte$lqte$estimate - stata_lqte$estimate)
)
knitr::kable(head(lqte_cmp, 10), digits = 8)
```

The native LQTE path follows the same rearranged counterfactual-CDF construction
as the upstream Stata command. Small differences can occur because the original
Stata implementation stores intermediate empirical CDFs as Stata floats.

LQTE requires binary `G`, `T`, and `D`, with no covariates.

Equivalent Stata code:

```stata
use "stata-parity-core-fixture.dta", clear
fuzzydid y g t d, lqte
matrix list e(b_LQTE)
```

# Multi-Period Designs

For more than two periods, use `group_forward`:

```{r multiperiod}
set.seed(7)
n_mp <- 70

make_mp_cell <- function(gb, gf, t, prob, shift = 0) {
  d <- rbinom(n_mp, 1, prob)
  y <- 1 + 0.35 * t + 1.6 * d + shift + rnorm(n_mp, sd = 0.25)
  data.frame(y = y, d = d, gb = gb, gf = gf, t = t)
}

df_mp <- rbind(
  make_mp_cell(gb = 0, gf = 0, t = 0, prob = 0.20),
  make_mp_cell(gb = 0, gf = 0, t = 1, prob = 0.30),
  make_mp_cell(gb = 0, gf = 0, t = 2, prob = 0.35),
  make_mp_cell(gb = 0, gf = 1, t = 0, prob = 0.30, shift = 0.2),
  make_mp_cell(gb = 1, gf = 1, t = 1, prob = 0.60, shift = 0.8),
  make_mp_cell(gb = 1, gf = 0, t = 2, prob = 0.75, shift = 1.1)
)

fit_mp <- fuzzydid(
  data = df_mp,
  formula = y ~ d,
  group = "gb",
  group_forward = "gf",
  time = "t",
  did = TRUE,
  tc = TRUE,
  nose = TRUE
)

knitr::kable(fit_mp$late)

if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df_mp, "stata-parity-multiperiod-fixture.dta")
}
```

Equivalent Stata code:

```stata
use "stata-parity-multiperiod-fixture.dta", clear
fuzzydid y gb t d, groupf(gf) did tc
matrix list e(b_LATE)
```

# Bootstrap Diagnostics

The object also tracks bootstrap failures (degenerate resamples):

```{r bootstrap-diag}
df_diag <- parity_fixture
df_diag$cl <- rep(seq_len(24L), each = 20L)

fit_diag <- fuzzydid(
  data = df_diag,
  formula = y ~ d,
  group = "g",
  time = "t",
  did = TRUE,
  tc = TRUE,
  cic = TRUE,
  cluster = "cl",
  breps = 20,
  seed = 1
)

diag_cmp <- data.frame(
  metric = c("N.reps", "N.misreps", "Share.failures"),
  R = c(fit_diag$n_reps, fit_diag$n_misreps, fit_diag$share_failures),
  Reference = c(
    native_cluster[["n_reps"]],
    native_cluster[["n_misreps"]],
    native_cluster[["share_failures"]]
  ),
  stringsAsFactors = FALSE
)

share_from_counts <- if (diag_cmp$R[1] > 0) diag_cmp$R[2] / diag_cmp$R[1] else NA_real_

# These diagnostics are deterministic in R when the same bootstrap seed is used.
diag_cmp$Delta <- diag_cmp$R - diag_cmp$Reference
knitr::kable(diag_cmp, digits = 8)

if (requireNamespace("haven", quietly = TRUE)) {
  haven::write_dta(df_diag, "stata-parity-cluster-fixture.dta")
}
```

Stata can run the same clustered bootstrap command:

```stata
use "stata-parity-cluster-fixture.dta", clear
fuzzydid y g t d, did tc cic cluster(cl) breps(20)
```

The Stata command does not expose `N_misreps` or `share_failures` directly, so
the table above is tracked as a deterministic native regression check rather
than a stored-scalar parity assertion.

# References

de Chaisemartin, C. and D'Haultfoeuille, X. (2018). *Fuzzy Differences-in-Differences*. *Review of Economic Studies*, 85(2): 999-1028. doi:10.1093/restud/rdx049.