--- title: "Reproducing the Examples from Amorim & Ospina (2021)" author: "Leila D. Amorim & Raydonal Ospina" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{Reproducing the Examples from Amorim & Ospina (2021)} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 4) library(prLogistic) ``` This vignette reproduces the numerical examples from: > Amorim, L. D. & Ospina, R. (2021). Prevalence ratio estimation using R. > *Anais da Academia Brasileira de Ciências*, **93**(4), e20190316. > doi: [10.1590/0001-3765202120190316](https://doi.org/10.1590/0001-3765202120190316) Each section corresponds to one of the datasets used in the paper. --- ## Example 1 — Low Birth Weight (LBW): clustered binary data ### Data 244 mothers followed during two pregnancies in Salvador, Brazil. Outcome: low birth weight (< 2500 g). Clustering: two births per mother. ```{r lbw-data} data(LBW) cat("n obs =", nrow(LBW), "| mothers =", length(unique(LBW$ID)), "\n") cat("Prevalence of low birth weight:", round(mean(LBW$low == "Low"), 3), "\n\n") table(LBW$low, LBW$smoke) ``` ### Independent GLM (ignoring clustering) ```{r lbw-glm} LBW$low_bin <- as.integer(LBW$low == "Low") LBW$smoke_bin <- as.integer(LBW$smoke == "Yes") LBW$race_bin <- as.integer(LBW$race == "Non-white") fit_lbw_glm <- glm(low_bin ~ smoke_bin + race_bin + age, family = binomial, data = LBW) cat("--- Conditional PR (GLM) ---\n") prLogisticDelta(fit_lbw_glm, standardisation = "conditional") ``` ```{r lbw-glm-marg} cat("--- Marginal PR (GLM) ---\n") prLogisticDelta(fit_lbw_glm, standardisation = "marginal") ``` ### GEE — accounting for within-mother correlation ```{r lbw-gee, eval=requireNamespace("geepack", quietly=TRUE)} library(geepack) fit_lbw_gee <- geeglm(low_bin ~ smoke_bin + race_bin + age, family = binomial, id = ID, corstr = "exchangeable", data = LBW) cat("--- Marginal PR (GEE, exchangeable) ---\n") prLogisticGEE(fit_lbw_gee) ``` ### Mixed model (random intercept per mother) ```{r lbw-glmer, eval=requireNamespace("lme4", quietly=TRUE)} library(lme4) fit_lbw_ml <- glmer(low_bin ~ smoke_bin + race_bin + age + (1 | ID), family = binomial, data = LBW) cat("--- Marginal PR (glmer) ---\n") prLogisticDelta(fit_lbw_ml, standardisation = "marginal") ``` --- ## Example 2 — Thailand Education Study: multilevel data ### Data 8582 students in 411 schools. Outcome: grade repetition (`rgi`). ```{r thai-data} data(Thailand) cat("n =", nrow(Thailand), "| schools =", length(unique(Thailand$schoolid)), "\n") cat("Prevalence of grade repetition:", round(mean(Thailand$rgi == "Yes"), 3), "\n\n") table(Thailand$rgi, Thailand$sex) ``` ### Independent GLM ```{r thai-glm} Thailand$rgi_bin <- as.integer(Thailand$rgi == "Yes") Thailand$sex_bin <- as.integer(Thailand$sex == "Boy") Thailand$pped_bin <- as.integer(Thailand$pped == "Yes") fit_thai_glm <- glm(rgi_bin ~ sex_bin + pped_bin, family = binomial, data = Thailand) cat("--- Conditional PR (GLM) ---\n") prLogisticDelta(fit_thai_glm, standardisation = "conditional") ``` ### Mixed model ```{r thai-glmer, eval=requireNamespace("lme4", quietly=TRUE)} fit_thai_ml <- glmer(rgi_bin ~ sex_bin + pped_bin + (1 | schoolid), family = binomial, data = Thailand) cat("--- Marginal PR (glmer) ---\n") prLogisticDelta(fit_thai_ml, standardisation = "marginal") ``` --- ## Example 3 — Toenail Infection Trial: longitudinal data ### Data 294 patients measured at up to 7 visits. Outcome: moderate/severe nail separation. ```{r toenail-data} data(Toenail) cat("n obs =", nrow(Toenail), "| patients =", length(unique(Toenail$ID)), "\n") Toenail$resp_bin <- as.integer(Toenail$Response == "Moderate/severe") Toenail$trt_bin <- as.integer(Toenail$Treatment == "Terbinafine") cat("Overall prevalence:", round(mean(Toenail$resp_bin), 3), "\n") ``` ### GEE ```{r toenail-gee, eval=requireNamespace("geepack", quietly=TRUE)} fit_toe_gee <- geeglm(resp_bin ~ trt_bin + Month, family = binomial, id = ID, corstr = "exchangeable", data = Toenail) cat("--- Marginal PR (GEE) ---\n") prLogisticGEE(fit_toe_gee) ``` --- ## Example 4 — UIS Drug Treatment Study ### Data 575 patients in a drug rehabilitation study. Outcome: drug-free at 6 months. ```{r uis-data} data(UIS) cat("n =", nrow(UIS), "\n") cat("Prevalence drug-free:", round(mean(UIS$drugFree == "Yes"), 3), "\n\n") table(UIS$drugFree, UIS$trt) ``` ### GLM — independent observations ```{r uis-glm} UIS$drugFree_bin <- as.integer(UIS$drugFree == "Yes") fit_uis <- glm(drugFree_bin ~ trt + Age + DrugUse + race + site, family = binomial, data = UIS) cat("--- Conditional PR ---\n") res_uis_cond <- prLogisticDelta(fit_uis, standardisation = "conditional") print(res_uis_cond) cat("\n--- Marginal PR ---\n") res_uis_marg <- prLogisticDelta(fit_uis, standardisation = "marginal") print(res_uis_marg) ``` ### OR vs PR comparison ```{r uis-or-pr} OR <- exp(coef(fit_uis)[-1]) PR_cond <- coef(res_uis_cond) PR_marg <- coef(res_uis_marg) comp <- data.frame( OR = round(OR, 3), PR_cond = round(PR_cond, 3), PR_marg = round(PR_marg, 3) ) print(comp) ``` ### Bootstrap CIs ```{r uis-boot, cache=TRUE} set.seed(2024) res_boot <- prLogisticBootCond(fit_uis, data = UIS, R = 499) print(res_boot) ``` --- ## Example 5 — Downer Cow Survival ### Data 216 downer cattle. Outcome: survival to discharge. ```{r downer-data} data(downer) cat("n =", nrow(downer), "\n") cat("Survival prevalence:", round(mean(downer$Survival == "Survived"), 3), "\n\n") table(downer$Survival, downer$Myopathy) ``` ### GLM ```{r downer-glm} downer$surv_bin <- as.integer(downer$Survival == "Survived") fit_downer <- glm(surv_bin ~ Myopathy + AST + CK + Calving, family = binomial, data = downer) cat("--- Conditional PR ---\n") prLogisticDelta(fit_downer, standardisation = "conditional") ``` --- ## Example 6 — Titanic Survival ### Data 1307 passengers. Outcome: survival. Overall survival rate ≈ 38%. ```{r titanic-data} data(titanic) cat("n =", nrow(titanic), "\n") cat("Survival rate:", round(mean(titanic$survived == "Yes"), 3), "\n\n") table(titanic$survived, titanic$sex) ``` ### GLM — OR vs PR ```{r titanic-glm} titanic$surv_bin <- as.integer(titanic$survived == "Yes") fit_titanic <- glm(surv_bin ~ sex + pclass, family = binomial, data = titanic) # Odds Ratios (what logistic gives directly) cat("--- Odds Ratios ---\n") print(round(exp(cbind(OR = coef(fit_titanic), confint.default(fit_titanic))), 3)) cat("\n--- Conditional Prevalence Ratios ---\n") res_tit <- prLogisticDelta(fit_titanic, standardisation = "conditional") print(res_tit) cat("\n--- Marginal Prevalence Ratios ---\n") prLogisticDelta(fit_titanic, standardisation = "marginal") ``` ### Forest plot ```{r titanic-plot, fig.cap="Forest plot: conditional PR for Titanic survival"} plot(res_tit, main = "Titanic: Conditional Prevalence Ratios (95% CI)") ``` ### Key comparison for Titanic ```{r titanic-comparison} OR_sex <- exp(coef(fit_titanic)["sexMale"]) PR_sex <- coef(res_tit)["sexMale"] cat(sprintf( "Being male:\n OR = %.2f (%.0f%% overestimate over PR)\n PR = %.2f\n", OR_sex, (OR_sex / PR_sex - 1) * 100, PR_sex )) ``` --- ## Summary The table below shows that OR consistently overstates PR when prevalence is above ~10%: ```{r summary-table} results <- data.frame( Dataset = c("LBW (GLM)", "Thailand (GLM)", "UIS", "downer", "Titanic"), Prevalence = c(0.18, 0.16, 0.43, 0.50, 0.38), Predictor = c("smoke", "sex (Boy)", "trt (Long)", "Myopathy (Yes)", "sex (Male)"), OR = c( round(exp(coef(fit_lbw_glm)["smoke_bin"]), 2), round(exp(coef(fit_thai_glm)["sex_bin"]), 2), round(exp(coef(fit_uis)["trtLong"]), 2), round(exp(coef(fit_downer)["MyopathyYes"]), 2), round(exp(coef(fit_titanic)["sexMale"]), 2) ), PR_cond = c( round(coef(prLogisticDelta(fit_lbw_glm))["smoke_bin"], 2), round(coef(prLogisticDelta(fit_thai_glm))["sex_bin"], 2), round(coef(res_uis_cond)["trtLong"], 2), round(coef(prLogisticDelta(fit_downer))["MyopathyYes"], 2), round(coef(res_tit)["sexMale"], 2) ) ) results$OR_over_PR <- round(results$OR / results$PR_cond, 2) print(results) ``` As prevalence increases, the ratio OR/PR grows — confirming that OR is a poor proxy for PR in common-outcome studies. --- ## Session information ```{r session} sessionInfo() ```