In this vignette we will explain how some functions of the package
are used in order to estimate a contingency table. We will work on the
eusilc
dataset of the laeken
package. All the
functions presented in the following are explained in the proposed
manuscript by Raphaël Jauslin and Yves Tillé (2021) doi:10.1016/j.jspi.2022.12.003.
We will estimate the contingency table when the factor variable which
represents the economic status pl030
is crossed with a
discretized version of the equivalized household income
eqIncome
. In order to discretize the equivalized income, we
calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable
and define the category as intervals between the values.
library(laeken)
library(sampling)
library(StratifiedSampling)
#> Loading required package: Matrix
data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)
# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030
# categorial income splitted by the percentile
c_income <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which( eusilc$eqIncome > q[7] )] <- "(90,100]"
# variable of interests
Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)
# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id
YZ <- table(cbind(Y,Z))
addmargins(YZ)
#> c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 409 616 722 807 935 1025 648 5162
#> 2 189 181 205 184 165 154 82 1160
#> 3 137 90 72 75 59 52 33 518
#> 4 210 159 103 95 74 49 46 736
#> 5 470 462 492 477 459 435 351 3146
#> 6 57 25 28 30 17 11 10 178
#> 7 344 283 194 149 106 91 40 1207
#> Sum 1816 1816 1816 1817 1815 1817 1210 12107
Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.
# size of sample
n1 <- 1000
n2 <- 500
# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)
# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]
# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)
# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)
# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))
Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]
Then the harmonization step must be performed. The
harmonize
function returns the harmonized weights. If by
chance the true population totals are known, it is possible to use these
instead of the estimate made within the function.
re <- harmonize(X1,d1,id1,X2,d2,id2)
# if we want to use the population totals to harmonize we can use
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))
w1 <- re$w1
w2 <- re$w2
colSums(Xm)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w1*X1)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
colSums(w2*X2)
#> 1 2 3 4 5 6 7 8 9 10 11
#> 476 887 2340 763 1880 1021 2244 1938 558 6263 5844
#> 12 13 14 hsize age
#> 11073 283 751 36380 559915
The statistical matching is done by using the otmatch
function. The estimation of the contingency table is calculated by
extracting the id1
units (respectively id2
units) and by using the function tapply
with the correct
weights.
# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
head(object[,1:3])
#> id1 id2 weight
#> 401 401 254002 4.6897016
#> 401.1 401 338203 1.2225508
#> 401.2 401 520204 8.3685001
#> 404 404 128203 7.4585822
#> 404.1 404 129304 1.4732214
#> 404.2 404 345204 0.3303463
Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)
# transform NA into 0
YZ_ot[is.na(YZ_ot)] <- 0
# result
round(addmargins(YZ_ot),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 664.752 739.638 1016.183 763.107 721.479 971.998 471.848 5349.005
#> 2 173.976 155.584 110.832 153.651 141.377 204.326 101.979 1041.726
#> 3 105.741 89.723 60.746 97.024 63.562 92.840 24.214 533.849
#> 4 158.217 110.751 199.985 112.196 95.874 43.539 16.814 737.375
#> 5 501.775 354.176 597.302 495.710 377.382 265.385 371.108 2962.837
#> 6 0.000 20.910 30.913 26.154 25.471 39.062 12.287 154.797
#> 7 200.786 186.903 170.232 221.771 156.036 293.921 97.763 1327.412
#> Sum 1805.247 1657.686 2186.193 1869.612 1581.180 1911.071 1096.012 12107.000
As you can see from the previous section, the optimal transport
results generally do not have a one-to-one match, meaning that for every
unit in sample 1, we have more than one unit with weights not equal to 0
in sample 2. The bsmatch
function creates a one-to-one
match by selecting a balanced stratified sampling to obtain a data.frame
where each unit in sample 1 has only one imputed unit from sample 2.
# Balanced Sampling
BS <- bsmatch(object)
head(BS$object[,1:3])
#> id1 id2 weight
#> 401.2 401 520204 8.368500
#> 404 404 128203 7.458582
#> 1604.1 1604 346803 5.162819
#> 1 2404 2404 12.494322
#> 2502.1 2502 254301 6.673364
#> 2505 2505 517303 12.065209
Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 702.807 757.839 988.954 742.316 747.296 901.918 507.874 5349.005
#> 2 146.707 149.430 128.279 143.933 163.601 210.191 99.585 1041.726
#> 3 107.077 86.191 62.112 96.616 72.315 85.654 23.885 533.849
#> 4 160.604 122.115 184.879 123.801 85.967 46.142 13.868 737.375
#> 5 492.239 364.711 598.014 534.895 369.306 239.095 364.576 2962.837
#> 6 0.000 36.935 34.838 12.382 23.839 37.354 9.450 154.797
#> 7 178.657 209.746 173.591 212.002 142.691 316.860 93.865 1327.412
#> Sum 1788.091 1726.966 2170.666 1865.944 1605.016 1837.214 1113.103 12107.000
# With Z2 as auxiliary information for stratified balanced sampling.
BS <- bsmatch(object,Z2)
Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum
#> 1 657.740 731.212 1019.177 733.526 730.351 978.206 498.793 5349.005
#> 2 197.392 173.803 106.704 154.185 127.148 195.200 87.293 1041.726
#> 3 117.703 87.281 62.356 91.397 60.404 89.065 25.644 533.849
#> 4 133.951 108.105 202.113 121.487 98.109 59.742 13.868 737.375
#> 5 488.592 345.554 629.447 505.639 363.751 257.436 372.419 2962.837
#> 6 0.000 24.473 34.838 26.333 23.839 35.864 9.450 154.797
#> 7 215.996 182.906 136.943 236.239 160.274 305.875 89.178 1327.412
#> Sum 1811.373 1653.334 2191.577 1868.806 1563.875 1921.388 1096.646 12107.000
# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))
Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]
#> [1,] 0 0.0000000 0.3283932 0.58599854 0.0000000 0.08560829 0
#> [2,] 0 0.5351022 0.3355042 0.02370009 0.1056935 0.00000000 0
#> [3,] 0 0.5665677 0.0000000 0.00000000 0.4334323 0.00000000 0
#> [4,] 0 0.0000000 0.0000000 0.00000000 0.0000000 1.00000000 0
#> [5,] 0 0.5382098 0.0000000 0.46179017 0.0000000 0.00000000 0
#> [6,] 0 1.0000000 0.0000000 0.00000000 0.0000000 0.00000000 0