Statistical Matching using Optimal Transport

Introduction

In this vignette we will explain how some functions of the package are used in order to estimate a contingency table. We will work on the eusilc dataset of the laeken package. All the functions presented in the following are explained in the proposed manuscript by Raphaël Jauslin and Yves Tillé (2021) doi:10.1016/j.jspi.2022.12.003.

Contingency table

We will estimate the contingency table when the factor variable which represents the economic status pl030 is crossed with a discretized version of the equivalized household income eqIncome. In order to discretize the equivalized income, we calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable and define the category as intervals between the values.

library(laeken)
library(sampling)
library(StratifiedSampling)
#> Loading required package: Matrix

data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)


# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030


# categorial income splitted by the percentile
c_income  <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which(  eusilc$eqIncome > q[7] )] <- "(90,100]"

# variable of interests
Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)

# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id

YZ <- table(cbind(Y,Z))
addmargins(YZ)
#>        c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]   Sum
#>     1      409     616     722     807     935    1025      648  5162
#>     2      189     181     205     184     165     154       82  1160
#>     3      137      90      72      75      59      52       33   518
#>     4      210     159     103      95      74      49       46   736
#>     5      470     462     492     477     459     435      351  3146
#>     6       57      25      28      30      17      11       10   178
#>     7      344     283     194     149     106      91       40  1207
#>     Sum   1816    1816    1816    1817    1815    1817     1210 12107

Sampling schemes

Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.


# size of sample
n1 <- 1000
n2 <- 500

# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)
  
# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]
  
# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)
  
# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]
  
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
  
# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)
  
# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))
  
Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]

Harmonization

Then the harmonization step must be performed. The harmonize function returns the harmonized weights. If by chance the true population totals are known, it is possible to use these instead of the estimate made within the function.



re <- harmonize(X1,d1,id1,X2,d2,id2)  

# if we want to use the population totals to harmonize we can use 
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))

w1 <- re$w1
w2 <- re$w2

colSums(Xm)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915
colSums(w1*X1)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915
colSums(w2*X2)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915

Optimal transport matching

The statistical matching is done by using the otmatch function. The estimation of the contingency table is calculated by extracting the id1 units (respectively id2 units) and by using the function tapply with the correct weights.


# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
head(object[,1:3])
#>         id1    id2   weight
#> 202     202   2801 3.816626
#> 202.1   202 551001 7.458110
#> 1401   1401 133802 9.384434
#> 1604   1604 167801 7.016546
#> 1604.1 1604 258203 5.097148
#> 1903   1903 303202 7.785325

Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)

# transform NA into 0
YZ_ot[is.na(YZ_ot)] <- 0

# result
round(addmargins(YZ_ot),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    707.389  863.781  646.338  831.064  812.230  836.188  602.869  5299.858
#> 2    188.524  189.014  134.221  151.914  227.806  150.646   89.866  1131.990
#> 3     35.317   99.326   76.017   48.439   72.604   86.125   74.055   491.883
#> 4     67.534   77.334  150.105   73.640   45.420   75.103   38.897   528.033
#> 5    364.334  339.338  554.635  466.888  338.280  678.597  345.293  3087.365
#> 6     25.031   22.548   35.370   58.257    7.818   15.778   17.966   182.768
#> 7    221.405  199.874  224.169  115.657  245.251  265.628  113.118  1385.102
#> Sum 1609.535 1791.214 1820.854 1745.859 1749.410 2108.065 1282.064 12107.000

Balanced sampling

As you can see from the previous section, the optimal transport results generally do not have a one-to-one match, meaning that for every unit in sample 1, we have more than one unit with weights not equal to 0 in sample 2. The bsmatch function creates a one-to-one match by selecting a balanced stratified sampling to obtain a data.frame where each unit in sample 1 has only one imputed unit from sample 2.


# Balanced Sampling 
BS <- bsmatch(object)
head(BS$object[,1:3])
#>        id1    id2    weight
#> 202.1  202 551001  7.458110
#> 1401  1401 133802  9.384434
#> 1604  1604 167801  7.016546
#> 1903  1903 303202  7.785325
#> 2003  2003 470603 10.291664
#> 2502  2502 323402 12.098266


Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    684.406  833.248  659.891  816.898  863.414  856.922  585.080  5299.858
#> 2    173.283  188.159  123.456  158.824  246.468  147.742   94.058  1131.990
#> 3     40.172   87.287   84.030   49.171   72.110   87.781   71.331   491.883
#> 4     49.878   79.118  157.640   70.620   45.900   89.409   35.468   528.033
#> 5    379.469  324.008  570.219  466.501  316.299  654.691  376.178  3087.365
#> 6     26.532   22.569   43.714   52.430   12.324   10.895   14.304   182.768
#> 7    242.655  189.056  237.988   94.647  208.435  320.367   91.954  1385.102
#> Sum 1596.397 1723.444 1876.938 1709.091 1764.950 2167.807 1268.373 12107.000

# With Z2 as auxiliary information for stratified balanced sampling.
BS <- bsmatch(object,Z2)

Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    731.233  899.316  648.513  849.054  811.372  810.830  549.540  5299.858
#> 2    175.333  192.336  141.168  176.120  205.356  156.722   84.954  1131.990
#> 3     28.059   89.968   69.062   75.780   72.110   73.460   83.444   491.883
#> 4     74.943   67.023  156.310   69.983   47.878   64.389   47.507   528.033
#> 5    327.032  347.086  521.409  431.167  367.510  715.015  378.147  3087.365
#> 6     26.532   11.260   43.714   52.430   12.324   22.204   14.304   182.768
#> 7    242.241  175.852  241.964   97.050  235.899  268.121  123.974  1385.102
#> Sum 1605.374 1782.841 1822.139 1751.585 1752.450 2110.740 1281.872 12107.000

Prediction


# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))
  
Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#>         (0,15] (15,30]   (30,45]   (45,60]   (60,75] (75,90]  (90,100]
#> [1,] 0.0000000       0 0.0000000 0.0000000 0.6614887       0 0.3385113
#> [2,] 0.0000000       0 1.0000000 0.0000000 0.0000000       0 0.0000000
#> [3,] 0.5792243       0 0.0000000 0.0000000 0.0000000       0 0.4207757
#> [4,] 0.0000000       0 0.3527935 0.6472065 0.0000000       0 0.0000000
#> [5,] 0.0000000       0 0.0000000 1.0000000 0.0000000       0 0.0000000
#> [6,] 0.0000000       0 1.0000000 0.0000000 0.0000000       0 0.0000000