Statistical Matching using Optimal Transport

Raphaël Jauslin

2021-05-20

Introduction

In this vignette we will explain how some functions of the package are used in order to estimate a contingency table. We will work on the eusilc dataset of the laeken package. All the functions presented in the following are explained in the proposed manuscript by Raphaël Jauslin and Yves Tillé (2021) <arXiv:2105.08379>.

Contingency table

We will estimate the contingency table when the factor variable which represents the economic status pl030 is crossed with a discretized version of the equivalized household income eqIncome. In order to discretize the equivalized income, we calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable and define the category as intervals between the values.

library(laeken)
library(sampling)
library(StratifiedSampling)
#> Le chargement a nécessité le package : Matrix

data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)


# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030


# categorial income splitted by the percentile
c_income  <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15))
c_income[which(eusilc$eqIncome <= q[2])] <- "(0,15]"
c_income[which(q[2] < eusilc$eqIncome & eusilc$eqIncome <= q[3])] <- "(15,30]"
c_income[which(q[3] < eusilc$eqIncome & eusilc$eqIncome <= q[4])] <- "(30,45]"
c_income[which(q[4] < eusilc$eqIncome & eusilc$eqIncome <= q[5])] <- "(45,60]"
c_income[which(q[5] < eusilc$eqIncome & eusilc$eqIncome <= q[6])] <- "(60,75]"
c_income[which(q[6] < eusilc$eqIncome & eusilc$eqIncome <= q[7])] <- "(75,90]"
c_income[which(  eusilc$eqIncome > q[7] )] <- "(90,100]"

# variable of interests
Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)

# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id

YZ <- table(cbind(Y,Z))
addmargins(YZ)
#>        c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]   Sum
#>     1      409     616     722     807     935    1025      648  5162
#>     2      189     181     205     184     165     154       82  1160
#>     3      137      90      72      75      59      52       33   518
#>     4      210     159     103      95      74      49       46   736
#>     5      470     462     492     477     459     435      351  3146
#>     6       57      25      28      30      17      11       10   178
#>     7      344     283     194     149     106      91       40  1207
#>     Sum   1816    1816    1816    1817    1815    1817     1210 12107

Sampling schemes

Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.


# size of sample
n1 <- 1000
n2 <- 500

# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)
  
# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]
  
# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)
  
# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]
  
# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2
  
# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)
  
# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))
  
Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]

Harmonization

Then the harmonization step must be performed. The harmonize function returns the harmonized weights. If by chance the true population totals are known, it is possible to use these instead of the estimate made within the function.



re <- harmonize(X1,d1,id1,X2,d2,id2)  

# if we want to use the population totals to harmonize we can use 
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))

w1 <- re$w1
w2 <- re$w2

colSums(Xm)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915
colSums(w1*X1)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915
colSums(w2*X2)
#>      1      2      3      4      5      6      7      8      9     10     11 
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844 
#>     12     13     14  hsize    age 
#>  11073    283    751  36380 559915

Optimal transport matching

The statistical matching is done by using the otmatch function. The estimation of the contingency table is calculated by extracting the id1 units (respectively id2 units) and by using the function tapply with the correct weights.


# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
head(object[,1:3])
#>         id1    id2     weight
#> 1      2002   2002 12.3693242
#> 2403   2403  18503  6.6549822
#> 2403.1 2403 391904  7.0634999
#> 2404   2404 256903  0.7315227
#> 2404.1 2404 286302 12.9999855
#> 2505   2505 361005 12.9288819

Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum)

# transform NA into 0
YZ_ot[is.na(YZ_ot)] <- 0

# result
round(addmargins(YZ_ot),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    590.712  842.961  691.578  913.959  642.791  763.596  579.974  5025.572
#> 2     92.120  172.447  177.192  178.544  156.887  130.758   94.599  1002.547
#> 3     44.705   49.082   72.319   99.973   59.875   98.070   77.787   501.812
#> 4    157.860  141.832  135.632  145.374  147.062   98.797   38.622   865.179
#> 5    490.223  391.945  514.436  493.816  489.049  484.409  491.583  3355.461
#> 6     21.357   68.943   20.831   10.583   23.219    9.848   21.952   176.734
#> 7    139.597  210.281  226.080  256.982  184.846   74.058   87.854  1179.696
#> Sum 1536.574 1877.491 1838.068 2099.231 1703.730 1659.536 1392.370 12107.000

Balanced sampling

As you can see from the previous section, the optimal transport results generally do not have a one-to-one match, meaning that for every unit in sample 1, we have more than one unit with weights not equal to 0 in sample 2. The bsmatch function creates a one-to-one match by selecting a balanced stratified sampling to obtain a data.frame where each unit in sample 1 has only one imputed unit from sample 2.


# Balanced Sampling 
BS <- bsmatch(object)
head(BS$object[,1:3])
#>         id1    id2     weight
#> 1      2002   2002 12.3693242
#> 2403   2403  18503  6.6549822
#> 2404   2404 256903  0.7315227
#> 2505   2505 361005 12.9288819
#> 3901   3901 474502 13.1517446
#> 4101.1 4101 138501  1.6895103


Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    621.099  800.005  660.437  889.889  647.441  788.650  618.051  5025.572
#> 2    100.353  158.434  172.335  184.415  166.926  119.849  100.235  1002.547
#> 3     61.437   62.601   86.312   93.052   37.187   99.268   61.955   501.812
#> 4    149.533  114.650  138.389  165.151  123.416  108.776   65.264   865.179
#> 5    494.525  355.216  494.427  490.951  542.947  493.790  483.605  3355.461
#> 6     23.058   68.806   25.641   12.413   24.311   10.763   11.743   176.734
#> 7    145.609  200.086  237.068  267.377  188.187   60.880   80.490  1179.696
#> Sum 1595.614 1759.797 1814.609 2103.248 1730.415 1681.976 1421.342 12107.000

# With Z2 as auxiliary information for stratified balanced sampling.
BS <- bsmatch(object,Z2)

Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
round(addmargins(YZ_bs),3)
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    651.249  862.018  647.975  885.562  679.848  772.688  526.232  5025.572
#> 2     87.458  170.447  182.069  171.690  157.381  120.034  113.468  1002.547
#> 3     48.814   24.211   72.143  107.549   61.131  101.476   86.488   501.812
#> 4    137.241  164.263  137.980  139.333  125.073  136.224   25.065   865.179
#> 5    455.864  388.036  536.370  525.841  471.763  469.060  508.527  3355.461
#> 6     23.058   68.806   25.641    0.000   23.053    0.000   36.176   176.734
#> 7    133.574  212.685  230.697  267.377  193.994   60.880   80.490  1179.696
#> Sum 1537.257 1890.466 1832.874 2097.351 1712.243 1660.362 1376.446 12107.000

Prediction


# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))
  
Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2$c_income))
head(Z_pred)
#>      (0,15]   (15,30] (30,45]   (45,60]  (60,75] (75,90] (90,100]
#> [1,]      0 0.0000000       0 0.0000000 0.000000       0        1
#> [2,]      0 0.5148893       0 0.4851107 0.000000       0        0
#> [3,]      0 0.0000000       0 0.0000000 0.000000       1        0
#> [4,]      0 0.0000000       0 1.0000000 0.000000       0        0
#> [5,]      0 0.0000000       0 0.0000000 0.000000       0        1
#> [6,]      0 0.8656330       0 0.0000000 0.134367       0        0