In this tutorial, we show that the alternate maximization (AM) is used in the first step of the two-step estimation method and the information criterion (IC) method is adopted to choose the number of factors.
The package can be loaded with the command:
First, we generate the data with homogeneous normal variables.
Then, we set the algorithm parameters and fit model
# Obtain the observed data
XList <- dat$XList # this is the data in the form of matrix list.
str(XList)
X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is continous variable type.
types <- 'gaussian'
Third, we fit the GFM model with user-specified number of factors.
# specify q=2
gfm1 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE)
# measure the performance of GFM estimators in terms of canonical correlations
measurefun(gfm1$hH, dat$H0, type='ccor')
measurefun(gfm1$hB, dat$B0, type='ccor')
The number of factors can also be determined by data-driven manners.
First, we generate the data with heterogeous normal variables and set the parameters of algorithm.
dat <- gendata(seed=1, n=100, p=100, type='heternorm', q=2, rho=1)
# Obtain the observed data
XList <- dat$XList # this is the data in the form of matrix list.
str(XList)
X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is continous variable type.
types <- 'gaussian'
Third, we fit the GFM model with user-specified number of factors and compare the results with that of linear factor models.
# specify q=2
gfm1 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE)
# measure the performance of GFM estimators in terms of canonical correlations
corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor')
corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor')
lfm1 <- Factorm(X, q=2)
corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor')
corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor')
library(ggplot2)
df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm),
Method =factor(rep(c('GFM', "LFM"), times=2)),
Quantity= factor(c(rep('factors',2), rep("loadings", 2))))
ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)
The number of factors can also be determined by data-driven manners.
First, we generate the data with Count(Poisson) variables and set the parameters of algorithm.
q <- 3; p <- 200
dat <- gendata(seed=1, n=200, p=p, type='pois', q=q, rho=4)
# Obtain the observed data
XList <- dat$XList # this is the data in the form of matrix list.
str(XList)
X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is continous variable type.
types <- 'poisson'
Second, we we fit the GFM models given the true number of factors.
system.time(
hq <- chooseFacNumber(XList, types, q_set=1:6, select_method = "IC", parallelList=list(parallel=TRUE))
)
Third, we compare the results with that of linear factor models.
# measure the performance of GFM estimators in terms of canonical correlations
corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor')
corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor')
lfm1 <- Factorm(X, q=3)
corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor')
corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor')
library(ggplot2)
df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm),
Method =factor(rep(c('GFM', "LFM"), times=2)),
Quantity= factor(c(rep('factors',2), rep("loadings", 2))))
ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)
First, we generate the data with Count(Poisson) variables and set the parameters of algorithm. Then fit the GFM model with user-specified number of factors.
dat <- gendata(seed=1, n=200, p=200, type='pois_bino', q=2, rho=2)
# Obtain the observed data
XList <- dat$XList # this is the data in the form of matrix list.
str(XList)
X <- dat$X # this is the data in form of matrix
# set variables' type, 'gaussian' means there is continous variable type.
types <- dat$types
table(dat$X[,1])
table(dat$X[, 200])
# user-specified q=2
gfm2 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE)
measurefun(gfm2$hH, dat$H0, type='ccor')
measurefun(gfm2$hB, dat$B0, type='ccor')
Third, we compare the results with that of linear factor models.
# select q automatically
hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:4, verbose = FALSE, parallelList=list(parallel=TRUE))
# measure the performance of GFM estimators in terms of canonical correlations
corH_gfm <- measurefun(gfm2$hH, dat$H0, type='ccor')
corB_gfm <- measurefun(gfm2$hB, dat$B0, type='ccor')
Compare with linear factor models
lfm1 <- Factorm(dat$X, q=3)
corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor')
corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor')
library(ggplot2)
df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm),
Method =factor(rep(c('GFM', "LFM"), times=2)),
Quantity= factor(c(rep('factors',2), rep("loadings", 2))))
ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5)
sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] rmarkdown_2.29
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.37 R6_2.5.1 fastmap_1.2.0 xfun_0.49
#> [5] maketools_1.3.1 cachem_1.1.0 knitr_1.49 htmltools_0.5.8.1
#> [9] buildtools_1.0.0 lifecycle_1.0.4 cli_3.6.3 sass_0.4.9
#> [13] jquerylib_0.1.4 compiler_4.4.2 sys_3.4.3 tools_4.4.2
#> [17] evaluate_1.0.1 bslib_0.8.0 yaml_2.3.10 jsonlite_1.8.9
#> [21] rlang_1.1.4