--- title: 'GFM: alternate maximization and information criterion' author: "Wei Liu" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{GFM: alternate maximization and information criterion} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` In this tutorial, we show that the alternate maximization (AM) is used in the first step of the two-step estimation method and the information criterion (IC) method is adopted to choose the number of factors. ## Fit GFM model using simulated data The package can be loaded with the command: ```{r eval=FALSE} library("GFM") set.seed(1) # set a random seed for reproducibility. ``` ### GFM can handle data with homogeneous normal variables First, we generate the data with homogeneous normal variables. ```{r eval=FALSE} ## Homogeneous normal variables dat <- gendata(q = 2, n=100, p=100, rho=3) ``` Then, we set the algorithm parameters and fit model ```{r eval=FALSE} # Obtain the observed data XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix # set variables' type, 'gaussian' means there is continous variable type. types <- 'gaussian' ``` Third, we fit the GFM model with user-specified number of factors. ```{r eval=FALSE} # specify q=2 gfm1 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE) # measure the performance of GFM estimators in terms of canonical correlations measurefun(gfm1$hH, dat$H0, type='ccor') measurefun(gfm1$hB, dat$B0, type='ccor') ``` The number of factors can also be determined by data-driven manners. ```{r eval=FALSE} # select q automatically hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:6, verbose = FALSE, parallelList=list(parallel=TRUE)) hq ``` ### GFM outperforms LFM in analyzing data with heterogeous normal variables First, we generate the data with heterogeous normal variables and set the parameters of algorithm. ```{r eval=FALSE} dat <- gendata(seed=1, n=100, p=100, type='heternorm', q=2, rho=1) # Obtain the observed data XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix # set variables' type, 'gaussian' means there is continous variable type. types <- 'gaussian' ``` Third, we fit the GFM model with user-specified number of factors and compare the results with that of linear factor models. ```{r eval=FALSE} # specify q=2 gfm1 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE) # measure the performance of GFM estimators in terms of canonical correlations corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor') corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor') lfm1 <- Factorm(X, q=2) corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor') corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor') library(ggplot2) df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm), Method =factor(rep(c('GFM', "LFM"), times=2)), Quantity= factor(c(rep('factors',2), rep("loadings", 2)))) ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5) ``` The number of factors can also be determined by data-driven manners. ```{r eval=FALSE} # select q automatically hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:6, verbose = FALSE, parallelList=list(parallel=TRUE)) ``` ### GFM outperforms LFM in analyzing data with Count(Poisson) variables First, we generate the data with Count(Poisson) variables and set the parameters of algorithm. ```{r eval=FALSE} q <- 3; p <- 200 dat <- gendata(seed=1, n=200, p=p, type='pois', q=q, rho=4) # Obtain the observed data XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix # set variables' type, 'gaussian' means there is continous variable type. types <- 'poisson' ``` Second, we we fit the GFM models given the true number of factors. ```{r eval=FALSE} system.time( gfm1 <- gfm(XList, types, algorithm="AM", q=3, verbose = FALSE) ) ``` ```{r eval=FALSE} system.time( hq <- chooseFacNumber(XList, types, q_set=1:6, select_method = "IC", parallelList=list(parallel=TRUE)) ) ``` Third, we compare the results with that of linear factor models. ```{r eval=FALSE} # measure the performance of GFM estimators in terms of canonical correlations corH_gfm <- measurefun(gfm1$hH, dat$H0, type='ccor') corB_gfm <- measurefun(gfm1$hB, dat$B0, type='ccor') lfm1 <- Factorm(X, q=3) corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor') corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor') library(ggplot2) df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm), Method =factor(rep(c('GFM', "LFM"), times=2)), Quantity= factor(c(rep('factors',2), rep("loadings", 2)))) ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5) ``` ### GFM outperforms LFM in analyzing data with the mixed-types of count and categorical variables First, we generate the data with Count(Poisson) variables and set the parameters of algorithm. Then fit the GFM model with user-specified number of factors. ```{r eval=FALSE} dat <- gendata(seed=1, n=200, p=200, type='pois_bino', q=2, rho=2) # Obtain the observed data XList <- dat$XList # this is the data in the form of matrix list. str(XList) X <- dat$X # this is the data in form of matrix # set variables' type, 'gaussian' means there is continous variable type. types <- dat$types table(dat$X[,1]) table(dat$X[, 200]) # user-specified q=2 gfm2 <- gfm(XList, types, algorithm="AM", q=2, verbose = FALSE) measurefun(gfm2$hH, dat$H0, type='ccor') measurefun(gfm2$hB, dat$B0, type='ccor') ``` Third, we compare the results with that of linear factor models. ```{r eval=FALSE} # select q automatically hq <- chooseFacNumber(XList, types, select_method='IC', q_set = 1:4, verbose = FALSE, parallelList=list(parallel=TRUE)) # measure the performance of GFM estimators in terms of canonical correlations corH_gfm <- measurefun(gfm2$hH, dat$H0, type='ccor') corB_gfm <- measurefun(gfm2$hB, dat$B0, type='ccor') ``` Compare with linear factor models ```{r eval=FALSE} lfm1 <- Factorm(dat$X, q=3) corH_lfm <- measurefun(lfm1$hH, dat$H0, type='ccor') corB_lfm <- measurefun(lfm1$hB, dat$B0, type='ccor') library(ggplot2) df1 <- data.frame(CCor= c(corH_gfm, corH_lfm, corB_gfm, corB_lfm), Method =factor(rep(c('GFM', "LFM"), times=2)), Quantity= factor(c(rep('factors',2), rep("loadings", 2)))) ggplot(data=df1, aes(x=Quantity, y=CCor, fill=Method)) + geom_bar(position = "dodge", stat="identity",width = 0.5) ``` ## Session information ```{r} sessionInfo() ```