A quick tour of ppgmmga

Introduction

An R package implementing a Projection Pursuit algorithm based on finite Gaussian Mixtures Models for density estimation using Genetic Algorithms (PPGMMGA) to maximise a Negentropy index. The PPGMMGA algorithm provides a method to visualise high-dimensional data in a lower-dimensional space, with special reference to reveal clustering structures.

library(ppgmmga)
##    ___  ___  ___ ___ _  __ _  ___ ____ _
##   / _ \/ _ \/ _ `/  ' \/  ' \/ _ `/ _ `/
##  / .__/ .__/\_, /_/_/_/_/_/_/\_, /\_,_/ 
## /_/  /_/   /___/            /___/       version 1.3

Banknote data

library(mclust)
data("banknote")
X <- banknote[,-1]
Class <- banknote$Status
table(Class)
## Class
## counterfeit     genuine 
##         100         100
clPairs(X, classification = Class, 
        symbols = ppgmmga.options("classPlotSymbols"),
        colors = ppgmmga.options("classPlotColors"))

1-dimensional PPGMMGA

PP1D <- ppgmmga(data = X, d = 1, seed = 1)
PP1D
## Call:
## ppgmmga(data = X, d = 1, seed = 1)
## 
## 'ppgmmga' object containing: 
## [1] "data"       "d"          "approx"     "GMM"        "GA"        
## [6] "Negentropy" "basis"      "Z"
summary(PP1D)
## ── ppgmmga ───────────────────────────── 
## 
## Data dimensions               = 200 x 6 
## Data transformation           = center & scale 
## Projection subspace dimension = 1 
## GMM density estimate          = (VEE,4)
## Negentropy approximation      = UT 
## GA optimal negentropy         = 0.6345935 
## GA encoded basis solution: 
##            x1       x2       x3       x4       x5
## [1,] 3.268902 2.373044 1.051365 0.313128 0.531718
## 
## Estimated projection basis: 
##                 PP1
## Length   -0.0119653
## Left     -0.0934775
## Right     0.1602105
## Bottom    0.5740698
## Top       0.3450346
## Diagonal -0.7189203
## 
## Monte Carlo Negentropy approximation check: 
##                            UT
## Approx Negentropy 0.634593544
## MC Negentropy     0.633614256
## MC se             0.002249545
## Relative accuracy 1.001545559
plot(PP1D)

plot(PP1D, class = Class)

2-dimensional PPGMMGA

PP2D <- ppgmmga(data = X, d = 2, seed = 1)
summary(PP2D)
## ── ppgmmga ───────────────────────────── 
## 
## Data dimensions               = 200 x 6 
## Data transformation           = center & scale 
## Projection subspace dimension = 2 
## GMM density estimate          = (VEE,4)
## Negentropy approximation      = UT 
## GA optimal negentropy         = 1.13624 
## GA encoded basis solution: 
##            x1       x2       x3       x4      x5      x6      x7      x8
## [1,] 2.268667 2.929821 1.061407 1.084929 0.30443 3.85462 0.98329 1.11377
##            x9      x10
## [1,] 0.167174 1.668403
## 
## Estimated projection basis: 
##                 PP1        PP2
## Length   -0.0372687 -0.0718319
## Left      0.0312555 -0.1198116
## Right    -0.1548079  0.0630092
## Bottom   -0.0856931  0.8639049
## Top      -0.1024990  0.4603727
## Diagonal  0.9776601  0.1350576
## 
## Monte Carlo Negentropy approximation check: 
##                            UT
## Approx Negentropy 1.136240194
## MC Negentropy     1.137260367
## MC se             0.003527379
## Relative accuracy 0.999102956
summary(PP2D$GMM)
## ------------------------------------------------------- 
## Density estimation via Gaussian finite mixture modeling 
## ------------------------------------------------------- 
## 
## Mclust VEE (ellipsoidal, equal shape and orientation) model with 4 components: 
## 
##  log-likelihood   n df       BIC       ICL
##       -1191.595 200 51 -2653.405 -2666.898
plot(PP2D$GA)

plot(PP2D)

plot(PP2D, class = Class, drawAxis = FALSE)

3-dimensional PPGMMGA

PP3D <- ppgmmga(data = X, d = 3, 
                center = TRUE, scale = FALSE, 
                gatype = "gaisl", 
                options = ppgmmga.options(numIslands = 2),
                seed = 1)
summary(PP3D)
## ── ppgmmga ───────────────────────────── 
## 
## Data dimensions               = 200 x 6 
## Data transformation           = center 
## Projection subspace dimension = 3 
## GMM density estimate          = (VVE,3)
## Negentropy approximation      = UT 
## GA optimal negentropy         = 1.16915 
## GA encoded basis solution: 
##            x1      x2       x3       x4       x5       x6       x7       x8
## [1,] 4.338173 2.52915 1.092234 1.076827 0.831164 4.978505 2.007004 2.077824
##            x9      x10  ...      x14      x15
## [1,] 1.994252 2.210178       1.57216 2.527153
## 
## Estimated projection basis: 
##                 PP1        PP2        PP3
## Length   -0.3089258  0.5132932 -0.5708323
## Left     -0.1213218 -0.1762688 -0.3272492
## Right     0.3028257  0.4912820 -0.3875035
## Bottom    0.2419392  0.3734296  0.4166151
## Top       0.2647285  0.4937701  0.3427949
## Diagonal -0.8182461  0.2843287  0.3547153
## 
## Monte Carlo Negentropy approximation check: 
##                            UT
## Approx Negentropy 1.169149622
## MC Negentropy     1.174923148
## MC se             0.004308964
## Relative accuracy 0.995086040
plot(PP3D$GA)

plot(PP3D)

plot(PP3D, class = Class)

plot(PP3D, dim = c(1,2))

plot(PP3D, dim = c(1,3), class = Class)

# A rotating 3D plot can be obtained using
if(!require("msir")) install.packages("msir")
msir::spinplot(PP3D$Z, markby = Class, 
               pch.points = c(20,17),
               col.points = ppgmmga.options("classPlotColors")[1:2])


References

Scrucca L, Serafini A (2019). “Projection pursuit based on Gaussian mixtures and evolutionary algorithms.” Journal of Computational and Graphical Statistics, 28(4), 847–860. https://doi.org/10.1080/10618600.2019.1598871.


sessionInfo()
## R version 4.4.2 (2024-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] mclust_6.1.1   ppgmmga_1.3    knitr_1.49     rmarkdown_2.29
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6      jsonlite_1.8.9    compiler_4.4.2    crayon_1.5.3     
##  [5] Rcpp_1.0.13-1     GA_3.2.4          jquerylib_0.1.4   scales_1.3.0     
##  [9] yaml_2.3.10       fastmap_1.2.0     ggplot2_3.5.1     R6_2.5.1         
## [13] labeling_0.4.3    iterators_1.0.14  tibble_3.2.1      maketools_1.3.1  
## [17] munsell_0.5.1     bslib_0.8.0       pillar_1.9.0      rlang_1.1.4      
## [21] utf8_1.2.4        cachem_1.1.0      xfun_0.49         sass_0.4.9       
## [25] sys_3.4.3         cli_3.6.3         withr_3.0.2       magrittr_2.0.3   
## [29] digest_0.6.37     foreach_1.5.2     grid_4.4.2        lifecycle_1.0.4  
## [33] vctrs_0.6.5       evaluate_1.0.1    glue_1.8.0        farver_2.1.2     
## [37] codetools_0.2-20  buildtools_1.0.0  fansi_1.0.6       colorspace_2.1-1 
## [41] pkgconfig_2.0.3   tools_4.4.2       htmltools_0.5.8.1