MacroPCA examples

Introduction

This file contains some examples of the functions related to the MacroPCA routine. More specifically, MacroPCA, MacroPCApredict and cellMap will be illustrated.

library("cellWise")
library("gridExtra") # has grid.arrange()

Small generated example

We will first look at a small artificial example.

set.seed(12345) # for reproducibility
n <- 50; d <- 10
A <- matrix(0.9, d, d); diag(A) = 1 
round(A,1) # true covariance matrix
##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [2,]  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [3,]  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [4,]  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9   0.9
##  [5,]  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9   0.9
##  [6,]  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9   0.9
##  [7,]  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9   0.9
##  [8,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9   0.9
##  [9,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0   0.9
## [10,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   1.0
library(MASS) # only needed for the following line:
x <- mvrnorm(n, rep(0,d), A)
x[sample(1:(n * d), 50, FALSE)] <- NA
x[sample(1:(n * d), 25, FALSE)] <- 10
x[sample(1:(n * d), 25, FALSE)] <- -10
x <- cbind(1:n, x)
# When not specifying MacroPCApars all defaults are used:
MacroPCA.out <- MacroPCA(x, k = 0)
##  
##  The input data has 50 rows and 11 columns.
##  
##  The data contained 1 columns that were identical to the case number
##  (number of the row).
##  Their column names are:
##  
## [1] X1
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 10 columns:
##  
##  [1] X2  X3  X4  X5  X6  X7  X8  X9  X10 X11
##  
##  The final data set we will analyze has 50 rows and 10 columns.
##  
## 
## Initial eigenvalues:
##  5.744158 0.09480572 0.07982848 0.05220849 0.05079981 0.04060717 0.03697148 0.02380588 0.0204317 0.01334948

## 
## The cumulative percentage of explained variability 
## by the first 10 components is:
##   PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8   PC9  PC10 
##  93.3  94.8  96.1  97.0  97.8  98.5  99.1  99.5  99.8 100.0 
## 
## Based on explained variability >= 80% one would select k = 1.
## 
## Please use this information and the scree plot to select a value of k
## and rerun MacroPCA with it.
MacroPCA.out <- MacroPCA(x, k = 1)
##  
##  The input data has 50 rows and 11 columns.
##  
##  The data contained 1 columns that were identical to the case number
##  (number of the row).
##  Their column names are:
##  
## [1] X1
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 10 columns:
##  
##  [1] X2  X3  X4  X5  X6  X7  X8  X9  X10 X11
##  
##  The final data set we will analyze has 50 rows and 10 columns.
##  
## 
## Initial eigenvalues:
##  5.682148 0.0936197 0.07770635 0.05754376 0.05269723 0.03632379 0.02764428 0.02274197 0.02064873 0.008519079 
## 
## XciSVD$rank, Xcih.SVD$rank, k and kmax:  10 10 1 10 
## 
## Performed an extra reweighting step because k = 1 < rank = 10 .
## 
## The final step used eigenvectors of MCD scatter.
cellMap(MacroPCA.out$stdResid)

# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, and all other cells are yellow.

TopGear dataset

The Top Gear data contains information on 297 cars.

library(robustHD)
data(TopGear)
dim(TopGear)
myTopGear = TopGear
rownames(myTopGear) = paste(myTopGear[,1],myTopGear[,2]) 
# These rownames are make and model of the cars.
rownames(myTopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = myTopGear[,-31] # removes subjective variable `Verdict'

# Transform some skewed variables:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)

# Check the data:
checkData = checkDataSet(transTG, silent = TRUE)
##  
##  The final data set we will analyze has 296 rows and 11 columns.
## 
# With option silent = FALSE we obtain more information:
# 
# The input data has 297 rows and 31 columns.
# 
# The input data contained 19 non-numeric columns (variables).
# Their column names are:
#   
#  [1] Maker         Model              Type               Fuel              
#  [5] DriveWheel    AdaptiveHeadlights AdjustableSteering AlarmSystem       
#  [9] Automatic     Bluetooth          ClimateControl     CruiseControl     
# [13] ElectricSeats Leather            ParkingSensors     PowerSteering     
# [17] SatNav        ESP                Origin            
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 12 numeric columns:
#   
# [1] Price Cylinders Displacement BHP   Torque Acceleration TopSpeed    
# [8] MPG   Weight    Length       Width Height      
# 
# The data contained 1 rows with over 50% of NAs.
# Their row names are:
#   
#   [1] Citroen C5 Tourer
# 
# These rows will be ignored in the analysis.
# We continue with the remaining 296 rows:
#   
#   [1] Alfa Romeo Giulietta             Alfa Romeo MiTo                 
# .......
# [295] Volvo XC70                       Volvo XC90                      
# 
# The data contained 1 columns with zero or tiny median absolute deviation.
# Their column names are:
# 
# [1] Cylinders
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 11 columns:
# 
# [1] Price  Displacement BHP   Torque Acceleration TopSpeed MPG         
# [8] Weight Length       Width Height      
# 
# The final data set we will analyze has 296 rows and 11 columns.

# The remainder of the dataset:
remTG = checkData$remX
dim(remTG) 
## [1] 296  11
colSums(is.na(remTG)) # we still have quite a few NA's, especially in Weight:
##        Price Displacement          BHP       Torque Acceleration     TopSpeed 
##            0            8            3            3            0            3 
##          MPG       Weight       Length        Width       Height 
##           11           32           10           15           10
# Check robust scale of the variables:
locscale = estLocScale(remTG)
round(locscale$scale,2)
##        Price Displacement          BHP       Torque Acceleration     TopSpeed 
##         0.60         0.38         0.59         0.60         3.51         0.17 
##          MPG       Weight       Length        Width       Height 
##        17.08       381.02       420.89        91.95       131.82
# The scales are clearly different, so we will standardize before PCA.
# This is the argument scale=TRUE in MacroPCApars below.

# Small option lists (MacroPCA will automatically extend them with the
# default choices for the other options):
DDCpars <- list(fastDDC = FALSE, silent = TRUE)
MacroPCApars <- list(DDCpars = DDCpars, scale = TRUE, silent = TRUE)
# Note that MacroPCA needs DDCpars because it first runs DDC.

# To choose the number k of principal components we can run MacroPCA with k=0:
MacroPCAtransTG0 <- MacroPCA(transTG,k=0,MacroPCApars=MacroPCApars)
##  
##  The final data set we will analyze has 296 rows and 11 columns.
## 

## 
## The cumulative percentage of explained variability 
## by the first 10 components is:
##  PC1  PC2  PC3  PC4  PC5  PC6  PC7  PC8  PC9 PC10 
## 70.3 83.4 90.4 93.3 95.3 96.7 97.8 98.7 99.3 99.7 
## 
## Based on explained variability >= 80% one would select k = 2.
## 
## Please use this information and the scree plot to select a value of k
## and rerun MacroPCA with it.
MacroPCAtransTG <- MacroPCA(transTG,k=2,MacroPCApars=MacroPCApars) # takes about 1 second.
##  
##  The final data set we will analyze has 296 rows and 11 columns.
## 
names(MacroPCAtransTG)
##  [1] "MacroPCApars" "remX"         "DDC"          "scaleX"       "k"           
##  [6] "loadings"     "eigenvalues"  "center"       "alpha"        "h"           
## [11] "It"           "diff"         "X.NAimp"      "scores"       "OD"          
## [16] "cutoffOD"     "SD"           "cutoffSD"     "highOD"       "highSD"      
## [21] "residScale"   "stdResid"     "indcells"     "NAimp"        "Cellimp"     
## [26] "Fullimp"
MacroPCAtransTG$MacroPCApars # these are all the options used, starting with DDCpars:
## $DDCpars
## $DDCpars$fracNA
## [1] 0.5
## 
## $DDCpars$numDiscrete
## [1] 3
## 
## $DDCpars$precScale
## [1] 1e-12
## 
## $DDCpars$cleanNAfirst
## [1] "automatic"
## 
## $DDCpars$tolProb
## [1] 0.99
## 
## $DDCpars$corrlim
## [1] 0.5
## 
## $DDCpars$combinRule
## [1] "wmean"
## 
## $DDCpars$returnBigXimp
## [1] FALSE
## 
## $DDCpars$silent
## [1] TRUE
## 
## $DDCpars$nLocScale
## [1] 25000
## 
## $DDCpars$fastDDC
## [1] FALSE
## 
## $DDCpars$standType
## [1] "1stepM"
## 
## $DDCpars$corrType
## [1] "gkwls"
## 
## $DDCpars$transFun
## [1] "wrap"
## 
## $DDCpars$nbngbrs
## [1] 100
## 
## 
## $kmax
## [1] 10
## 
## $alpha
## [1] 0.5
## 
## $scale
## [1] TRUE
## 
## $maxdir
## [1] 250
## 
## $distprob
## [1] 0.99
## 
## $silent
## [1] TRUE
## 
## $maxiter
## [1] 20
## 
## $tol
## [1] 0.005
## 
## $bigOutput
## [1] TRUE
MacroPCAtransTG$cutoffOD # cutoff value for orthogonal distances OD:
## [1] 1.737869
MacroPCAtransTG$cutoffSD # cutoff value for score distances OD:
## [1] 3.034854
length(MacroPCAtransTG$indcells) # list of flagged cells
## [1] 126

Also run ICPCA (iterative classical PCA):

ICPCAtransTG <- ICPCA(remTG,k=2,scale=TRUE,tolProb=0.99)
names(ICPCAtransTG)
##  [1] "scaleX"      "k"           "loadings"    "eigenvalues" "center"     
##  [6] "covmatrix"   "It"          "diff"        "X.NAimp"     "scores"     
## [11] "OD"          "cutoffOD"    "SD"          "cutoffSD"    "highOD"     
## [16] "highSD"      "residScale"  "stdResid"    "indcells"
# Compare imputed values for missings:
MacroPCAtransTG$X.NAimp[c(94,176),8]
##  Ford Kuga Mini Coupe 
##   1654.907   1125.878
ICPCAtransTG$X.NAimp[c(94,176),8] 
##  Ford Kuga Mini Coupe 
##   1774.286   1001.698
# CAR MAGAZINE: Ford Kuga Kuga 2.0 TDCi weighs 1605kg 
# CAR MAGAZINE: MINI Coupe 1.6T Cooper  weighs 1165kg
# Make untransformed submatrix of X for labeling the cells 
# in the residual plot:
tempTG = myTopGear[checkData$rowInAnalysis,checkData$colInAnalysis]
dim(tempTG)
## [1] 296  11
tempTG$Price = tempTG$Price/1000 # to avoid printing long numbers

showrows = c(12,42,50,51,52,59,72,94,98,135,150,164,176,
             180,195,196,198,209,210,215,219,234,259,277) # these 24 cars will be shown

# Make the ggplot2 objects for the residual maps by the function cellMap:
ggpICPCA = cellMap(ICPCAtransTG$stdResid, showcellvalues="D", 
                   D=tempTG, mTitle="ICPCA residual map", 
                   showrows=showrows, sizecellvalues = 0.7)

plot(ggpICPCA)

ggpMacroPCA = cellMap(MacroPCAtransTG$stdResid, showcellvalues="D",
                      D=tempTG, mTitle="MacroPCA residual map",
                       showrows=showrows, sizecellvalues = 0.7)
plot(ggpMacroPCA)

# Creating the combined pdf:
# pdf(file="TopGear_IPCA_MacroPCA_residualMap.pdf", width=12, height=10)
# gridExtra::grid.arrange(ggpICPCA, ggpMacroPCA, ncol=2) # arranges two plots on a page
# dev.copy(pdf, file="TopGear_IPCA_MacroPCA_residualMap.pdf", width=20, height=16)
# dev.off()
### Creating the outlier maps

outlierMap(MacroPCAtransTG,title="MacroPCA outlier map",
           labelOut=FALSE)
rowlabels = rownames(tempTG)
plotLabs = rep("",nrow(tempTG))
plotLabs[42]  = rowlabels[42] # BMW i3
plotLabs[50]  = "Bugatti" 
plotLabs[52]  = rowlabels[52] # Caterham Super 7
plotLabs[59]  = rowlabels[59] # Chevrolet Volt
plotLabs[180] = rowlabels[180] # Mitsubishi i-MiEV
plotLabs[195] = rowlabels[195] # Noble M600
plotLabs[196] = rowlabels[196] # Pagani Huayra
plotLabs[219] = rowlabels[219] # Renault Twizy
plotLabs[234] = rowlabels[234] # Ssangyong Rodius
plotLabs[259] = rowlabels[259] # Vauxhall Ampera
textPos = cbind(MacroPCAtransTG$SD,MacroPCAtransTG$OD)
textPos[42,1]  = textPos[42,1] -0.5 # BMW i3
textPos[42,2]  = textPos[42,2] +0.8 # BMW i3
textPos[50,1]  = textPos[50,1] -0.03 # Bugatti Veyron
textPos[50,2]  = textPos[50,2] +0.05 # Bugatti Veyron
textPos[52,1]  = textPos[52,1] +0.3  # Caterham Super 7
textPos[52,2]  = textPos[52,2] -0.5  # Caterham Super 7
textPos[59,1]  = textPos[59,1] -0.05 # Chevrolet Volt
textPos[59,2]  = textPos[59,2] -0.1  # Chevrolet Volt
textPos[180,1] = textPos[180,1] -1.2 # Mitsubishi i-MiEV
textPos[180,2] = textPos[180,2] +0.6 # Mitsubishi i-MiEV
textPos[195,1] = textPos[195,1] -0.05 # Noble M600
textPos[195,2] = textPos[195,2] +0.35 # Noble M600
textPos[196,1] = textPos[196,1] -0.6 # Pagani Huayra
textPos[196,2] = textPos[196,2] +0.7 # Pagani Huayra
textPos[219,1] = textPos[219,1] -0.03 # Renault Twizy
textPos[234,1] = textPos[234,1] +0.1 # Ssangyong Rodius
textPos[234,2] = textPos[234,2] +0.6 # Ssangyong Rodius
textPos[259,1] = textPos[259,1] -1.35 # Vauxhall Ampera
textPos[259,2] = textPos[259,2] +0.65 # Vauxhall Ampera
text(textPos,plotLabs,cex=0.8,pos=4)

# dev.copy(pdf,"TopGear_MacroPCA_outlierMap.pdf",width=6,height=6)
# dev.off()

outlierMap(ICPCAtransTG,title="ICPCA outlier map",labelOut=FALSE)
plotLabs = rep("",nrow(tempTG))
plotLabs[42]  = rowlabels[42]  # BMW i3
plotLabs[50]  = rowlabels[50]  # Bugatti Veyron
plotLabs[52]  = rowlabels[52]  # Caterham Super 7
plotLabs[59]  = rowlabels[59]  # Chevrolet Volt
plotLabs[180] = rowlabels[180] # Mitsubishi i-MiEV
plotLabs[195] = rowlabels[195] # Noble M600
plotLabs[196] = rowlabels[196] # Pagani Huayra
plotLabs[219] = rowlabels[219] # Renault Twizy
plotLabs[234] = rowlabels[234] # Ssangyong Rodius
plotLabs[259] = rowlabels[259] # Vauxhall Ampera
textPos = cbind(ICPCAtransTG$SD,ICPCAtransTG$OD)
textPos[50,1]  = textPos[50,1] -0.05 # Bugatti Veyron
textPos[50,2]  = textPos[50,2] +0.05 # Bugatti Veyron
textPos[52,1]  = textPos[52,1] -0.07 # Caterham Super 7
textPos[52,2]  = textPos[52,2] -0.37 # Caterham Super 7
textPos[59,1]  = textPos[59,1] -0.05 # Chevrolet Volt
textPos[59,2]  = textPos[59,2] -0.3  # Chevrolet Volt
textPos[180,1] = textPos[180,1] -1.2 # Mitsubishi i-MiEV
textPos[180,2] = textPos[180,2] +0.23 # Mitsubishi i-MiEV
textPos[195,1] = textPos[195,1] +0.25 # Noble M600
textPos[195,2] = textPos[195,2] +0.2 # Noble M600
textPos[196,1] = textPos[196,1] -0.05 # Pagani Huayra
textPos[196,2] = textPos[196,2] +0.32 # Pagani Huayra
textPos[219,1] = textPos[219,1] -0.8 # Renault Twizy
textPos[219,2] = textPos[219,2] +0.4 # Renault Twizy
textPos[234,1] = textPos[234,1] -0.05 # Ssangyong Rodius
textPos[234,2] = textPos[234,2] +0.2 # Ssangyong Rodius
textPos[259,1] = textPos[259,1] -0.9 # Vauxhall Ampera
textPos[259,2] = textPos[259,2] +0.4 # Vauxhall Ampera
text(textPos,plotLabs, cex=0.8, pos=4)

# dev.copy(pdf,"TopGear_ICPCA_outlierMap.pdf",width=6,height=6)
# dev.off()

TopGear dataset: prediction of new data

# For comparison, remake the residual map of the entire dataset, but now 
# showing the values of the residuals instead of the data values:

ggpMacroPCAres = cellMap(MacroPCAtransTG$stdResid, 
                         showcellvalues="R", sizecellvalues = 0.7,
                         mTitle="MacroPCA residual map",
                         showrows=showrows) 
plot(ggpMacroPCAres)

# Define the "initial" dataset as the rows not in these 24:
initX = remTG[-showrows,]
dim(initX) 
## [1] 272  11
# Fit initX:
MacroPCAinitX = MacroPCA(initX,k=2,MacroPCApars=MacroPCApars) 
##  
##  The final data set we will analyze has 271 rows and 11 columns.
## 
# Define the "new" data to predict:
newX = remTG[showrows,]
dim(newX) 
## [1] 24 11
# Make predictions by MacroPCApredict. 
# Its inputs are:
#
# Xnew            : the new data (test data), which must be a 
#                   matrix or a data frame. 
#                   It must always be provided.
# InitialMacroPCA : the output of the MacroPCA function on the 
#                   initial (training) dataset. Must be provided.
# MacroPCApars    : the input options to be used for the prediction.
#                   By default the options of InitialMacroPCA
#                   are used. For the complete list of options
#                   see the function MacroPCA.

predictMacroPCA = MacroPCApredict(newX,MacroPCAinitX)
# We did not need to specify the third argument because it is taken
# from the initial fit MacroPCAinitX .

names(predictMacroPCA)
##  [1] "MacroPCApars" "DDC"          "scaleX"       "k"            "loadings"    
##  [6] "eigenvalues"  "center"       "It"           "diff"         "Xnew.NAimp"  
## [11] "scores"       "OD"           "cutoffOD"     "SD"           "cutoffSD"    
## [16] "highOD"       "highSD"       "residScale"   "stdResid"     "indcells"    
## [21] "NAimp"        "Cellimp"      "Fullimp"
# The outputs are similar to those of of MacroPCA.

# Make the residual map:
ggpMacroPCApredict = cellMap(predictMacroPCA$stdResid, 
                             showcellvalues="R", sizecellvalues = 0.7,
                             mTitle="MacroPCApredict residual map")
plot(ggpMacroPCApredict) # is very similar to that based on all the data!

# Creating the combined pdf:
# pdf(file="TopGear_MacroPCApredict_residualMap.pdf",width=12,height=10)
# gridExtra::grid.arrange(ggpMacroPCAres,ggpMacroPCApredict,ncol=2)
# dev.off()

Glass dataset

The glass data consists of spectra with 750 wavelengths of 180 archaeological glass samples.

data(data_glass)
library(rrcov) # for robust PCA:
dim(data_glass) 
## [1] 180 750
# Do not scale the spectra in the glass data:
MacroPCApars$scale = FALSE

# Check data
checkData = checkDataSet(data_glass, silent=TRUE) 
##  
##  The final data set we will analyze has 180 rows and 737 columns.
## 
# With checkData = checkDataSet(glass, silent=FALSE) we obtain more information:
#
#  The input data has 180 rows and 750 columns.
#
#  The data contained 11 discrete columns with 3 or fewer values.
#  Their column names are:
#
#  [1] V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 739 columns:
#  
#   [1] V12  V13  V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25
#    ......
# [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The data contained 2 columns with zero or tiny median absolute deviation.
#  Their column names are:
#
# [1] V12 V13
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 737 columns:
#
#   [1] V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27
#    ......
# [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The final data set we will analyze has 180 rows and 737 columns.

remglass = checkData$remX
n <- nrow(remglass); n 
## [1] 180
d = ncol(remglass); d # the first 13 variables had scale 0:
## [1] 737
# Compare ICPCA and MacroPCA:

ICPCAglass <- ICPCA(remglass,k=4,scale=F,tolProb=0.99)

MacroPCAglass = MacroPCA(remglass,k=4,MacroPCApars=MacroPCApars) # takes 8 seconds

nrowsinblock = 5
rowlabels = rep("",floor(n/nrowsinblock));
rowlabels[1] = "1"
rowlabels[floor(n/nrowsinblock)] = "n";

ncolumnsinblock = 5
columnlabels = rep("",floor(d/ncolumnsinblock));
columnlabels[1] = "1";
columnlabels[floor(d/ncolumnsinblock)] = "d"

ggpICPCA <- cellMap(ICPCAglass$stdResid,
                    rowblocklabels=rowlabels,    
                    columnblocklabels=columnlabels,
                    mTitle="ICPCA residual map",
                    rowtitle="glass samples",
                    columntitle="wavelengths",
                    nrowsinblock=5,
                    ncolumnsinblock=5)

ggpMacroPCA <- cellMap(MacroPCAglass$stdResid,
                       rowblocklabels=rowlabels,
                       columnblocklabels=columnlabels,
                       mTitle="MacroPCA residual map",
                       rowtitle="glass samples",
                       columntitle="wavelengths",
                       nrowsinblock=5,
                       ncolumnsinblock=5)

grid.arrange(ggpMacroPCA,ggpICPCA,nrow=2) 

# pdfName = "Glass_MacroPCA_ICPCA_residualMap.pdf"
# dev.copy(pdf, pdfName, width=12, height=8)
# dev.off()

We now compare MacroPCA with ROBPCA:

library(rrcov) # only needed for PcaHubert()
ROBPCAglass = PcaHubert(remglass,k=4,alpha=0.5)
# Calculate ROBPCA residuals and standardize them:
Xhat = sweep(ROBPCAglass@scores %*% t(ROBPCAglass@loadings),
             2,ROBPCAglass@center,"+")
Xresid = remglass - Xhat
scaleRes = estLocScale(Xresid,type="1stepM",center=F)$scale
stdResidROBPCA = sweep(Xresid,2,scaleRes,"/")

ggpROBPCA <- cellMap(stdResidROBPCA,
                     rowblocklabels=rowlabels,
                     columnblocklabels=columnlabels,
                     mTitle="ROBPCA residual map",
                     rowtitle="glass samples",
                     columntitle="wavelengths",
                     nrowsinblock=5,
                     ncolumnsinblock=5)

grid.arrange(ggpMacroPCA, ggpROBPCA, nrow=2) 

# pdfName = "Glass_MacroPCA_ROBPCA_residualMap.pdf"
# dev.copy(pdf, pdfName, width=12, height=8)
# dev.off()

We now compare fastDDC=FALSE with fastDDC=TRUE in MacroPCA:

fastDDCpars = list(fastDDC=TRUE, silent=TRUE)
fastMacroPCApars = list(DDCpars=fastDDCpars, scale=FALSE, silent=TRUE)

fastMacroPCAglass = MacroPCA(data_glass,k=4,MacroPCApars=fastMacroPCApars) # 2 seconds
##  
##  The final data set we will analyze has 180 rows and 737 columns.
## 
ggpfastMacroPCA <- cellMap(fastMacroPCAglass$stdResid,
                           columnblocklabels=columnlabels,
                           rowblocklabels=rowlabels,
                           mTitle="MacroPCA with fastDDC=T",
                           columntitle="wavelengths",
                           rowtitle="glass samples",
                           ncolumnsinblock=5,
                           nrowsinblock=5)

grid.arrange(ggpMacroPCA, ggpfastMacroPCA, nrow=2) # The results are similar:

# pdfName = "Glass_MacroPCA_residualMaps.pdf"
# dev.copy(pdf, pdfName, width=12, height=8)
# dev.off()

DPOSS dataset

Finally we analyze the DPOSS data. This is a random subset of 20’000 stars from the Digitized Palomar Sky Survey described by Odewahn et al (1998).

data(data_dposs)
colnames(data_dposs); dim(data_dposs) 
##  [1] "MAperF" "MTotF"  "MCoreF" "AreaF"  "IR2F"   "csfF"   "EllipF" "MAperJ"
##  [9] "MTotJ"  "MCoreJ" "AreaJ"  "IR2J"   "csfJ"   "EllipJ" "MAperN" "MTotN" 
## [17] "MCoreN" "AreaN"  "IR2N"   "csfN"   "EllipN"
## [1] 20000    21
n = nrow(data_dposs); n
## [1] 20000
# Count missing cells
missmat = is.na(data_dposs)
sizemat = nrow(missmat)*ncol(missmat); sizemat 
## [1] 420000
100*sum(as.vector(missmat))/sizemat # 50.2% of the values are missing:
## [1] 50.20952
# Count rows with missings
missrow = length(which(rowSums(missmat) > 0))
100*missrow/nrow(missmat) # 84.6% of the rows contain missing values:
## [1] 84.61
# PERFORM ICPCA AND MACROPCA

ICPCAdposs = ICPCA(data_dposs,k=4,scale=TRUE)
names(ICPCAdposs)
##  [1] "scaleX"      "k"           "loadings"    "eigenvalues" "center"     
##  [6] "covmatrix"   "It"          "diff"        "X.NAimp"     "scores"     
## [11] "OD"          "cutoffOD"    "SD"          "cutoffSD"    "highOD"     
## [16] "highSD"      "residScale"  "stdResid"    "indcells"
# MacroPCA options with fracNA allowing for many NA's:
DDCPars = list(fastDDC=F,fracNA=1.0)
MacroPCAPars = list(DDCpars=DDCPars,scale=TRUE,silent=T)
MacroPCAdposs = MacroPCA(data_dposs,k=4,MacroPCApars=MacroPCAPars) # takes 6 seconds

## SCREE PLOTS

barplot(ICPCAdposs$eigenvalues,
        main="ICPCA scree plot", ylab="eigenvalues",
        names.arg=1:length(ICPCAdposs$eigenvalues))

# dev.copy(pdf,"DPOSS_ICPCA_screeplot.pdf",width=6,height=6)
# dev.off()

barplot(MacroPCAdposs$eigenvalues,
        main="MacroPCA scree plot", ylab="eigenvalues",
        names.arg=1:length(MacroPCAdposs$eigenvalues))

# Not as concentrated in the first eigenvalue.
# dev.copy(pdf,"DPOSS_MacroPCA_screeplot.pdf",width=6,height=6)
# dev.off()
## LOADINGS
ICPCAdposs$loadings[,2] = -ICPCAdposs$loadings[,2]

matplot(ICPCAdposs$loadings[,1:2],main="ICPCA loadings",
        xlab="variables",ylab="Loadings",col=c("black","blue"),
        ylim=c(-0.4,0.6),type="l",lty=c(1,2),lwd=2)
abline(v=7.5,col="red")
abline(v=14.5,col="red")

# dev.copy(pdf,"DPOSS_ICPCA_loadings.pdf",width=5,height=5)
# dev.off()

matplot(MacroPCAdposs$loadings[,1:2],main="MacroPCA loadings",
        xlab="variables",ylab="Loadings",col=c("black","blue"),
        ylim=c(-0.3,0.5),type="l",lty=c(1,2),lwd=2)
abline(v=7.5,col="red")
abline(v=14.5,col="red")

# dev.copy(pdf,"DPOSS_MacroPCA_loadings.pdf",width=5,height=5)
# dev.off()
# Select the 150 rows with highest OD from MacroPCA
dpossOD = MacroPCAdposs$OD
summary(dpossOD) 
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##   0.04344   0.45833   0.99144   2.01535   2.31592 145.53136
n = length(dpossOD); n
## [1] 20000
sortOD = order(dpossOD,1:n)
summary(sortOD); sortOD[1:5]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    5001   10000   10000   15000   20000
## [1] 10370  6891  1320 17270  7993
indHighOD = sortOD[n-150+1:n]
indHighOD = na.omit(indHighOD)
summary(indHighOD); length(indHighOD) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     157    5009    8222    9549   15101   19979
## [1] 150
indLowOD = sortOD[1:(n-150)]
indLowOD = na.omit(indLowOD)
summary(indLowOD); length(indLowOD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    5001   10014   10004   15000   20000
## [1] 19850
## OUTLIER MAPS

dpossColList = list(class1=list(col="black",index=indLowOD),
                    class2=list(col="red",index=indHighOD))

outlierMap(ICPCAdposs,title="ICPCA outlier map", 
           col=dpossColList,labelOut=FALSE)

# dev.copy(pdf,"DPOSS_ICPCA_outlierMap.pdf",width=8,height=8)
# dev.off()

outlierMap(MacroPCAdposs,title="MacroPCA outlier map", 
           col=dpossColList,labelOut=FALSE)

# dev.copy(pdf,"DPOSS_MacroPCA_outlierMap.pdf",width=8,height=8)
# dev.off()
## ICPCA SCORE PLOTS
ICPCAdposs$scores[,2] = -ICPCAdposs$scores[,2]

plot(ICPCAdposs$scores[,1:2],main="ICPCA scores",xlab="PC1",ylab="PC2")
points(ICPCAdposs$scores[indHighOD,1:2],pch=16,col="red")

# dev.copy(pdf,"DPOSS_ICPCA_Scores12.pdf",width=5,height=5)
# dev.off()

plot(ICPCAdposs$scores[,c(1,3)],main="ICPCA scores",xlab="PC1",ylab="PC3")
points(ICPCAdposs$scores[indHighOD,c(1,3)],pch=16,col="red")

plot(ICPCAdposs$scores[,2:3],main="ICPCA scores",xlab="PC2",ylab="PC3")
points(ICPCAdposs$scores[indHighOD,2:3],pch=16,col="red")

# MacroPCA SCORE PLOTS
MacroPCAdposs$scores[,2] = -MacroPCAdposs$scores[,2]
MacroPCAdposs$scores[,3] = -MacroPCAdposs$scores[,3]

plot(MacroPCAdposs$scores[,1:2],main="MacroPCA scores",xlab="PC1",ylab="PC2")
points(MacroPCAdposs$scores[indHighOD,1:2],pch=16,col="red")

# dev.copy(pdf,"DPOSS_MacroPCA_Scores12.pdf",width=5,height=5)
# dev.off()

plot(MacroPCAdposs$scores[,c(1,3)],main="MacroPCA scores",xlab="PC1",ylab="PC3")
points(MacroPCAdposs$scores[indHighOD,c(1,3)],pch=16,col="red")

plot(MacroPCAdposs$scores[,2:3],main="MacroPCA scores",xlab="PC2",ylab="PC3")
points(MacroPCAdposs$scores[indHighOD,2:3],pch=16,col="red")

# RESIDUAL MAPS

dpossH = na.omit(indHighOD)
summary(dpossH); length(as.vector(dpossH)) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     157    5009    8222    9549   15101   19979
## [1] 150
dpossL = na.omit(indLowOD)
summary(dpossL); length(as.vector(dpossL)) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1    5001   10014   10004   15000   20000
## [1] 19850
set.seed(0)
dpossH = sample(dpossH,150)
dpossL = sample(dpossL,300)
showrowsdposs = c(dpossH,dpossL)
summary(showrowsdposs)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     150    5110    9818   10115   15554   19991
length(showrowsdposs) 
## [1] 450
rowlabels = c("OS1","OS2","OS3","OS4","OS5","OS6",
              "S1","S2","S3","S4","S5","S6","S7","S8",
              "S9","S10","S11","S12")


ggpICPCAdposs = cellMap(ICPCAdposs$stdResid,
                        rowblocklabels=rowlabels,
                        mTitle="ICPCA residual map",
                        rowtitle="",
                        showrows=showrowsdposs,
                        nrowsinblock=25,
                        ncolumnsinblock=1,
                        sizetitles=1.5)
plot(ggpICPCAdposs) # not much to see:

# dev.copy(pdf,"DPOSS_ICPCA_residualMap.pdf",width=8,height=6)
# dev.off()


ggpMacroPCAdposs = cellMap(MacroPCAdposs$stdResid, 
                           rowblocklabels=rowlabels,
                           mTitle="MacroPCA residual map",
                           rowtitle="",
                           showrows=showrowsdposs,
                           nrowsinblock=25,
                           ncolumnsinblock=1,
                           sizetitles=1.5)
plot(ggpMacroPCAdposs) # interesting structure:

# dev.copy(pdf,"DPOSS_MacroPCA_residualMap.pdf",width=8,height=6)
# dev.off()