--- title: "dicepro - Real Data Workflow (BlueCode + CellMixtures)" author: "dicepro Team" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true toc_depth: 3 number_sections: true vignette: > %\VignetteIndexEntry{dicepro - Real Data Workflow (BlueCode + CellMixtures)} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE, eval = FALSE ) ``` > **Note:** All code chunks have `eval = FALSE` and are shown for > illustration only. To run them interactively: > ```r > library(dicepro) > # copy-paste the chunks below into your R session > ``` # Overview This vignette demonstrates a complete dicepro deconvolution workflow on **real RNA-seq data** using: - **BlueCode** -- the 34-cell-type reference signature matrix bundled with the package, spanning five major tissue compartments (Immune, Stromal, Endothelial, Epithelial, and Muscle). - **CellMixtures** -- a bulk RNA-seq data-set of experimentally mixed cell populations (12 samples, ~31 000 genes). --- # Data Loading ## BlueCode Reference Signature Matrix ```{r load-bluecode, eval=FALSE} library(dicepro) data(BlueCode) cat("BlueCode dimensions :", dim(BlueCode), "\n") cat("Number of cell types :", ncol(BlueCode), "\n") cat("Number of genes :", nrow(BlueCode), "\n") print(head(colnames(BlueCode), 5L)) ``` ```{r bluecode-compartments, eval=FALSE} compartments <- list( Immune = colnames(BlueCode)[1:9], Stromal = colnames(BlueCode)[10:17], Endothelial = colnames(BlueCode)[18:20], Epithelial = colnames(BlueCode)[21:25], Muscle = colnames(BlueCode)[26:34] ) for (comp in names(compartments)) { cat(sprintf(" %s (%d): %s\n", comp, length(compartments[[comp]]), paste(compartments[[comp]], collapse = ", "))) } ``` ## CellMixtures Bulk Dataset ```{r load-cellmixtures, eval=FALSE} data(CellMixtures) cat("CellMixtures dimensions :", dim(CellMixtures), "\n") cat("Sample names :", colnames(CellMixtures), "\n") cat("First 5 gene names :", head(rownames(CellMixtures), 5L), "\n") ``` --- # Data Inspection ## Gene Overlap ```{r gene-overlap, eval=FALSE} n_ref <- nrow(BlueCode) n_bulk <- nrow(CellMixtures) n_common <- length(intersect(rownames(BlueCode), rownames(CellMixtures))) cat(sprintf( "Reference genes : %d\nBulk genes : %d\nCommon genes : %d (%.1f%% of reference)\n", n_ref, n_bulk, n_common, 100 * n_common / n_ref )) ``` ## Expression Distribution ```{r expr-dist, eval=FALSE} log2_bulk <- log2(as.matrix(CellMixtures) + 1) boxplot( log2_bulk, las = 2, col = "#2c7bb680", ylab = expression(log[2](counts + 1)), main = "CellMixtures: expression distribution per sample" ) ``` --- # Deconvolution with `dicepro()` ```{r run-dicepro, eval=FALSE} out <- dicepro( reference = as.matrix(BlueCode), bulk = as.matrix(CellMixtures), methodDeconv = "FARDEEP", W_prime = 0, bulkName = "CellMixtures", refName = "BlueCode", hp_max_evals = 150L, algo_select = "random", output_path = tempdir(), hspaceTechniqueChoose = "all" ) ``` --- # Results ## Optimal Hyperparameters ```{r best-hp, eval=FALSE} cat("Best lambda :", out$hyperparameters$lambda, "\n") cat("Best gamma :", out$hyperparameters$gamma, "\n") cat("Loss :", out$metrics$loss, "\n") cat("Constraint :", out$metrics$constraint, "\n") ``` ## Hyperparameter Optimization Report ```{r plot-hyperopt, eval=FALSE, fig.height=9} out$plot_hyperopt ``` ## Pareto Frontier ```{r plot-pareto, eval=FALSE} out$plot ``` ## Estimated Cell-Type Proportions ```{r proportion-heatmap, eval=FALSE, fig.height=6} prop_mat <- as.matrix(out$H) prop_sorted <- prop_mat[, order(colMeans(prop_mat), decreasing = TRUE)] heatmap( prop_sorted, Rowv = NA, Colv = NA, col = colorRampPalette(c("white", "#2c7bb6", "#d7191c"))(50L), scale = "none", margins = c(12, 6), main = "Estimated cell-type proportions -- CellMixtures", xlab = "Cell type", ylab = "Sample" ) ``` ## Top Contributing Cell Types ```{r top-ct, eval=FALSE, fig.height=4} mean_props <- sort(colMeans(prop_mat), decreasing = TRUE) par(mar = c(10, 4, 3, 1)) barplot( mean_props, las = 2, col = "#2c7bb6", ylab = "Mean proportion", main = "Mean estimated proportions across samples", cex.names = 0.65 ) ``` ## Per-Sample Composition ```{r stacked-bar, eval=FALSE, fig.height=5} top10 <- names(sort(colMeans(prop_mat), decreasing = TRUE))[seq_len(10L)] prop_top10 <- prop_mat[, top10, drop = FALSE] cols <- colorRampPalette( c("#2c7bb6", "#abd9e9", "#ffffbf", "#fdae61", "#d7191c", "#1a9641", "#a6d96a", "#762a83", "#c2a5cf", "#e7d4e8") )(10L) barplot( t(prop_top10), col = cols, legend = colnames(prop_top10), args.legend = list(x = "topright", cex = 0.55, ncol = 2L), las = 1, ylab = "Proportion", xlab = "Sample", main = "Per-sample cell composition (top 10 cell types)", border = NA ) ``` ## Compartment-Level Summary ```{r compartment-summary, eval=FALSE, fig.height=4} ct_to_comp <- c( setNames(rep("Immune", 9L), compartments$Immune), setNames(rep("Stromal", 8L), compartments$Stromal), setNames(rep("Endothelial", 3L), compartments$Endothelial), setNames(rep("Epithelial", 5L), compartments$Epithelial), setNames(rep("Muscle", 9L), compartments$Muscle) ) shared_ct <- intersect(colnames(prop_mat), names(ct_to_comp)) comp_props <- vapply( unique(ct_to_comp), function(comp) { cts <- names(ct_to_comp)[ct_to_comp == comp & names(ct_to_comp) %in% shared_ct] if (length(cts) == 0L) return(NA_real_) mean(rowSums(prop_mat[, cts, drop = FALSE])) }, numeric(1L) ) comp_cols <- c( Immune = "#2c7bb6", Stromal = "#fdae61", Endothelial = "#1a9641", Epithelial = "#d7191c", Muscle = "#762a83" ) barplot( sort(comp_props, decreasing = TRUE), col = comp_cols[names(sort(comp_props, decreasing = TRUE))], ylab = "Mean proportion", las = 1, main = "Mean estimated proportion by tissue compartment", border = NA ) ``` --- # Session Info ```{r session-info, eval=FALSE} sessionInfo() ```