Installing SuperCell package from gitHub
if (!requireNamespace("remotes")) install.packages("remotes")
remotes::install_github("GfellerLab/SuperCell")
Data available at authors’ GitHub under file name sincell_with_class_5cl.Rdata.
(i.e., 20
times less metacells (called ‘supercells’ in
the package functions) than single cells) by first building a kNN (k = 5) network using top n.var.genes = 1000
most variable genes for dimentionality reduction. Function
SCimplify()
computes the partition into metacells, this
information is available with the field membership
.
gamma <- 20 # graining level
k.knn <- 5
# Building metacells from gene expressio (GE)
SC <- SCimplify(GE, # gene expression matrix
k.knn = k.knn, # number of nearest neighbors to build kNN network
gamma = gamma, # graining level
n.var.genes = 1000 # number of the top variable genes to use for dimentionality reduction
)
# Alternative, metacells can be build from low-dimensional embedding.
# For this, first compute low-dim embedding and pass it into `SCimplify_from_embedding()`
if(0){
SC_alt <- SCimplify_from_embedding(
X = stats::prcomp(Matrix::t(GE[SC$genes.use,]), rank. = 10)$x, # PCA embedding
k.knn = k.knn, # number of nearest neighbors to build kNN network
gamma = gamma # graining level)
)
}
# plot network of metacells
supercell_plot(SC$graph.supercells, # network
color.use = "gray", # color of the nodes
main = paste("Metacell network, gamma =", gamma),
seed = 1)
# plot single-cell network
supercell_plot(SC$graph.singlecell, # network
group = cell.meta, # colored by cell line assignment
do.frames = F, # not drawing frames around each node
main = paste("Single-cell network, N =", dim(GE)[2]),
lay.method = "components") # method to compute the network 2D embedding
To get a gene expression of metacells, we need to average gene
expressions within each metacell with function
supercell_GE()
We now assign each metcell to a particular cell line based on the
cell line data, for this, we use function
supercell_assign()
. By default, this function assign each
metacell to a cluster with the largest Jaccard coefficient to avoid
biases towards very rare or very abundant clusters. Alternatively,
assigmnent can be performed using relative (may cause biase towards very
small populations) or absolute (may cause bias towards large
populations) abundance with method = "relative"
or
method = "absolute"
, respectively.
SC$cell_line <- supercell_assign(clusters = cell.meta, # single-cell assigment to cell lines (clusters)
supercell_membership = SC$membership, # single-cell assignment to metacells
method = "jaccard")
seed <- 1 # seed for network plotting
# plot network of metacells colored by cell line assignment
supercell_plot(SC$graph.supercells,
group = SC$cell_line,
seed = seed,
main = "Metacells colored by cell line assignment")
The quality of assignment can be evaluated with metacell purity
(function supercell_purity()
) that returns the proportion
of the most abundant cell type (in this case, cell line) in each
metacell.
# compute purity of metacells in terms of cell line composition
purity <- supercell_purity(clusters = cell.meta,
supercell_membership = SC$membership, method = 'max_proportion')
hist(purity, main = "Purity of metacells \nin terms of cell line composition")
Some options to plot networks of metacells
## rotate network to be more consistent with the single-cell one
supercell_plot(SC$graph.supercells,
group = SC$cell_line,
seed = seed,
alpha = -pi/2,
main = "Metacells colored by cell line assignment (rotated)")
## alternatively, any layout can be provided as 2xN numerical matrix, where N is number of nodes (cells)
## Let's plot metacell network using the layout of the single-cell network:
## 1) get single-cell network layout
my.lay.sc <- igraph::layout_components(SC$graph.singlecell)
## 2) compute metacell network layout averaging coordinates withing metacells
my.lay.SC <- Matrix::t(supercell_GE(ge = t(my.lay.sc), groups = SC$membership))
## 3) provide layout with the parameter $lay$
supercell_plot(SC$graph.supercells,
group = SC$cell_line,
lay = my.lay.SC,
main = "Metacells colored by cell line assignment (averaged coordinates)")
#dimensionality reduction
SC.PCA <- supercell_prcomp(Matrix::t(SC.GE), # metacell gene expression matrix
genes.use = SC$genes.use, # genes used for the coarse-graining, but any set can be provided
supercell_size = SC$supercell_size, # sample-weighted pca
k = 20)
## compute distance
D <- dist(SC.PCA$x)
## cluster metacells
SC.clusters <- supercell_cluster(D = D, k = 5, supercell_size = SC$supercell_size)
SC$clustering <- SC.clusters$clustering
## mapping metacell cluster to cell line
map.cluster.to.cell.line <- supercell_assign(supercell_membership = SC$clustering, clusters = SC$cell_line)
## clustering as cell line
SC$clustering_reordered <- map.cluster.to.cell.line[SC$clustering]
supercell_plot(SC$graph.supercells,
group = SC$clustering_reordered,
seed = seed,
alpha = -pi/2,
main = "Metacells colored by cluster")
markers.all.positive <- supercell_FindAllMarkers(ge = SC.GE, # metacell gene expression matrix
supercell_size = SC$supercell_size, # size of metacell for sample-weighted method
clusters = SC$clustering_reordered, # clustering
logfc.threshold = 1, # mininum log fold-change
only.pos = T) # keep only upregulated genes
markers.all.positive$H2228[1:20,]
#> p.value adj.p.value pct.1 pct.2 logFC w.mean.1 w.mean.2
#> S100A9 0 0 1 0.9821429 4.900415 3.407261 0.50021541
#> CD74 0 0 1 0.9515306 4.706122 5.160220 0.33755798
#> SAA1 0 0 1 0.9623724 4.645834 5.692237 0.70177437
#> CXCL1 0 0 1 0.9815051 4.597609 5.086265 0.34463066
#> CXCL6 0 0 1 0.9081633 3.909404 3.755125 0.15881505
#> LCN2 0 0 1 1.0000000 3.870886 6.867711 1.49507670
#> HLA-B 0 0 1 1.0000000 3.565492 5.059566 1.22965559
#> HLA-C 0 0 1 0.9751276 3.384311 4.180295 0.73496480
#> CXCL8 0 0 1 0.8641582 3.156071 3.184266 0.14175113
#> HLA-DRA 0 0 1 0.8131378 3.072904 2.809587 0.09010310
#> IGFBP3 0 0 1 1.0000000 3.024813 4.496049 1.31677725
#> BIRC3 0 0 1 0.8463010 2.994637 3.182946 0.21117832
#> HLA-DRB1 0 0 1 0.7295918 2.885790 2.644877 0.07274146
#> CCL20 0 0 1 0.8469388 2.814461 2.752270 0.08244648
#> HLA-A 0 0 1 0.9929847 2.656370 4.532330 1.91215341
#> MMP7 0 0 1 0.9317602 2.645386 3.085071 0.53916665
#> SAA2 0 0 1 0.7646684 2.597852 2.576263 0.15414246
#> WFDC2 0 0 1 0.6721939 2.376167 2.171900 0.10202708
#> HLA-DRB5 0 0 1 0.5943878 2.358504 2.256209 0.04683749
#> HLA-DPA1 0 0 1 0.6543367 2.316558 2.109882 0.05200388
genes.to.plot <- c("DHRS2", "MT1P1", "TFF1", "G6PD", "CD74", "CXCL8")
supercell_VlnPlot(ge = SC.GE,
supercell_size = SC$supercell_size,
clusters = SC$clustering_reordered,
features = genes.to.plot,
idents = c("H1975", "H2228", "A549"),
ncol = 3)
supercell_GeneGenePlot(ge = SC.GE,
gene_x = genes.to.plot[1:3],
gene_y = genes.to.plot[4:6],
supercell_size = SC$supercell_size,
clusters = SC$clustering_reordered,)
#> $p
#>
#> $w.cor
#> $w.cor$TFF1_CXCL8
#> [1] -0.2530462
#>
#> $w.cor$DHRS2_G6PD
#> [1] -0.2143222
#>
#> $w.cor$MT1P1_CD74
#> [1] 0.08085401
#>
#>
#> $w.pval
#> $w.pval$TFF1_CXCL8
#> [1] 5.350818e-30
#>
#> $w.pval$DHRS2_G6PD
#> [1] 8.66973e-22
#>
#> $w.pval$MT1P1_CD74
#> [1] 0.0003406761
supercell_rescale()
functionSC10 <- supercell_rescale(SC, gamma = 10)
SC10$cell_line <- supercell_assign(clusters = cell.meta, # single-cell assigment to cell lines (clusters)
supercell_membership = SC10$membership, # single-cell assignment to metacells
method = "jaccard")
supercell_plot(SC10$graph.supercells,
group = SC10$cell_line,
seed = 1,
main = "Metacells at gamma = 10 colored by cell line assignment")
In case you want to perform other analyses available with Seurat
package, we can convert SuperCell to Seurat object with
function supercell_2_Seurat()
or to SingleCellExperiment
object with function ‘supercell_2_sce()’. Let consider a Seurat
example.
#install.packages("Seurat")
library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.4.0 but the current version is
#> 4.4.1; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#> 'SeuratObject' was built with package 'Matrix' 1.7.0 but the current
#> version is 1.7.1; it is recomended that you reinstall 'SeuratObject' as
#> the ABI for 'Matrix' may have changed
#>
#> Attaching package: 'SeuratObject'
#> The following objects are masked from 'package:base':
#>
#> intersect, t
m.seurat <- supercell_2_Seurat(SC.GE = as.matrix(SC.GE), SC = SC, fields = c("cell_line", "clustering","clustering_reordered"))
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> Normalizing layer: counts
#> Done: NormalizeData
#> Doing: data to normalized data
#> Doing: weighted scaling
#> Done: weighted scaling
#> Warning in irlba(A = t(x = object), nv = npcs, ...): You're computing too large
#> a percentage of total singular values, use a standard svd instead.
#> Computing nearest neighbor graph
#> Computing SNN
Note: since metacells have different size (consist of different
number of cells), we apply sample-weighted algorithms at most af the
steps of the downstream analyses. Thus, when coercing SuperCell to
Seurat, we replaced PCA, saling and kNN graph of Seurat object with
those obtained applying sample-weighted version of PCA, scaling or
SuperCell graph (i.e., metacell network), respectively. If you then
again apply RunPCA
, ScaleData
, or
FindNeighbors
, the result will be rewritten, but you will
be able to access them with
Embeddings(m.seurat, reduction = "pca_weigted")
,
m.seurat@assays$RNA@misc[["scale.data.weighted"]]
, or
m.seurat@graphs$RNA_super_cells
, respectively.
### cluster SuperCell network (unweighted clustering)
m.seurat <- FindClusters(m.seurat, graph.name = "RNA_nn") # now RNA_nn is metacell network
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 98
#> Number of edges: 255
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.8185
#> Number of communities: 7
#> Elapsed time: 0 seconds
#> 1 singletons identified. 6 final clusters.
m.seurat <- FindNeighbors(m.seurat, verbose = FALSE) # RNA_nn has been replaced with kNN graph of metacell (unweigted)
m.seurat <- FindClusters(m.seurat, graph.name = "RNA_nn")
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 98
#> Number of edges: 960
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.7410
#> Number of communities: 4
#> Elapsed time: 0 seconds