---
title: "Importing From Other Sources"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Importing-From-Other-Sources}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)
```

MOCHA is intended for use after clustering and cell labeling, since all functions of MOCHA operate on per-sample basis within each cell population. MOCHA can be used with an ArchR Project as input, as well as with a generic input. This tutorial has two sections: demonstrating MOCHA with generic inputs - [Importing from Signac](#signac) and [Importing from SnapATAC](#snapatac).

## Importing from Signac {#signac}

This vignette will follow the [Signac tutorial](https://stuartlab.org/signac/articles/pbmc_multiomic.html) to generate an Signac object and label cell types within it. If you already have a Signac object with cell types labelled, skip to section [Extract Fragments from Signac](#extract-frags-signac)

```{r}
library(Signac)
library(Seurat)
library(SeuratDisk)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
set.seed(1234)
```

### Signac Tutorial Through Clustering

```{r}
# Download files
system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5')
system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz')
system('wget https://cf.10xgenomics.com/samples/cell-arc/1.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz.tbi')

# Load in ATAC and RNA data
counts <- Read10X_h5("pbmc_granulocyte_sorted_10k_filtered_feature_bc_matrix.h5")
fragpath <- "pbmc_granulocyte_sorted_10k_atac_fragments.tsv.gz"

```

```{r}
# create a Seurat object containing the RNA adata
pbmc <- CreateSeuratObject(
  counts = counts$`Gene Expression`,
  assay = "RNA"
)

# create ATAC assay and add it to the object
pbmc[["ATAC"]] <- CreateChromatinAssay(
  counts = counts$Peaks,
  sep = c(":", "-"),
  fragments = fragpath,
  annotation = annotation
)

DefaultAssay(pbmc) <- "ATAC"
pbmc <- NucleosomeSignal(pbmc)
pbmc <- TSSEnrichment(pbmc)

pbmc <- subset(
  x = pbmc,
  subset = nCount_ATAC < 100000 &
    nCount_RNA < 25000 &
    nCount_ATAC > 1000 &
    nCount_RNA > 1000 &
    nucleosome_signal < 2 &
    TSS.enrichment > 1
)

# Transform RNA 
DefaultAssay(pbmc) <- "RNA"
pbmc <- SCTransform(pbmc)
pbmc <- RunPCA(pbmc)
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
```

Label cell types:

```{r}
# load PBMC reference
system('wget https://atlas.fredhutch.org/data/nygc/multimodal/pbmc_multimodal.h5seurat')
reference <- LoadH5Seurat("pbmc_multimodal.h5seurat")
```

```{r}
DefaultAssay(pbmc) <- "SCT"

# transfer cell type labels from reference to query
transfer_anchors <- FindTransferAnchors(
  reference = reference,
  query = pbmc,
  normalization.method = "SCT",
  reference.reduction = "spca",
  recompute.residuals = FALSE,
  dims = 1:50
)

predictions <- TransferData(
  anchorset = transfer_anchors, 
  refdata = reference$celltype.l2,
  weight.reduction = pbmc[['pca']],
  dims = 1:50
)

pbmc <- AddMetaData(
  object = pbmc,
  metadata = predictions
)
```

```{r}
# set the cell identities to the cell type predictions
Idents(pbmc) <- "predicted.id"

# set a reasonable order for cell types to be displayed when plotting
levels(pbmc) <- c("CD4 Naive", "CD4 TCM", "CD4 CTL", "CD4 TEM", "CD4 Proliferating",
                  "CD8 Naive", "dnT",
                 "CD8 TEM", "CD8 TCM", "CD8 Proliferating", "MAIT", "NK", "NK_CD56bright",
                 "NK Proliferating", "gdT",
                 "Treg", "B naive", "B intermediate", "B memory", "Plasmablast",
                 "CD14 Mono", "CD16 Mono",
                 "cDC1", "cDC2", "pDC", "HSPC", "Eryth", "ASDC", "ILC", "Platelet")

saveRDS(pbmc, 'pbmc_Signac_tutorial.rds')
```

### Extract Fragments from Signac Object {#extract-frags-signac}

```{r}
pbmc <- readRDS('pbmc_Signac_tutorial.rds')

DefaultAssay(pbmc) <- 'ATAC'
fragObj <- Fragments(pbmc)
```

Signac's training dataset only has one sample, so to simulate multiple samples, we will duplicate the data here. ***This should not be necessary with a large dataset.***

```{r}
full_pbmc <- merge(
  pbmc, 
  y = c(pbmc, pbmc), 
  add.cell.ids = c('Sample1', 'Sample2', 'Sample3'),
  project = 'DuplicateData'
) 
```

For a larger dataset, you would interact over the fragObj list and extract each fragments.tsv.gz file.

Instead, we're just duplicating data for the sake of this tutorial.

More importantly, you do need to modify the barcode in the fragment file so it matches the barcodes in Seurat's metadata.

```{r}
fragList <- parallel::mclapply(1:3, function(x){
    frags <- read.table(GetFragmentData(fragObj[[1]]))
    names(frags) <- c('chr', 'start', 'end', 'barcode', 'val')
    frags$barcode <- paste("Sample",x,"_", frags$barcode, sep ='')
    frags
}, mc.cores = 3)
names(fragList) <- c('Sample1', 'Sample2', 'Sample3')
```

### Format metadata and fragments for MOCHA

Generate your sample/cell type list by finding all combinations of Samples and Cell Populations

Here we must also rename the GRangesList to have names in the format `CellPopulation#Sample`.

```{r}
celltype_sample_list <- apply(expand.grid(unique(pbmc@meta.data$predicted.id), names(fragList)), 1, paste, collapse = "#")
# Extract metadata, add the Sample column, as well as CellBarcode. 
fullMeta <- full_pbmc@meta.data
fullMeta$Sample = gsub("_.*","", rownames(full_pbmc@meta.data))
fullMeta$CellBarcode = gsub(".*_","", rownames(full_pbmc@meta.data))

# Change the CellPopulation_Sample format to CellPopulation#Sample for MOCHA
rownames(fullMeta) <- gsub("_","#", rownames(fullMeta))
CellType_GRanges <- pbapply::pblapply(cl = 30, celltype_sample_list, function(x){
  celltype <- gsub("#.*","", x)
  sample <- gsub(".*#","", x)
  sortedMeta <- dplyr::filter(fullMeta, predicted.id == celltype, Sample == sample)
  sortedFrags <- dplyr::filter(fragList[[sample]], barcode %in% rownames(sortedMeta))
  makeGRangesFromDataFrame(sortedFrags, keep.extra.columns = TRUE)
})

names(CellType_GRanges) <- celltype_sample_list
```

Calculate the study signal (the median number of fragments per cell)

```{r}
avg_reads <- lapply(fragList, function(x){
                filtFrag <- dplyr::filter(as.data.frame(x), barcode  %in% rownames(fullMeta))
                as.vector(table(filtFrag$barcode))
    })
studySignal <- median(unlist(avg_reads))
```

### Call Open Tiles with MOCHA

```{r}
# Our blacklist comes included with Signac
blackList <- blacklist_hg38_unified
# Call Open Tiles
tileResults <- MOCHA::callOpenTiles(ATACFragments = CellType_GRanges,
                             cellColData = fullMeta, 
                             blackList = blackList, 
                             genome = 'BSgenome.Hsapiens.UCSC.hg38',
                             cellPopLabel = 'predicted.id',
                             cellPopulations = fullMeta$predicted.id,
                             studySignal = studySignal,
                             cellCol = 'barcode',
                             TxDb = "TxDb.Hsapiens.UCSC.hg38.refGene",
                             Org = "org.Hs.eg.db", 
                            outDir = paste(getwd(),'/MOCHA_Out', sep = ''), numCores= 35)

TSAM <- MOCHA::getSampleTileMatrix(tileResults, threshold = 0.2, numCores = 3, verbose = TRUE)
```

## Importing from SnapATAC {#snapatac}

In this example, we follow the [SnapATAC 10X PBMC tutorial](https://github.com/r3fang/SnapATAC/tree/master/examples/10X_PBMC_15K#data_download) through the clustering step before extracting the fragments and cell metadata necessary for MOCHA.

If you have a fully-formed Snap file for your analysis with clustering results and sample information added to the metadata, skip to section [Formatting Snap File Metadata](#format-metadata). If following the vignette, we STRONGLY recommending installing this patched version of SnapATAC from [this repository](https://github.com/imran-aifi/SnapATAC) with `devtools::install_github("imran-aifi/SnapATAC")`.

### SnapATAC Tutorial through Clustering

Download the all the SnapATAC Tutorial Data (linked above) to your working directory, and load it:

``` bash
# CLI tool for installing from Google Drive share links
pip install gdown
# https://drive.google.com/file/d/1YiYd_Ydes3tqsJGpNuqQquUOoVj2EEjE/view?usp=share_link
gdown https://drive.google.com/uc?id=1YiYd_Ydes3tqsJGpNuqQquUOoVj2EEjE
# https://drive.google.com/file/d/1NvGn4M2_HD06PL5Nj2if5xO-uVA0y8Q5/view?usp=share_link
gdown https://drive.google.com/uc?id=1NvGn4M2_HD06PL5Nj2if5xO-uVA0y8Q5
# https://drive.google.com/file/d/1LUOqsXoQN6lVx-4RNlgH90e5oXQ0y9Bd/view?usp=share_link
gdown https://drive.google.com/uc?id=1LUOqsXoQN6lVx-4RNlgH90e5oXQ0y9Bd
# https://drive.google.com/file/d/1oMJ6wFsfS-q-sY_yaLEtnYebM7RrBix6/view?usp=share_link
gdown https://drive.google.com/uc?id=1oMJ6wFsfS-q-sY_yaLEtnYebM7RrBix6
# https://drive.google.com/file/d/1SEFZ5CJgcmoAkmo4_1kCMYS60YOFP379/view?usp=share_link
gdown https://drive.google.com/uc?id=1SEFZ5CJgcmoAkmo4_1kCMYS60YOFP379
# https://drive.google.com/file/d/1RlBvTCqz6mhaTAkfYiCdeojD-U2mN2wp/view?usp=share_link
gdown https://drive.google.com/uc?id=1RlBvTCqz6mhaTAkfYiCdeojD-U2mN2wp
```

```{r}
library(SnapATAC);
snap.files = c(
  "atac_pbmc_5k_nextgem.snap", 
  "atac_pbmc_10k_nextgem.snap"
);
sample.names = c(
  "PBMC 5K",
  "PBMC 10K"
);
barcode.files = c(
  "atac_pbmc_5k_nextgem_singlecell.csv",
  "atac_pbmc_10k_nextgem_singlecell.csv"
);
x.sp.ls = lapply(seq(snap.files), function(i){
  createSnap(
      file=snap.files[i],
      sample=sample.names[i]
  );
})
names(x.sp.ls) = sample.names;
barcode.ls = lapply(seq(snap.files), function(i){
  barcodes = read.csv(
      barcode.files[i], 
      head=TRUE
  );
  barcodes = barcodes[2:nrow(barcodes),];
  barcodes$logUMI = log10(barcodes$passed_filters + 1);
  barcodes$promoter_ratio = (barcodes$promoter_region_fragments+1) / (barcodes$passed_filters + 1);
  barcodes
})
x.sp.ls
```

```{r}
# for both datasets, we identify usable barcodes using [3.5-5] for log10(UMI) and [0.4-0.8] for promoter ratio as cutoff.
cutoff.logUMI.low = c(3.5, 3.5);
cutoff.logUMI.high = c(5, 5);
cutoff.FRIP.low = c(0.4, 0.4);
cutoff.FRIP.high = c(0.8, 0.8);
barcode.ls = lapply(seq(snap.files), function(i){
  barcodes = barcode.ls[[i]];
  idx = which(
      barcodes$logUMI >= cutoff.logUMI.low[i] & 
      barcodes$logUMI <= cutoff.logUMI.high[i] & 
      barcodes$promoter_ratio >= cutoff.FRIP.low[i] &
      barcodes$promoter_ratio <= cutoff.FRIP.high[i]
  );
  barcodes[idx,]
});
x.sp.ls = lapply(seq(snap.files), function(i){
  barcodes = barcode.ls[[i]];
  x.sp = x.sp.ls[[i]];
  barcode.shared = intersect(x.sp@barcode, barcodes$barcode);
  x.sp = x.sp[match(barcode.shared, x.sp@barcode),];
  barcodes = barcodes[match(barcode.shared, barcodes$barcode),];
  x.sp@metaData = barcodes;
  x.sp
})
names(x.sp.ls) = sample.names;
x.sp.ls
```

```{r}
# combine two snap object
x.sp = Reduce(snapRbind, x.sp.ls);
x.sp@metaData["Sample"] = x.sp@sample;
print(table(x.sp@sample))
x.sp
```

```{r}
# Step 2. Add cell-by-bin matrix
x.sp = addBmatToSnap(x.sp, bin.size=5000);
# Step 3. Matrix binarization
x.sp = makeBinary(x.sp, mat="bmat");
# Step 4. Bin filtering
library(GenomicRanges);
black_list = read.table("hg19.blacklist.bed.gz");
black_list.gr = GRanges(
  black_list[,1], 
  IRanges(black_list[,2], black_list[,3])
);
idy = queryHits(
  findOverlaps(x.sp@feature, black_list.gr)
);
if(length(idy) > 0){
  x.sp = x.sp[,-idy, mat="bmat"];
};
x.sp

# Remove unwanted chromosomes
chr.exclude = seqlevels(x.sp@feature)[grep("random|chrM", seqlevels(x.sp@feature))];
idy = grep(paste(chr.exclude, collapse="|"), x.sp@feature);
if(length(idy) > 0){
  x.sp = x.sp[,-idy, mat="bmat"]
};
x.sp
```

```{r}
# The coverage of bins roughly obeys a log normal distribution. We remove the top 5% bins that overlap with invariant features such as the house keeping gene promoters.
bin.cov = log10(Matrix::colSums(x.sp@bmat)+1);
hist(
  bin.cov[bin.cov > 0], 
  xlab="log10(bin cov)", 
  main="log10(Bin Cov)", 
  col="lightblue", 
  xlim=c(0, 5)
);
bin.cutoff = quantile(bin.cov[bin.cov > 0], 0.95);
idy = which(bin.cov <= bin.cutoff & bin.cov > 0);
x.sp = x.sp[, idy, mat="bmat"];
x.sp

# We will further remove any cells of bin coverage less than 1,000. The rational behind this is that some cells may have high number of unique fragments but end up with low bin coverage after filtering. This step is optional but highly recommended.
idx = which(Matrix::rowSums(x.sp@bmat) > 1000);
x.sp = x.sp[idx,];
x.sp
```

```{r}
# Step 5. Dimensionality reduction
row.covs.dens <- density(
  x = x.sp@metaData[,"logUMI"], 
  bw = 'nrd', adjust = 1
);
sampling_prob <- 1 / (approx(x = row.covs.dens$x, y = row.covs.dens$y, xout = x.sp@metaData[,"logUMI"])$y + .Machine$double.eps);
set.seed(1);
idx.landmark.ds <- base::sort(sample(x = seq(nrow(x.sp)), size = 10000, prob = sampling_prob));
x.landmark.sp = x.sp[idx.landmark.ds,];
x.query.sp = x.sp[-idx.landmark.ds,];
x.landmark.sp = runDiffusionMaps(
  obj= x.landmark.sp,
  input.mat="bmat", 
  num.eigs=50
);
x.landmark.sp@metaData$landmark = 1;
x.query.sp = runDiffusionMapsExtension(
  obj1=x.landmark.sp, 
  obj2=x.query.sp,
  input.mat="bmat"
);
x.query.sp@metaData$landmark = 0;
x.sp = snapRbind(x.landmark.sp, x.query.sp);
x.sp = x.sp[order(x.sp@metaData["sample"])];
x.sp = runKNN(
  obj=x.sp,
  eigs.dims=1:20,
  k=15
);
x.sp=runCluster(
  obj=x.sp,
  tmp.folder=tempdir(),
  louvain.lib="R-igraph", #"leiden" preferred, but may cause issues. Requires 'library(leiden)'.
  seed.use=10,
  resolution=0.7
)

```

### Format Snap File Metadata {#format-metadata}

Add the computed clusters to the Snap object metadata.

The Snap object contains two samples, "PBMC 5K" and "PBMC 10K". Let's add a "Sample" column to the metadata. Let's also add a column "files" pointing to the original .snap files from which each cell came.

```{r}
# Add clusters (from SnapATAC::runCluster) to metadata
x.sp@metaData$cluster = x.sp@cluster

# Add Sample name to metadata (if not done previously)
x.sp@metaData$Sample = x.sp@sample

# Add files to metadata, indicating the original snap file each cell belongs to.
snap.files <- c(
  "atac_pbmc_5k_nextgem.snap", 
  "atac_pbmc_10k_nextgem.snap"
)
fileList <- unlist(lapply(x.sp@metaData$Sample, function(x){
  ifelse(x == "PBMC 5K", snap.files[[1]],snap.files[[2]])
}))
x.sp@metaData$files <- fileList

# SAVE this metadata to disk
write.csv(x.sp@metaData, "./snapMetadataforMOCHA.csv")
```

### Extract Fragments from Snap File {#extract-fragments}

Now we have a Snap object with metadata containing barcodes, unique cell ids (column cell_id), sample names, and cell populations (cluster). We also have our HG19 blackList, `black_list.gr`.

> Note: Following the tutorial from SnapATAC can often result in a segfault when extracting fragments with `SnapATAC::extractReads`. We recommend running on a machine with large RAM and avoiding parallelization.

Next we extract reads by sample and cell population, ensuring our final GRanges list is named following the pattern `CellPopulation#Sample`.

```{r}
snapMetadata <- read.csv("./snapMetadataforMOCHA.csv")
cellCol <- "barcode"
cellPopLabel <- "cluster"
cellPopulations <- unique(snapMetadata$cluster)
allSamples <- unique(snapMetadata$Sample)

fragmentsGRangesList <- unlist(lapply(allSamples, function(sample){
  barcodesList <- lapply(cellPopulations, function(cellPop) {
    snapMetadata[snapMetadata$Sample == sample,]
    # Extract barcodes for a single cell population
    cellPopIdx <- snapMetadata$cluster == cellPop
    cellPopBarcodes <- snapMetadata[cellPopIdx,]$barcode
    
    # Build the file list for the selected cell barcode
    files <- snapMetadata[cellPopIdx,]$files
    
    # Extract fragments
    cellPopFrags <- SnapATAC::extractReads(cellPopBarcodes, files, do.par = FALSE)
  })
  names(barcodesList) <- paste(cellPopulations, sample, sep="#")
  barcodesList
}))
```

Calculate the study signal (the median number of fragments per cell)

```{r}
avg_reads <- lapply(fragmentsGRangesList, function(x){
  filtFrag <- dplyr::filter(as.data.frame(x), barcode  %in% snapMetadata$barcode)
  as.vector(table(filtFrag$barcode))
})
studySignal <- median(unlist(avg_reads))
```

### Call Open Tiles with MOCHA {#call-tiles}

```{r}
# Call Open Tiles
tileResults <- MOCHA::callOpenTiles(
  ATACFragments = fragmentsGRangesList,
  cellColData = snapMetadata, 
  blackList = black_list.gr, 
  genome = "BSgenome.Hsapiens.UCSC.hg38",
  cellPopLabel = cellPopLabel,
  cellPopulations = cellPopulations,
  studySignal = studySignal,
  cellCol = cellCol,
  TxDb = "TxDb.Hsapiens.UCSC.hg38.refGene",
  Org = "org.Hs.eg.db", 
  outDir = paste(getwd(),'/MOCHA_Out', sep = ''), 
  numCores = 5
)

TSAM <- MOCHA::getSampleTileMatrix(tileResults, threshold = 0.2, numCores = 3, verbose = TRUE)
```