---
title: "Fast MaxLFQ"
author: "Thang V Pham"
date: "`r Sys.Date()`" 
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Fast MaxLFQ}
  %VignetteEncoding{UTF-8}
  %\VignetteEngine{knitr::rmarkdown}
editor_options: 
  chunk_output_type: console
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

```{r eval=TRUE, include=FALSE, echo=FALSE}
require("knitr")
local_file_exist <- file.exists("DIA-report-long-format.txt")
```

We have implemented a highly optimized version of the **iq** pipeline (Pham et al., Bioinformatics 2020). To run the following examples, download [DIA-report-long-format.zip](https://github.com/tvpham/iq/releases/download/v1.1/DIA-report-long-format.zip) and unzip the file to a local working directory.

The unzipped file `DIA-report-long-format.txt` is a tab-deliminated text file exported from a Spectronaut search using this export schema [iq.rs](https://github.com/tvpham/iq/releases/download/v1.1/iq.rs). The user might want to import this schema to their Spectronaut installation for the ease of using the **iq** pipeline.


## The standard pipeline

First, we apply the standard pipeline implemented in pure R. Read and filter the data

```{r, eval = local_file_exist, include = TRUE}
library("iq") # if not already installed, run install.packages("iq") 

raw <- read.delim("DIA-Report-long-format.txt")

selected <- raw$F.ExcludedFromQuantification == "False" & 
            !is.na(raw$PG.Qvalue) & (raw$PG.Qvalue < 0.01) &
            !is.na(raw$EG.Qvalue) & (raw$EG.Qvalue < 0.01)

raw <- raw[selected,]
```

Normalize the data, create a protein list, and perform the MaxLFQ algorithm

```{r eval = local_file_exist, echo=TRUE, message = TRUE, include = TRUE}
sample_id  <- "R.FileName" 

secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")

norm_data <- iq::preprocess(raw, 
                            sample_id  = sample_id, 
                            secondary_id = secondary_id)

protein_list <- iq::create_protein_list(norm_data)

result <- iq::create_protein_table(protein_list)
```

Extract annotation columns and write the result to an output file

```{r eval = local_file_exist}
annotation_columns <- c("PG.Genes", "PG.ProteinNames")

extra_names <- iq::extract_annotation(rownames(result$estimate), 
                                      raw, 
                                      annotation_columns = annotation_columns)

write.table(cbind(Protein = rownames(result$estimate),
                  extra_names[, annotation_columns],
                  MaxLFQ_annotation = result$annotation,
                  result$estimate),
            "iq-MaxLFQ.txt", sep = "\t", row.names = FALSE)
```

The resulting file `iq-MaxLFQ.txt` is the protein level quantification report in a tab-deliminated text format.

## A faster MaxLFQ implementation

The function `iq::fast_MaxLFQ` implemented in C++ combines the functionalities of `iq::create_protein_list` and `iq::create_protein_table`.

```{r eval = local_file_exist}
#--------------------- Replacing ---------------------
# protein_list <- iq::create_protein_list(norm_data) #
# result <- iq::create_protein_table(protein_list)   #
#-----------------------------------------------------

result_faster <- iq::fast_MaxLFQ(norm_data)

```

The results of the R implementation `result` and C++ implementation `result_faster` should be equal up to the floating-point precision of the underlying numerical libraries.

```{r eval = local_file_exist}
cat("Max difference =", max(abs(result_faster$estimate - result$estimate), na.rm = TRUE), "\n")

cat("Identical NAs =", identical(is.na(result_faster$estimate), is.na(result$estimate)), "\n")

cat("Equal annotation =", identical(result_faster$annotation, result$annotation), "\n")
```

### Benchmarking execution time

We can check the improvement in execution time. The following result is obtained on a computer with 12 CPU cores.

```{r eval=local_file_exist}
system.time({
    protein_list <- iq::create_protein_list(norm_data)
    result <- iq::create_protein_table(protein_list)
})

system.time({
    result_faster <- iq::fast_MaxLFQ(norm_data)
})
```

## An efficient data structure and data loading

We have implemented a fast data loading algorithm and an efficient data structure. The memory usage is highly optimized to enable the processing of very large datasets.

```{r eval=local_file_exist}
sample_id  <- "R.FileName" 

secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")

annotation_columns <- c("PG.Genes", "PG.ProteinNames")

iq_dat <- iq::fast_read("DIA-report-long-format.txt",
                        sample_id  = sample_id, 
                        secondary_id = secondary_id,
                        filter_string_equal = c("F.ExcludedFromQuantification" = "False"),
                        annotation_col = annotation_columns)

iq_norm_data <- iq::fast_preprocess(iq_dat$quant_table)

result_fastest <- iq::fast_MaxLFQ(iq_norm_data, 
                                  row_names = iq_dat$protein[, 1], 
                                  col_names = iq_dat$sample)
```

The result of the optimized pipeline `result_fastest` should be the same as that of the standard pipeline `result`.

```{r eval = local_file_exist}
cat("Max difference =", max(abs(result_fastest$estimate - result$estimate), na.rm = TRUE), "\n")

cat("Identical NAs =", identical(is.na(result_fastest$estimate), is.na(result$estimate)), "\n")

cat("Equal annotation =", identical(result_fastest$annotation, result$annotation), "\n")
```

The annotation columns are stored in the `protein` component of the input data structure `iq_dat`. We can extract the annotation columns and write the result to an output text file.

```{r eval = local_file_exist}
iq_extra_names <- iq::extract_annotation(rownames(result_fastest$estimate), 
                                         iq_dat$protein, 
                                         annotation_columns = annotation_columns)

write.table(cbind(Protein = rownames(result_fastest$estimate),
                  iq_extra_names[, annotation_columns],
                  MaxLFQ_annotation = result_fastest$annotation,
                  result_fastest$estimate), 
            "iq-MaxLFQ-fast.txt", sep = "\t", row.names = FALSE)
```

### Benchmarking execution time

```{r eval = local_file_exist}
sample_id  <- "R.FileName" 

secondary_id <- c("EG.Library", "FG.Id", "FG.Charge", "F.FrgIon", "F.Charge", "F.FrgLossType")

annotation_columns <- c("PG.Genes", "PG.ProteinNames")

system.time({
    
    # reading data
    raw <- read.delim("DIA-report-long-format.txt")

    # filtering
    selected <- raw$F.ExcludedFromQuantification == "False" & 
                !is.na(raw$PG.Qvalue) & raw$PG.Qvalue < 0.01 &
                !is.na(raw$EG.Qvalue) & raw$EG.Qvalue < 0.01

    raw <- raw[selected,]

    ## process

    norm_data <- iq::preprocess(raw, 
                                sample_id  = sample_id, 
                                secondary_id = secondary_id)

    protein_list <- iq::create_protein_list(norm_data)
    
    result <- iq::create_protein_table(protein_list)
    
})

system.time({
    iq_dat <- iq::fast_read("DIA-report-long-format.txt",
                            sample_id  = sample_id, 
                            secondary_id = secondary_id,
                            filter_string_equal = c("F.ExcludedFromQuantification" = "False"),
                            annotation_col = annotation_columns)

    iq_norm_data <- iq::fast_preprocess(iq_dat$quant_table)

    result_fastest <- iq::fast_MaxLFQ(iq_norm_data, 
                                      row_names = iq_dat$protein[, 1], 
                                      col_names = iq_dat$sample)
})
```


## References

1. Pham TV, Henneman AA, Jimenez CR (2020) iq: an R package to estimate relative protein abundances from ion quantification in DIA-MS-based proteomics. _Bioinformatics_ 36(8):2611-2613.