---
title: "Parallel Computing Examples Using Rcurvep"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Parallel Computing Examples Using Rcurvep}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

# Set up the packages

```{r setup, warning=FALSE, message=FALSE}
library(future)
library(dplyr)
library(purrr)
#library(microbenchmark) # it is needed if recalculating the comparison
library(Rcurvep)
```

# datasets from Rcurvep package

```{r}
data("zfishdev_all") # two endpoints, each endpoint includes 32 chemicals/curves
data("zfishdev_act") # simulated curves based on zfishdev_all, each chemical has 100 simulated curves
data("zfishdev") # four endpoints, each endpoint includes 3 chemicals/curves
```

# When no preferred BMR and would like to do a exhaustive search

This is a computationally intensive procedure. 
`n_sample = 100` is preferred but for demonstration, `n_sample = 10` is used.
Expressions are used to delay the execution of commands. 

In `combi_run_rcurvep` function, functions from **furrr** package are embedded. 
Use `future::plan()` to control the types of calculation.  

```{r}

# sequential run
seq_run_multi <- expression(
  
  # set up the plan
  future::plan(sequential),
  
  # calculation
  combi_run_rcurvep(
    zfishdev_all, 
    n_sample = 10,
    TRSH = seq(5, 95, by = 5),
    RNGE = 1000000,
    keep_sets = "act_set",
    seed = 300
  )
)


# parallel run
par_run_multi <- expression(
  
  # set up the plan
  future::plan(multisession, workers = 10),
  
  # calculation
  combi_run_rcurvep(
    zfishdev_all, 
    n_sample = 10,
    TRSH = seq(5, 95, by = 5),
    RNGE = 1000000,
    keep_sets = "act_set",
    seed = 300
  ),
  
  # re-set the plan back
  future::plan(sequential)
)

```

# Calculate 10 times and compare the results

Due to the long time, the results are pasted below. 
Parallel calculation is faster. 

```{r, eval=FALSE}

run_speed_multi_rcurvep <- microbenchmark(
  eval(seq_run_multi),
  eval(par_run_multi),
  times = 10
)
```


```{r}
#> run_speed_multi_rcurvep
#Unit: seconds
#                expr      min       lq     mean   median       uq      max neval
# eval(seq_run_multi) 61.06341 61.12504 61.79744 61.43494 62.17374 63.99470    10
# eval(par_run_multi) 18.87097 19.36093 19.74137 19.50655 20.42096 20.97378    10
```



# When preferred BMRs are available for endpoints


## Get the BMRs for each endpoint 

```{r}
bmr_out <- estimate_dataset_bmr(zfishdev_act, plot = FALSE)
bmrd <- bmr_out$outcome
```

## Join the BMRs to the concentration-response data

```{r}
inp_tb <- bmrd |>
  nest_join(
    zfishdev_all, by = c("endpoint"), keep = TRUE, name = "data"
  ) |>
  select(RNGE, endpoint, bmr_exp, data)
# input_data for combi_run_rcurvep
rmarkdown::paged_table(inp_tb)
```

## Set up the expressions

`n_sample = 1000` is preferred but for demonstration, `n_sample = 100` is used.

For sequential run, `purrr::pmap` is used. 
For parallel run, `furrr::future_pmap` is used

```{r}
# sequential run
seq_run_bmr <- expression(
  future::plan(sequential),
  pmap(inp_tb, ~ combi_run_rcurvep(..4, TRSH = ..3, RNGE = ..1, n_samples = 100, seed = 300, keep_sets = c("act_set")))
)

# parallel run
par_run_bmr <- expression(
  
  future::plan(multisession, workers = 10),
  
  # calculation
  # there is no need to use future_pmap here 
  pmap(inp_tb, ~ combi_run_rcurvep(..4, TRSH = ..3, RNGE = ..1, n_samples = 100, seed = 300, keep_sets = c("act_set"))),

  future::plan(sequential)
)


```

# Calculate 10 times and compare the results

Due to the long time, the results are pasted below. 
Parallel calculation is faster. 

```{r, eval=FALSE}
run_speed_bmr_rcurvep <- microbenchmark(
  eval(seq_run_bmr),
  eval(par_run_bmr),
  times = 10
)
```


```{r}
#> run_speed_bmr_rcurvep
#Unit: seconds
#              expr      min       lq     mean   median       uq      max neval
# eval(seq_run_bmr) 35.51327 35.59489 35.79890 35.81629 35.88001 36.28173    10
# eval(par_run_bmr) 14.78751 15.52596 16.19503 16.14672 16.50997 17.82284    10
```


# Fitting based on simulated curves using run_fit

In `run_fit` function, functions from **furrr** package are embedded. 
Use `future::plan()` to control the types of calculation. 

## Set up the expressions

`n_sample = 1000` is preferred but for demonstration, `n_sample = 100` is used.
Also, `create_dataset` function is used to convert the incidence data into response data.

```{r}

# sequential run
seq_fit_hill_boot <- expression(
  
  future::plan(sequential),
  run_fit(create_dataset(zfishdev), hill_pdir = 1, n_samples = 100, modls = "hill")

)

# parallel run
seq_fit_hill_boot <- expression(
  
  future::plan(multisession, workers = 10),
  
   # calculation
  run_fit(create_dataset(zfishdev), hill_pdir = 1, n_samples = 100, modls = "hill"),
  future::plan(sequential)
)
```


# Calculate 10 times and compare the results

Due to the long time, the results are pasted below. 
Parallel calculation is faster. 

```{r, eval=FALSE}
run_speed_fit_hill_boot <- microbenchmark(
  eval(seq_fit_hill_boot),
  eval(par_fit_hill_boot),
  times = 10
)
```


```{r}
#> run_speed_fit_hill_boot
#Unit: seconds
#               expr      min       lq     mean   median       uq      max neval
# eval(seq_fit_hill_boot) 60.75111 63.42611 64.02762 63.97692 65.46656 66.83198    10
# eval(par_fit_hill_boot) 34.81743 36.88777 37.43076 37.41199 38.88405 39.40711    10
```


# Fitting based on simulated curves using run_fit and pmap

We can also use similar syntax as `combi_run_rcurvep` to run multiple datasets.


```{r}

# make the incidence data as response data
inp_tb_resp <- inp_tb |> mutate(data = map(data, create_dataset))

# sequential run
seq_fit_hill_multi <- expression(
  future::plan(sequential),
  
  pmap(inp_tb_resp, ~ run_fit(..4, modls = "hill", hill_pdir = ifelse(..3 < 0, -1, 1), n_samples = 100, keep_sets = c("fit_set")), .options = furrr_options(seed = 2023))
)
  
# parallel run
para_fit_hill_multi <- expression(
  future::plan(multisession, workers = 10), 
  
  # calculation, no need to use future_pmap
  pmap(inp_tb_resp, ~ run_fit(..4, modls = "hill", hill_pdir = ifelse(..3 < 0, -1, 1), n_samples = 100, keep_sets = c("fit_set")), .options = furrr_options(seed = 2023)),
  future::plan(sequential)
)

```


# Calculate 10 times and compare the results

Due to the long time, the results are pasted below. 
Parallel calculation is faster. 


```{r, eval=FALSE}
run_speed_fit_hill_multi <- microbenchmark(
  eval(seq_fit_hill_multi),
  eval(para_fit_hill_multi),
  times = 10
)
```


```{r}
#> run_speed_fit_hill_multi
#Unit: seconds
#                      expr       min        lq      mean   median       uq      max neval
#  eval(seq_fit_hill_multi) 219.04359 220.59658 222.32948 222.2282 224.4493 225.2394    10
# eval(para_fit_hill_multi)  97.04507  98.85834  99.67153 100.1014 100.9218 101.0854    10
```


 
# Fitting based on original data


In `run_fit` function, functions from **furrr** package are embedded. 
Use `future::plan()` to control the types of calculation. 

Two parameters are tested: _hill_ and _cc2_.
_hill_ is 3-parameter Hill equation implemented using R. 
_cc2_ is 4-parameter Hill equation implemented using Java. 

The dataset - respd_1 - includes 3000 curves, is not available in the package. 
If the dataset is too small, it might not worthwhile to start the parallel computing. 

## Use `modls = hill` parameter

```{r, eval=FALSE}

# sequential run
seq_fit_hill_ori <- expression(
  
  future::plan(sequential),
  run_fit(respd_1,  modls = "hill")

)

# parallel run
par_fit_hill_ori <- expression(
  
  future::plan(multisession, workers = 5),
  run_fit(respd_1,  modls = "hill"),
  future::plan(sequential)
)
```


# Calculate 5 times and compare the results

Due to the long time, the results are pasted below. 
Parallel calculation is faster. 

```{r, eval=FALSE}
run_speed_hit_hill_ori <- microbenchmark(
  eval(seq_fit_hill_ori),
  eval(par_fit_hill_ori),
  times = 5
)
```


```{r}
#> run_speed_hit_hill_ori
#Unit: seconds
#                   expr       min        lq      mean    median       uq       max neval
# eval(seq_fit_hill_ori) 112.12160 112.30511 112.35622 112.40213 112.4308 112.52144     5
# eval(par_fit_hill_ori)  62.92156  63.04108  63.51158  63.60182  63.9130  64.08043     5
```


## Use `modls = cc2` parameter


```{r, eval=FALSE}

# sequential run
seq_fit_cc2 <- expression(
  
  future::plan(sequential),
  run_fit(respd_1,  modls = "cc2")

)

# parallel run
par_fit_cc2 <- expression(
  
  future::plan(multisession, workers = 5),
  run_fit(respd_1,  modls = "cc2"),
  future::plan(sequential)
)
```


# Calculate 5 times and compare the results

Due to the long time, the results are pasted below. 
Parallel calculation does not improve much.
It could be because the **cc2** is implemented using Java. 

```{r, eval=FALSE}
run_speed_fit_cc2 <- microbenchmark(
  eval(seq_fit_cc2),
  eval(par_fit_cc2),
  times = 5
)
```


```{r}
#> run_speed_fit_cc2
#Unit: seconds
#              expr      min       lq     mean   median       uq      max neval
# eval(seq_fit_cc2) 68.37777 68.39599 68.46936 68.45011 68.45746 68.66547     5
# eval(par_fit_cc2) 58.07689 58.31388 59.17766 59.01783 59.07421 61.40546     5
```



```{r, include=FALSE, eval=FALSE}
res <- ls() |> str_match("run_speed.*") |> na.omit()
l1 <- map(res, get) |> set_names(res)
saveRDS(l1, here("data-raw", "future_rcurvep_output.rds"))
```