---
title: "Conditional Nelson--Aalen and Aalen--Johansen Estimation"
author: "Martin Bladt & Christian Furrer"
date: "28th of February, 2023"
package: "AalenJohansen"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{my-vignette}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

This vignette illustrates, through four examples, the potential uses of the R-package $\texttt{AalenJohansen}$, which is an implementation of the conditional Nelson--Aalen and Aalen--Johansen estimators introduced in Bladt \& Furrer (2023).

## 1. Markov model with independent censoring

We start out with a simple time-inhomogeneous Markov model:
\begin{align*}
\frac{\mathrm{d}\Lambda(t)}{\mathrm{d}t}=\lambda(t)=\frac{1}{1+\frac{1}{2}t}
\begin{pmatrix}
-2 & 1& 1 \\
2 & -3 & 1 \\
0 & 0 & 0
\end{pmatrix}\!.
\end{align*}

```{r}
library(AalenJohansen)

set.seed(2)

jump_rate <- function(i, t, u){
  if(i == 1){
    2 / (1+1/2*t)
  } else if(i == 2){
    3 / (1+1/2*t)
  } else{
    0
  }
}

mark_dist <- function(i, s, v){
  if(i == 1){
    c(0, 1/2, 1/2)
  } else if(i == 2){
    c(2/3, 0, 1/3)
  } else{
    0
  } 
}

lambda <- function(t){
  A <- matrix(c(2/(1+1/2*t)*mark_dist(1, t, 0), 3/(1+1/2*t)*mark_dist(2, t, 0), rep(0, 3)),
              nrow = 3, ncol = 3, byrow = TRUE)
  diag(A) <- -rowSums(A)
  A
}
```

We simulate $1,000$ independent and identically distributed realizations subject to independent right-censoring. Right-censoring follows the distribution $\text{Unif}(0,5)$.

```{r}
n <- 1000

c <- runif(n, 0, 5)

sim <- list()
for(i in 1:n){
  sim[[i]] <- sim_path(sample(1:2, 1), rates = jump_rate, dists = mark_dist,
                       tn = c[i], bs = c(2*c[i], 3*c[i], 0))
}
```

The degree of censoring is

```{r}
sum(c == unlist(lapply(sim, FUN = function(z){tail(z$times, 1)}))) / n
```

We fit the classic Nelson--Aalen and Aalen-Johansen estimators.

```{r}
fit <- aalen_johansen(sim)
```

For illustrative purposes, we plot $\Lambda_{21}$ and the state occupation probability $p_2$ for both the true model (full) and using the classic estimators (dashed).

```{r, fig.align = 'center', fig.height = 3, fig.width = 6}
v1 <- unlist(lapply(fit$Lambda, FUN = function(L) L[2,1]))
v0 <- fit$t
p <- unlist(lapply(fit$p, FUN = function(L) L[2]))
P <- unlist(lapply(prodint(0, 5, 0.01, lambda), FUN = function(L) (c(1/2, 1/2, 0) %*% L)[2]))

par(mfrow = c(1, 2))
par(mar = c(2.5, 2.5, 1.5, 1.5))

plot(v0, v1, type = "l", lty = 2, xlab = "", ylab = "", main = "Hazard")
lines(v0, 4*log(1+1/2*v0))
plot(v0, p, type = "l", lty = 2, xlab = "", ylab = "", main = "Probability")
lines(seq(0, 5, 0.01), P)
```

## 2. Markov model with independent censoring and covariates

We now consider a simple extension with covariates:
\begin{align*}
\frac{\mathrm{d}\Lambda(t|x)}{\mathrm{d}t}=\lambda(t|x)=\frac{1}{1+x\cdot t}
\begin{pmatrix}
-2 & 1& 1 \\
2 & -3 & 1 \\
0 & 0 & 0
\end{pmatrix}\!.
\end{align*}

We simulate $10,000$ independent realizations subject to independent right-censoring. Right-censoring follows the distribution $\text{Unif}(0,5)$, while $X\sim\text{Unif}(0,1)$.

```{r}
jump_rate <- function(i, t, u){
  if(i == 1){
    2
  } else if(i == 2){
    3
  } else{
    0
  }
}

mark_dist <- function(i, s, v){
  if(i == 1){
    c(0, 1/2, 1/2)
  } else if(i == 2){
    c(2/3, 0, 1/3)
  } else{
    0
  }
}

lambda <- function(t, x){
  A <- matrix(c(2/(1+x*t)*mark_dist(1, t, 0), 3/(1+x*t)*mark_dist(2, t, 0), rep(0, 3)),
              nrow = 3, ncol = 3, byrow = TRUE)
  diag(A) <- -rowSums(A)
  A
}

n <- 10000

X <- runif(n)
c <- runif(n, 0, 5)

sim <- list()
for(i in 1:n){
  rates <- function(j, y, z){jump_rate(j, y, z)/(1+X[i]*y)}
  sim[[i]] <- sim_path(sample(1:2, 1), rates = rates, dists = mark_dist,
                       tn = c[i], bs = c(2*c[i], 3*c[i], 0))
  sim[[i]]$X <- X[i]
}
```

The degree of censoring is

```{r}
sum(c == unlist(lapply(sim, FUN = function(z){tail(z$times, 1)}))) / n
```

We fit the conditional Nelson--Aalen and Aalen--Johansen estimators for $x=0.2, 0.8$.

```{r}
x1 <- 0.2
x2 <- 0.8

fit1 <- aalen_johansen(sim, x = x1)
fit2 <- aalen_johansen(sim, x = x2)
```

For illustrative purposes, we plot $\Lambda_{21}$ and the conditional state occupation probability $p_2$ for both the true model (full) and using the conditional estimators (dashed). This is done for both $x=0.2$ (in red) and $x=0.8$ (in blue).

```{r, fig.align = 'center', fig.height = 3, fig.width = 6}
v11 <- unlist(lapply(fit1$Lambda, FUN = function(L) L[2,1]))
v10 <- fit1$t
v21 <- unlist(lapply(fit2$Lambda, FUN = function(L) L[2,1]))
v20 <- fit2$t
p1 <- unlist(lapply(fit1$p, FUN = function(L) L[2]))
P1 <- unlist(lapply(prodint(0, 5, 0.01, function(t){lambda(t, x = x1)}),
FUN = function(L) (c(1/2, 1/2, 0) %*% L)[2]))
p2 <- unlist(lapply(fit2$p, FUN = function(L) L[2]))
P2 <- unlist(lapply(prodint(0, 5, 0.01, function(t){lambda(t, x = x2)}),
FUN = function(L) (c(1/2, 1/2, 0) %*% L)[2]))

par(mfrow = c(1, 2))
par(mar = c(2.5, 2.5, 1.5, 1.5))

plot(v10, v11, type = "l", lty = 2, xlab = "", ylab = "", main = "Hazard", col = "red")
lines(v10, 2/x1*log(1+x1*v10), col = "red")
lines(v20, v21, lty = 2, col = "blue")
lines(v20, 2/x2*log(1+x2*v20), col = "blue")

plot(v10, p1, type = "l", lty = 2, xlab = "", ylab = "", main = "Probability", col = "red")
lines(seq(0, 5, 0.01), P1, col = "red")
lines(v20, p2, lty = 2, col = "blue")
lines(seq(0, 5, 0.01), P2, col = "blue")
```

## 3. Markov model with dependent censoring and covariates

We consider the same model as before, but now introduce dependent right-censoring. To be precise, we assume that right-censoring occurs at rate $\frac{1}{2}\frac{1}{1+\frac{1}{2}t}$ in the first state, while it occurs at twice this rate in the second state. Finally, all remaining individuals are right-censored at time $t$.

We again simulate $10,000$ independent realizations.

```{r}
jump_rate_enlarged <- function(i, t, u){
  if(i == 1){
    2.5
  } else if(i == 2){
    4
  } else{
    0
  }
}

mark_dist_enlarged <- function(i, s, v){
  if(i == 1){
    c(0, 2/5, 2/5, 1/5)
  } else if(i == 2){
    c(2/4, 0, 1/4, 1/4)
  } else{
    0
  }
}

n <- 10000

X <- runif(n)

tn <- 5
sim <- list()
for(i in 1:n){
  rates <- function(j, y, z){jump_rate_enlarged(j, y, z)/(1+X[i]*y)}
  sim[[i]] <- sim_path(sample(1:2, 1), rates = rates, dists = mark_dist_enlarged,
                       tn = tn, abs = c(FALSE, FALSE, TRUE, TRUE),
                       bs = c(2.5*tn, 4*tn, 0, 0))
  sim[[i]]$X <- X[i]
}

```

The degree of censoring is

```{r}
sum(tn == unlist(lapply(sim, FUN = function(z){tail(z$times, 1)}))
    | 4 == unlist(lapply(sim, FUN = function(z){tail(z$states, 1)}))) / n
```

We fit the conditional Nelson--Aalen and Aalen--Johansen estimators for $x=0.2, 0.8$.

```{r}
fit1 <- aalen_johansen(sim, x = x1, collapse = TRUE)
fit2 <- aalen_johansen(sim, x = x2, collapse = TRUE)
```

For illustrative purposes, we plot $\Lambda_{21}$ and the conditional state occupation probability $p_2$ for both the true model (full) and using the conditional estimators (dashed). This is done for both $x=0.2$ (in red) and $x=0.8$ (in blue).

```{r, fig.align = 'center', fig.height = 3, fig.width = 6}
v11 <- unlist(lapply(fit1$Lambda, FUN = function(L) L[2,1]))
v10 <- fit1$t
v21 <- unlist(lapply(fit2$Lambda, FUN = function(L) L[2,1]))
v20 <- fit2$t
p1 <- unlist(lapply(fit1$p, FUN = function(L) L[2]))
p2 <- unlist(lapply(fit2$p, FUN = function(L) L[2]))

par(mfrow = c(1, 2))
par(mar = c(2.5, 2.5, 1.5, 1.5))

plot(v10, v11, type = "l", lty = 2, xlab = "", ylab = "", main = "Hazard", col = "red")
lines(v10, 2/x1*log(1+x1*v10), col = "red")
lines(v20, v21, lty = 2, col = "blue")
lines(v20, 2/x2*log(1+x2*v20), col = "blue")

plot(v10, p1, type = "l", lty = 2, xlab = "", ylab = "", main = "Probability", col = "red")
lines(seq(0, 5, 0.01), P1, col = "red")
lines(v20, p2, lty = 2, col = "blue")
lines(seq(0, 5, 0.01), P2, col = "blue")
```

## 4. Semi-Markov model with independent censoring

Last but not least, we consider a time-inhomogeneous semi-Markov model with non-zero transition rates given by
\begin{align*}
\lambda_{12}(t, u)
&=
0.09 + 0.0018t, \\
\lambda_{13}(t, u)
&=
0.01 + 0.0002t, \\
\lambda_{23}(t, u)
&=
0.09 + 1(u < 4)0.20 + 0.001t.
\end{align*}

```{r}
jump_rate <- function(i, t, u){
  if(i == 1){
    0.1 + 0.002*t
  } else if(i == 2){
    ifelse(u < 4, 0.29, 0.09) + 0.001*t
  } else{
    0
  }
}

mark_dist <- function(i, s, v){
  if(i == 1){
    c(0, 0.9, 0.1)
  } else if(i == 2){
    c(0, 0, 1)
  } else{
    0
  }
}
```

We simulate $5,000$ independent and identically distributed realizations subject to independent right-censoring. Right-censoring follows the distribution $\text{Unif}(10,40)$.

```{r}
n <- 5000

c <- runif(n, 10, 40)

sim <- list()
for(i in 1:n){
  sim[[i]] <- sim_path(1, rates = jump_rate, dists = mark_dist, tn = c[i],
                       bs = c(0.1+0.002*c[i], 0.29+0.001*c[i], 0))
}
```

The degree of censoring is

```{r}
sum(c == unlist(lapply(sim, FUN = function(z){tail(z$times, 1)}))) / n
```

We fit the Aalen--Johansen estimator.

```{r}
fit <- aalen_johansen(sim)
```

For illustrative purposes, we plot the estimate of the state occupation probability $p_2$ (dashed). The true values (full) are obtained via numerical integration, utilizing that this specific model has a hierarchical structure.

```{r, fig.align = 'center', fig.height = 4.5, fig.width = 4.5}
v0 <- fit$t
p <- unlist(lapply(fit$p, FUN = function(L) L[2]))
integrand <- function(t, s){
  exp(-0.1*s-0.001*s^2)*(0.09 + 0.0018*s)*exp(-0.20*pmin(t-s, 4)-0.09*(t-s)-0.0005*(t^2-s^2))
}
P <- Vectorize(function(t){
  integrate(f = integrand, lower = 0, upper = t, t = t)$value
}, vectorize.args = "t")
plot(v0, p, type = "l", lty = 2, xlab = "", ylab = "", main = "Probability")
lines(seq(0, 40, 0.1), P(seq(0, 40, 0.1)))
```

We now want to estimate the conditional state occupation probability $p_2$, given sojourn in the second state at time $10$ with duration $u=1,5$. For this, we first need to sub-sample the data (landmarking).

```{r}
landmark <- sim[unlist(lapply(sim, FUN = function(z){any(z$times <= 10
                                                         & c(z$times[-1], Inf) > 10
                                                         & z$states == 2)}))]
landmark <- lapply(landmark, FUN = function(z){list(times = z$times, states = z$states,
                                                    X = 10 - z$times[z$times <= 10
                                                                     & c(z$times[-1], Inf) > 10
                                                                     & z$states == 2])})
```

The degree of sub-sampling is

```{r}
length(landmark) / n
```

Next, we fit the conditional Aalen--Johansen estimator for $u=1,5$. We also fit the usual landmark Aalen--Johansen estimator.

```{r}
u1 <- 1
u2 <- 5

fit1 <- aalen_johansen(landmark, x = u1)
fit2 <- aalen_johansen(landmark, x = u2)
fit3 <- aalen_johansen(landmark)
```

For illustrative purposes, we plot the conditional state occupation probability $p_2$ using the conditional estimator (dashed), the usual landmark estimator (dotted), and the true model (full). This is done for both $u=1$ (in red) and $u=5$ (in blue).

```{r, fig.align = 'center', fig.height = 4.5, fig.width = 4.5}
v10 <- fit1$t
v20 <- fit2$t
v30 <- fit3$t
p1 <- unlist(lapply(fit1$p, FUN = function(L) L[2]))
p2 <- unlist(lapply(fit2$p, FUN = function(L) L[2]))
p3 <- unlist(lapply(fit3$p, FUN = function(L) L[2]))
P <- function(t, u){
  exp(-(t-10)*0.09-(t^2-100)*0.0005-pmax(0, pmin(t, 4-(u-10))-10)*0.20)  
} 

plot(v10, p1, type = "l", lty = 2, xlab = "", ylab = "", main = "Probability",
     col = "red", xlim = c(10, 40))
lines(seq(10, 40, 0.1), P(seq(10, 40, 0.1), u1), col = "red")
lines(v20, p2, lty = 2, col = "blue")
lines(seq(10, 40, 0.1), P(seq(10, 40, 0.1), u2), col = "blue")
lines(v30, p3, lty = 3)
```