cohortBuilder
provides seven built-in filter types suitable for most common filtering
tasks:
If none of the above filters meets your need, you can create a custom one. Below we describe how custom filters work and walk through an example: creating a logical filter.
Filters in cohortBuilder are S7 objects that inherit
from the CbFilter base class. Each filter type is an S7
class with properties describing its configuration (dataset, variable,
value, etc.).
Filtering logic is implemented via S7 multi-dispatch
generics that dispatch on both the filter class and the source
class. This means you can define how a filter type behaves for different
source backends (e.g. tblist, database, etc.).
The key generics are:
| Generic | Purpose |
|---|---|
cb_filter_data(filter, source, data_object) |
Apply filter to data |
cb_get_filter_stats(filter, source, data_object) |
Compute filter statistics |
cb_plot_filter_data(filter, source, data_object) |
Plot filter data summary |
cb_get_filter_data(filter, source, data_object) |
Get filter-related data subset |
cb_get_filter_defaults(filter, source, data_object, cache_object) |
Get default parameter values |
cb_filter_to_expr(filter, source) |
Generate reproducible code expression |
Let’s look at a built-in discrete filter to understand the structure:
spec_filter <- filter("discrete", value = "setosa", dataset = "iris", variable = "Species")
spec_filter
#> <cohortBuilder::CbFilterDiscrete>
#> @ type : chr "discrete"
#> @ id : chr "iris-Species"
#> @ name : chr "Species"
#> @ active : logi TRUE
#> @ description: NULL
#> @ domain : NULL
#> @ step_id : NULL
#> @ extra : list()
#> @ private :List of 1
#> .. $ input_param: chr "value"
#> @ dataset : chr "iris"
#> @ variable : chr "Species"
#> @ value : chr "setosa"
#> @ keep_na : logi TRUEThe filter is an S7 object with properties accessible via
@:
spec_filter@type
#> [1] "discrete"
spec_filter@variable
#> [1] "Species"
spec_filter@value
#> [1] "setosa"
spec_filter@dataset
#> [1] "iris"All user-facing parameters can be retrieved as a list:
str(get_filter_params(spec_filter))
#> List of 10
#> $ type : chr "discrete"
#> $ id : chr "iris-Species"
#> $ name : chr "Species"
#> $ active : logi TRUE
#> $ description: NULL
#> $ domain : NULL
#> $ dataset : chr "iris"
#> $ variable : chr "Species"
#> $ value : chr "setosa"
#> $ keep_na : logi TRUETo create a custom filter you need to:
CbFilter.filter("logical", ...) works.CbFilterLogical <- S7::new_class("CbFilterLogical",
parent = CbFilter,
properties = list(
dataset = S7::class_character,
variable = S7::class_character,
value = S7::class_any,
keep_na = S7::class_logical
),
constructor = function(id = NULL, name = NULL, variable, value = NA,
dataset, keep_na = TRUE, description = NULL,
active = getOption("cb_active_filter", default = TRUE), ...) {
if (is.null(id)) id <- paste(dataset, variable, sep = "-")
if (is.null(name)) name <- variable
S7::new_object(S7::S7_object(),
type = "logical", id = id, name = name, input_param = "value",
variable = variable, value = value, dataset = dataset,
keep_na = keep_na, active = active, description = description,
extra = list(...)
)
}
)Key points:
parent = CbFilter makes it part of the filter
hierarchy.input_param = "value" tells the system which property
holds the filtering value.extra = list(...) captures any additional parameters
(e.g. gui_input for shinyCohortBuilder).value = NA means no filtering is applied by
default.Now filter("logical", ...) works:
Each method dispatches on (filter_class, source_class).
We use tblist_class (provided by cohortBuilder) as the
source class for the built-in list-of-tables backend.
S7::method(cb_filter_data, list(CbFilterLogical, tblist_class)) <- function(filter, source, data_object, ...) {
dataset <- filter@dataset
variable <- filter@variable
value <- filter@value
keep_na <- filter@keep_na
if (keep_na && !identical(value, NA)) {
data_object[[dataset]] <- data_object[[dataset]] |>
dplyr::filter(!!sym(variable) == !!value | is.na(!!sym(variable)))
}
if (!keep_na && identical(value, NA)) {
data_object[[dataset]] <- data_object[[dataset]] |>
dplyr::filter(!is.na(!!sym(variable)))
}
if (!keep_na && !identical(value, NA)) {
data_object[[dataset]] <- data_object[[dataset]] |>
dplyr::filter(!!sym(variable) == !!value)
}
attr(data_object[[dataset]], "filtered") <- TRUE
data_object
}Note:
value = NA, no filtering is applied (unless
keep_na = FALSE, which removes NA rows).filtered attribute signals to binding logic that
this dataset was touched.S7::method(cb_get_filter_stats, list(CbFilterLogical, tblist_class)) <- function(
filter, source, data_object, name, ...
) {
dataset <- filter@dataset
variable <- filter@variable
if (missing(name)) {
name <- c("n_data", "choices", "n_missing")
}
column <- data_object[[dataset]][[variable]]
stats <- list(
choices = if ("choices" %in% name) as.list(table(stats::na.omit(column))),
n_data = if ("n_data" %in% name) length(stats::na.omit(column)),
n_missing = if ("n_missing" %in% name) sum(is.na(column))
)
if (length(name) == 1L) stats[[name]] else stats[name]
}Tip: evaluating only the requested stat (via the name
parameter) improves performance with large data.
S7::method(cb_plot_filter_data, list(CbFilterLogical, tblist_class)) <- function(filter, source, data_object, ...) {
dataset <- filter@dataset
variable <- filter@variable
if (nrow(data_object[[dataset]])) {
data_object[[dataset]][[variable]] |> table() |> prop.table() |> graphics::barplot(...)
} else {
graphics::barplot(0.0, ylim = c(0.0, 0.1), main = "No data")
}
}S7::method(cb_filter_to_expr, list(CbFilterLogical, tblist_class)) <- function(filter, source, ...) {
dataset <- filter@dataset
variable <- filter@variable
value <- filter@value
keep_na <- filter@keep_na
if (keep_na && !identical(value, NA)) {
rlang::expr({
data_object[[!!dataset]] <- data_object[[!!dataset]] |>
dplyr::filter(!!sym(variable) == !!value | is.na(!!sym(variable)))
})
} else if (!keep_na && identical(value, NA)) {
rlang::expr({
data_object[[!!dataset]] <- data_object[[!!dataset]] |>
dplyr::filter(!is.na(!!sym(variable)))
})
} else if (!keep_na && !identical(value, NA)) {
rlang::expr({
data_object[[!!dataset]] <- data_object[[!!dataset]] |>
dplyr::filter(!!sym(variable) == !!value)
})
} else {
NULL
}
}Now we can use our filter for building a cohort. For the example
we’ll use an extended iris table:
iris2 <- dplyr::mutate(iris, is_setosa = Species == "setosa")
coh <- set_source(tblist(iris = iris2)) |>
cohort(
filter("logical", dataset = "iris", variable = "is_setosa", value = TRUE)
) |>
run()Let’s verify the filtering worked:
We can also check statistics and reproducible code:
Filter constructors accept ... which is stored in
filter@extra. This is useful for passing additional
configuration consumed by other packages (e.g. gui_input
for shinyCohortBuilder):
my_filter <- filter("logical",
dataset = "iris", variable = "is_setosa", value = TRUE,
gui_input = "checkbox"
)
my_filter@extra$gui_input
#> [1] "checkbox"
get_filter_params(my_filter)$gui_input
#> [1] "checkbox"Extra parameters are included in get_filter_params()
output, persisted in get_state(), and can be updated via
update_filter().
When creating custom filters in a separate package:
register_filter_type() in your package’s
.onLoad().S7 (>= 0.2.0) and cohortBuilder to
Imports in DESCRIPTION.S7::methods_register() in your
.onLoad() to register S7 methods.