Title: | Core Functions for Data Quality Assessment |
---|---|
Description: | Perform data quality assessment ('DQA') of electronic health records ('EHR'). Publication: Kapsner et al. (2021) <doi:10.1055/s-0041-1733847>. |
Authors: | Lorenz A. Kapsner [cre, aut] , Jonathan M. Mang [aut] , Helene Köster [ctb], MIRACUM - Medical Informatics in Research and Care in University Medicine [fnd], Universitätsklinikum Erlangen [cph] |
Maintainer: | Lorenz A. Kapsner <[email protected]> |
License: | GPL-3 |
Version: | 0.3.6 |
Built: | 2024-12-27 06:44:30 UTC |
Source: | CRAN |
Internal function to filter the input data (or SQL) depending on provided time information. Sensitive to SQL dialects.
apply_time_restriciton( data, key, lower_limit, upper_limit, system_name = NULL, system_type, mdr, logfile_dir = NULL, db_con = NULL, sql_create_view_all = list(), verify_on_db = TRUE )
apply_time_restriciton( data, key, lower_limit, upper_limit, system_name = NULL, system_type, mdr, logfile_dir = NULL, db_con = NULL, sql_create_view_all = list(), verify_on_db = TRUE )
data |
If system_type is a database, the sql-string goes here. If system_type is 'csv', the data.table of this csv goes here. Sensitive to SQL dialects. |
key |
The key from the mdr. |
lower_limit |
The posixct timestamp of the lower filtering boundary. |
upper_limit |
The posixct timestamp of the upper filtering boundary. |
system_name |
(Optional for non-database-changes) 'i2b2'/'p21csv'/'omop'/... |
system_type |
'postgres'/'oracle'/'csv' |
mdr |
(Optional for non-database-changes) The internal MDR
(get it from |
logfile_dir |
(Optional) The directory to store the logfile in. Defaults to NULL. |
db_con |
(Optional for non-database-changes) The connection to the database. Used to create the views we need later to apply the SQLs to. |
sql_create_view_all |
(Optional, list). A list containing the SQLs to create all Views for the time-filtering. This is needed for the printing-friendly SQL including this view-creating SQLs and the actual data-extracting SQL query. |
verify_on_db |
A boolean. If the view should be verified on the
database (default: |
If system_type is a database, a list with the new sql-string containing the temporal filtering will be returned under $sql ('order by' parts will be removed) and a printable sql containing the commands to create the view needed to run the sql under $sql_extended. If system_type is 'csv', the filtered data.table will be returned.
Internal function to generate the results of the 'Atemporal Plausibility' checks.
atemp_plausi_results(rv, atemp_vars, mdr, headless = FALSE)
atemp_plausi_results(rv, atemp_vars, mdr, headless = FALSE)
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
atemp_vars |
These are the atemporal variables |
mdr |
A data.table object containing the MDR. |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
A list with one entry for each atemporal plausibility check containing the results. Each entry contains the following (nested) list items:
A nested list with the description of the plausibility check for the source data system and the target data system.
A nested list with the frequency count results for the source data system and the target data system.
A nested list with the plausibility check results for the source data system and the target data system.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate atemporal plausibilites atemp_plausi_results( rv = rv, atemp_vars = rv$data_plausibility$atemporal, mdr = rv$mdr, headless = rv$headless )
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate atemporal plausibilites atemp_plausi_results( rv = rv, atemp_vars = rv$data_plausibility$atemporal, mdr = rv$mdr, headless = rv$headless )
Internal function to check if for every input table there is
one single (or empty) column where to apply the time restriction to.
If the input is valid, it will just print a success-message, if the
data is invalid, the function will call stop()
.
check_date_restriction_requirements( mdr, system_names, logfile_dir, headless = TRUE, enable_stop = TRUE )
check_date_restriction_requirements( mdr, system_names, logfile_dir, headless = TRUE, enable_stop = TRUE )
mdr |
The mdr as data.table |
system_names |
(String) The name of the systems (source and target) to check for possible date restriction in the mdr. |
logfile_dir |
The absolute path to folder where the logfile
will be stored default( |
headless |
(Boolean) Is this a console application? Otherwise
(if |
enable_stop |
(Boolean, default = TRUE) If true (default) this function
will call |
TRUE/FALSE: TRUE if the check was successful and the given systems can be time filtered, FALSE if something went wrong and no time filtering is possible.
A boolean to indicate if the date restriction requirements are met (TRUE) or not (FALSE).
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" DIZtools::cleanup_old_logfile(logfile_dir = tempdir()) check_date_restriction_requirements( mdr = mdr, system_names = c(source_system_name, target_system_name), logfile_dir = tempdir(), headless = TRUE, enable_stop = TRUE )
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" DIZtools::cleanup_old_logfile(logfile_dir = tempdir()) check_date_restriction_requirements( mdr = mdr, system_names = c(source_system_name, target_system_name), logfile_dir = tempdir(), headless = TRUE, enable_stop = TRUE )
Internal function to perform missing analysis.
completeness(results, headless = FALSE, logfile_dir)
completeness(results, headless = FALSE, logfile_dir)
results |
A list object. The list should contain the results of either 'rv$results_descriptive' or 'rv$results_plausibility_atemporal'. |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
logfile_dir |
The absolute path to folder where the logfile
will be stored default( |
A data.table with the absolute and relative counts of missing values (results of the completeness checks) for each dataelement for the source data system and the target data system.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) completeness( results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir )
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) completeness( results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir )
Internal function to create necessary variables from the meta data repository (MDR).
create_helper_vars(mdr, source_db, target_db)
create_helper_vars(mdr, source_db, target_db)
mdr |
A data.table object containing the MDR. |
source_db |
A character string. The name of the source database. This string must be conform with the corresponding config section in the config.yml-file. |
target_db |
A character string. The name of the target database. This string must be conform with the corresponding config section in the config.yml-file. |
A list with results from the analysis of the metadata repository (MDR) with the following items:
A character vector with the different values of the 'key' field from the MDR for the source data system.
A character vector with the different values of the 'key' field from the MDR for the target data system.
A data.table with a subset of the MDR for the dataelement entries with the field 'dqa_assessment' = 1.
A mapping list from MDR variable names (MDR field 'designation') to DQA tool internal variable names (MDR field 'variable_name').
A nested list with items regarding the plausibility checks
A data.table with a subset of the MDR with dataelements that are associated with atemporal plausibility checks.
A data.table with a subset of the MDR with dataelements that are associated with uniqueness plausibility checks.
A character vector with further dataelements that are required to perform the atemporal plausibility checks.
A boolean to indicate if all dataelements required to perform the atemporal plausibility checks are available in the dataset.
A character vector with further dataelements that are required to perform the uniqueness plausibility checks.
A boolean to indicate if all dataelements required to perform the uniqueness plausibility checks are available in the dataset.
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" create_helper_vars( mdr = mdr, source_db = source_system_name, target_db = target_system_name )
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" create_helper_vars( mdr = mdr, source_db = source_system_name, target_db = target_system_name )
Internal function to generate the final PDF report.
create_pdf_report(rv = rv, utils_path, outdir = tempdir(), headless = FALSE)
create_pdf_report(rv = rv, utils_path, outdir = tempdir(), headless = FALSE)
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
utils_path |
A character string. The path to the utils-folder, containing the required app utilities like the MDR and the settings folder. |
outdir |
A character string. The directory to store the resulting
PDF document. Default: |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
No return value. This function renders the PDF markdown report with
the data quality assessment results and saves it to outdir
.
# runtime > 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate unique plausibilites rv$results_plausibility_unique <- uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless ) create_pdf_report( rv = rv, utils_path = rv$utilspath, outdir = output_dir, headless = rv$headless )
# runtime > 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate unique plausibilites rv$results_plausibility_unique <- uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless ) create_pdf_report( rv = rv, utils_path = rv$utilspath, outdir = output_dir, headless = rv$headless )
Internal function to load the source and target data
data_loading(rv, system, keys_to_test)
data_loading(rv, system, keys_to_test)
rv |
The complete reactive-value dataset |
system |
The part of the rv-list which should be loaded (e.g. rv$source or rv$target) |
keys_to_test |
A vector containing the names (keys) of the variables to test. |
A list with the fields '$outdata' and if testing an SQL-based database also '$sql_statements'.
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source )
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source )
Internal function to generate the descriptive results.
descriptive_results(rv, headless = FALSE)
descriptive_results(rv, headless = FALSE)
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
A list with one entry for each dataelement containing the results of the descriptive results. Each entry contains the following (nested) list items:
A nested list with the description of the dataelement for the source data system and the target data system.
A nested list with the frequency count results for the source data system and the target data system.
A nested list with the descriptive results for the source data system and the target data system stored as data.table objects.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results descriptive_results( rv = rv, headless = rv$headless )
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results descriptive_results( rv = rv, headless = rv$headless )
Internal function to calculate differences
difference_checks(results)
difference_checks(results)
results |
A list object. The list should contain the results 'rv$results_descriptive'. |
A list with two data.tables with the differences in total, distinct, valid and missing values of source and target database. In table one, called text, the results are represented as a string containing the absolute difference as well as the percentage. Table two, called percent, contains the percentage as a numeric value.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) difference_checks(results = rv$results_descriptive)
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) difference_checks(results = rv$results_descriptive)
This function performs a data quality assessment (DQA) of electronic health records (EHR).#'
dqa( source_system_name, target_system_name, utils_path, mdr_filename = "mdr.csv", output_dir = paste0(tempdir(), "/output/"), logfile_dir = tempdir(), parallel = FALSE, ncores = 2, restricting_date_start = NULL, restricting_date_end = NULL, restricting_date_format = NULL )
dqa( source_system_name, target_system_name, utils_path, mdr_filename = "mdr.csv", output_dir = paste0(tempdir(), "/output/"), logfile_dir = tempdir(), parallel = FALSE, ncores = 2, restricting_date_start = NULL, restricting_date_end = NULL, restricting_date_format = NULL )
source_system_name |
A character string. The name of the source-system, e.g. "P21" or "i2b2". This name must be identical and unique to one entry in the settings-yml file. |
target_system_name |
Optional. A character string or null. The name of the target-system, e.g. "P21" or "i2b2". This name must be identical and unique to one entry in the config-yml file or null. If the argument is empty, the source will be processed as standalone on its own. |
utils_path |
A character string. The path to the utils-folder, containing the required app utilities like the MDR and the settings folder. |
mdr_filename |
A character string. The filename of the MDR e.g. "mdr_example_data.csv". |
output_dir |
The path to the output folder where all the results will
be stored (default: |
logfile_dir |
The absolute path to folder where the logfile
will be stored default( |
parallel |
A boolean. If TRUE, initializing a |
ncores |
A integer. The number of cores to use. Caution: you would probably like to choose a low number when operating on large datasets. Default: 2. |
restricting_date_start |
The date as the lower limit against which
the data to be analyzed will be filtered. Your input must be able to be
recognized as a date by |
restricting_date_end |
The date as the lower limit against which
the data to be analyzed will be filtered. Your input must be able to be
recognized as a date by |
restricting_date_format |
The format in which the input data is stored.
See |
This function is a wrapper around all helper functions in DQAstats
to perform the data quality assessment. The results are summarized in a
PDF report which is saved to outdir
. The return value of this function is
a nested list that contains all results as R objects.
# runtime > 5 sec. Sys.setenv("EXAMPLECSV_SOURCE_PATH" = system.file( "demo_data", package = "DQAstats") ) Sys.setenv("EXAMPLECSV_TARGET_PATH" = system.file( "demo_data", package = "DQAstats") ) # Set path to utilities folder where to find the mdr and template files: utils_path <- system.file( "demo_data/utilities", package = "DQAstats" ) # Execute the DQA and generate a PDF report: results <- DQAstats::dqa( source_system_name = "exampleCSV_source", target_system_name = "exampleCSV_target", utils_path = utils_path, mdr_filename = "mdr_example_data.csv", output_dir = paste0(tempdir(), "/output/"), parallel = FALSE )
# runtime > 5 sec. Sys.setenv("EXAMPLECSV_SOURCE_PATH" = system.file( "demo_data", package = "DQAstats") ) Sys.setenv("EXAMPLECSV_TARGET_PATH" = system.file( "demo_data", package = "DQAstats") ) # Set path to utilities folder where to find the mdr and template files: utils_path <- system.file( "demo_data/utilities", package = "DQAstats" ) # Execute the DQA and generate a PDF report: results <- DQAstats::dqa( source_system_name = "exampleCSV_source", target_system_name = "exampleCSV_target", utils_path = utils_path, mdr_filename = "mdr_example_data.csv", output_dir = paste0(tempdir(), "/output/"), parallel = FALSE )
Internal function to perform etl conformance checks.
etl_checks(results)
etl_checks(results)
results |
A list object. The list should contain the results 'rv$results_descriptive'. |
A data.table with the automated comparison of the counts of valid, missing, and distinct values between the source data system and the target data system.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) etl_checks(results = rv$results_descriptive)
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) etl_checks(results = rv$results_descriptive)
This function exports aggregated results in csv files that are added to a zip archive.
export_aggregated(output_dir, rv)
export_aggregated(output_dir, rv)
output_dir |
The path to the output folder where all the results will
be stored (default: |
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
No return value. This function writes the aggregated results, namely
the conformace results overview table, the plausibility check results
overview, the completeness results overview and a combined version (named
'all_results.csv') to csv files. These files are saved in
{output_dir}/export
.
# runtime > 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # completeness rv$completeness <- completeness(results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir) rv$datamap <- generate_datamap( results = rv$results_descriptive, db = rv$target$system_name, mdr = rv$mdr, rv = rv, headless = rv$headless ) # checks$value_conformance rv$checks$value_conformance <- value_conformance_checks(results = rv$conformance$value_conformance) # checks$etl rv$checks$etl <- etl_checks(results = rv$results_descriptive) output_dir <- tempdir() export_aggregated( output_dir = output_dir, rv = rv )
# runtime > 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # completeness rv$completeness <- completeness(results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir) rv$datamap <- generate_datamap( results = rv$results_descriptive, db = rv$target$system_name, mdr = rv$mdr, rv = rv, headless = rv$headless ) # checks$value_conformance rv$checks$value_conformance <- value_conformance_checks(results = rv$conformance$value_conformance) # checks$etl rv$checks$etl <- etl_checks(results = rv$results_descriptive) output_dir <- tempdir() export_aggregated( output_dir = output_dir, rv = rv )
Internal function to format the value conformance results
format_value_conformance_results(results, desc_out, source)
format_value_conformance_results(results, desc_out, source)
results |
A list containing the value conformance results for one data element. |
desc_out |
A list containing the descriptive results for the same data element. |
source |
A character: either |
The function returns a list with the formatted value conformance results for one data element.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate atemporal plausibilites rv$results_plausibility_atemporal <- atemp_plausi_results( rv = rv, atemp_vars = rv$data_plausibility$atemporal, mdr = rv$mdr, headless = rv$headless ) # calculate unique plausibilites rv$results_plausibility_unique <- uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless ) rv$conformance$value_conformance <- value_conformance( rv = rv, scope = "descriptive", results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir ) # format the results (wrap functioncall into `sapply` to get results for all # available data elements): value_conformance_formatted <- sapply( X = names(rv$results_descriptive), FUN = function(i) { desc_out <- rv$results_descriptive[[i]]$description if (!is.null(rv$conformance$value_conformance[[i]])) { format_value_conformance_results( results = rv$conformance$value_conformance[[i]], desc_out = desc_out, source = "source_data" ) } } )
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate atemporal plausibilites rv$results_plausibility_atemporal <- atemp_plausi_results( rv = rv, atemp_vars = rv$data_plausibility$atemporal, mdr = rv$mdr, headless = rv$headless ) # calculate unique plausibilites rv$results_plausibility_unique <- uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless ) rv$conformance$value_conformance <- value_conformance( rv = rv, scope = "descriptive", results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir ) # format the results (wrap functioncall into `sapply` to get results for all # available data elements): value_conformance_formatted <- sapply( X = names(rv$results_descriptive), FUN = function(i) { desc_out <- rv$results_descriptive[[i]]$description if (!is.null(rv$conformance$value_conformance[[i]])) { format_value_conformance_results( results = rv$conformance$value_conformance[[i]], desc_out = desc_out, source = "source_data" ) } } )
Internal function to generate the dashboard data maps
generate_datamap(results, mdr, db, rv, headless = FALSE)
generate_datamap(results, mdr, db, rv, headless = FALSE)
results |
A list object. The list should contain the results 'rv$results_descriptive'. |
mdr |
A data.table object containing the MDR. |
db |
A character string. The name of the corresponding database. |
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
A data.table with the results of the datamap.
# runtime > 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) generate_datamap( results = rv$results_descriptive, db = rv$target$system_name, mdr = rv$mdr, rv = rv, headless = rv$headless )
# runtime > 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) generate_datamap( results = rv$results_descriptive, db = rv$target$system_name, mdr = rv$mdr, rv = rv, headless = rv$headless )
Internal function to generate raw data for the 'Atemporal Plausibility' checks.
get_atemp_plausis(rv, atemp_vars, mdr, headless = FALSE)
get_atemp_plausis(rv, atemp_vars, mdr, headless = FALSE)
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
atemp_vars |
A data.table object. The object is created
by |
mdr |
A data.table object containing the MDR. |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
A list with one entry for each atemporal plausibility check containing the raw results. Each entry contains the following (nested) list items:
A nested list with the raw plausibility check results for the source data system.
A nested list with the raw plausibility check results for the target data system.
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless )
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless )
See title.
get_restricting_date_info( restricting_date, lang = "en", date = TRUE, time = TRUE )
get_restricting_date_info( restricting_date, lang = "en", date = TRUE, time = TRUE )
restricting_date |
The list applied from rv$restricting_date |
lang |
Language of the result. "de"/"en" (en = default). If language is not yet implemented, "en" is used. |
date |
Should the date be included in the result string? |
time |
Should the time be included in the result string? |
String containing start and end date obtained from the list of
restricting_date
.
Internal function to determine if a LaTeX installation is available. Used before creating/knitr-ing the PDF report.
is_latex_installed(logfile_dir = NULL, headless = TRUE)
is_latex_installed(logfile_dir = NULL, headless = TRUE)
logfile_dir |
The absolute path to folder where the logfile
will be stored default( |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
TRUE if there is a LaTeX installation, FALSE if not.
is_latex_installed()
is_latex_installed()
Internal function to load the data from CSV files.
load_csv(rv, keys_to_test, headless = FALSE, system)
load_csv(rv, keys_to_test, headless = FALSE, system)
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
keys_to_test |
A vector containing the names (keys) of the variables to test. |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
system |
The system object rv$system |
A list with data.tables for each unique CSV file as defined in the 'source_system_table' field of the MDR.
Internal function to load the data from SQL databases.
load_database( rv, sql_statements, db_con, keys_to_test, db_name, headless = FALSE, db_type )
load_database( rv, sql_statements, db_con, keys_to_test, db_name, headless = FALSE, db_type )
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
sql_statements |
The SQL-Statement-object |
db_con |
The connection-socket |
keys_to_test |
A vector containing the names (keys) of the variables to test. |
db_name |
The database name |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
db_type |
The database type (postgres/oracle) |
A list with a data.table for each data element as defined in the in the MDR.
Internal function to load the SQL statements.
load_sqls(utils_path, db)
load_sqls(utils_path, db)
utils_path |
A character string. The path to the utils-folder, containing the required app utilities like the MDR and the settings folder. |
db |
A character string. The name of the corresponding database. |
Internal function to initialize the parallel backend.
parallel(parallel, logfile_dir, ncores)
parallel(parallel, logfile_dir, ncores)
parallel |
A boolean. If TRUE, initializing a |
logfile_dir |
The absolute path to folder where the logfile
will be stored default( |
ncores |
A integer. The number of cores to use. Caution: you would probably like to choose a low number when operating on large datasets. Default: 2. |
No return value. Depending on the specified arguments, this function enables a parallel backend for faster computations.
parallel(parallel = FALSE, logfile_dir = tempdir(), ncores = 1)
parallel(parallel = FALSE, logfile_dir = tempdir(), ncores = 1)
Internal function to read the meta data repository (MDR).
read_mdr(utils_path = NULL, mdr_filename = "mdr.csv")
read_mdr(utils_path = NULL, mdr_filename = "mdr.csv")
utils_path |
A character string. The path to the utils-folder, containing the required app utilities like the MDR and the settings folder. |
mdr_filename |
A character string. The filename of your meta data repository (default: 'mdr.csv'). |
A data.table containing the metadata repository which is imported
from the CSV file provided with {utils_path}/MDR/{mdr_filename}
.
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename )
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename )
Internal function to reduce categorical variables to a maximum of values to be displayed.
reduce_cat(data, levellimit = 25)
reduce_cat(data, levellimit = 25)
data |
A list object. The object |
levellimit |
An integer value. The number of maximum values to be displayed (default: 25). |
A data.table with the data quality assessment results for categorical
dataelements that are reduced to the maximum number of levels specified
with levellimit
.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) reduce_cat( data = rv$results_descriptive, levellimit = 25 )
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) reduce_cat( data = rv$results_descriptive, levellimit = 25 )
Internal function to test and get the database connection of the source database.
test_csv(settings, source_db, mdr, headless = FALSE, logfile_dir)
test_csv(settings, source_db, mdr, headless = FALSE, logfile_dir)
settings |
A list object containing the database settings. |
source_db |
A character string. The name of the source database. This string must be conform with the corresponding config section in the config.yml-file. |
mdr |
A data.table object containing the MDR. |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
logfile_dir |
The absolute path to folder where the logfile
will be stored default( |
A boolean indicating if the CSV files specified in the metadata repository are found in the specified locations.
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" DIZtools::cleanup_old_logfile(logfile_dir = tempdir()) settings <- DIZutils::get_config_env( system_name = source_system_name, logfile_dir = tempdir(), headless = TRUE ) test_csv( settings = settings, source_db = source_system_name, mdr = mdr, headless = TRUE, logfile_dir = tempdir() )
utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" mdr <- read_mdr( utils_path = utils_path, mdr_filename = mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" DIZtools::cleanup_old_logfile(logfile_dir = tempdir()) settings <- DIZutils::get_config_env( system_name = source_system_name, logfile_dir = tempdir(), headless = TRUE ) test_csv( settings = settings, source_db = source_system_name, mdr = mdr, headless = TRUE, logfile_dir = tempdir() )
Internal function to calculate differences between source and target based on a timestamp comparison. It can help to identify potential missing resources.
time_compare(rv, logfile_dir, headless = FALSE)
time_compare(rv, logfile_dir, headless = FALSE)
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
logfile_dir |
The absolute path to folder where the logfile
will be stored default( |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
a list of time-compare results for each analyzed element. For every element, there are three dataframes available. The first dataframe (result_table), presents an overview table that displays the counts for each timestamp. The other two dataframes (suspect_data_source and suspect_data_target), contain all the data associated with the identified timestamps found in the source or target data.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata # time_compare time_compare_results <- time_compare( rv = rv, logfile_dir = rv$log$logfile_dir, headless = rv$headless )
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata # time_compare time_compare_results <- time_compare( rv = rv, logfile_dir = rv$log$logfile_dir, headless = rv$headless )
Internal function to generate the results of the 'Uniqueness Plausibility' checks.
uniq_plausi_results(rv, uniq_vars, mdr, headless = FALSE)
uniq_plausi_results(rv, uniq_vars, mdr, headless = FALSE)
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
uniq_vars |
A data.table object. The object is created
by |
mdr |
A data.table object containing the MDR. |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
A list with one entry for each uniqueness plausibility check containing the results. Each entry contains the following (nested) list items:
A character with the description of the plausibility check.
A nested list with the uniqueness plausibility check results for the source data system with the values 'message' and 'error'.
A nested list with the uniqueness plausibility check results for the target data system with the values 'message' and 'error'.
# runtime > 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate unique plausibilites uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless )
# runtime > 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate unique plausibilites uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless )
Internal function to perform value conformance checks.
value_conformance(rv, results, scope, headless = FALSE, logfile_dir)
value_conformance(rv, results, scope, headless = FALSE, logfile_dir)
rv |
A list object. Internal list simulating Shiny's 'reactive values'. |
results |
A list object. The list should contain the results of either 'rv$results_descriptive' or 'rv$results_plausibility_atemporal'. |
scope |
A character. Either "plausibility" or "descriptive". |
headless |
A boolean (default: FALSE). Indicating, if the function is run only in the console (headless = TRUE) or on a GUI frontend (headless = FALSE). |
logfile_dir |
The absolute path to folder where the logfile
will be stored default( |
A list with one entry for each dataelement containing the raw results of the value conformance checks. Each entry contains the following (nested) list items:
A nested list with the raw value conformance check results for the source data system.
A nested list with the raw value conformance check results for the target data system.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate atemporal plausibilites rv$results_plausibility_atemporal <- atemp_plausi_results( rv = rv, atemp_vars = rv$data_plausibility$atemporal, mdr = rv$mdr, headless = rv$headless ) # calculate unique plausibilites rv$results_plausibility_unique <- uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless ) value_conformance( rv = rv, scope = "descriptive", results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir )
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate atemporal plausibilites rv$results_plausibility_atemporal <- atemp_plausi_results( rv = rv, atemp_vars = rv$data_plausibility$atemporal, mdr = rv$mdr, headless = rv$headless ) # calculate unique plausibilites rv$results_plausibility_unique <- uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless ) value_conformance( rv = rv, scope = "descriptive", results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir )
Internal function to perform value conformance checks.
value_conformance_checks(results)
value_conformance_checks(results)
results |
A list object. The list should contain the results of
the function |
A data.table with the results of the automated comparison of the value conformance check results between the source data system and the target data system.
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate atemporal plausibilites rv$results_plausibility_atemporal <- atemp_plausi_results( rv = rv, atemp_vars = rv$data_plausibility$atemporal, mdr = rv$mdr, headless = rv$headless ) # calculate unique plausibilites rv$results_plausibility_unique <- uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless ) rv$conformance$value_conformance <- value_conformance( rv = rv, scope = "descriptive", results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir ) value_conformance_checks(results = rv$conformance$value_conformance)
# runtime ~ 5 sec. utils_path <- system.file( "demo_data/utilities/", package = "DQAstats" ) mdr_filename <- "mdr_example_data.csv" rv <- list() rv$mdr <- read_mdr( utils_path = utils_path, mdr_filename <- mdr_filename ) source_system_name <- "exampleCSV_source" target_system_name <- "exampleCSV_target" rv <- c(rv, create_helper_vars( mdr = rv$mdr, source_db = source_system_name, target_db = target_system_name )) # save source/target vars rv$source$system_name <- source_system_name rv$target$system_name <- target_system_name rv$source$system_type <- "csv" rv$target$system_type <- "csv" rv$log$logfile_dir <- tempdir() # set headless (without GUI, progressbars, etc.) rv$headless <- TRUE # set configs demo_files <- system.file("demo_data", package = "DQAstats") Sys.setenv("EXAMPLECSV_SOURCE_PATH" = demo_files) Sys.setenv("EXAMPLECSV_TARGET_PATH" = demo_files) # get configs rv$source$settings <- DIZutils::get_config_env( system_name = rv$source$system_name, logfile_dir = rv$log$logfile_dir, headless = rv$headless ) rv$target$settings <- DIZutils::get_config_env( system_name = tolower(rv$target$system_name), logfile_dir = rv$log$logfile_dir, headless = rv$headless ) # set start_time (e.g. when clicking the 'Load Data'-button in shiny rv$start_time <- format(Sys.time(), usetz = TRUE, tz = "CET") # define restricting date rv$restricting_date$use_it <- FALSE # load source data tempdat <- data_loading( rv = rv, system = rv$source, keys_to_test = rv$keys_source ) rv$data_source <- tempdat$outdata # load target data tempdat <- data_loading( rv = rv, system = rv$target, keys_to_test = rv$keys_target ) rv$data_target <- tempdat$outdata rv$data_plausibility$atemporal <- get_atemp_plausis( rv = rv, atemp_vars = rv$pl$atemp_vars, mdr = rv$mdr, headless = rv$headless ) # add the plausibility raw data to data_target and data_source for (i in names(rv$data_plausibility$atemporal)) { for (k in c("source_data", "target_data")) { w <- gsub("_data", "", k) raw_data <- paste0("data_", w) rv[[raw_data]][[i]] <- rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] rv$data_plausibility$atemporal[[i]][[k]][[raw_data]] <- NULL } gc() } # calculate descriptive results rv$results_descriptive <- descriptive_results( rv = rv, headless = rv$headless ) # calculate atemporal plausibilites rv$results_plausibility_atemporal <- atemp_plausi_results( rv = rv, atemp_vars = rv$data_plausibility$atemporal, mdr = rv$mdr, headless = rv$headless ) # calculate unique plausibilites rv$results_plausibility_unique <- uniq_plausi_results( rv = rv, uniq_vars = rv$pl$uniq_vars, mdr = rv$mdr, headless = rv$headless ) rv$conformance$value_conformance <- value_conformance( rv = rv, scope = "descriptive", results = rv$results_descriptive, headless = rv$headless, logfile_dir = rv$log$logfile_dir ) value_conformance_checks(results = rv$conformance$value_conformance)