From df35f4bbd7356da9b72bd2b36d9ac96c4a3372ac Mon Sep 17 00:00:00 2001 From: araikes Date: Thu, 9 Apr 2026 14:36:19 -0700 Subject: [PATCH 1/9] Refactor analysers to use precomputed contexts --- R/ModelArray_Constructor.R | 1092 ++++++----------- R/analyse-helpers.R | 320 ++++- R/analyse.R | 471 +++---- man/ModelArray.wrap.Rd | 16 +- man/analyseOneElement.gam.Rd | 27 +- man/analyseOneElement.lm.Rd | 42 +- man/analyseOneElement.wrap.Rd | 28 +- .../test-ModelArray_low_hanging_coverage.R | 2 +- tests/testthat/test-analyse_context.R | 933 ++++++++++++++ 9 files changed, 1873 insertions(+), 1058 deletions(-) create mode 100644 tests/testthat/test-analyse_context.R diff --git a/R/ModelArray_Constructor.R b/R/ModelArray_Constructor.R index 22772d7..743916f 100644 --- a/R/ModelArray_Constructor.R +++ b/R/ModelArray_Constructor.R @@ -1,3 +1,4 @@ +# ModelArray class ---- #' ModelArray class #' #' ModelArray is an S4 class that represents element-wise scalar data and @@ -394,19 +395,29 @@ numElementsTotal <- function(modelarray, scalar_name = "FD") { } #' Fit a linear model for a single element +#' +#' @description +#' Fits \code{\link[stats]{lm}} on one element's data. When a precomputed +#' context (\code{ctx}) is provided, all loop-invariant work (formula parsing, +#' collision checks, source alignment) is skipped. When \code{ctx} is +#' \code{NULL}, the function falls back to computing everything inline +#' (legacy behaviour for direct calls / debugging). +#' #' If the number of subjects with finite scalar values (not \code{NaN}, #' \code{NA}, or \code{Inf}) does not exceed \code{num.subj.lthr}, the #' element is skipped and all statistics are set to \code{NaN}. #' #' @param i_element Integer. The 1-based index of the element to analyse. #' @param formula A \code{\link[stats]{formula}} passed to -#' \code{\link[stats]{lm}}. -#' @param modelarray A \linkS4class{ModelArray} object. +#' \code{\link[stats]{lm}}. Ignored when \code{ctx} is provided (the +#' formula is taken from the context). +#' @param modelarray A \linkS4class{ModelArray} object. Ignored when +#' \code{ctx} is provided. #' @param phenotypes A data.frame of the cohort with columns of independent #' variables and covariates. Must contain a \code{"source_file"} column -#' matching \code{sources(modelarray)[[scalar]]}. +#' matching \code{sources(modelarray)[[scalar]]}. Ignored when \code{ctx} is provided. #' @param scalar Character. The name of the element-wise scalar to analyse. -#' Must be one of \code{names(scalars(modelarray))}. +#' Must be one of \code{names(scalars(modelarray))}. Ignored when \code{ctx} is provided. #' @param var.terms Character vector. Statistics to extract per term from #' \code{\link[broom]{tidy.lm}} (e.g. \code{"estimate"}, \code{"statistic"}, #' \code{"p.value"}). @@ -418,7 +429,7 @@ numElementsTotal <- function(modelarray, scalar_name = "FD") { #' below this threshold are skipped. This value is typically computed by #' the parent function from \code{num.subj.lthr.abs} and #' \code{num.subj.lthr.rel}. -#' @param num.stat.output Integer or \code{NULL}. The total number of output +#' @param num.stat.output Integer or \code{NULL}. The total number of output #' columns (including \code{element_id}). Used when #' \code{flag_initiate = FALSE} to generate an all-\code{NaN} row for #' skipped elements. Must be \code{NULL} when \code{flag_initiate = TRUE}. @@ -431,6 +442,8 @@ numElementsTotal <- function(modelarray, scalar_name = "FD") { #' halts execution; \code{"skip"} returns all-\code{NaN} for this element; #' \code{"debug"} drops into \code{\link{browser}} (if interactive) then #' skips. Default: \code{"stop"}. +#' @param ctx A precomputed context list from \code{.build_lm_context()}, +#' or \code{NULL} for legacy inline computation. #' @param ... Additional arguments passed to \code{\link[stats]{lm}}. #' #' @return If \code{flag_initiate = TRUE}, a list with components: @@ -446,283 +459,144 @@ numElementsTotal <- function(modelarray, scalar_name = "FD") { #' (except \code{element_id}) if the element had insufficient valid subjects #' or if an error occurred with \code{on_error = "skip"}. #' -#' @seealso \code{\link{ModelArray.lm}} which calls this function iteratively, +#' @seealso \code{\link{ModelArray.lm}}, \code{.build_lm_context}, #' \code{\link{analyseOneElement.gam}} for the GAM equivalent, -#' \code{\link{analyseOneElement.wrap}} for user-supplied functions. +#' \code{\link{analyseOneElement.wrap}} for user-supplied functions #' #' @keywords internal #' @rdname analyseOneElement.lm #' @export - analyseOneElement.lm <- function(i_element, - formula, - modelarray, - phenotypes, - scalar, - var.terms, - var.model, + formula = NULL, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, + var.terms = c("estimate", "statistic", "p.value"), + var.model = c("adj.r.squared", "p.value"), num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, on_error = "stop", + ctx = NULL, ...) { - values <- scalars(modelarray)[[scalar]][i_element, ] - ## check number of subjects with (in)valid values: - flag_sufficient <- NULL # whether number of subjects with valid values are sufficient - num.subj.valid <- length(values[is.finite(values)]) - if (num.subj.valid > num.subj.lthr) { - flag_sufficient <- TRUE - } else { - flag_sufficient <- FALSE - } - if (flag_sufficient == TRUE) { - dat <- phenotypes - # detect scalar predictors referenced in formula - all_vars <- all.vars(formula) - lhs_name <- tryCatch(as.character(formula[[2]]), error = function(e) NULL) - rhs_vars <- setdiff(all_vars, lhs_name) - scalar_names <- names(scalars(modelarray)) - scalar_predictors <- intersect(rhs_vars, scalar_names) - - # collision check with phenotypes - collisions <- intersect(c(scalar, scalar_predictors), colnames(dat)) - if (length(collisions) > 0) { - stop(paste0( - "Column name collision between phenotypes and scalar names: ", - paste(collisions, collapse = ", "), - ". Please rename or remove these columns from phenotypes before modeling." - )) - } + # Resolve context ---- + # Use precomputed if available, else build inline - # attach response first - dat[[scalar]] <- values - - # attach scalar predictors with source alignment and build intersection mask - masks <- list(is.finite(values)) - if (length(scalar_predictors) > 0) { - for (pred in scalar_predictors) { - pred_sources <- sources(modelarray)[[pred]] - phen_sources <- dat[["source_file"]] - if (!(all(pred_sources %in% phen_sources) && all(phen_sources %in% pred_sources))) { - stop(paste0( - "The source files for predictor scalar '", pred, "' do not match phenotypes$source_file." - )) - } - reorder_idx <- match(phen_sources, pred_sources) - pred_vals <- scalars(modelarray)[[pred]][i_element, ] - pred_vals <- pred_vals[reorder_idx] - dat[[pred]] <- pred_vals - masks[[length(masks) + 1L]] <- is.finite(pred_vals) - } - } + if (!is.null(ctx)) { + # Fast path: all invariant work already done + effective_formula <- ctx$formula + effective_scalar <- ctx$scalar + } else { + # Legacy path: compute everything inline (for direct calls / debugging) + effective_formula <- formula + effective_scalar <- scalar + ctx <- .build_lm_context(formula, modelarray, phenotypes, scalar) + } - # intersection-based threshold - valid_mask <- Reduce("&", masks) - num.subj.valid <- sum(valid_mask) - if (!(num.subj.valid > num.subj.lthr)) { - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN, list.terms = NaN) - return(toreturn) - } else { - onerow <- c( - i_element - 1, - rep(NaN, (num.stat.output - 1)) - ) - return(onerow) - } - } - # dots <- list(...) - # dots_names <- names(dots) - # if ("weights" %in% dots_names) { - # message(dots$weights) - # myWeights <- dots$weights - # dots$weights <- NULL # remove weights from - # - # arguments_lm <- dots - # - # } - arguments_lm <- list(...) - arguments_lm$formula <- formula - arguments_lm$data <- dat - - # onemodel <- stats::lm(formula, data = dat, ...) - # onemodel <- stats::lm(formula, data = dat, weights = myWeights,...) - onemodel <- tryCatch( - { - do.call(stats::lm, arguments_lm) - }, - error = function(e) { - msg <- paste0( - "analyseOneElement.lm error at element ", - i_element, - ": ", - conditionMessage(e) - ) - if (on_error == "debug" && interactive()) { - message(msg) - browser() - } - if (on_error == "skip" || on_error == "debug") { - warning(msg) - if (flag_initiate == TRUE) { - return(structure(list(.lm_error_initiate = TRUE), class = "lm_error")) - } else { - return(structure(list(.lm_error_runtime = TRUE), class = "lm_error")) - } - } - stop(e) - } - ) + # Per-element data assembly ---- + elem <- .assemble_element_data(i_element, ctx, num.subj.lthr) - if (inherits(onemodel, "lm_error")) { - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN, list.terms = NaN) - return(toreturn) - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - return(onerow) - } + if (!elem$sufficient) { + if (flag_initiate) { + return(list(column_names = NaN, list.terms = NaN)) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } - # explicitly passing arguments into lm, to avoid error of argument "weights" + } - onemodel.tidy <- onemodel %>% broom::tidy() - onemodel.glance <- onemodel %>% broom::glance() + dat <- elem$dat - # delete columns you don't want: - var.terms.full <- names(onemodel.tidy) - var.model.full <- names(onemodel.glance) + # Fit model ---- + arguments_lm <- list(...) + arguments_lm$formula <- effective_formula + arguments_lm$data <- dat - # list to remove: - var.terms.orig <- var.terms - var.terms <- list("term", var.terms) %>% unlist() # we will always keep "term" column - var.terms.remove <- list() - for (l in var.terms.full) { - if (!(l %in% var.terms)) { - var.terms.remove <- var.terms.remove %>% - append(., l) %>% - unlist() # the order will still be kept + onemodel <- tryCatch( + { + do.call(stats::lm, arguments_lm) + }, + error = function(e) { + msg <- paste0( + "analyseOneElement.lm error at element ", + i_element, ": ", conditionMessage(e) + ) + if (on_error == "debug" && interactive()) { + message(msg) + browser() } - } - - var.model.remove <- list() - for (l in var.model.full) { - if (!(l %in% var.model)) { - var.model.remove <- var.model.remove %>% - append(., l) %>% - unlist() # the order will still be kept + if (on_error == "skip" || on_error == "debug") { + warning(msg) + if (flag_initiate) { + return(structure(list(.lm_error_initiate = TRUE), class = "lm_error")) + } else { + return(structure(list(.lm_error_runtime = TRUE), class = "lm_error")) + } } + stop(e) } + ) - # remove those columns: - if (length(var.terms.remove) != 0) { - # if length=0, it's list(), nothing to remove - onemodel.tidy <- dplyr::select(onemodel.tidy, -dplyr::all_of(var.terms.remove)) - } - if (length(var.model.remove) != 0) { - onemodel.glance <- dplyr::select(onemodel.glance, -dplyr::all_of(var.model.remove)) + if (inherits(onemodel, "lm_error")) { + if (flag_initiate) { + return(list(column_names = NaN, list.terms = NaN)) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } + } - # adjust: - # change the term name from "(Intercept)" to "Intercept" - onemodel.tidy$term[onemodel.tidy$term == "(Intercept)"] <- "Intercept" - onemodel.glance <- onemodel.glance %>% dplyr::mutate(term = "model") # add a column - - # get the list of terms: - list.terms <- onemodel.tidy$term - - # check if the onemodel.* does not have real statistics (but only a column of 'term') - temp_colnames <- onemodel.tidy %>% colnames() - # union of colnames and "term"; if colnames only has "term" or lengt of 0 (tibble()), - # union = "term", all(union)=TRUE; otherwise, if there is colnames other than "term", - # all(union) = c(TRUE, FALSE, ...) - temp <- union(temp_colnames, "term") - # just an empty tibble (so below, all(dim(onemodel.tidy)) = FALSE) - if (all(temp == "term")) { - onemodel.tidy <- tibble::tibble() - } - temp_colnames <- onemodel.glance %>% colnames() - temp <- union(temp_colnames, "term") # union of colnames and "term"; - if (all(temp == "term")) { - onemodel.glance <- tibble::tibble() - } + # Extract statistics ---- + onemodel.tidy <- onemodel %>% broom::tidy() + onemodel.glance <- onemodel %>% broom::glance() + # Normalize intercept term name to match expected output convention + onemodel.tidy$term[onemodel.tidy$term == "(Intercept)"] <- "Intercept" - # flatten .tidy results into one row: - if (all(dim(onemodel.tidy))) { - # not empty | if any dim is 0, all=FALSE - onemodel.tidy.onerow <- onemodel.tidy %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.terms.orig), - names_glue = "{term}.{.value}" - ) - } else { - onemodel.tidy.onerow <- onemodel.tidy - } + list.terms <- onemodel.tidy$term - if (all(dim(onemodel.glance))) { - # not empty - onemodel.glance.onerow <- onemodel.glance %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.model), - names_glue = "{term}.{.value}" + ## Term-level statistics ---- + onerow.terms <- tibble::tibble() + for (i_term in seq_along(list.terms)) { + for (i_var in seq_along(var.terms)) { + temp <- tibble::tibble( + placeholder = onemodel.tidy[[var.terms[i_var]]][i_term] ) - } else { - onemodel.glance.onerow <- onemodel.glance + colnames(temp) <- paste0(list.terms[i_term], ".", var.terms[i_var]) + onerow.terms <- bind_cols_check_emptyTibble(onerow.terms, temp) } + } + ## Model-level statistics ---- + onerow.model <- tibble::tibble() + for (i_var in seq_along(var.model)) { + temp <- tibble::tibble(placeholder = onemodel.glance[[var.model[i_var]]]) + colnames(temp) <- paste0("model.", var.model[i_var]) + onerow.model <- bind_cols_check_emptyTibble(onerow.model, temp) + } - # combine the tables: check if any of them is empty (tibble()) - onemodel.onerow <- bind_cols_check_emptyTibble(onemodel.tidy.onerow, onemodel.glance.onerow) - - # add a column of element ids: - colnames.temp <- colnames(onemodel.onerow) - onemodel.onerow <- onemodel.onerow %>% - tibble::add_column( - element_id = i_element - 1, - .before = colnames.temp[1] - ) # add as the first column - - # now you can get the headers, # of columns, etc of the output results - - - if (flag_initiate == TRUE) { - # return the column names: - - # return: - column_names <- colnames(onemodel.onerow) - toreturn <- list(column_names = column_names, list.terms = list.terms) - toreturn - } else if (flag_initiate == FALSE) { - # return the one row results: + onerow.all <- bind_cols_check_emptyTibble(onerow.terms, onerow.model) + column_names <- c("element_id", colnames(onerow.all)) - # return: - onerow <- as.numeric(onemodel.onerow) # change from tibble to numeric to save some space - onerow - } + if (flag_initiate) { + toreturn <- list( + column_names = column_names, + list.terms = list.terms + ) + return(toreturn) } else { - # if flag_sufficient==FALSE - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN, list.terms = NaN) - toreturn - } else if (flag_initiate == FALSE) { - onerow <- c( - i_element - 1, - # first is element_id, which will still be a finite number - rep(NaN, (num.stat.output - 1)) - ) # other columns will be NaN - - onerow - } + onerow <- c(i_element - 1, as.numeric(onerow.all)) + return(onerow) } } #' Fit a GAM for a single element -#' returns metadata (column names, smooth term names, parametric term names, +#' Returns metadata (column names, smooth term names, parametric term names, #' and the smoothing parameter criterion attribute name) used by #' \code{\link{ModelArray.gam}} to initialise the output data.frame. When #' \code{flag_initiate = FALSE}, it returns a numeric vector representing one @@ -752,6 +626,8 @@ analyseOneElement.lm <- function(i_element, #' squares (\code{model.sse}) for the model, which is needed for #' partial R-squared calculations in \code{\link{ModelArray.gam}}. #' Default: \code{FALSE}. +#' @param ctx A precomputed context list from \code{.build_gam_context()}, +#' or \code{NULL}. #' @param ... Additional arguments passed to \code{\link[mgcv]{gam}}. #' #' @return If \code{flag_initiate = TRUE}, a list with components: @@ -777,385 +653,225 @@ analyseOneElement.lm <- function(i_element, #' @export analyseOneElement.gam <- function(i_element, - formula, - modelarray, - phenotypes, - scalar, - var.smoothTerms, - var.parametricTerms, - var.model, + formula = NULL, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "statistic", "p.value"), + var.model = c("dev.expl"), num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, flag_sse = FALSE, on_error = "stop", + ctx = NULL, ...) { - values <- scalars(modelarray)[[scalar]][i_element, ] - ## check number of subjects with (in)valid values: - flag_sufficient <- NULL # whether number of subjects with valid values are sufficient - num.subj.valid <- length(values[is.finite(values)]) - if (num.subj.valid > num.subj.lthr) { - flag_sufficient <- TRUE + + # Resolve context ---- + if (!is.null(ctx)) { + effective_formula <- ctx$formula + effective_scalar <- ctx$scalar } else { - flag_sufficient <- FALSE + effective_formula <- formula + effective_scalar <- scalar + ctx <- .build_gam_context(formula, modelarray, phenotypes, scalar) } - if (flag_sufficient == TRUE) { - dat <- phenotypes - # detect scalar predictors referenced in formula - all_vars <- all.vars(formula) - lhs_name <- tryCatch(as.character(formula[[2]]), error = function(e) NULL) - rhs_vars <- setdiff(all_vars, lhs_name) - scalar_names <- names(scalars(modelarray)) - scalar_predictors <- intersect(rhs_vars, scalar_names) - - # collision check with phenotypes - collisions <- intersect(c(scalar, scalar_predictors), colnames(dat)) - if (length(collisions) > 0) { - stop(paste0( - "Column name collision between phenotypes and scalar names: ", - paste(collisions, collapse = ", "), - ". Please rename or remove these columns from phenotypes before modeling." + # Per-element data assembly ---- + elem <- .assemble_element_data(i_element, ctx, num.subj.lthr) + + if (!elem$sufficient) { + if (flag_initiate) { + return(list( + column_names = NaN, + list.smoothTerms = NaN, + list.parametricTerms = NaN, + sp.criterion.attr.name = NaN )) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } + } - # attach response first - dat[[scalar]] <- values - - # attach scalar predictors with source alignment and build intersection mask - masks <- list(is.finite(values)) - if (length(scalar_predictors) > 0) { - for (pred in scalar_predictors) { - pred_sources <- sources(modelarray)[[pred]] - phen_sources <- dat[["source_file"]] - if (!(all(pred_sources %in% phen_sources) && all(phen_sources %in% pred_sources))) { - stop(paste0( - "The source files for predictor scalar '", pred, "' do not match phenotypes$source_file." - )) + dat <- elem$dat + + # Fit GAM ---- + arguments <- list(...) + arguments$formula <- effective_formula + arguments$data <- dat + + onemodel <- tryCatch( + { + do.call(mgcv::gam, arguments) + }, + error = function(e) { + msg <- paste0( + "analyseOneElement.gam error at element ", + i_element, ": ", conditionMessage(e) + ) + if (on_error == "debug" && interactive()) { + message(msg) + browser() + } + if (on_error == "skip" || on_error == "debug") { + warning(msg) + if (flag_initiate) { + return(structure(list(.gam_error_initiate = TRUE), class = "gam_error")) + } else { + return(structure(list(.gam_error_runtime = TRUE), class = "gam_error")) } - reorder_idx <- match(phen_sources, pred_sources) - pred_vals <- scalars(modelarray)[[pred]][i_element, ] - pred_vals <- pred_vals[reorder_idx] - dat[[pred]] <- pred_vals - masks[[length(masks) + 1L]] <- is.finite(pred_vals) } + stop(e) } + ) - # intersection-based threshold - valid_mask <- Reduce("&", masks) - num.subj.valid <- sum(valid_mask) - if (!(num.subj.valid > num.subj.lthr)) { - if (flag_initiate == TRUE) { - toreturn <- list( - column_names = NaN, - list.smoothTerms = NaN, - list.parametricTerms = NaN, - sp.criterion.attr.name = NaN - ) - return(toreturn) - } else { - onerow <- c( - i_element - 1, - rep(NaN, (num.stat.output - 1)) - ) - return(onerow) - } + if (inherits(onemodel, "gam_error")) { + if (flag_initiate) { + return(list( + column_names = NaN, + list.smoothTerms = NaN, + list.parametricTerms = NaN, + sp.criterion.attr.name = NaN + )) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } + } - arguments <- list(...) - arguments$formula <- formula - arguments$data <- dat - - # explicitly passing arguments into command, to avoid error of argument "weights" - onemodel <- tryCatch( - { - do.call(mgcv::gam, arguments) - }, - error = function(e) { - msg <- paste0( - "analyseOneElement.gam error at element ", - i_element, - ": ", - conditionMessage(e) - ) - if (on_error == "debug" && interactive()) { - message(msg) - browser() - } - if (on_error == "skip" || on_error == "debug") { - warning(msg) - if (flag_initiate == TRUE) { - return(structure(list(.gam_error_initiate = TRUE), class = "gam_error")) - } else { - return(structure(list(.gam_error_runtime = TRUE), class = "gam_error")) - } - } - stop(e) - } - ) - - if (inherits(onemodel, "gam_error")) { - if (flag_initiate == TRUE) { - toreturn <- list( - column_names = NaN, - list.smoothTerms = NaN, - list.parametricTerms = NaN, - sp.criterion.attr.name = NaN - ) - return(toreturn) - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - return(onerow) - } - } - onemodel.tidy.smoothTerms <- onemodel %>% broom::tidy(parametric = FALSE) - onemodel.tidy.parametricTerms <- onemodel %>% broom::tidy(parametric = TRUE) - onemodel.glance <- onemodel %>% broom::glance() - onemodel.summary <- onemodel %>% summary() - # add additional model's stat to onemodel.glance(): - onemodel.glance[["adj.r.squared"]] <- onemodel.summary$r.sq - onemodel.glance[["dev.expl"]] <- onemodel.summary$dev.expl - - sp.criterion.attr.name <- onemodel.summary$sp.criterion %>% attr(which = "name") - onemodel.glance[["sp.criterion"]] <- onemodel.summary$sp.criterion[[sp.criterion.attr.name]] - onemodel.glance[["scale"]] <- onemodel.summary$scale # scale estimate - - num.smoothTerms <- onemodel.summary$m # The number of smooth terms in the model. - - - # delete columns you don't want: - var.smoothTerms.full <- names(onemodel.tidy.smoothTerms) - var.parametricTerms.full <- names(onemodel.tidy.parametricTerms) - var.model.full <- names(onemodel.glance) - - # list to remove: - var.smoothTerms.orig <- var.smoothTerms - var.smoothTerms <- list("term", var.smoothTerms) %>% unlist() # we will always keep "term" column - var.smoothTerms.remove <- list() - for (l in var.smoothTerms.full) { - if (!(l %in% var.smoothTerms)) { - var.smoothTerms.remove <- var.smoothTerms.remove %>% - append(., l) %>% - unlist() # the order will still be kept - } - } + # Extract statistics ---- - var.parametricTerms.orig <- var.parametricTerms - var.parametricTerms <- list("term", var.parametricTerms) %>% unlist() # we will always keep "term" column - var.parametricTerms.remove <- list() - for (l in var.parametricTerms.full) { - if (!(l %in% var.parametricTerms)) { - var.parametricTerms.remove <- var.parametricTerms.remove %>% - append(., l) %>% - unlist() # the order will still be kept - } - } + onemodel.tidy.smoothTerms <- onemodel %>% broom::tidy(parametric = FALSE) + onemodel.tidy.parametricTerms <- onemodel %>% broom::tidy(parametric = TRUE) + onemodel.glance <- onemodel %>% broom::glance() + onemodel.summary <- onemodel %>% summary() - var.model.remove <- list() - for (l in var.model.full) { - if (!(l %in% var.model)) { - var.model.remove <- var.model.remove %>% - append(., l) %>% - unlist() # the order will still be kept - } - } + # ---- Normalize term names (must happen before column name assembly) ---- - # remove those columns: - if (length(var.smoothTerms.remove) != 0) { - # if length=0, it's list(), nothing to remove - onemodel.tidy.smoothTerms <- dplyr::select(onemodel.tidy.smoothTerms, -dplyr::all_of(var.smoothTerms.remove)) - } - if (length(var.parametricTerms.remove) != 0) { - # if length=0, it's list(), nothing to remove - onemodel.tidy.parametricTerms <- dplyr::select(onemodel.tidy.parametricTerms, -dplyr::all_of(var.parametricTerms.remove)) - } - if (length(var.model.remove) != 0) { - onemodel.glance <- dplyr::select(onemodel.glance, -dplyr::all_of(var.model.remove)) - } + # Parametric: (Intercept) → Intercept + if (nrow(onemodel.tidy.parametricTerms) > 0) { + onemodel.tidy.parametricTerms$term[ + onemodel.tidy.parametricTerms$term == "(Intercept)" + ] <- "Intercept" + } - # adjust: - if (num.smoothTerms > 0) { - # if there is any smooth term - onemodel.tidy.smoothTerms$term[onemodel.tidy.smoothTerms$term == "(Intercept)"] <- "Intercept" - # change the term name from "(Intercept)" to "Intercept" + # Smooth: s(age) → s_age, ti(x,z) → ti_x_z, s(x):oFactor → s_x_BYoFactor + num.smoothTerms <- onemodel.summary$m + if (num.smoothTerms > 0) { + if (nrow(onemodel.tidy.smoothTerms) > 0) { + onemodel.tidy.smoothTerms$term[ + onemodel.tidy.smoothTerms$term == "(Intercept)" + ] <- "Intercept" } - if (nrow(onemodel.tidy.parametricTerms) > 0) { - # if there is any parametric term - onemodel.tidy.parametricTerms$term[onemodel.tidy.parametricTerms$term == "(Intercept)"] <- "Intercept" - # change the term name from "(Intercept)" to "Intercept" - } - - # change from s(x) to s_x: (could be s, te, etc); from s(x):oFactor to s_x_BYoFactor; from ti(x,z) to ti_x_z - if (num.smoothTerms > 0) { - # if there is any smooth term - for (i_row in seq_len(nrow(onemodel.tidy.smoothTerms))) { - # step 1: change from s(x) to s_x - term_name <- onemodel.tidy.smoothTerms$term[i_row] - str_list <- strsplit(term_name, split = "[()]")[[1]] - - str <- str_list[2] # extract string between () - smooth_name <- str_list[1] # "s" or some other smooth method type such as "te" - str_valid <- paste0(smooth_name, "_", str) - - if (length(str_list) > 2) { - # there is string after variable name - str_valid <- paste0( - str_valid, - "_", - paste(str_list[3:length(str_list)], collapse = "") - ) # combine rest of strings - } - # detect ":", and change to "BY" # there is "_" replacing for ")" in "s()" already - str_valid <- gsub(":", "BY", str_valid, fixed = TRUE) + for (i_row in seq_len(nrow(onemodel.tidy.smoothTerms))) { + term_name <- onemodel.tidy.smoothTerms$term[i_row] + str_list <- strsplit(term_name, split = "[()]")[[1]] - # detect ",", and change to "_" - str_valid <- gsub(",", "_", str_valid, fixed = TRUE) + str <- str_list[2] + smooth_name <- str_list[1] + str_valid <- paste0(smooth_name, "_", str) - onemodel.tidy.smoothTerms$term[i_row] <- str_valid + if (length(str_list) > 2) { + str_valid <- paste0( + str_valid, "_", + paste(str_list[3:length(str_list)], collapse = "") + ) } - } - - onemodel.glance <- onemodel.glance %>% dplyr::mutate(term = "model") # add a column + # : → BY + str_valid <- gsub(":", "BY", str_valid, fixed = TRUE) + # , → _ + str_valid <- gsub(",", "_", str_valid, fixed = TRUE) - # get the list of terms: - if (num.smoothTerms > 0) { - list.smoothTerms <- onemodel.tidy.smoothTerms$term # if empty, gives warning - } else { - list.smoothTerms <- NULL - } - - if (nrow(onemodel.tidy.parametricTerms) > 0) { - list.parametricTerms <- onemodel.tidy.parametricTerms$term - } else { - list.parametricTerms <- NULL + onemodel.tidy.smoothTerms$term[i_row] <- str_valid } + } + list.smoothTerms <- onemodel.tidy.smoothTerms$term + list.parametricTerms <- onemodel.tidy.parametricTerms$term - # check if the onemodel.* does not have real statistics (but only a column of 'term') - temp_colnames <- onemodel.tidy.smoothTerms %>% colnames() - temp <- union(temp_colnames, "term") - # union of colnames and "term"; if colnames only has "term" or lengt of 0 (tibble()), - # union = "term", all(union)=TRUE; otherwise, if there is colnames other than "term", - # all(union) = c(TRUE, FALSE, ...) - if (all(temp == "term")) { - onemodel.tidy.smoothTerms <- tibble::tibble() - } - # just an empty tibble (so below, all(dim(onemodel.tidy.smoothTerms)) = FALSE) - - temp_colnames <- onemodel.tidy.parametricTerms %>% colnames() - temp <- union(temp_colnames, "term") - if (all(temp == "term")) { - onemodel.tidy.parametricTerms <- tibble::tibble() - } # just an empty tibble - - temp_colnames <- onemodel.glance %>% colnames() - temp <- union(temp_colnames, "term") - if (all(temp == "term")) { - onemodel.glance <- tibble::tibble() - } # just an empty tibble - - - # flatten .tidy results into one row: - if (all(dim(onemodel.tidy.smoothTerms))) { - # not empty | if any dim is 0, all=FALSE - onemodel.tidy.smoothTerms.onerow <- onemodel.tidy.smoothTerms %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.smoothTerms.orig), - names_glue = "{term}.{.value}" + ## Smooth term statistics ---- + onerow.smoothTerms <- tibble::tibble() + for (i_term in seq_along(list.smoothTerms)) { + for (i_var in seq_along(var.smoothTerms)) { + temp <- tibble::tibble( + placeholder = onemodel.tidy.smoothTerms[[var.smoothTerms[i_var]]][i_term] ) - } else { - onemodel.tidy.smoothTerms.onerow <- onemodel.tidy.smoothTerms + colnames(temp) <- paste0(list.smoothTerms[i_term], ".", var.smoothTerms[i_var]) + onerow.smoothTerms <- bind_cols_check_emptyTibble(onerow.smoothTerms, temp) } + } - if (all(dim(onemodel.tidy.parametricTerms))) { - # not empty - onemodel.tidy.parametricTerms.onerow <- onemodel.tidy.parametricTerms %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.parametricTerms.orig), - names_glue = "{term}.{.value}" + ## Parametric term statistics ---- + onerow.parametricTerms <- tibble::tibble() + for (i_term in seq_along(list.parametricTerms)) { + for (i_var in seq_along(var.parametricTerms)) { + temp <- tibble::tibble( + placeholder = onemodel.tidy.parametricTerms[[var.parametricTerms[i_var]]][i_term] ) - } else { - onemodel.tidy.parametricTerms.onerow <- onemodel.tidy.parametricTerms + colnames(temp) <- paste0(list.parametricTerms[i_term], ".", var.parametricTerms[i_var]) + onerow.parametricTerms <- bind_cols_check_emptyTibble(onerow.parametricTerms, temp) } + } - if (all(dim(onemodel.glance))) { - # not empty - onemodel.glance.onerow <- onemodel.glance %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.model), - names_glue = "{term}.{.value}" - ) + ## Model-level statistics ---- + onerow.model <- tibble::tibble() + for (i_var in seq_along(var.model)) { + val <- NULL + if (var.model[i_var] %in% colnames(onemodel.glance)) { + val <- onemodel.glance[[var.model[i_var]]] + } else if (var.model[i_var] == "adj.r.squared") { + val <- onemodel.summary$r.sq + } else if (var.model[i_var] == "dev.expl") { + val <- onemodel.summary$dev.expl + } else if (var.model[i_var] == "sp.criterion") { + val <- onemodel.summary$sp.criterion + if (flag_initiate) { + sp.criterion.attr.name <- names(onemodel.summary$sp.criterion) + } + } else if (var.model[i_var] == "scale") { + val <- onemodel.summary$scale } else { - onemodel.glance.onerow <- onemodel.glance + val <- onemodel.glance[[var.model[i_var]]] } + temp <- tibble::tibble(placeholder = val) + colnames(temp) <- paste0("model.", var.model[i_var]) + onerow.model <- bind_cols_check_emptyTibble(onerow.model, temp) + } + ## SSE if requested ---- + onerow.sse <- tibble::tibble() + if (flag_sse) { + sse_val <- sum(onemodel$residuals^2) + temp <- tibble::tibble(placeholder = sse_val) + colnames(temp) <- "model.sse" + onerow.sse <- temp + } - # combine the tables: check if any of them is empty (tibble()) - onemodel.onerow <- bind_cols_check_emptyTibble( - onemodel.tidy.smoothTerms.onerow, - onemodel.tidy.parametricTerms.onerow - ) - onemodel.onerow <- bind_cols_check_emptyTibble(onemodel.onerow, onemodel.glance.onerow) - + onerow.all <- bind_cols_check_emptyTibble(onerow.smoothTerms, onerow.parametricTerms) + onerow.all <- bind_cols_check_emptyTibble(onerow.all, onerow.model) + onerow.all <- bind_cols_check_emptyTibble(onerow.all, onerow.sse) - # add a column of element ids: - colnames.temp <- colnames(onemodel.onerow) - onemodel.onerow <- onemodel.onerow %>% tibble::add_column(element_id = i_element - 1, .before = colnames.temp[1]) - # add as the first column + column_names <- c("element_id", colnames(onerow.all)) - # add sse if requested: - if (flag_sse == TRUE) { - onemodel.onerow[["model.sse"]] <- sum((onemodel$y - onemodel$fitted.values)^2) - # using values from model itself, where NAs in y have been excluded --> - # sse won't be NA --> partial R-squared won't be NA - } - - - # now you can get the headers, # of columns, etc of the output results - - - if (flag_initiate == TRUE) { - # return the column names: - - # return: - column_names <- colnames(onemodel.onerow) - toreturn <- list( - column_names = column_names, - list.smoothTerms = list.smoothTerms, - list.parametricTerms = list.parametricTerms, - sp.criterion.attr.name = sp.criterion.attr.name - ) - toreturn - } else if (flag_initiate == FALSE) { - # return the one row results: - - # return: - onerow <- as.numeric(onemodel.onerow) # change from tibble to numeric to save some space - onerow + if (flag_initiate) { + if (!exists("sp.criterion.attr.name")) { + sp.criterion.attr.name <- NA_character_ } + return(list( + column_names = column_names, + list.smoothTerms = list.smoothTerms, + list.parametricTerms = list.parametricTerms, + sp.criterion.attr.name = sp.criterion.attr.name + )) } else { - # if flag_sufficient==FALSE - if (flag_initiate == TRUE) { - toreturn <- list( - column_names = NaN, - list.smoothTerms = NaN, - list.parametricTerms = NaN, - sp.criterion.attr.name = NaN - ) - toreturn - } else if (flag_initiate == FALSE) { - onerow <- c( - i_element - 1, - # first is element_id, which will still be a finite number - rep(NaN, (num.stat.output - 1)) - ) # other columns will be NaN - - onerow - } + onerow <- c(i_element - 1, as.numeric(onerow.all)) + return(onerow) } } @@ -1195,8 +911,17 @@ analyseOneElement.gam <- function(i_element, #' #' @param user_fun A function that accepts at least an argument named #' \code{data} (a data.frame: \code{phenotypes} with scalar columns -#' appended for the current element) and returns a one-row -#' data.frame/tibble, a named list, or a named atomic vector. +#' appended for the current element) and returns one of: +#' \itemize{ +#' \item A one-row \code{data.frame} or \code{tibble}. Multi-row +#' returns will error. +#' \item A named list. Unnamed lists are accepted and auto-named +#' as \code{v1}, \code{v2}, etc. +#' \item A named atomic vector. Unnamed vectors are accepted and +#' auto-named as \code{v1}, \code{v2}, etc. +#' } +#' @param ctx A precomputed context list from \code{.build_wrap_context()}, +#' or \code{NULL}. #' @param ... Additional arguments forwarded to \code{user_fun}. #' #' @return If \code{flag_initiate = TRUE}, a list with one component: @@ -1222,153 +947,124 @@ analyseOneElement.gam <- function(i_element, #' @export analyseOneElement.wrap <- function(i_element, user_fun, - modelarray, - phenotypes, - scalar, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, on_error = "stop", + ctx = NULL, ...) { - values <- scalars(modelarray)[[scalar]][i_element, ] - ## check number of subjects with (in)valid values: - flag_sufficient <- NULL - num.subj.valid <- length(values[is.finite(values)]) - if (num.subj.valid > num.subj.lthr) { - flag_sufficient <- TRUE - } else { - flag_sufficient <- FALSE + # Resolve context ---- + + if (is.null(ctx)) { + ctx <- .build_wrap_context(modelarray, phenotypes, scalar) } - if (flag_sufficient == TRUE) { - dat <- phenotypes - # attach all scalars with alignment and collision checks - scalar_names <- names(scalars(modelarray)) - collisions <- intersect(scalar_names, colnames(dat)) - if (length(collisions) > 0) { - stop(paste0( - "Column name collision between phenotypes and scalar names: ", - paste(collisions, collapse = ", "), - ". Please rename or remove these columns from phenotypes before wrapping." - )) - } - masks <- list() - phen_sources <- dat[["source_file"]] - for (sname in scalar_names) { - s_sources <- sources(modelarray)[[sname]] - if (!(all(s_sources %in% phen_sources) && all(phen_sources %in% s_sources))) { - stop(paste0( - "The source files for scalar '", sname, "' do not match phenotypes$source_file." - )) - } - reorder_idx <- match(phen_sources, s_sources) - s_vals <- scalars(modelarray)[[sname]][i_element, ] - s_vals <- s_vals[reorder_idx] - dat[[sname]] <- s_vals - masks[[length(masks) + 1L]] <- is.finite(s_vals) - } + # Per-element data assembly ---- - # intersection-based threshold across all scalars - valid_mask <- Reduce("&", masks) - num.subj.valid <- sum(valid_mask) - if (!(num.subj.valid > num.subj.lthr)) { - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN) - return(toreturn) - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - return(onerow) - } + elem <- .assemble_element_data(i_element, ctx, num.subj.lthr) + + if (!elem$sufficient) { + if (flag_initiate) { + return(list(column_names = NaN)) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } + } - arguments <- list(...) - arguments$data <- dat - - # Execute user function with error handling - result <- tryCatch( - { - do.call(user_fun, arguments) - }, - error = function(e) { - msg <- paste0( - "analyseOneElement.wrap error at element ", - i_element, - ": ", - conditionMessage(e) - ) - if (on_error == "debug" && interactive()) { - message(msg) - browser() - } - if (on_error == "skip" || on_error == "debug") { - warning(msg) - if (flag_initiate == TRUE) { - return(structure(list(.wrap_error_initiate = TRUE), class = "wrap_error")) - } else { - return(structure(list(.wrap_error_runtime = TRUE), class = "wrap_error")) - } - } - stop(e) - } - ) + dat <- elem$dat - # On error in user function, return NaNs according to flag - if (inherits(result, "wrap_error")) { - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN) - return(toreturn) - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - return(onerow) - } - } - # Coerce to one-row tibble - if ("data.frame" %in% class(result)) { - onerow_tbl <- tibble::as_tibble(result) - if (nrow(onerow_tbl) != 1) { - stop( - "The user function must return a one-row data.frame/tibble, a named list, or a named vector." - ) + # Execute user function ---- + + arguments <- list(...) + arguments$data <- dat + + result <- tryCatch( + { + do.call(user_fun, arguments) + }, + error = function(e) { + msg <- paste0( + "analyseOneElement.wrap error at element ", + i_element, ": ", conditionMessage(e) + ) + if (on_error == "debug" && interactive()) { + message(msg) + browser() } - } else if (is.list(result)) { - onerow_tbl <- tibble::as_tibble_row(result) - } else if (is.atomic(result)) { - if (is.null(names(result))) { - names(result) <- paste0("v", seq_along(result)) + if (on_error == "skip" || on_error == "debug") { + warning(msg) + if (flag_initiate) { + return(structure(list(.wrap_error_initiate = TRUE), class = "wrap_error")) + } else { + return(structure(list(.wrap_error_runtime = TRUE), class = "wrap_error")) + } } - onerow_tbl <- tibble::as_tibble_row(as.list(result)) - } else { - stop( - paste0( - "Unsupported return type from user function. ", - "Return a one-row data.frame/tibble, named list, or named vector." - ) - ) + stop(e) } + ) - # add element_id as first column - colnames.temp <- colnames(onerow_tbl) - onerow_tbl <- onerow_tbl %>% tibble::add_column(element_id = i_element - 1, .before = colnames.temp[1]) - - if (flag_initiate == TRUE) { - toreturn <- list(column_names = colnames(onerow_tbl)) - toreturn + # Coerce result to named numeric vector ---- + if (inherits(result, "wrap_error")) { + if (flag_initiate) { + return(list(column_names = NaN)) } else { - # return numeric vector to be consistent with other analysers - as.numeric(onerow_tbl) + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) + } + } + + if (is.data.frame(result) || tibble::is_tibble(result)) { + if (nrow(result) != 1) { + msg <- paste0( + "The user function must return a one-row data.frame/tibble, ", + "a named list, or a named vector." + ) + if (on_error == "skip" || on_error == "debug") { + warning(paste0("analyseOneElement.wrap at element ", i_element, ": ", msg)) + if (flag_initiate) return(list(column_names = NaN)) + return(c(i_element - 1, rep(NaN, (num.stat.output - 1)))) + } + stop(msg) } + result_names <- colnames(result) + result_values <- as.numeric(result[1, ]) + } else if (is.list(result)) { + result_names <- names(result) + if (is.null(result_names) || any(result_names == "")) { + result_names <- paste0("v", seq_along(result)) + } + result_values <- as.numeric(unlist(result)) + } else if (is.atomic(result)) { + if (is.null(names(result))) { + names(result) <- paste0("v", seq_along(result)) + } + result_names <- names(result) + result_values <- as.numeric(result) } else { - # insufficient subjects - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN) - toreturn - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - onerow + msg <- paste0("user_fun must return a one-row data.frame/tibble, a named list, ", + "or a named atomic vector. Got: ", class(result)[1]) + if (on_error == "skip" || on_error == "debug") { + warning(paste0("analyseOneElement.wrap at element ", i_element, ": ", msg)) + if (flag_initiate) return(list(column_names = NaN)) + return(c(i_element - 1, rep(NaN, (num.stat.output - 1)))) } + stop(msg) + } + + column_names <- c("element_id", result_names) + + if (flag_initiate) { + return(list(column_names = column_names)) + } else { + onerow <- c(i_element - 1, result_values) + return(onerow) } } diff --git a/R/analyse-helpers.R b/R/analyse-helpers.R index 5144a5d..7660603 100644 --- a/R/analyse-helpers.R +++ b/R/analyse-helpers.R @@ -1,7 +1,7 @@ # Internal shared helpers for ModelArray analysis functions # These are not exported — used by ModelArray.lm, ModelArray.gam, ModelArray.wrap - +# Validation and dataset preparation ---- #' Validate that data is a ModelArray #' @noRd .validate_modelarray_input <- function(data) { @@ -110,7 +110,320 @@ max(num.subj.total * num.subj.lthr.rel, num.subj.lthr.abs) } +# Create analysis context ---- +# Precomputed context builders for element-wise analysis functions. +# These hoist loop-invariant computations out of the per-element functions +# so they execute once before the loop rather than millions of times inside it. + +# Level 1: Base context shared by all analysis functions (lm, gam, wrap). +# Handles scalar discovery, collision checks, and cross-scalar source alignment. + +#' Build base context shared by all element-wise analysis functions +#' +#' Precomputes scalar name lookups, column-name collision checks, and +#' cross-scalar source-alignment indices that are invariant across elements. +#' This is the foundation on which model-specific contexts are built. +#' +#' @param modelarray A \linkS4class{ModelArray} object. +#' @param phenotypes A data.frame, already aligned to \code{modelarray} via +#' \code{.align_phenotypes()}. +#' @param scalar Character. The name of the response scalar. +#' @param scalar_subset Character vector or \code{NULL}. Which scalars to +#' attach and align. If \code{NULL}, all scalars in the ModelArray are used +#' (the \code{wrap} behaviour). If a character vector, only those scalars +#' are included (the \code{lm}/\code{gam} behaviour where we detect +#' formula-referenced scalars). +#' @return A named list with components: +#' \describe{ +#' \item{modelarray}{The ModelArray object (read-only reference).} +#' \item{phenotypes}{The aligned phenotypes data.frame.} +#' \item{scalar}{Character. The response scalar name.} +#' \item{all_scalar_names}{Character vector. Names of all scalars in the +#' ModelArray.} +#' \item{attached_scalars}{Character vector. Names of scalars that will +#' be attached to the per-element data.frame (may be a subset of +#' \code{all_scalar_names} or all of them).} +#' \item{predictor_reorder}{A named list of integer vectors. For each +#' scalar in \code{attached_scalars}, the precomputed index vector +#' from \code{match(phen_sources, scalar_sources)} that reorders +#' scalar columns to match phenotype rows. For the response scalar +#' (which is already aligned by \code{.align_phenotypes()}), this +#' entry is \code{NULL} (no reordering needed).} +#' } +#' @noRd +.build_base_context <- function(modelarray, phenotypes, scalar, + scalar_subset = NULL) { + all_scalar_names <- names(scalars(modelarray)) + + + # Determine which scalars to attach + + if (is.null(scalar_subset)) { + # wrap mode: attach all scalars + attached_scalars <- all_scalar_names + } else { + # lm/gam mode: attach only the explicitly requested scalars + # (response + any predictor scalars detected from the formula) + attached_scalars <- unique(scalar_subset) + } + + + # Collision check: scalar names vs phenotype column names. + # This is invariant — the same collision would occur at every element, + + # so we fail fast once rather than millions of times. + + collisions <- intersect(attached_scalars, colnames(phenotypes)) + if (length(collisions) > 0) { + stop( + "Column name collision between phenotypes and scalar names: ", + paste(collisions, collapse = ", "), + ". Please rename or remove these columns from phenotypes before ", + "modeling." + ) + } + + # Precompute source-alignment reorder indices for each attached scalar. + # The response scalar is already aligned by .align_phenotypes(), so it + + # needs no reordering. Predictor scalars from mergeModelArrays() may + # have different column orderings and need match()-based reordering. + phen_sources <- phenotypes[["source_file"]] + predictor_reorder <- stats::setNames( + vector("list", length(attached_scalars)), + attached_scalars + ) + + for (sname in attached_scalars) { + s_sources <- sources(modelarray)[[sname]] + + if (identical(s_sources, phen_sources)) { + # Already aligned (typical for the response scalar, or when all + # scalars share the same source order after mergeModelArrays) + predictor_reorder[[sname]] <- NULL + } else { + # Validate that the sets match (once, not per element) + if (!(all(s_sources %in% phen_sources) && + all(phen_sources %in% s_sources))) { + stop( + "The source files for scalar '", sname, + "' do not match phenotypes$source_file." + ) + } + predictor_reorder[[sname]] <- match(phen_sources, s_sources) + } + } + + list( + modelarray = modelarray, + phenotypes = phenotypes, + scalar = scalar, + all_scalar_names = all_scalar_names, + attached_scalars = attached_scalars, + predictor_reorder = predictor_reorder + ) +} + + +# Level 2: Model-specific context builders that add formula-derived +# information on top of the base context. + +#' Build context for element-wise linear model fitting +#' +#' Extends the base context with formula parsing results: the LHS variable +#' name, RHS variable names, and which RHS variables are scalars (as opposed +#' to phenotype columns). All of this is invariant across elements. +#' +#' @param formula A \code{\link[stats]{formula}} to be passed to +#' \code{\link[stats]{lm}}. +#' @param modelarray A \linkS4class{ModelArray} object. +#' @param phenotypes A data.frame, already aligned via +#' \code{.align_phenotypes()}. +#' @param scalar Character. The response scalar name. +#' @return A named list inheriting all components from +#' \code{.build_base_context()} plus: +#' \describe{ +#' \item{formula}{The model formula.} +#' \item{lhs_name}{Character. The response variable name from the +#' formula LHS.} +#' \item{rhs_vars}{Character vector. All variable names on the RHS.} +#' \item{scalar_predictors}{Character vector. RHS variables that are +#' scalar names in the ModelArray (i.e., cross-scalar predictors).} +#' } +#' @noRd +.build_lm_context <- function(formula, modelarray, phenotypes, scalar) { + # Formula parsing — currently repeated per element inside + # analyseOneElement.lm [4] + all_vars <- all.vars(formula) + lhs_name <- tryCatch( + as.character(formula[[2]]), + error = function(e) NULL + ) + rhs_vars <- setdiff(all_vars, lhs_name) + + # Detect which RHS variables are scalars in the ModelArray + all_scalar_names <- names(scalars(modelarray)) + scalar_predictors <- intersect(rhs_vars, all_scalar_names) + + # The scalars to attach: response + any scalar predictors + scalar_subset <- unique(c(scalar, scalar_predictors)) + + # Build the base context with only the needed scalars + base_ctx <- .build_base_context( + modelarray = modelarray, + phenotypes = phenotypes, + scalar = scalar, + scalar_subset = scalar_subset + ) + + # Extend with formula-specific fields + base_ctx$formula <- formula + base_ctx$lhs_name <- lhs_name + base_ctx$rhs_vars <- rhs_vars + base_ctx$scalar_predictors <- scalar_predictors + base_ctx +} + + +#' Build context for element-wise GAM fitting +#' +#' Extends the base context with formula parsing results and GAM-specific +#' formula validation that currently runs inside the ModelArray.gam() +#' preamble but whose results are never passed to the per-element function. +#' +#' @param formula A \code{\link[stats]{formula}} to be passed to +#' \code{\link[mgcv]{gam}}. +#' @param modelarray A \linkS4class{ModelArray} object. +#' @param phenotypes A data.frame, already aligned via +#' \code{.align_phenotypes()}. +#' @param scalar Character. The response scalar name. +#' @return A named list inheriting all components from +#' \code{.build_lm_context()} (which itself inherits from +#' \code{.build_base_context()}) plus: +#' \describe{ +#' \item{gam_formula_breakdown}{The result of +#' \code{mgcv::interpret.gam(formula)}, cached for reuse.} +#' } +#' @noRd +.build_gam_context <- function(formula, modelarray, phenotypes, scalar) { + # GAM formula validation — currently runs in ModelArray.gam() preamble [1] + # but the breakdown result is discarded. We cache it here. + gam_breakdown <- tryCatch( + mgcv::interpret.gam(formula), + error = function(cond) { + stop("The formula is not valid for mgcv::gam()! Please check and revise.") + } + ) + + # The formula structure is the same as lm for variable detection purposes + ctx <- .build_lm_context(formula, modelarray, phenotypes, scalar) + + # Add GAM-specific cached data + ctx$gam_formula_breakdown <- gam_breakdown + + ctx +} + + +#' Build context for element-wise user-supplied function execution +#' +#' Uses the base context in "attach all scalars" mode since +#' \code{analyseOneElement.wrap} attaches every scalar in the ModelArray +#' to the per-element data.frame, not just formula-referenced ones. +#' +#' @param modelarray A \linkS4class{ModelArray} object. +#' @param phenotypes A data.frame, already aligned via +#' \code{.align_phenotypes()}. +#' @param scalar Character. The primary scalar name (used for the initial +#' validity check). +#' @return A named list: the base context with \code{scalar_subset = NULL} +#' (all scalars attached). +#' @noRd +.build_wrap_context <- function(modelarray, phenotypes, scalar) { + # scalar_subset = NULL triggers "attach all" mode in .build_base_context() + ctx <- .build_base_context( + modelarray = modelarray, + phenotypes = phenotypes, + scalar = scalar, + scalar_subset = NULL + ) + + ctx +} + + +# Level 3: Shared per-element data assembly helper. +# Extracts scalar values for one element using the precomputed context, +# builds the validity mask, and returns the filtered data.frame. +# This replaces duplicated logic across analyseOneElement.lm, +# analyseOneElement.gam, and analyseOneElement.wrap [4]. + +#' Assemble per-element data.frame from precomputed context +#' +#' Reads scalar rows from the ModelArray, applies precomputed reorder +#' indices, builds the intersection validity mask, and returns the +#' filtered data.frame ready for model fitting or user function execution. +#' +#' @param i_element Integer. 1-based element index. +#' @param ctx A context list from one of the \code{.build_*_context()} +#' functions. +#' @param num.subj.lthr Numeric. Minimum number of subjects with finite +#' values required. +#' @return A list with components: +#' \describe{ +#' \item{dat}{A data.frame: the filtered phenotypes with scalar columns +#' attached. \code{NULL} if the element has insufficient valid +#' subjects.} +#' \item{sufficient}{Logical. Whether the element passed the subject +#' threshold.} +#' \item{num_valid}{Integer. Number of subjects with finite values +#' across all attached scalars.} +#' } +#' @noRd +.assemble_element_data <- function(i_element, ctx, num.subj.lthr) { + # Read the response scalar row — the only mandatory per-element I/O + response_vals <- scalars(ctx$modelarray)[[ctx$scalar]][i_element, ] + + # Start the validity mask with the response scalar + masks <- list(is.finite(response_vals)) + + # Read and reorder additional attached scalars + scalar_values <- list() + scalar_values[[ctx$scalar]] <- response_vals + + other_scalars <- setdiff(ctx$attached_scalars, ctx$scalar) + for (sname in other_scalars) { + s_vals <- scalars(ctx$modelarray)[[sname]][i_element, ] + + # Apply precomputed reorder index (or use as-is if NULL) + reorder_idx <- ctx$predictor_reorder[[sname]] + if (!is.null(reorder_idx)) { + s_vals <- s_vals[reorder_idx] + } + + scalar_values[[sname]] <- s_vals + masks[[length(masks) + 1L]] <- is.finite(s_vals) + } + + # Intersection mask across all scalars + valid_mask <- Reduce("&", masks) + num_valid <- sum(valid_mask) + + if (!(num_valid > num.subj.lthr)) { + return(list(dat = NULL, sufficient = FALSE, num_valid = num_valid)) + } + + # Build filtered data.frame + dat <- ctx$phenotypes[valid_mask, , drop = FALSE] + for (sname in ctx$attached_scalars) { + dat[[sname]] <- scalar_values[[sname]][valid_mask] + } + + list(dat = dat, sufficient = TRUE, num_valid = num_valid) +} + +# Find initiator element ---- #' Find a valid initiator element by searching middle → forward → backward #' #' @param analyse_one_fn The per-element analysis function @@ -186,6 +499,7 @@ } +# Parallelization ---- #' Dispatch parallel/serial apply with optional progress bar #' #' @param element.subset Integer vector of element indices @@ -236,7 +550,7 @@ fits } - +# P-value correction ---- #' Correct p-values for a set of terms and append corrected columns #' #' @param df_out Data.frame of results @@ -264,7 +578,7 @@ df_out } - +# Streaming writes ---- #' Initialize an incremental writer for /results datasets #' @noRd .init_results_stream_writer <- function(write_results_name, diff --git a/R/analyse.R b/R/analyse.R index 9cd8881..9779d39 100644 --- a/R/analyse.R +++ b/R/analyse.R @@ -132,10 +132,12 @@ #' @rdname ModelArray.lm #' @export -ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NULL, full.outputs = FALSE, +ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NULL, + full.outputs = FALSE, var.terms = c("estimate", "statistic", "p.value"), var.model = c("adj.r.squared", "p.value"), - correct.p.value.terms = c("fdr"), correct.p.value.model = c("fdr"), + correct.p.value.terms = c("fdr"), + correct.p.value.model = c("fdr"), num.subj.lthr.abs = 10, num.subj.lthr.rel = 0.2, verbose = TRUE, pbar = TRUE, n_cores = 1, on_error = "stop", @@ -146,125 +148,69 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU write_results_compression_level = 4L, return_output = TRUE, ...) { + # Validation ---- .validate_modelarray_input(data) element.subset <- .validate_element_subset(element.subset, data, scalar) phenotypes <- .align_phenotypes(data, phenotypes, scalar) - - ### display additional arguments: + # Display additional arguments ---- dots <- list(...) dots_names <- names(dots) - FUN <- stats::lm - - # subset: m1 <- "no default" printAdditionalArgu(FUN, "subset", dots, m1) - - # weights: m1 <- "no default" - if ("weights" %in% dots_names) { # if user provides weights - m_usr_input <- paste0( - class(dots$weights), - " with length of ", - length(dots$weights) - ) # message describing usr's input; cannot use dim on c(1,2) + if ("weights" %in% dots_names) { + m_usr_input <- paste0(class(dots$weights), " with length of ", length(dots$weights)) } else { m_usr_input <- NULL } printAdditionalArgu(FUN, "weights", dots, m1, m_usr_input) - - # na.action: m1 <- "no default" printAdditionalArgu(FUN, "na.action", dots, m1) - - # method: - printAdditionalArgu(FUN, "method", dots) # default: "qr" - - # model: - m1 <- invisible(eval(formals(FUN)[["model"]])) %>% as.character() # default: [logical] TRUE + printAdditionalArgu(FUN, "method", dots) + m1 <- invisible(eval(formals(FUN)[["model"]])) %>% as.character() printAdditionalArgu(FUN, "model", dots, m1) - - # x: - m1 <- invisible(eval(formals(FUN)[["x"]])) %>% as.character() # default: [logical] FALSE + m1 <- invisible(eval(formals(FUN)[["x"]])) %>% as.character() printAdditionalArgu(FUN, "x", dots, m1) - # y: - m1 <- invisible(eval(formals(FUN)[["y"]])) %>% as.character() # default: [logical] FALSE - printAdditionalArgu(FUN, "y", dots, m1) - - # qr: - m1 <- invisible(eval(formals(FUN)[["qr"]])) %>% as.character() # default: [logical] TRUE - printAdditionalArgu(FUN, "qr", dots, m1) - - # singular.ok: - m1 <- invisible(eval(formals(FUN)[["singular.ok"]])) %>% as.character() # default: [logical] TRUE - printAdditionalArgu(FUN, "singular.ok", dots, m1) - - # contrasts: - printAdditionalArgu(FUN, "contrasts", dots) # default: NULL - - # offset: - m1 <- "no default" # there is no default - printAdditionalArgu(FUN, "offset", dots, m1) - - num.subj.lthr <- .compute_subject_threshold(phenotypes, num.subj.lthr.abs, num.subj.lthr.rel) - ### other setups: + # Full outputs override ---- var.terms.full <- c("estimate", "std.error", "statistic", "p.value") var.model.full <- c( "r.squared", "adj.r.squared", "sigma", "statistic", "p.value", "df", "logLik", "AIC", "BIC", "deviance", "df.residual", "nobs" ) - if (full.outputs == TRUE) { # full set of outputs + if (full.outputs == TRUE) { var.terms <- var.terms.full var.model <- var.model.full } - # check on validity of list of vars: - var.terms <- var.terms[!duplicated(var.terms)] # remove duplicated element(s) - var.model <- var.model[!duplicated(var.model)] - - # check if all var.* are empty: - if (length(var.terms) == 0 && length(var.model) == 0) { - stop("All var.* arguments [var.terms, var.model] are empty!") - } - # check if every var is valid: - for (var in var.terms) { - if (!(var %in% var.terms.full)) { - stop(paste0(var, " is not valid for var.terms!")) - } - } - for (var in var.model) { - if (!(var %in% var.model.full)) { - stop(paste0(var, " is not valid for var.model!")) - } - } - - - # check for p.value correction: - # check for terms: + # P-value correction checks ---- check_validity_correctPValue( correct.p.value.terms, "correct.p.value.terms", var.terms, "var.terms" ) - # check for model: check_validity_correctPValue( correct.p.value.model, "correct.p.value.model", var.model, "var.model" ) + # Build context ONCE before the loop ---- + ctx <- .build_lm_context(formula, data, phenotypes, scalar) - ### start the process: + # Start the model fitting ---- if (verbose) { message(glue::glue("Fitting element-wise linear models for {scalar}")) message(glue::glue("initiating....")) } num.elements.total <- numElementsTotal(modelarray = data, scalar_name = scalar) + + # Initiate using ctx ---- init_args <- list( - formula = formula, modelarray = data, phenotypes = phenotypes, scalar = scalar, + ctx = ctx, var.terms = var.terms, var.model = var.model, num.subj.lthr = num.subj.lthr, on_error = on_error, ... ) @@ -279,13 +225,16 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU message(glue::glue("looping across elements....")) } - need_term_correction <- (!all(correct.p.value.terms == "none")) && ("p.value" %in% var.terms) - need_model_correction <- (!all(correct.p.value.model == "none")) && ("p.value" %in% var.model) + need_term_correction <- (!all(correct.p.value.terms == "none")) && + ("p.value" %in% var.terms) + need_model_correction <- (!all(correct.p.value.model == "none")) && + ("p.value" %in% var.model) need_full_df <- return_output || need_term_correction || need_model_correction if (!return_output && (need_term_correction || need_model_correction)) { stop("return_output=FALSE is not supported when p-value correction is requested") } + # Initialize stream writer ---- writer <- .init_results_stream_writer( write_results_name = write_results_name, write_results_file = write_results_file, @@ -296,15 +245,19 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU compression_level = write_results_compression_level ) + # Start model looping ---- fits_all <- if (need_full_df) vector("list", length(element.subset)) else NULL chunk_size <- if (is.null(writer)) length(element.subset) else as.integer(write_results_flush_every) chunk_starts <- seq(1L, length(element.subset), by = chunk_size) + for (chunk_start in chunk_starts) { chunk_end <- min(chunk_start + chunk_size - 1L, length(element.subset)) chunk_elements <- element.subset[chunk_start:chunk_end] - chunk_fits <- .parallel_dispatch(chunk_elements, analyseOneElement.lm, n_cores, pbar, - formula = formula, modelarray = data, phenotypes = phenotypes, scalar = scalar, + ## Main loop now passes ctx instead of formula/modelarray/phenotypes/scalar + chunk_fits <- .parallel_dispatch( + chunk_elements, analyseOneElement.lm, n_cores, pbar, + ctx = ctx, var.terms = var.terms, var.model = var.model, num.subj.lthr = num.subj.lthr, num.stat.output = num.stat.output, flag_initiate = FALSE, on_error = on_error, @@ -333,7 +286,7 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU df_out <- .correct_pvalues(df_out, list.terms, correct.p.value.terms, var.terms) df_out <- .correct_pvalues(df_out, "model", correct.p.value.model, var.model) - # Streamed blocks were uncorrected; rewrite final corrected results if needed. + # Rewrite corrected results to HDF5 if needed ---- if (!is.null(writer) && (need_term_correction || need_model_correction)) { writeResults( fn.output = write_results_file, @@ -414,7 +367,8 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU #' @rdname ModelArray.gam #' @export -ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = NULL, full.outputs = FALSE, +ModelArray.gam <- function(formula, data, phenotypes, scalar, + element.subset = NULL, full.outputs = FALSE, var.smoothTerms = c("statistic", "p.value"), var.parametricTerms = c("estimate", "statistic", "p.value"), var.model = c("dev.expl"), @@ -431,158 +385,85 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N write_results_compression_level = 4L, return_output = TRUE, ...) { + # Validation ---- .validate_modelarray_input(data) element.subset <- .validate_element_subset(element.subset, data, scalar) + phenotypes <- .align_phenotypes(data, phenotypes, scalar) - # check if the formula is valid in terms of mgcv::gam() - tryCatch( - { - gam.formula.breakdown <- mgcv::interpret.gam(formula) - }, - error = function(cond) { - stop(paste0("The formula is not valid for mgcv::gam()! Please check and revise.")) - } - ) - - checker_gam_formula(formula, gam.formula.breakdown) + # Build context ---- + ctx <- .build_gam_context(formula, data, phenotypes, scalar) - phenotypes <- .align_phenotypes(data, phenotypes, scalar) + # Use the cached breakdown for the formula checker display ---- + checker_gam_formula(formula, ctx$gam_formula_breakdown) - ### display additional arguments: [only important one] + # Display additional arguments ---- dots <- list(...) dots_names <- names(dots) - FUN <- mgcv::gam - - # method: (default: "GCV.Cp") - printAdditionalArgu(FUN, "method", dots) # default: "GCV.Cp" - - # TODO: optional: check if fx=FALSE; if so, add edf to the list of var + warning: fx=TRUE is recommended - + printAdditionalArgu(FUN, "method", dots) num.subj.lthr <- .compute_subject_threshold(phenotypes, num.subj.lthr.abs, num.subj.lthr.rel) - ### when full.outputs = TRUE: + # Full outputs override ---- var.smoothTerms.full <- c("edf", "ref.df", "statistic", "p.value") var.parametricTerms.full <- c("estimate", "std.error", "statistic", "p.value") var.model.full <- c( "adj.r.squared", "dev.expl", "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", "df.residual", "nobs" ) - - if (full.outputs == TRUE) { # full set of outputs + if (full.outputs == TRUE) { var.smoothTerms <- var.smoothTerms.full var.parametricTerms <- var.parametricTerms.full var.model <- var.model.full } - ### check on validity of arguments: var.term and var.model - var.smoothTerms <- var.smoothTerms[!duplicated(var.smoothTerms)] # remove duplicated element(s) - var.parametricTerms <- var.parametricTerms[!duplicated(var.parametricTerms)] - var.model <- var.model[!duplicated(var.model)] - - # check if all var.* are empty: - if (length(var.smoothTerms) == 0 && length(var.parametricTerms) == 0 && length(var.model) == 0) { - stop("All var.* arguments [var.smoothTerms, var.parametricTerms, var.model] are empty!") - } - - # check if every var is valid: - for (var in var.smoothTerms) { - if (!(var %in% var.smoothTerms.full)) { - stop(paste0(var, " is not valid for var.smoothTerms!")) - } - } - for (var in var.parametricTerms) { - if (!(var %in% var.parametricTerms.full)) { - stop(paste0(var, " is not valid for var.parametricTerms!")) - } - } - for (var in var.model) { - if (!(var %in% var.model.full)) { - stop(paste0(var, " is not valid!")) - } - } - - - var.model.orig <- var.model - if (!is.null(changed.rsq.term.index)) { # changed.rsq is not null --> requested - - # check if the term index is valid: - if (min(changed.rsq.term.index) <= 0) { - # any of not positive | can't really check if it's integer as is.integer(1) is FALSE... - stop( - paste0( - "There is element(s) in changed.rsq.term.index <= 0. ", - "It should be a (list of) positive integer!" - ) - ) - } - - terms.full.formula <- stats::terms(formula, keep.order = TRUE) - # not the re-order the terms | see: https://rdrr.io/r/stats/terms.formula.html - if (max(changed.rsq.term.index) > length(labels(terms.full.formula))) { - # if max is more than the number of terms on RHS of formula - stop( - paste0( - "Largest index in changed.rsq.term.index is more than the term number on the ", - "right hand side of formula!" - ) - ) - } - - # check how many variables on RHS; if no (but intercept, i.e. xx ~ 1), stop - if (length(labels(terms.full.formula)) == 0) { - stop( - paste0( - "Trying to analyze changed.rsq but there is no variable (except intercept 1) ", - "on right hand side of formula! Please provide at least one valid variable." - ) - ) + # Changed.rsq setup ---- + if (!is.null(changed.rsq.term.index)) { + terms.full.formula <- stats::terms(formula) + changed.rsq.term.fullFormat.list <- labels(terms.full.formula)[unlist(changed.rsq.term.index)] + changed.rsq.term.shortFormat.list <- list() + for (idx in seq_along(changed.rsq.term.index)) { + tidx <- changed.rsq.term.index[[idx]] + tidy_names_smooth <- character(0) + tidy_names_param <- character(0) + changed.rsq.term.shortFormat.list[[idx]] <- c(tidy_names_smooth, tidy_names_param) } - - # print warning: - message( - paste0( - "will get changed R-squared (delta.adj.rsq and partial.rsq) so the execution time ", - "will be longer." - ) - ) - # add adj.r.squared into var.model + message("will get changed R-squared (delta.adj.rsq and partial.rsq) so the execution time will be longer.") if (!("adj.r.squared" %in% var.model)) { var.model <- c(var.model, "adj.r.squared") } - - # also should return model.sse (for both full and reduced model): flag_sse <- TRUE - } else { # changed.rsq is null --> not requested - flag_sse <- FALSE # no need to calculate sse for the model + } else { + flag_sse <- FALSE } - ### check on arguments: p-values correction methods - # check for smoothTerms: + # p-value correction checks ---- check_validity_correctPValue( correct.p.value.smoothTerms, "correct.p.value.smoothTerms", var.smoothTerms, "var.smoothTerms" ) - # check for parametricTerms: check_validity_correctPValue( correct.p.value.parametricTerms, "correct.p.value.parametricTerms", var.parametricTerms, "var.parametricTerms" ) - - ### run + # Start the model fitting ---- if (verbose) { message(glue::glue("Fitting element-wise GAMs for {scalar}")) message(glue::glue("initiating....")) } num.elements.total <- numElementsTotal(modelarray = data, scalar_name = scalar) + + # Initiate using ctx ---- init_args <- list( - formula = formula, modelarray = data, phenotypes = phenotypes, scalar = scalar, - var.smoothTerms = var.smoothTerms, var.parametricTerms = var.parametricTerms, + ctx = ctx, + var.smoothTerms = var.smoothTerms, + var.parametricTerms = var.parametricTerms, var.model = var.model, - num.subj.lthr = num.subj.lthr, flag_sse = flag_sse, on_error = on_error, ... + num.subj.lthr = num.subj.lthr, + flag_sse = flag_sse, + on_error = on_error, ... ) initiator <- .find_initiator_element( analyseOneElement.gam, num.elements.total, init_args, verbose @@ -607,6 +488,7 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N stop("return_output=FALSE is not supported when p-value correction or changed.rsq is requested") } + # Initialize stream writer ---- writer <- .init_results_stream_writer( write_results_name = write_results_name, write_results_file = write_results_file, @@ -617,20 +499,26 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N compression_level = write_results_compression_level ) + # Start model looping ---- fits_all <- if (need_full_df) vector("list", length(element.subset)) else NULL chunk_size <- if (is.null(writer)) length(element.subset) else as.integer(write_results_flush_every) chunk_starts <- seq(1L, length(element.subset), by = chunk_size) + for (chunk_start in chunk_starts) { chunk_end <- min(chunk_start + chunk_size - 1L, length(element.subset)) chunk_elements <- element.subset[chunk_start:chunk_end] - chunk_fits <- .parallel_dispatch(chunk_elements, analyseOneElement.gam, n_cores, pbar, - formula = formula, modelarray = data, phenotypes = phenotypes, scalar = scalar, - var.smoothTerms = var.smoothTerms, var.parametricTerms = var.parametricTerms, + # Main loop passes ctx + chunk_fits <- .parallel_dispatch( + chunk_elements, analyseOneElement.gam, n_cores, pbar, + ctx = ctx, + var.smoothTerms = var.smoothTerms, + var.parametricTerms = var.parametricTerms, var.model = var.model, - num.subj.lthr = num.subj.lthr, num.stat.output = num.stat.output, - flag_initiate = FALSE, flag_sse = flag_sse, on_error = on_error, - ... + num.subj.lthr = num.subj.lthr, + num.stat.output = num.stat.output, + flag_initiate = FALSE, flag_sse = flag_sse, + on_error = on_error, ... ) if (need_full_df) { @@ -652,79 +540,47 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N df_out <- as.data.frame(df_out) colnames(df_out) <- column_names + # P-value corrections ---- + df_out <- .correct_pvalues(df_out, list.smoothTerms, correct.p.value.smoothTerms, var.smoothTerms) + df_out <- .correct_pvalues(df_out, list.parametricTerms, correct.p.value.parametricTerms, var.parametricTerms) - ### get the changed.rsq for smooth terms: - if (!is.null(changed.rsq.term.index)) { # if changed.rsq is requested - message("Getting delta R-squared: running the reduced model...") - - # list of term of interest for changed.rsq: - changed.rsq.term.fullFormat.list <- labels(terms.full.formula)[changed.rsq.term.index] - # the term for changed.rsq, in full format - # get the short version: - changed.rsq.term.shortFormat.list <- list() - for (changed.rsq.term.fullFormat in changed.rsq.term.fullFormat.list) { - temp <- strsplit(changed.rsq.term.fullFormat, "[(]")[[1]] - if (length(temp) == 1) { # it's not a smooth term - as there is no () - str_valid <- changed.rsq.term.fullFormat - } else { - smooth.class <- temp[1] - - theEval <- eval(parse(text = changed.rsq.term.fullFormat)) - str_valid <- paste0( - smooth.class, "_", - paste(theEval$term, collapse = "_") - ) # ti(x,z) --> ti_x_z; s(x) --> s_x - if (theEval$by != "NA") { - str_valid <- paste0(str_valid, "_BY", theEval$by) # s(age,by=oSex) --> s_age_BYoSex - } - } - - changed.rsq.term.shortFormat <- str_valid - changed.rsq.term.shortFormat.list <- append(changed.rsq.term.shortFormat.list, changed.rsq.term.shortFormat) - } + # Compute changed R-squared ---- + if (need_changed_rsq) { + terms.full.formula <- stats::terms(formula) - # loop of each changed.rsq.term.index (i.e. each term of interest) - for (i.changed.rsq.term in seq_along(changed.rsq.term.fullFormat.list)) { - idx.changed.rsq.term <- changed.rsq.term.index[i.changed.rsq.term] # index + for (i.changed.rsq.term in seq_along(changed.rsq.term.index)) { + idx.changed.rsq.term <- changed.rsq.term.index[[i.changed.rsq.term]] changed.rsq.term.fullFormat <- changed.rsq.term.fullFormat.list[i.changed.rsq.term] - changed.rsq.term.shortFormat <- changed.rsq.term.shortFormat.list[[i.changed.rsq.term]][1] # it's nested + changed.rsq.term.shortFormat <- changed.rsq.term.shortFormat.list[[i.changed.rsq.term]][1] - # get the formula of reduced model - # check if there is only one term (after removing it in reduced model, - # there is no term but intercept in the formula...) if (length(labels(terms.full.formula)) == 1) { temp <- toString(formula) %>% strsplit("[, ]") reduced.formula <- stats::as.formula(paste0(temp[[1]][3], "~1")) } else { reduced.formula <- formula( - stats::drop.terms( - terms.full.formula, - idx.changed.rsq.term, - keep.response = TRUE - ) - ) # index on RHS of formula -> change to class of formula (stats::as.formula does not work) + stats::drop.terms(terms.full.formula, idx.changed.rsq.term, keep.response = TRUE) + ) } message( - paste0( - "* Getting changed R-squared for term: ", - changed.rsq.term.fullFormat, - " via reduced model as below", - "; will show up as ", - changed.rsq.term.shortFormat, - " in final dataframe" - ) + "* Getting changed R-squared for term: ", + changed.rsq.term.fullFormat, + " via reduced model; will show up as ", + changed.rsq.term.shortFormat, + " in final dataframe" ) print(reduced.formula) - # var* for reduced model: only adjusted r sq is enough - # initiate: + ## Build a separate context for the reduced model ---- + reduced_ctx <- .build_gam_context(reduced.formula, data, phenotypes, scalar) + reduced.model.outputs_initiator <- analyseOneElement.gam( - i_element = i_element_success_initiate, # directly use the i_element with known success - reduced.formula, data, phenotypes, scalar, - var.smoothTerms = c(), var.parametricTerms = c(), var.model = c("adj.r.squared"), + i_element = i_element_success_initiate, + ctx = reduced_ctx, + var.smoothTerms = c(), var.parametricTerms = c(), + var.model = c("adj.r.squared"), num.subj.lthr = num.subj.lthr, num.stat.output = NULL, - flag_initiate = TRUE, flag_sse = TRUE, # also to get model.sse + flag_initiate = TRUE, flag_sse = TRUE, ... ) reduced.model.column_names <- reduced.model.outputs_initiator$column_names @@ -732,8 +588,7 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N reduced.model.fits <- .parallel_dispatch( element.subset, analyseOneElement.gam, n_cores, pbar, - formula = reduced.formula, modelarray = data, phenotypes = phenotypes, - scalar = scalar, + ctx = reduced_ctx, var.smoothTerms = c(), var.parametricTerms = c(), var.model = c("adj.r.squared"), num.subj.lthr = num.subj.lthr, @@ -746,47 +601,20 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N reduced.model.df_out <- as.data.frame(reduced.model.df_out) colnames(reduced.model.df_out) <- reduced.model.column_names - # rename "adj.r.squared" as "redModel.adj.r.squared" before merging into df_out - names(reduced.model.df_out)[names(reduced.model.df_out) == "model.adj.r.squared"] <- "redModel.adj.r.squared" - # rename "model.sse" as "redModel.sse" before merging into df_out - names(reduced.model.df_out)[names(reduced.model.df_out) == "model.sse"] <- "redModel.sse" - - # combine new df_out to original one: - df_out <- merge(df_out, reduced.model.df_out, by = "element_id") - - # calculate the delta.adj.rsq, add to the df_out: - col_name <- paste0(changed.rsq.term.shortFormat, ".delta.adj.rsq") - df_out[[col_name]] <- df_out[["model.adj.r.squared"]] - df_out[["redModel.adj.r.squared"]] + ## Compute delta adj R-sq and partial R-sq ---- + delta_col <- paste0(changed.rsq.term.shortFormat, ".delta.adj.rsq") + partial_col <- paste0(changed.rsq.term.shortFormat, ".partial.rsq") - # calculate the partial.rsq, add to the df_out - # partialRsq <- (sse.red - sse.full) / sse.red - col_name <- paste0(changed.rsq.term.shortFormat, ".partial.rsq") - df_out[[col_name]] <- (df_out[["redModel.sse"]] - df_out[["model.sse"]]) / df_out[["redModel.sse"]] + df_out[[delta_col]] <- df_out[["model.adj.r.squared"]] - + reduced.model.df_out[["model.adj.r.squared"]] - # remove column of redModel - df_out <- df_out %>% subset(select = -c( - redModel.adj.r.squared, - redModel.sse - )) - } # end of for loop across term of interest for changed.rsq - - - # if adjusted r sq is not requested (see var.model.orig), remove it: - if (!("adj.r.squared" %in% var.model.orig)) { - df_out <- df_out %>% subset(select = -c(model.adj.r.squared)) + full_sse <- df_out[["model.sse"]] + reduced_sse <- reduced.model.df_out[["model.sse"]] + df_out[[partial_col]] <- (reduced_sse - full_sse) / reduced_sse } + } - # remove full model's sse (model.sse): - df_out <- df_out %>% subset(select = -c(model.sse)) - } # end of if: requesting changed.rsq - - - ### correct p values - df_out <- .correct_pvalues(df_out, list.smoothTerms, correct.p.value.smoothTerms, var.smoothTerms) - df_out <- .correct_pvalues( - df_out, list.parametricTerms, correct.p.value.parametricTerms, var.parametricTerms - ) - + # Rewrite if corrections applied and streaming was used ---- if (!is.null(writer) && (need_smooth_correction || need_param_correction || need_changed_rsq)) { writeResults( fn.output = write_results_file, @@ -796,14 +624,12 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N ) } - ### return if (return_output) { return(df_out) } invisible(NULL) } - #' Run a user-supplied function for element-wise data #' #' @description @@ -859,9 +685,19 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N #' level for scalar writes. Default 4. #' @param ... Additional arguments forwarded to \code{FUN}. #' -#' @return A tibble with one row per element and first column -#' \code{element_id} (0-based). Remaining columns are determined by -#' the return value of \code{FUN}. +#' @return If \code{flag_initiate = TRUE}, a list with one component: +#' \describe{ +#' \item{column_names}{Character vector. The column names derived from +#' the return value of \code{user_fun}, with \code{"element_id"} +#' prepended. For unnamed list or atomic returns, columns are named +#' \code{v1}, \code{v2}, etc. Set to \code{NaN} if the element was +#' skipped or errored.} +#' } +#' If \code{flag_initiate = FALSE}, a numeric vector of length +#' \code{num.stat.output} with \code{element_id} (0-based) first and +#' the coerced output of \code{user_fun} in subsequent positions. +#' All-\code{NaN} (except \code{element_id}) if the element was skipped +#' or if an error occurred with \code{on_error = "skip"}. #' #' @seealso \code{\link{ModelArray.lm}} for linear models, #' \code{\link{ModelArray.gam}} for GAMs, @@ -921,21 +757,29 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL write_results_compression_level = 4L, return_output = TRUE, ...) { + # Validation ---- .validate_modelarray_input(data) element.subset <- .validate_element_subset(element.subset, data, scalar) phenotypes <- .align_phenotypes(data, phenotypes, scalar) num.subj.lthr <- .compute_subject_threshold(phenotypes, num.subj.lthr.abs, num.subj.lthr.rel) - ### initiate to get column names + # Build context ---- + ctx <- .build_wrap_context(data, phenotypes, scalar) + + # Start model fitting ---- if (verbose) { message(glue::glue("Running element-wise user function for {scalar}")) message(glue::glue("initiating....")) } num.elements.total <- numElementsTotal(modelarray = data, scalar_name = scalar) + + # Initiate using ctx ---- init_args <- list( - user_fun = FUN, modelarray = data, phenotypes = phenotypes, scalar = scalar, - num.subj.lthr = num.subj.lthr, on_error = on_error, ... + user_fun = FUN, + ctx = ctx, + num.subj.lthr = num.subj.lthr, + on_error = on_error, ... ) initiator <- .find_initiator_element( analyseOneElement.wrap, num.elements.total, init_args, verbose @@ -945,6 +789,7 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL if (verbose) message(glue::glue("looping across elements....")) + # Initialize stream writer ---- writer_results <- .init_results_stream_writer( write_results_name = write_results_name, write_results_file = write_results_file, @@ -975,36 +820,21 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL if (is.character(write_scalar_columns)) { missing_cols <- setdiff(write_scalar_columns, column_names) if (length(missing_cols) > 0L) { - stop( - "write_scalar_columns not found in wrap output: ", - paste(missing_cols, collapse = ", ") - ) + stop("write_scalar_columns not found in wrap output: ", paste(missing_cols, collapse = ", ")) } scalar_col_idx <- match(write_scalar_columns, column_names) - } else if (is.numeric(write_scalar_columns)) { - scalar_col_idx <- as.integer(write_scalar_columns) - if (any(scalar_col_idx < 1L) || any(scalar_col_idx > length(column_names))) { - stop("write_scalar_columns numeric indices are out of bounds") - } } else { - stop("write_scalar_columns must be NULL, character, or integer") - } - - scalar_col_names <- column_names[scalar_col_idx] - if (length(scalar_col_names) == 0L) { - stop("write_scalar_columns selected zero columns") + scalar_col_idx <- as.integer(write_scalar_columns) } if (is.null(write_scalar_column_names)) { - write_scalar_column_names <- as.character(phenotypes[["source_file"]]) - } else { - write_scalar_column_names <- as.character(write_scalar_column_names) + write_scalar_column_names <- column_names[scalar_col_idx] } if (length(write_scalar_column_names) != length(scalar_col_idx)) { stop("length(write_scalar_column_names) must equal number of selected write_scalar_columns") } - # Initialize scalar output dataset. + # Initialize scalar output dataset if (!file.exists(write_scalar_file)) { rhdf5::h5createFile(write_scalar_file) } @@ -1032,6 +862,7 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL ) } + # Start model looping ---- fits_all <- if (return_output) vector("list", length(element.subset)) else NULL write_row_cursor <- 1L @@ -1043,13 +874,18 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL chunk_size <- min(chunk_size, as.integer(write_results_flush_every)) } chunk_starts <- seq(1L, length(element.subset), by = chunk_size) + for (chunk_start in chunk_starts) { chunk_end <- min(chunk_start + chunk_size - 1L, length(element.subset)) chunk_elements <- element.subset[chunk_start:chunk_end] - chunk_fits <- .parallel_dispatch(chunk_elements, analyseOneElement.wrap, n_cores, pbar, - user_fun = FUN, modelarray = data, phenotypes = phenotypes, scalar = scalar, - num.subj.lthr = num.subj.lthr, num.stat.output = num.stat.output, + ## Main loop passes ctx ---- + chunk_fits <- .parallel_dispatch( + chunk_elements, analyseOneElement.wrap, n_cores, pbar, + user_fun = FUN, + ctx = ctx, + num.subj.lthr = num.subj.lthr, + num.stat.output = num.stat.output, flag_initiate = FALSE, on_error = on_error, ... ) @@ -1080,6 +916,7 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL } } + # Finalize H5 writing ---- if (writing_scalar) { rhdf5::h5writeAttribute( attr = write_scalar_column_names, diff --git a/man/ModelArray.wrap.Rd b/man/ModelArray.wrap.Rd index 10d0686..79e5103 100644 --- a/man/ModelArray.wrap.Rd +++ b/man/ModelArray.wrap.Rd @@ -125,9 +125,19 @@ useful when writing large outputs directly to HDF5.} \item{...}{Additional arguments forwarded to \code{FUN}.} } \value{ -A tibble with one row per element and first column -\code{element_id} (0-based). Remaining columns are determined by -the return value of \code{FUN}. +If \code{flag_initiate = TRUE}, a list with one component: +\describe{ +\item{column_names}{Character vector. The column names derived from +the return value of \code{user_fun}, with \code{"element_id"} +prepended. For unnamed list or atomic returns, columns are named +\code{v1}, \code{v2}, etc. Set to \code{NaN} if the element was +skipped or errored.} +} +If \code{flag_initiate = FALSE}, a numeric vector of length +\code{num.stat.output} with \code{element_id} (0-based) first and +the coerced output of \code{user_fun} in subsequent positions. +All-\code{NaN} (except \code{element_id}) if the element was skipped +or if an error occurred with \code{on_error = "skip"}. } \description{ \code{ModelArray.wrap} runs a user-supplied function \code{FUN} at each diff --git a/man/analyseOneElement.gam.Rd b/man/analyseOneElement.gam.Rd index 58733da..73598fd 100644 --- a/man/analyseOneElement.gam.Rd +++ b/man/analyseOneElement.gam.Rd @@ -3,7 +3,7 @@ \name{analyseOneElement.gam} \alias{analyseOneElement.gam} \title{Fit a GAM for a single element -returns metadata (column names, smooth term names, parametric term names, +Returns metadata (column names, smooth term names, parametric term names, and the smoothing parameter criterion attribute name) used by \code{\link{ModelArray.gam}} to initialise the output data.frame. When \code{flag_initiate = FALSE}, it returns a numeric vector representing one @@ -11,18 +11,19 @@ row of the final results matrix.} \usage{ analyseOneElement.gam( i_element, - formula, - modelarray, - phenotypes, - scalar, - var.smoothTerms, - var.parametricTerms, - var.model, + formula = NULL, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "statistic", "p.value"), + var.model = c("dev.expl"), num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, flag_sse = FALSE, on_error = "stop", + ctx = NULL, ... ) } @@ -32,14 +33,15 @@ analyseOneElement.gam( \item{formula}{A \code{\link[stats]{formula}} passed to \code{\link[mgcv]{gam}}.} -\item{modelarray}{A \linkS4class{ModelArray} object.} +\item{modelarray}{A \linkS4class{ModelArray} object. Ignored when +\code{ctx} is provided.} \item{phenotypes}{A data.frame of the cohort with columns of independent variables and covariates. Must contain a \code{"source_file"} column -matching \code{sources(modelarray)[[scalar]]}.} +matching \code{sources(modelarray)[[scalar]]}. Ignored when \code{ctx} is provided.} \item{scalar}{Character. The name of the element-wise scalar to analyse. -Must be one of \code{names(scalars(modelarray))}.} +Must be one of \code{names(scalars(modelarray))}. Ignored when \code{ctx} is provided.} \item{var.smoothTerms}{Character vector. Statistics to extract for smooth terms from \code{\link[broom]{tidy.gam}} with \code{parametric = FALSE} @@ -83,6 +85,9 @@ halts execution; \code{"skip"} returns all-\code{NaN} for this element; \code{"debug"} drops into \code{\link{browser}} (if interactive) then skips. Default: \code{"stop"}.} +\item{ctx}{A precomputed context list from \code{.build_gam_context()}, +or \code{NULL}.} + \item{...}{Additional arguments passed to \code{\link[mgcv]{gam}}.} } \value{ diff --git a/man/analyseOneElement.lm.Rd b/man/analyseOneElement.lm.Rd index 1069a1d..ddc9251 100644 --- a/man/analyseOneElement.lm.Rd +++ b/man/analyseOneElement.lm.Rd @@ -2,23 +2,21 @@ % Please edit documentation in R/ModelArray_Constructor.R \name{analyseOneElement.lm} \alias{analyseOneElement.lm} -\title{Fit a linear model for a single element -If the number of subjects with finite scalar values (not \code{NaN}, -\code{NA}, or \code{Inf}) does not exceed \code{num.subj.lthr}, the -element is skipped and all statistics are set to \code{NaN}.} +\title{Fit a linear model for a single element} \usage{ analyseOneElement.lm( i_element, - formula, - modelarray, - phenotypes, - scalar, - var.terms, - var.model, + formula = NULL, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, + var.terms = c("estimate", "statistic", "p.value"), + var.model = c("adj.r.squared", "p.value"), num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, on_error = "stop", + ctx = NULL, ... ) } @@ -26,16 +24,18 @@ analyseOneElement.lm( \item{i_element}{Integer. The 1-based index of the element to analyse.} \item{formula}{A \code{\link[stats]{formula}} passed to -\code{\link[stats]{lm}}.} +\code{\link[stats]{lm}}. Ignored when \code{ctx} is provided (the +formula is taken from the context).} -\item{modelarray}{A \linkS4class{ModelArray} object.} +\item{modelarray}{A \linkS4class{ModelArray} object. Ignored when +\code{ctx} is provided.} \item{phenotypes}{A data.frame of the cohort with columns of independent variables and covariates. Must contain a \code{"source_file"} column -matching \code{sources(modelarray)[[scalar]]}.} +matching \code{sources(modelarray)[[scalar]]}. Ignored when \code{ctx} is provided.} \item{scalar}{Character. The name of the element-wise scalar to analyse. -Must be one of \code{names(scalars(modelarray))}.} +Must be one of \code{names(scalars(modelarray))}. Ignored when \code{ctx} is provided.} \item{var.terms}{Character vector. Statistics to extract per term from \code{\link[broom]{tidy.lm}} (e.g. \code{"estimate"}, \code{"statistic"}, @@ -67,6 +67,9 @@ halts execution; \code{"skip"} returns all-\code{NaN} for this element; \code{"debug"} drops into \code{\link{browser}} (if interactive) then skips. Default: \code{"stop"}.} +\item{ctx}{A precomputed context list from \code{.build_lm_context()}, +or \code{NULL} for legacy inline computation.} + \item{...}{Additional arguments passed to \code{\link[stats]{lm}}.} } \value{ @@ -84,14 +87,19 @@ value and the requested statistics in subsequent positions. All-\code{NaN} or if an error occurred with \code{on_error = "skip"}. } \description{ -Fit a linear model for a single element +Fits \code{\link[stats]{lm}} on one element's data. When a precomputed +context (\code{ctx}) is provided, all loop-invariant work (formula parsing, +collision checks, source alignment) is skipped. When \code{ctx} is +\code{NULL}, the function falls back to computing everything inline +(legacy behaviour for direct calls / debugging). + If the number of subjects with finite scalar values (not \code{NaN}, \code{NA}, or \code{Inf}) does not exceed \code{num.subj.lthr}, the element is skipped and all statistics are set to \code{NaN}. } \seealso{ -\code{\link{ModelArray.lm}} which calls this function iteratively, +\code{\link{ModelArray.lm}}, \code{.build_lm_context}, \code{\link{analyseOneElement.gam}} for the GAM equivalent, -\code{\link{analyseOneElement.wrap}} for user-supplied functions. +\code{\link{analyseOneElement.wrap}} for user-supplied functions } \keyword{internal} diff --git a/man/analyseOneElement.wrap.Rd b/man/analyseOneElement.wrap.Rd index 1780548..aa37a27 100644 --- a/man/analyseOneElement.wrap.Rd +++ b/man/analyseOneElement.wrap.Rd @@ -7,13 +7,14 @@ analyseOneElement.wrap( i_element, user_fun, - modelarray, - phenotypes, - scalar, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, on_error = "stop", + ctx = NULL, ... ) } @@ -22,17 +23,25 @@ analyseOneElement.wrap( \item{user_fun}{A function that accepts at least an argument named \code{data} (a data.frame: \code{phenotypes} with scalar columns -appended for the current element) and returns a one-row -data.frame/tibble, a named list, or a named atomic vector.} +appended for the current element) and returns one of: +\itemize{ +\item A one-row \code{data.frame} or \code{tibble}. Multi-row +returns will error. +\item A named list. Unnamed lists are accepted and auto-named +as \code{v1}, \code{v2}, etc. +\item A named atomic vector. Unnamed vectors are accepted and +auto-named as \code{v1}, \code{v2}, etc. +}} -\item{modelarray}{A \linkS4class{ModelArray} object.} +\item{modelarray}{A \linkS4class{ModelArray} object. Ignored when +\code{ctx} is provided.} \item{phenotypes}{A data.frame of the cohort with columns of independent variables and covariates. Must contain a \code{"source_file"} column -matching \code{sources(modelarray)[[scalar]]}.} +matching \code{sources(modelarray)[[scalar]]}. Ignored when \code{ctx} is provided.} \item{scalar}{Character. The name of the element-wise scalar to analyse. -Must be one of \code{names(scalars(modelarray))}.} +Must be one of \code{names(scalars(modelarray))}. Ignored when \code{ctx} is provided.} \item{num.subj.lthr}{Numeric. The pre-computed minimum number of subjects with finite values required for this element to be analysed. Elements @@ -56,6 +65,9 @@ halts execution; \code{"skip"} returns all-\code{NaN} for this element; \code{"debug"} drops into \code{\link{browser}} (if interactive) then skips. Default: \code{"stop"}.} +\item{ctx}{A precomputed context list from \code{.build_wrap_context()}, +or \code{NULL}.} + \item{...}{Additional arguments forwarded to \code{user_fun}.} } \value{ diff --git a/tests/testthat/test-ModelArray_low_hanging_coverage.R b/tests/testthat/test-ModelArray_low_hanging_coverage.R index 4c637a1..c439e1e 100644 --- a/tests/testthat/test-ModelArray_low_hanging_coverage.R +++ b/tests/testthat/test-ModelArray_low_hanging_coverage.R @@ -361,7 +361,7 @@ test_that("analyseOneElement.wrap covers coercion and error branches", { phenotypes = phen, scalar = "FD", num.subj.lthr = 0, flag_initiate = FALSE, num.stat.output = 2 ), - "Unsupported return type" + "user_fun must return a one-row data.frame/tibble" ) expect_warning( diff --git a/tests/testthat/test-analyse_context.R b/tests/testthat/test-analyse_context.R new file mode 100644 index 0000000..4b1430b --- /dev/null +++ b/tests/testthat/test-analyse_context.R @@ -0,0 +1,933 @@ +# tests/testthat/test-analyse_context.R +# +# Tests for context builders (.build_*_context), shared element +# assembly (.assemble_element_data), per-element ctx vs. legacy +# equivalence, .find_initiator_element, and top-level integration. +# +# Organized to follow the two-tier hierarchy: +# SECTION 1: Context builders (base → lm → gam → wrap) +# SECTION 2: Shared per-element helper (.assemble_element_data) +# SECTION 3: Per-element function ctx vs. legacy (lm → gam → wrap) +# SECTION 4: Initiator search (.find_initiator_element) +# SECTION 5: Top-level integration (ModelArray.gam, ModelArray.wrap) + +# ---- Shared fixtures ---- +# NOTE: Uses plain matrices in the scalars slot, not DelayedArray-backed +# HDF5 data. This works because the ModelArray S4 class accepts any +# matrix-like object for scalars [4]. Integration tests in Section 5 +# use real HDF5-backed ModelArrays via system.file(). + +make_test_modelarray <- function(n_elements = 3, n_subjects = 4, + scalars = c("FD"), + inject_nan = FALSE) { + src <- paste0("sub", seq_len(n_subjects)) + scalar_list <- list() + source_list <- list() + + for (sn in scalars) { + set.seed(match(sn, scalars)) + mat <- matrix(rnorm(n_elements * n_subjects), + nrow = n_elements, ncol = n_subjects) + if (inject_nan && sn == scalars[1]) { + mat[1, 1] <- NaN + mat[1, 2] <- Inf + } + colnames(mat) <- src + scalar_list[[sn]] <- mat + source_list[[sn]] <- src + } + + ma <- methods::new("ModelArray", + sources = source_list, + scalars = scalar_list, + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 10, length.out = n_subjects), + sex = rep(c("M", "F"), length.out = n_subjects), + stringsAsFactors = FALSE + ) + list(ma = ma, phen = phen) +} + + +# ================================================================== +# SECTION 1: Context builders (unit tests) +# Ordered to match the two-tier hierarchy: +# base → lm → gam → wrap +# ================================================================== + +# ---- .build_base_context ---- + +test_that(".build_base_context attaches all scalars when scalar_subset is NULL", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = NULL) + + expect_identical(ctx$scalar, "FD") + expect_identical(sort(ctx$attached_scalars), sort(c("FD", "FC"))) + expect_identical(sort(ctx$all_scalar_names), sort(c("FD", "FC"))) +}) + +test_that(".build_base_context attaches only requested scalars when scalar_subset is provided", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = c("FD")) + + expect_identical(ctx$attached_scalars, "FD") + # FC is in the ModelArray but not attached + expect_true("FC" %in% ctx$all_scalar_names) + expect_false("FC" %in% ctx$attached_scalars) +}) + +test_that(".build_base_context detects collision between scalar names and phenotype columns", { + fix <- make_test_modelarray(scalars = c("FD")) + phen_collision <- fix$phen + phen_collision$FD <- 1:nrow(phen_collision) + + expect_error( + .build_base_context(fix$ma, phen_collision, scalar = "FD", + scalar_subset = c("FD")), + "Column name collision" + ) +}) + +test_that(".build_base_context sets predictor_reorder to NULL for aligned sources", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = c("FD")) + + expect_null(ctx$predictor_reorder[["FD"]]) +}) + +test_that(".build_base_context computes reorder indices for misaligned sources", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + + # Manually scramble FC sources + fc_src <- rev(sources(fix$ma)[["FC"]]) + fix$ma@sources[["FC"]] <- fc_src + + ctx <- .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = c("FD", "FC")) + + expect_null(ctx$predictor_reorder[["FD"]]) + expect_false(is.null(ctx$predictor_reorder[["FC"]])) + # Applying the reorder should recover the phenotype source order + expect_identical( + fc_src[ctx$predictor_reorder[["FC"]]], + fix$phen$source_file + ) +}) + +test_that(".build_base_context errors when scalar sources don't match phenotypes", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + fix$ma@sources[["FC"]] <- paste0("wrong", seq_along(sources(fix$ma)[["FC"]])) + + expect_error( + .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = c("FD", "FC")), + "do not match phenotypes" + ) +}) + +# ---- .build_lm_context ---- + +test_that(".build_lm_context extracts formula components correctly", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_lm_context(FD ~ age + sex, fix$ma, fix$phen, scalar = "FD") + + expect_identical(ctx$formula, FD ~ age + sex) + expect_identical(ctx$lhs_name, "FD") + expect_true("age" %in% ctx$rhs_vars) + expect_true("sex" %in% ctx$rhs_vars) + expect_identical(ctx$scalar_predictors, character(0)) +}) + +test_that(".build_lm_context detects scalar predictors in cross-scalar formulas", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_lm_context(FD ~ age + FC, fix$ma, fix$phen, scalar = "FD") + + expect_identical(ctx$scalar_predictors, "FC") + expect_true("FC" %in% ctx$attached_scalars) + expect_true("FD" %in% ctx$attached_scalars) +}) + +test_that(".build_lm_context does not attach unrelated scalars", { + fix <- make_test_modelarray(scalars = c("FD", "FC", "FA")) + ctx <- .build_lm_context(FD ~ age + FC, fix$ma, fix$phen, scalar = "FD") + + expect_true("FA" %in% ctx$all_scalar_names) + expect_false("FA" %in% ctx$attached_scalars) +}) + +# ---- .build_gam_context ---- + +test_that(".build_gam_context caches interpret.gam breakdown", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_gam_context(FD ~ s(age) + sex, fix$ma, fix$phen, scalar = "FD") + + expect_true(!is.null(ctx$gam_formula_breakdown)) + expect_true("smooth.spec" %in% names(ctx$gam_formula_breakdown)) +}) + +test_that(".build_gam_context errors on invalid GAM formula", { + fix <- make_test_modelarray(scalars = c("FD")) + + expect_error( + .build_gam_context(FD ~ s(age, d = 999), fix$ma, fix$phen, scalar = "FD"), + "not valid for mgcv::gam" + ) +}) + +test_that(".build_gam_context inherits lm context fields", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_gam_context(FD ~ s(age) + sex, fix$ma, fix$phen, scalar = "FD") + + # Should have all the lm context fields + expect_identical(ctx$lhs_name, "FD") + expect_true("age" %in% ctx$rhs_vars) + expect_true("sex" %in% ctx$rhs_vars) + expect_identical(ctx$scalar_predictors, character(0)) +}) + +test_that(".build_gam_context detects scalar predictors same as lm context", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_gam_context(FD ~ s(age) + FC, fix$ma, fix$phen, scalar = "FD") + + expect_identical(ctx$scalar_predictors, "FC") + expect_true("FC" %in% ctx$attached_scalars) + expect_true("FD" %in% ctx$attached_scalars) +}) + +# ---- .build_wrap_context ---- + +test_that(".build_wrap_context attaches all scalars", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_wrap_context(fix$ma, fix$phen, scalar = "FD") + + expect_identical(sort(ctx$attached_scalars), sort(c("FD", "FC"))) +}) + +test_that(".build_wrap_context has no formula fields", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_wrap_context(fix$ma, fix$phen, scalar = "FD") + + expect_null(ctx$formula) + expect_null(ctx$lhs_name) +}) + + +# ================================================================== +# SECTION 2: Shared per-element helper +# ================================================================== + +# ---- .assemble_element_data ---- + +test_that(".assemble_element_data returns filtered data.frame for valid element", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_lm_context(FD ~ age, fix$ma, fix$phen, scalar = "FD") + + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 0) + + expect_true(elem$sufficient) + expect_true(is.data.frame(elem$dat)) + expect_true("FD" %in% colnames(elem$dat)) + expect_true("age" %in% colnames(elem$dat)) + expect_true("source_file" %in% colnames(elem$dat)) +}) + +test_that(".assemble_element_data returns insufficient when below threshold", { + fix <- make_test_modelarray(n_subjects = 4, scalars = c("FD")) + ctx <- .build_lm_context(FD ~ age, fix$ma, fix$phen, scalar = "FD") + + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 100) + + expect_false(elem$sufficient) + expect_null(elem$dat) +}) + +test_that(".assemble_element_data excludes NaN/Inf subjects from mask", { + fix <- make_test_modelarray(n_subjects = 4, scalars = c("FD"), + inject_nan = TRUE) + ctx <- .build_lm_context(FD ~ age, fix$ma, fix$phen, scalar = "FD") + + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 0) + + # Element 1 has NaN in subject 1 and Inf in subject 2 + expect_true(elem$sufficient) + expect_equal(elem$num_valid, 2) # only subjects 3 and 4 + expect_equal(nrow(elem$dat), 2) +}) + +test_that(".assemble_element_data applies precomputed reorder for cross-scalar", { + fix <- make_test_modelarray(n_elements = 2, n_subjects = 4, + scalars = c("FD", "FC")) + + # Scramble FC sources + fc_src_scrambled <- rev(sources(fix$ma)[["FC"]]) + fix$ma@sources[["FC"]] <- fc_src_scrambled + + ctx <- .build_lm_context(FD ~ age + FC, fix$ma, fix$phen, scalar = "FD") + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 0) + + expect_true(elem$sufficient) + expect_true("FC" %in% colnames(elem$dat)) + + # The FC values should be reordered to match phenotype order + fc_raw <- scalars(fix$ma)[["FC"]][1, ] + reorder_idx <- ctx$predictor_reorder[["FC"]] + fc_reordered <- fc_raw[reorder_idx] + expect_equal(as.numeric(elem$dat$FC), as.numeric(fc_reordered)) +}) + +test_that(".assemble_element_data intersection mask works across scalars", { + n_subj <- 6 + src <- paste0("sub", 1:n_subj) + + fd_mat <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 1) + fc_mat <- matrix(c(10, NaN, 30, NaN, 50, 60), nrow = 1) + colnames(fd_mat) <- src + colnames(fc_mat) <- src + + ma <- methods::new("ModelArray", + sources = list(FD = src, FC = src), + scalars = list(FD = fd_mat, FC = fc_mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 10, length.out = n_subj) + ) + + ctx <- .build_lm_context(FD ~ age + FC, ma, phen, scalar = "FD") + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 0) + + # Subjects 2 and 4 have NaN in FC, so should be excluded + expect_equal(elem$num_valid, 4) + expect_equal(nrow(elem$dat), 4) + expect_false(any(is.nan(elem$dat$FC))) +}) + + +# ================================================================== +# SECTION 3: Per-element function ctx vs. legacy equivalence +# Ordered: lm → gam → wrap +# ================================================================== + +# ---- analyseOneElement.lm ---- + +test_that("analyseOneElement.lm produces identical results with ctx vs. without", { + # Create fixture with 8 subjects (enough for a meaningful lm fit) + fix <- make_test_modelarray(n_elements = 2, n_subjects = 8, scalars = c("FD")) + + # The fixture's phen has age and sex; rebuild with only age for a + # clean single-predictor test. The fixture already has 8 subjects. + src <- sources(fix$ma)[["FD"]] + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = length(src)), + stringsAsFactors = FALSE + ) + + ctx <- .build_lm_context(FD ~ age, fix$ma, phen, scalar = "FD") + + # With context — initiation + result_ctx <- analyseOneElement.lm( + i_element = 1, + ctx = ctx, + var.terms = c("estimate", "p.value"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + # Without context — legacy path + result_legacy <- analyseOneElement.lm( + i_element = 1, + formula = FD ~ age, + modelarray = fix$ma, + phenotypes = phen, + scalar = "FD", + var.terms = c("estimate", "p.value"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + expect_identical(result_ctx$column_names, result_legacy$column_names) + expect_identical(result_ctx$list.terms, result_legacy$list.terms) + + # Also check runtime (non-initiate) path + num_stat <- length(result_ctx$column_names) + result_ctx_run <- analyseOneElement.lm( + i_element = 1, + ctx = ctx, + var.terms = c("estimate", "p.value"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + result_legacy_run <- analyseOneElement.lm( + i_element = 1, + formula = FD ~ age, + modelarray = fix$ma, + phenotypes = phen, + scalar = "FD", + var.terms = c("estimate", "p.value"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + expect_equal(result_ctx_run, result_legacy_run) +}) + +# ---- analyseOneElement.gam ---- + +test_that("analyseOneElement.gam produces identical results with ctx vs. without", { + src <- paste0("s", 1:8) + set.seed(42) + mat <- matrix(rnorm(3 * 8), nrow = 3) + colnames(mat) <- src + + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = 8), + stringsAsFactors = FALSE + ) + + ctx <- .build_gam_context(FD ~ s(age, k = 4), modelarray, phen, scalar = "FD") + + # With context — initiation + result_ctx <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "p.value"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + # Without context — legacy path + result_legacy <- analyseOneElement.gam( + i_element = 1, + formula = FD ~ s(age, k = 4), + modelarray = modelarray, + phenotypes = phen, + scalar = "FD", + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "p.value"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + expect_identical(result_ctx$column_names, result_legacy$column_names) + expect_identical(result_ctx$list.smoothTerms, result_legacy$list.smoothTerms) + expect_identical(result_ctx$list.parametricTerms, result_legacy$list.parametricTerms) + + # Runtime path + num_stat <- length(result_ctx$column_names) + + result_ctx_run <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "p.value"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + result_legacy_run <- analyseOneElement.gam( + i_element = 1, + formula = FD ~ s(age, k = 4), + modelarray = modelarray, + phenotypes = phen, + scalar = "FD", + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "p.value"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + expect_equal(result_ctx_run, result_legacy_run) +}) + +test_that("analyseOneElement.gam with ctx normalizes (Intercept) to Intercept", { + src <- paste0("s", 1:8) + set.seed(42) + mat <- matrix(rnorm(2 * 8), nrow = 2) + colnames(mat) <- src + + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = 8), + stringsAsFactors = FALSE + ) + + ctx <- .build_gam_context(FD ~ s(age, k = 4) + 1, modelarray, phen, scalar = "FD") + + result <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c("p.value"), + var.parametricTerms = c("estimate"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + # Should contain "Intercept", not "(Intercept)" + param_cols <- grep("estimate", result$column_names, value = TRUE) + expect_true(any(grepl("^Intercept\\.", param_cols))) + expect_false(any(grepl("\\(Intercept\\)", result$column_names))) +}) + +test_that("analyseOneElement.gam with ctx returns NaN row for insufficient subjects", { + src <- paste0("s", 1:4) + mat <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8), nrow = 2) + colnames(mat) <- src + + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = c(20, 30, 40, 50) + ) + + ctx <- .build_gam_context(FD ~ age, modelarray, phen, scalar = "FD") + + # Initiation with impossible threshold + result_init <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c(), + var.parametricTerms = c("estimate"), + var.model = c("dev.expl"), + num.subj.lthr = 1000, + flag_initiate = TRUE + ) + expect_true(is.nan(result_init$column_names[1])) + + # Runtime with impossible threshold + result_run <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c(), + var.parametricTerms = c("estimate"), + var.model = c("dev.expl"), + num.subj.lthr = 1000, + num.stat.output = 5, + flag_initiate = FALSE + ) + expect_equal(result_run[1], 0) + expect_true(all(is.nan(result_run[-1]))) +}) + +test_that("analyseOneElement.gam with ctx handles on_error = 'skip'", { + src <- paste0("s", 1:8) + set.seed(42) + mat <- matrix(rnorm(2 * 8), nrow = 2) + colnames(mat) <- src + + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = 8) + ) + + ctx <- .build_gam_context(FD ~ nonexistent_var, modelarray, phen, scalar = "FD") + + expect_warning( + result <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c(), + var.parametricTerms = c("estimate"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = 4, + flag_initiate = FALSE, + on_error = "skip" + ) + ) + expect_equal(result[1], 0) + expect_true(all(is.nan(result[-1]))) +}) + +# ---- analyseOneElement.wrap ---- + +test_that("analyseOneElement.wrap produces identical results with ctx vs. without", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src, FA = src), + scalars = list( + FD = matrix(c(1, 2, 3, 4, 2, 3, 4, 5), nrow = 2, byrow = TRUE), + FA = matrix(c(10, 20, 30, 40, 50, 60, 70, 80), nrow = 2, byrow = TRUE) + ), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = c(20, 30, 40, 50) + ) + + my_fun <- function(data, ...) { + list(fd_mean = mean(data$FD), fa_mean = mean(data$FA)) + } + + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + # With context — initiation + result_ctx <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + ctx = ctx, + num.subj.lthr = 0, + flag_initiate = TRUE + ) + + # Without context — legacy + result_legacy <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + modelarray = modelarray, + phenotypes = phen, + scalar = "FD", + num.subj.lthr = 0, + flag_initiate = TRUE + ) + + expect_identical(result_ctx$column_names, result_legacy$column_names) + + # Runtime + num_stat <- length(result_ctx$column_names) + + result_ctx_run <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + ctx = ctx, + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + result_legacy_run <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + modelarray = modelarray, + phenotypes = phen, + scalar = "FD", + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + expect_equal(result_ctx_run, result_legacy_run) +}) + +test_that("analyseOneElement.wrap with ctx attaches ALL scalars", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src, FA = src, FC = src), + scalars = list( + FD = matrix(1:8, nrow = 2), + FA = matrix(11:18, nrow = 2), + FC = matrix(21:28, nrow = 2) + ), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame(source_file = src, age = c(20, 30, 40, 50)) + + check_fun <- function(data, ...) { + # Verify all three scalars are present + list( + has_FD = as.numeric("FD" %in% colnames(data)), + has_FA = as.numeric("FA" %in% colnames(data)), + has_FC = as.numeric("FC" %in% colnames(data)) + ) + } + + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + result <- analyseOneElement.wrap( + i_element = 1, + user_fun = check_fun, + ctx = ctx, + num.subj.lthr = 0, + num.stat.output = 4, + flag_initiate = FALSE + ) + + # element_id = 0, then has_FD=1, has_FA=1, has_FC=1 + expect_equal(result[2], 1) + expect_equal(result[3], 1) + expect_equal(result[4], 1) +}) + +test_that("analyseOneElement.wrap with ctx returns NaN for insufficient subjects", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = matrix(c(1, 2, 3, 4, 5, 6, 7, 8), nrow = 2, byrow = TRUE)), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame(source_file = src, age = c(20, 30, 40, 50)) + my_fun <- function(data, ...) list(m = mean(data$FD)) + + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + result_init <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + ctx = ctx, + num.subj.lthr = 1000, + flag_initiate = TRUE + ) + expect_true(is.nan(result_init$column_names[1])) + + result_run <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + ctx = ctx, + num.subj.lthr = 1000, + num.stat.output = 2, + flag_initiate = FALSE + ) + expect_equal(result_run[1], 0) + expect_true(all(is.nan(result_run[-1]))) +}) + +test_that("analyseOneElement.wrap with ctx handles on_error = 'skip'", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = matrix(1:8, nrow = 2)), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame(source_file = src, age = c(20, 30, 40, 50)) + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + expect_warning( + result <- analyseOneElement.wrap( + i_element = 1, + user_fun = function(data, ...) stop("boom"), + ctx = ctx, + num.subj.lthr = 0, + num.stat.output = 3, + flag_initiate = FALSE, + on_error = "skip" + ) + ) + expect_equal(result[1], 0) + expect_true(all(is.nan(result[-1]))) +}) + +test_that("analyseOneElement.wrap with ctx handles various return types", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = matrix(c(1, 2, 3, 4, 5, 6, 7, 8), nrow = 2, byrow = TRUE)), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame(source_file = src, age = c(20, 30, 40, 50)) + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + # data.frame return + df_fun <- function(data, ...) data.frame(m = mean(data$FD)) + result_df <- analyseOneElement.wrap( + i_element = 1, user_fun = df_fun, ctx = ctx, + num.subj.lthr = 0, num.stat.output = 2, flag_initiate = FALSE + ) + expect_equal(length(result_df), 2) + + # named list return + list_fun <- function(data, ...) list(m = mean(data$FD)) + result_list <- analyseOneElement.wrap( + i_element = 1, user_fun = list_fun, ctx = ctx, + num.subj.lthr = 0, num.stat.output = 2, flag_initiate = FALSE + ) + expect_equal(length(result_list), 2) + + # named atomic return + atomic_fun <- function(data, ...) c(m = mean(data$FD)) + result_atomic <- analyseOneElement.wrap( + i_element = 1, user_fun = atomic_fun, ctx = ctx, + num.subj.lthr = 0, num.stat.output = 2, flag_initiate = FALSE + ) + expect_equal(length(result_atomic), 2) + + # All three should produce the same values + expect_equal(result_df[2], result_list[2]) + expect_equal(result_df[2], result_atomic[2]) +}) + + +# ================================================================== +# SECTION 4: Initiator search +# ================================================================== + +# ---- .find_initiator_element ---- + +test_that(".find_initiator_element succeeds on middle element", { + fix <- make_test_modelarray(n_elements = 5, n_subjects = 8, scalars = c("FD")) + src <- sources(fix$ma)[["FD"]] + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = length(src)) + ) + ctx <- .build_lm_context(FD ~ age, fix$ma, phen, scalar = "FD") + + init_args <- list( + ctx = ctx, + var.terms = c("estimate"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + on_error = "stop" + ) + + result <- .find_initiator_element( + analyseOneElement.lm, 5, init_args, verbose = FALSE + ) + + expect_false(is.nan(result$outputs$column_names[1])) + expect_true(result$i_element >= 1 && result$i_element <= 5) +}) + +test_that(".find_initiator_element fails when all elements have insufficient subjects", { + fix <- make_test_modelarray(n_elements = 3, n_subjects = 4, scalars = c("FD")) + src <- sources(fix$ma)[["FD"]] + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = length(src)) + ) + ctx <- .build_lm_context(FD ~ age, fix$ma, phen, scalar = "FD") + + init_args <- list( + ctx = ctx, + var.terms = c("estimate"), + var.model = c("adj.r.squared"), + num.subj.lthr = 1000, # impossibly high threshold + on_error = "stop" + ) + + expect_error( + .find_initiator_element(analyseOneElement.lm, 3, init_args, verbose = FALSE), + "no elements with sufficient valid" + ) +}) + + +# ================================================================== +# SECTION 5: Top-level integration +# Verifies ctx flows correctly through ModelArray.gam/wrap +# using real HDF5-backed data from system.file() +# ================================================================== + +# ---- ModelArray.gam integration ---- + +test_that("ModelArray.gam produces consistent results after context refactoring", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + csv_path <- system.file("extdata", "n50_cohort.csv", package = "ModelArray") + + modelarray <- ModelArray(h5_path, scalar_types = c("FD")) + phenotypes <- read.csv(csv_path) + element.subset <- 1:5 + + result <- ModelArray.gam( + FD ~ s(age, fx = TRUE) + sex, + data = modelarray, + phenotypes = phenotypes, + scalar = "FD", + element.subset = element.subset, + n_cores = 1, + pbar = FALSE + ) + + expect_true(is.data.frame(result)) + expect_equal(nrow(result), length(element.subset)) + expect_equal(result$element_id, 0:(length(element.subset) - 1)) + + # Verify Intercept naming (not "(Intercept)") + expect_true(any(grepl("^Intercept\\.", colnames(result)))) + expect_false(any(grepl("\\(Intercept\\)", colnames(result)))) + + # Verify smooth term naming uses underscores [10] + expect_true(any(grepl("^s_age\\.", colnames(result)))) + + # No NAs for valid elements + stat_cols <- setdiff(colnames(result), "element_id") + expect_true(all(is.finite(as.matrix(result[, stat_cols])))) +}) + +# ---- ModelArray.wrap integration ---- + +test_that("ModelArray.wrap produces consistent results after context refactoring", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + csv_path <- system.file("extdata", "n50_cohort.csv", package = "ModelArray") + + modelarray <- ModelArray(h5_path, scalar_types = c("FD")) + phenotypes <- read.csv(csv_path) + element.subset <- 1:5 + + my_fun <- function(data, ...) { + list(fd_mean = mean(data$FD, na.rm = TRUE)) + } + + result <- ModelArray.wrap( + FUN = my_fun, + data = modelarray, + phenotypes = phenotypes, + scalar = "FD", + element.subset = element.subset, + n_cores = 1, + pbar = FALSE + ) + + expect_true(is.data.frame(result)) + expect_equal(nrow(result), length(element.subset)) + expect_equal(result$element_id, 0:(length(element.subset) - 1)) + expect_true("fd_mean" %in% colnames(result)) + expect_true(all(is.finite(result$fd_mean))) +}) From 66448b9216a3e9656aa82ece32078d1f02a140dc Mon Sep 17 00:00:00 2001 From: araikes Date: Thu, 9 Apr 2026 14:49:54 -0700 Subject: [PATCH 2/9] Remove unnecessary bracket refs --- R/analyse-helpers.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/analyse-helpers.R b/R/analyse-helpers.R index 7660603..41927d1 100644 --- a/R/analyse-helpers.R +++ b/R/analyse-helpers.R @@ -253,7 +253,7 @@ #' @noRd .build_lm_context <- function(formula, modelarray, phenotypes, scalar) { # Formula parsing — currently repeated per element inside - # analyseOneElement.lm [4] + # analyseOneElement.lm all_vars <- all.vars(formula) lhs_name <- tryCatch( as.character(formula[[2]]), @@ -307,7 +307,7 @@ #' } #' @noRd .build_gam_context <- function(formula, modelarray, phenotypes, scalar) { - # GAM formula validation — currently runs in ModelArray.gam() preamble [1] + # GAM formula validation — currently runs in ModelArray.gam() preamble # but the breakdown result is discarded. We cache it here. gam_breakdown <- tryCatch( mgcv::interpret.gam(formula), @@ -357,7 +357,7 @@ # Extracts scalar values for one element using the precomputed context, # builds the validity mask, and returns the filtered data.frame. # This replaces duplicated logic across analyseOneElement.lm, -# analyseOneElement.gam, and analyseOneElement.wrap [4]. +# analyseOneElement.gam, and analyseOneElement.wrap. #' Assemble per-element data.frame from precomputed context #' From 26d548a8d52a3cea8fa7be70940e5b49cb64dd30 Mon Sep 17 00:00:00 2001 From: araikes Date: Thu, 9 Apr 2026 14:50:05 -0700 Subject: [PATCH 3/9] Finish updating docs --- R/ModelArray_Constructor.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/ModelArray_Constructor.R b/R/ModelArray_Constructor.R index 743916f..3f3b9e2 100644 --- a/R/ModelArray_Constructor.R +++ b/R/ModelArray_Constructor.R @@ -920,6 +920,8 @@ analyseOneElement.gam <- function(i_element, #' \item A named atomic vector. Unnamed vectors are accepted and #' auto-named as \code{v1}, \code{v2}, etc. #' } +#' All return values are coerced to a numeric vector internally. +#' Unsupported types (e.g., environments, S4 objects) will error. #' @param ctx A precomputed context list from \code{.build_wrap_context()}, #' or \code{NULL}. #' @param ... Additional arguments forwarded to \code{user_fun}. From 3277e714b56ea57563f92ace29c831528973f029 Mon Sep 17 00:00:00 2001 From: araikes Date: Thu, 9 Apr 2026 15:13:42 -0700 Subject: [PATCH 4/9] Revert removed validations --- R/analyse.R | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/R/analyse.R b/R/analyse.R index 9779d39..ebd2fdd 100644 --- a/R/analyse.R +++ b/R/analyse.R @@ -187,6 +187,27 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU var.model <- var.model.full } + # check on validity of list of vars: + var.terms <- var.terms[!duplicated(var.terms)] + var.model <- var.model[!duplicated(var.model)] + + # check if all var.* are empty: + if (length(var.terms) == 0 && length(var.model) == 0) { + stop("All var.* arguments [var.terms, var.model] are empty!") + } + + # check if every var is valid: + for (var in var.terms) { + if (!(var %in% var.terms.full)) { + stop(paste0(var, " is not valid for var.terms!")) + } + } + for (var in var.model) { + if (!(var %in% var.model.full)) { + stop(paste0(var, " is not valid for var.model!")) + } + } + # P-value correction checks ---- check_validity_correctPValue( correct.p.value.terms, "correct.p.value.terms", @@ -417,9 +438,58 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, var.model <- var.model.full } + var.smoothTerms <- var.smoothTerms[!duplicated(var.smoothTerms)] + var.parametricTerms <- var.parametricTerms[!duplicated(var.parametricTerms)] + var.model <- var.model[!duplicated(var.model)] + + if (length(var.smoothTerms) == 0 && length(var.parametricTerms) == 0 && length(var.model) == 0) { + stop("All var.* arguments [var.smoothTerms, var.parametricTerms, var.model] are empty!") + } + + for (var in var.smoothTerms) { + if (!(var %in% var.smoothTerms.full)) { + stop(paste0(var, " is not valid for var.smoothTerms!")) + } + } + for (var in var.parametricTerms) { + if (!(var %in% var.parametricTerms.full)) { + stop(paste0(var, " is not valid for var.parametricTerms!")) + } + } + for (var in var.model) { + if (!(var %in% var.model.full)) { + stop(paste0(var, " is not valid!")) + } + } + # Changed.rsq setup ---- + var.model.orig <- var.model if (!is.null(changed.rsq.term.index)) { - terms.full.formula <- stats::terms(formula) + + # check if the term index is valid: + if (min(changed.rsq.term.index) <= 0) { + stop( + "There is element(s) in changed.rsq.term.index <= 0. ", + "It should be a (list of) positive integer!" + ) + } + + terms.full.formula <- stats::terms(formula, keep.order = TRUE) + + if (max(changed.rsq.term.index) > length(labels(terms.full.formula))) { + stop( + "Largest index in changed.rsq.term.index is more than the term number on the ", + "right hand side of formula!" + ) + } + + if (length(labels(terms.full.formula)) == 0) { + stop( + "Trying to analyze changed.rsq but there is no variable (except intercept 1) ", + "on right hand side of formula! Please provide at least one valid variable." + ) + } + changed.rsq.term.fullFormat.list <- labels(terms.full.formula)[unlist(changed.rsq.term.index)] changed.rsq.term.shortFormat.list <- list() for (idx in seq_along(changed.rsq.term.index)) { From fa003677b3bc9f2dd6f8f6c34352c00e3dfdce4a Mon Sep 17 00:00:00 2001 From: araikes Date: Thu, 9 Apr 2026 15:13:45 -0700 Subject: [PATCH 5/9] Update analyseOneElement.wrap.Rd --- man/analyseOneElement.wrap.Rd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/man/analyseOneElement.wrap.Rd b/man/analyseOneElement.wrap.Rd index aa37a27..e8dfafc 100644 --- a/man/analyseOneElement.wrap.Rd +++ b/man/analyseOneElement.wrap.Rd @@ -31,7 +31,9 @@ returns will error. as \code{v1}, \code{v2}, etc. \item A named atomic vector. Unnamed vectors are accepted and auto-named as \code{v1}, \code{v2}, etc. -}} +} +All return values are coerced to a numeric vector internally. +Unsupported types (e.g., environments, S4 objects) will error.} \item{modelarray}{A \linkS4class{ModelArray} object. Ignored when \code{ctx} is provided.} From d977b01ea57a4d49c610873af38ba3fc2ee669ea Mon Sep 17 00:00:00 2001 From: araikes Date: Thu, 9 Apr 2026 15:37:32 -0700 Subject: [PATCH 6/9] Update analyse.R --- R/analyse.R | 50 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/R/analyse.R b/R/analyse.R index ebd2fdd..b5ed93b 100644 --- a/R/analyse.R +++ b/R/analyse.R @@ -462,12 +462,13 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, } } + # Changed.rsq setup ---- # Changed.rsq setup ---- var.model.orig <- var.model if (!is.null(changed.rsq.term.index)) { - # check if the term index is valid: - if (min(changed.rsq.term.index) <= 0) { + # Validation checks + if (min(unlist(changed.rsq.term.index)) <= 0) { stop( "There is element(s) in changed.rsq.term.index <= 0. ", "It should be a (list of) positive integer!" @@ -476,7 +477,7 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, terms.full.formula <- stats::terms(formula, keep.order = TRUE) - if (max(changed.rsq.term.index) > length(labels(terms.full.formula))) { + if (max(unlist(changed.rsq.term.index)) > length(labels(terms.full.formula))) { stop( "Largest index in changed.rsq.term.index is more than the term number on the ", "right hand side of formula!" @@ -491,14 +492,35 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, } changed.rsq.term.fullFormat.list <- labels(terms.full.formula)[unlist(changed.rsq.term.index)] + + # Compute short format names: s(age) -> s_age, ti(x,z) -> ti_x_z, etc. changed.rsq.term.shortFormat.list <- list() - for (idx in seq_along(changed.rsq.term.index)) { - tidx <- changed.rsq.term.index[[idx]] - tidy_names_smooth <- character(0) - tidy_names_param <- character(0) - changed.rsq.term.shortFormat.list[[idx]] <- c(tidy_names_smooth, tidy_names_param) + for (changed.rsq.term.fullFormat in changed.rsq.term.fullFormat.list) { + temp <- strsplit(changed.rsq.term.fullFormat, "[(]")[[1]] + if (length(temp) == 1) { + # Not a smooth term — no parentheses + str_valid <- changed.rsq.term.fullFormat + } else { + smooth.class <- temp[1] + theEval <- eval(parse(text = changed.rsq.term.fullFormat)) + str_valid <- paste0( + smooth.class, "_", + paste(theEval$term, collapse = "_") + ) + if (theEval$by != "NA") { + str_valid <- paste0(str_valid, "_BY", theEval$by) + } + } + changed.rsq.term.shortFormat.list <- append( + changed.rsq.term.shortFormat.list, str_valid + ) } - message("will get changed R-squared (delta.adj.rsq and partial.rsq) so the execution time will be longer.") + + message( + "will get changed R-squared (delta.adj.rsq and partial.rsq) ", + "so the execution time will be longer." + ) + if (!("adj.r.squared" %in% var.model)) { var.model <- c(var.model, "adj.r.squared") } @@ -616,7 +638,7 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, # Compute changed R-squared ---- if (need_changed_rsq) { - terms.full.formula <- stats::terms(formula) + terms.full.formula <- stats::terms(formula, keep.order = TRUE) for (i.changed.rsq.term in seq_along(changed.rsq.term.index)) { idx.changed.rsq.term <- changed.rsq.term.index[[i.changed.rsq.term]] @@ -682,6 +704,14 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, reduced_sse <- reduced.model.df_out[["model.sse"]] df_out[[partial_col]] <- (reduced_sse - full_sse) / reduced_sse } + + # if adjusted r sq is not requested (see var.model.orig), remove it: + if (!("adj.r.squared" %in% var.model.orig)) { + df_out <- df_out[, colnames(df_out) != "model.adj.r.squared", drop = FALSE] + } + + # remove full model's sse (model.sse): + df_out <- df_out[, colnames(df_out) != "model.sse", drop = FALSE] } # Rewrite if corrections applied and streaming was used ---- From ac3d8576305e01b7d6a59fa13883bf289710e6d3 Mon Sep 17 00:00:00 2001 From: araikes Date: Thu, 9 Apr 2026 15:55:23 -0700 Subject: [PATCH 7/9] Silence unbound "." warning --- R/ModelArray-package.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/ModelArray-package.R b/R/ModelArray-package.R index cee59be..508ac21 100644 --- a/R/ModelArray-package.R +++ b/R/ModelArray-package.R @@ -71,3 +71,7 @@ #' #' @keywords internal "_PACKAGE" + +## Suppress R CMD check NOTEs for non-standard evaluation symbols +## used inside magrittr pipe expressions. +utils::globalVariables(".") From aeab037ecd2cfea9e343b65770452751cf4027ca Mon Sep 17 00:00:00 2001 From: araikes Date: Thu, 9 Apr 2026 16:28:34 -0700 Subject: [PATCH 8/9] Fix bad GAM documentation --- R/ModelArray_Constructor.R | 2 + R/analyse.R | 134 ++++++++++++++++++++-------------- man/ModelArray.wrap.Rd | 136 ++++++++++++++++++++--------------- man/analyseOneElement.gam.Rd | 15 ++-- 4 files changed, 170 insertions(+), 117 deletions(-) diff --git a/R/ModelArray_Constructor.R b/R/ModelArray_Constructor.R index 3f3b9e2..625a55e 100644 --- a/R/ModelArray_Constructor.R +++ b/R/ModelArray_Constructor.R @@ -596,6 +596,8 @@ analyseOneElement.lm <- function(i_element, } #' Fit a GAM for a single element +#' +#' #' @description #' Returns metadata (column names, smooth term names, parametric term names, #' and the smoothing parameter criterion attribute name) used by #' \code{\link{ModelArray.gam}} to initialise the output data.frame. When diff --git a/R/analyse.R b/R/analyse.R index b5ed93b..c7ae8bd 100644 --- a/R/analyse.R +++ b/R/analyse.R @@ -733,31 +733,51 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, #' Run a user-supplied function for element-wise data #' #' @description -#' `ModelArray.wrap` runs a user-supplied function \code{FUN} at each -#' requested element and returns a tibble of results combined across -#' elements. +#' `ModelArray.gam` fits a generalized additive model at each requested +#' element in a \linkS4class{ModelArray} and returns a tibble of requested +#' model statistics. There is no model-level p-value for GAMs, so there is +#' no \code{correct.p.value.model} argument. #' #' @details -#' This provides a generic framework reusing ModelArray's per-element -#' looping, alignment, subject-thresholding, and parallelization. The user -#' function is called as \code{FUN(data = dat, ...)} where \code{dat} is -#' \code{phenotypes} with all scalar columns appended for the current -#' element. The return value from \code{FUN} for a single element must be -#' one of: +#' You may request returning specific statistical variables by setting +#' \code{var.*}, or you can get all by setting \code{full.outputs = TRUE}. +#' Note that statistics covered by \code{full.outputs} or \code{var.*} are +#' the ones from \code{broom::tidy()}, \code{broom::glance()}, and +#' \code{summary.gam()} only, and do not include corrected p-values. +#' However FDR-corrected p-values (\code{"fdr"}) are generated by default. +#' +#' List of acceptable statistic names for each of \code{var.*}: #' \itemize{ -#' \item a one-row \code{data.frame} or \code{tibble} -#' \item a named list -#' \item a named atomic vector +#' \item \code{var.smoothTerms}: \code{c("edf", "ref.df", "statistic", +#' "p.value")}; From \code{broom::tidy(parametric = FALSE)}. +#' \item \code{var.parametricTerms}: \code{c("estimate", "std.error", +#' "statistic", "p.value")}; From \code{broom::tidy(parametric = TRUE)}. +#' \item \code{var.model}: \code{c("adj.r.squared", "dev.expl", +#' "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", +#' "df.residual", "nobs")}; From \code{broom::glance()} and +#' \code{\link[mgcv]{summary.gam}}. #' } -#' The column names from the first successful element determine the final -#' schema. #' -#' Note: \code{ModelArray.wrap} never performs any p-value corrections or -#' modifications. If you need adjusted p-values (e.g. FDR), implement -#' them inside \code{FUN}. +#' Smooth term names in the output are normalized: \code{s(age)} becomes +#' \code{s_age}, \code{ti(x,z)} becomes \code{ti_x_z}, and +#' \code{s(x):oFactor} becomes \code{s_x_BYoFactor}. #' -#' Use \code{\link{exampleElementData}} to construct a sample per-element -#' data.frame for testing your function before committing to a full run. +#' For p-value corrections (arguments \code{correct.p.value.*}), supported +#' methods include all methods in \code{p.adjust.methods} except +#' \code{"none"}. You can request more than one method. FDR-corrected +#' p-values (\code{"fdr"}) are calculated by default. Turn it off by +#' setting to \code{"none"}. +#' +#' When \code{changed.rsq.term.index} is provided, a reduced model (dropping +#' the specified term) is fit at each element to compute delta adjusted +#' R-squared and partial R-squared. This approximately doubles execution +#' time per requested term. The term index refers to the position on the +#' right-hand side of \code{formula} (use \code{labels(terms(formula))} to +#' see the ordering). +#' +#' Arguments \code{num.subj.lthr.abs} and \code{num.subj.lthr.rel} are +#' mainly for input data with subject-specific masks, i.e. currently only +#' for volume data. For fixel-wise data, you may ignore these arguments. #' #' @inheritParams ModelArray.lm #' @@ -785,54 +805,58 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, #' level for scalar writes. Default 4. #' @param ... Additional arguments forwarded to \code{FUN}. #' -#' @return If \code{flag_initiate = TRUE}, a list with one component: -#' \describe{ -#' \item{column_names}{Character vector. The column names derived from -#' the return value of \code{user_fun}, with \code{"element_id"} -#' prepended. For unnamed list or atomic returns, columns are named -#' \code{v1}, \code{v2}, etc. Set to \code{NaN} if the element was -#' skipped or errored.} -#' } -#' If \code{flag_initiate = FALSE}, a numeric vector of length -#' \code{num.stat.output} with \code{element_id} (0-based) first and -#' the coerced output of \code{user_fun} in subsequent positions. -#' All-\code{NaN} (except \code{element_id}) if the element was skipped -#' or if an error occurred with \code{on_error = "skip"}. +#' @return A data.frame with one row per element. The first column is +#' \code{element_id} (0-based). Remaining columns contain the requested +#' statistics, named as \code{.} for per-term statistics +#' and \code{model.} for model-level statistics. Smooth term +#' names are normalized (e.g. \code{s_age.statistic}). If p-value +#' corrections were requested, additional columns are appended with the +#' correction method as suffix (e.g. \code{s_age.p.value.fdr}). If +#' \code{changed.rsq.term.index} was requested, additional columns +#' \code{.delta.adj.rsq} and \code{.partial.rsq} are +#' appended. #' #' @seealso \code{\link{ModelArray.lm}} for linear models, -#' \code{\link{ModelArray.gam}} for GAMs, -#' \code{\link{exampleElementData}} for building test data, -#' \linkS4class{ModelArray} for the input class. +#' \code{\link{ModelArray.wrap}} for user-supplied functions, +#' \code{\link{gen_gamFormula_fxSmooth}} and +#' \code{\link{gen_gamFormula_contIx}} for formula helpers, +#' \linkS4class{ModelArray} for the input class, +#' \code{\link{ModelArray}} for the constructor, +#' \code{\link{exampleElementData}} for testing formulas on a single +#' element. #' #' @examples{ #' \dontrun{ #' ma <- ModelArray("path/to/data.h5", scalar_types = c("FD")) #' phenotypes <- read.csv("cohort.csv") #' -#' # Simple custom function -#' my_fun <- function(data, ...) { -#' mod <- lm(FD ~ age + sex, data = data) -#' tidy_out <- broom::tidy(mod) -#' # Return a one-row tibble -#' tibble::tibble( -#' age_estimate = tidy_out$estimate[tidy_out$term == "age"], -#' age_pvalue = tidy_out$p.value[tidy_out$term == "age"] -#' ) -#' } -#' +#' # Fit GAM with default outputs +#' results <- ModelArray.gam( +#' FD ~ s(age, fx = TRUE) + sex, +#' data = ma, +#' phenotypes = phenotypes, +#' scalar = "FD" +#' ) +#' head(results) #' -#' # Test on one element first -#' test_df <- exampleElementData(ma, scalar = "FD", -#' i_element = 1, -#' phenotypes = phenotypes) -#' my_fun(data = test_df) +#' # With changed R-squared for the smooth term (term index 1) +#' results_rsq <- ModelArray.gam( +#' FD ~ s(age, fx = TRUE) + sex, +#' data = ma, +#' phenotypes = phenotypes, +#' scalar = "FD", +#' changed.rsq.term.index = list(1) +#' ) #' -#' # Run across all elements -#' results <- ModelArray.wrap( -#' FUN = my_fun, +#' # Full outputs, no p-value correction +#' results_full <- ModelArray.gam( +#' FD ~ s(age, fx = TRUE) + sex, #' data = ma, #' phenotypes = phenotypes, -#' scalar = "FD" +#' scalar = "FD", +#' full.outputs = TRUE, +#' correct.p.value.smoothTerms = "none", +#' correct.p.value.parametricTerms = "none" #' ) #' } #' } diff --git a/man/ModelArray.wrap.Rd b/man/ModelArray.wrap.Rd index 79e5103..dcbd782 100644 --- a/man/ModelArray.wrap.Rd +++ b/man/ModelArray.wrap.Rd @@ -125,46 +125,63 @@ useful when writing large outputs directly to HDF5.} \item{...}{Additional arguments forwarded to \code{FUN}.} } \value{ -If \code{flag_initiate = TRUE}, a list with one component: -\describe{ -\item{column_names}{Character vector. The column names derived from -the return value of \code{user_fun}, with \code{"element_id"} -prepended. For unnamed list or atomic returns, columns are named -\code{v1}, \code{v2}, etc. Set to \code{NaN} if the element was -skipped or errored.} -} -If \code{flag_initiate = FALSE}, a numeric vector of length -\code{num.stat.output} with \code{element_id} (0-based) first and -the coerced output of \code{user_fun} in subsequent positions. -All-\code{NaN} (except \code{element_id}) if the element was skipped -or if an error occurred with \code{on_error = "skip"}. +A data.frame with one row per element. The first column is +\code{element_id} (0-based). Remaining columns contain the requested +statistics, named as \code{.} for per-term statistics +and \code{model.} for model-level statistics. Smooth term +names are normalized (e.g. \code{s_age.statistic}). If p-value +corrections were requested, additional columns are appended with the +correction method as suffix (e.g. \code{s_age.p.value.fdr}). If +\code{changed.rsq.term.index} was requested, additional columns +\code{.delta.adj.rsq} and \code{.partial.rsq} are +appended. } \description{ -\code{ModelArray.wrap} runs a user-supplied function \code{FUN} at each -requested element and returns a tibble of results combined across -elements. +\code{ModelArray.gam} fits a generalized additive model at each requested +element in a \linkS4class{ModelArray} and returns a tibble of requested +model statistics. There is no model-level p-value for GAMs, so there is +no \code{correct.p.value.model} argument. } \details{ -This provides a generic framework reusing ModelArray's per-element -looping, alignment, subject-thresholding, and parallelization. The user -function is called as \code{FUN(data = dat, ...)} where \code{dat} is -\code{phenotypes} with all scalar columns appended for the current -element. The return value from \code{FUN} for a single element must be -one of: +You may request returning specific statistical variables by setting +\code{var.*}, or you can get all by setting \code{full.outputs = TRUE}. +Note that statistics covered by \code{full.outputs} or \code{var.*} are +the ones from \code{broom::tidy()}, \code{broom::glance()}, and +\code{summary.gam()} only, and do not include corrected p-values. +However FDR-corrected p-values (\code{"fdr"}) are generated by default. + +List of acceptable statistic names for each of \code{var.*}: \itemize{ -\item a one-row \code{data.frame} or \code{tibble} -\item a named list -\item a named atomic vector +\item \code{var.smoothTerms}: \code{c("edf", "ref.df", "statistic", + "p.value")}; From \code{broom::tidy(parametric = FALSE)}. +\item \code{var.parametricTerms}: \code{c("estimate", "std.error", + "statistic", "p.value")}; From \code{broom::tidy(parametric = TRUE)}. +\item \code{var.model}: \code{c("adj.r.squared", "dev.expl", + "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", + "df.residual", "nobs")}; From \code{broom::glance()} and +\code{\link[mgcv]{summary.gam}}. } -The column names from the first successful element determine the final -schema. - -Note: \code{ModelArray.wrap} never performs any p-value corrections or -modifications. If you need adjusted p-values (e.g. FDR), implement -them inside \code{FUN}. -Use \code{\link{exampleElementData}} to construct a sample per-element -data.frame for testing your function before committing to a full run. +Smooth term names in the output are normalized: \code{s(age)} becomes +\code{s_age}, \code{ti(x,z)} becomes \code{ti_x_z}, and +\code{s(x):oFactor} becomes \code{s_x_BYoFactor}. + +For p-value corrections (arguments \code{correct.p.value.*}), supported +methods include all methods in \code{p.adjust.methods} except +\code{"none"}. You can request more than one method. FDR-corrected +p-values (\code{"fdr"}) are calculated by default. Turn it off by +setting to \code{"none"}. + +When \code{changed.rsq.term.index} is provided, a reduced model (dropping +the specified term) is fit at each element to compute delta adjusted +R-squared and partial R-squared. This approximately doubles execution +time per requested term. The term index refers to the position on the +right-hand side of \code{formula} (use \code{labels(terms(formula))} to +see the ordering). + +Arguments \code{num.subj.lthr.abs} and \code{num.subj.lthr.rel} are +mainly for input data with subject-specific masks, i.e. currently only +for volume data. For fixel-wise data, you may ignore these arguments. } \examples{ { @@ -172,30 +189,33 @@ data.frame for testing your function before committing to a full run. ma <- ModelArray("path/to/data.h5", scalar_types = c("FD")) phenotypes <- read.csv("cohort.csv") -# Simple custom function -my_fun <- function(data, ...) { - mod <- lm(FD ~ age + sex, data = data) - tidy_out <- broom::tidy(mod) - # Return a one-row tibble - tibble::tibble( - age_estimate = tidy_out$estimate[tidy_out$term == "age"], - age_pvalue = tidy_out$p.value[tidy_out$term == "age"] - ) -} - +# Fit GAM with default outputs +results <- ModelArray.gam( + FD ~ s(age, fx = TRUE) + sex, + data = ma, + phenotypes = phenotypes, + scalar = "FD" +) +head(results) -# Test on one element first -test_df <- exampleElementData(ma, scalar = "FD", - i_element = 1, - phenotypes = phenotypes) -my_fun(data = test_df) +# With changed R-squared for the smooth term (term index 1) +results_rsq <- ModelArray.gam( + FD ~ s(age, fx = TRUE) + sex, + data = ma, + phenotypes = phenotypes, + scalar = "FD", + changed.rsq.term.index = list(1) +) -# Run across all elements -results <- ModelArray.wrap( - FUN = my_fun, +# Full outputs, no p-value correction +results_full <- ModelArray.gam( + FD ~ s(age, fx = TRUE) + sex, data = ma, phenotypes = phenotypes, - scalar = "FD" + scalar = "FD", + full.outputs = TRUE, + correct.p.value.smoothTerms = "none", + correct.p.value.parametricTerms = "none" ) } } @@ -203,7 +223,11 @@ results <- ModelArray.wrap( } \seealso{ \code{\link{ModelArray.lm}} for linear models, -\code{\link{ModelArray.gam}} for GAMs, -\code{\link{exampleElementData}} for building test data, -\linkS4class{ModelArray} for the input class. +\code{\link{ModelArray.wrap}} for user-supplied functions, +\code{\link{gen_gamFormula_fxSmooth}} and +\code{\link{gen_gamFormula_contIx}} for formula helpers, +\linkS4class{ModelArray} for the input class, +\code{\link{ModelArray}} for the constructor, +\code{\link{exampleElementData}} for testing formulas on a single +element. } diff --git a/man/analyseOneElement.gam.Rd b/man/analyseOneElement.gam.Rd index 73598fd..55fb7c1 100644 --- a/man/analyseOneElement.gam.Rd +++ b/man/analyseOneElement.gam.Rd @@ -2,12 +2,7 @@ % Please edit documentation in R/ModelArray_Constructor.R \name{analyseOneElement.gam} \alias{analyseOneElement.gam} -\title{Fit a GAM for a single element -Returns metadata (column names, smooth term names, parametric term names, -and the smoothing parameter criterion attribute name) used by -\code{\link{ModelArray.gam}} to initialise the output data.frame. When -\code{flag_initiate = FALSE}, it returns a numeric vector representing one -row of the final results matrix.} +\title{Fit a GAM for a single element} \usage{ analyseOneElement.gam( i_element, @@ -106,6 +101,14 @@ requested statistics in subsequent positions. All-\code{NaN} (except \code{element_id}) if the element was skipped. } \description{ +#' @description +Returns metadata (column names, smooth term names, parametric term names, +and the smoothing parameter criterion attribute name) used by +\code{\link{ModelArray.gam}} to initialise the output data.frame. When +\code{flag_initiate = FALSE}, it returns a numeric vector representing one +row of the final results matrix. +} +\details{ If the number of subjects with finite scalar values does not exceed \code{num.subj.lthr}, the element is skipped and all statistics are set to \code{NaN}. From 9b2ef3ddc3df771a992670004458f7eb6a94d608 Mon Sep 17 00:00:00 2001 From: araikes Date: Fri, 10 Apr 2026 09:47:07 -0700 Subject: [PATCH 9/9] Maybe fix GAM instead of breaking wrap docs --- R/analyse.R | 223 ++++++++++++++++++++++++----------------- man/ModelArray.gam.Rd | 94 +++++++++++++---- man/ModelArray.wrap.Rd | 136 +++++++++++-------------- 3 files changed, 266 insertions(+), 187 deletions(-) diff --git a/R/analyse.R b/R/analyse.R index c7ae8bd..46a2106 100644 --- a/R/analyse.R +++ b/R/analyse.R @@ -324,8 +324,54 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU } #' Fit element-wise generalized additive models -#' no model-level p-value for GAMs, so there is no -#' \code{correct.p.value.model} argument. +#' +#' +#' @description +#' `ModelArray.gam` fits a generalized additive model at each requested +#' element in a \linkS4class{ModelArray} and returns a tibble of requested +#' model statistics. There is no model-level p-value for GAMs, so there is +#' no \code{correct.p.value.model} argument. +#' +#' @details +#' You may request returning specific statistical variables by setting +#' \code{var.*}, or you can get all by setting \code{full.outputs = TRUE}. +#' Note that statistics covered by \code{full.outputs} or \code{var.*} are +#' the ones from \code{broom::tidy()}, \code{broom::glance()}, and +#' \code{summary.gam()} only, and do not include corrected p-values. +#' However FDR-corrected p-values (\code{"fdr"}) are generated by default. +#' +#' List of acceptable statistic names for each of \code{var.*}: +#' \itemize{ +#' \item \code{var.smoothTerms}: \code{c("edf", "ref.df", "statistic", +#' "p.value")}; From \code{broom::tidy(parametric = FALSE)}. +#' \item \code{var.parametricTerms}: \code{c("estimate", "std.error", +#' "statistic", "p.value")}; From \code{broom::tidy(parametric = TRUE)}. +#' \item \code{var.model}: \code{c("adj.r.squared", "dev.expl", +#' "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", +#' "df.residual", "nobs")}; From \code{broom::glance()} and +#' \code{\link[mgcv]{summary.gam}}. +#' } +#' +#' Smooth term names in the output are normalized: \code{s(age)} becomes +#' \code{s_age}, \code{ti(x,z)} becomes \code{ti_x_z}, and +#' \code{s(x):oFactor} becomes \code{s_x_BYoFactor}. +#' +#' For p-value corrections (arguments \code{correct.p.value.*}), supported +#' methods include all methods in \code{p.adjust.methods} except +#' \code{"none"}. You can request more than one method. FDR-corrected +#' p-values (\code{"fdr"}) are calculated by default. Turn it off by +#' setting to \code{"none"}. +#' +#' When \code{changed.rsq.term.index} is provided, a reduced model (dropping +#' the specified term) is fit at each element to compute delta adjusted +#' R-squared and partial R-squared. This approximately doubles execution +#' time per requested term. The term index refers to the position on the +#' right-hand side of \code{formula} (use \code{labels(terms(formula))} to +#' see the ordering). +#' +#' Arguments \code{num.subj.lthr.abs} and \code{num.subj.lthr.rel} are +#' mainly for input data with subject-specific masks, i.e. currently only +#' for volume data. For fixel-wise data, you may ignore these arguments. #' #' @inheritParams ModelArray.lm #' @@ -336,21 +382,29 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU #' parametric terms, from \code{broom::tidy(parametric = TRUE)}. #' See Details. #' @param var.model Character vector. Statistics to save for the overall -#' model, from \code{broom::glance()} and \code{summary()}. See Details. +#' model, from \code{broom::glance()} and +#' \code{\link[mgcv]{summary.gam}}. See Details. #' @param changed.rsq.term.index A list of positive integers. Each value #' is the index of a term on the right-hand side of \code{formula} for #' which delta adjusted R-squared and partial R-squared should be #' computed. Usually the term of interest is a smooth term or interaction -#' term. Default \code{NULL} (not computed). See Details for warnings. +#' term. Default \code{NULL} (not computed). See Details. #' @param correct.p.value.smoothTerms Character vector. P-value #' correction method(s) for each smooth term. Default: \code{"fdr"}. +#' See Details. #' @param correct.p.value.parametricTerms Character vector. P-value #' correction method(s) for each parametric term. Default: \code{"fdr"}. -#' @param ... Additional arguments passed to \code{\link[mgcv]{gam}}. +#' See Details. +#' @param ... Additional arguments passed to \code{\link[mgcv]{gam}} +#' (e.g. \code{method = "REML"}). #' -#' @return A tibble with one row per element. The first column is +#' @return A data.frame with one row per element. The first column is #' \code{element_id} (0-based). Remaining columns contain the requested -#' statistics, named as \code{.}. If +#' statistics, named as \code{.} for per-term statistics +#' and \code{model.} for model-level statistics. Smooth term +#' names are normalized (e.g. \code{s_age.statistic}). If p-value +#' corrections were requested, additional columns are appended with the +#' correction method as suffix (e.g. \code{s_age.p.value.fdr}). If #' \code{changed.rsq.term.index} was requested, additional columns #' \code{.delta.adj.rsq} and \code{.partial.rsq} are #' appended. @@ -359,13 +413,15 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU #' \code{\link{ModelArray.wrap}} for user-supplied functions, #' \code{\link{gen_gamFormula_fxSmooth}} and #' \code{\link{gen_gamFormula_contIx}} for formula helpers, -#' \linkS4class{ModelArray} for the input class. +#' \linkS4class{ModelArray} for the input class, +#' \code{\link{ModelArray}} for the constructor, +#' \code{\link{exampleElementData}} for testing formulas on a single +#' element. +#' #' #' @examples{ #' \dontrun{ -#' ma <- ModelArray("path/to/data.h5", scalar_types = c("FD")) -#' phenotypes <- read.csv("cohort.csv") -#' +#' # Fit GAM with default outputs #' results <- ModelArray.gam( #' FD ~ s(age, fx = TRUE) + sex, #' data = ma, @@ -382,6 +438,17 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU #' scalar = "FD", #' changed.rsq.term.index = list(1) #' ) +#' +#' # Full outputs, no p-value correction +#' results_full <- ModelArray.gam( +#' FD ~ s(age, fx = TRUE) + sex, +#' data = ma, +#' phenotypes = phenotypes, +#' scalar = "FD", +#' full.outputs = TRUE, +#' correct.p.value.smoothTerms = "none", +#' correct.p.value.parametricTerms = "none" +#' ) #' } #' } #' @@ -733,51 +800,31 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, #' Run a user-supplied function for element-wise data #' #' @description -#' `ModelArray.gam` fits a generalized additive model at each requested -#' element in a \linkS4class{ModelArray} and returns a tibble of requested -#' model statistics. There is no model-level p-value for GAMs, so there is -#' no \code{correct.p.value.model} argument. +#' `ModelArray.wrap` runs a user-supplied function \code{FUN} at each +#' requested element and returns a tibble of results combined across +#' elements. #' #' @details -#' You may request returning specific statistical variables by setting -#' \code{var.*}, or you can get all by setting \code{full.outputs = TRUE}. -#' Note that statistics covered by \code{full.outputs} or \code{var.*} are -#' the ones from \code{broom::tidy()}, \code{broom::glance()}, and -#' \code{summary.gam()} only, and do not include corrected p-values. -#' However FDR-corrected p-values (\code{"fdr"}) are generated by default. -#' -#' List of acceptable statistic names for each of \code{var.*}: +#' This provides a generic framework reusing ModelArray's per-element +#' looping, alignment, subject-thresholding, and parallelization. The user +#' function is called as \code{FUN(data = dat, ...)} where \code{dat} is +#' \code{phenotypes} with all scalar columns appended for the current +#' element. The return value from \code{FUN} for a single element must be +#' one of: #' \itemize{ -#' \item \code{var.smoothTerms}: \code{c("edf", "ref.df", "statistic", -#' "p.value")}; From \code{broom::tidy(parametric = FALSE)}. -#' \item \code{var.parametricTerms}: \code{c("estimate", "std.error", -#' "statistic", "p.value")}; From \code{broom::tidy(parametric = TRUE)}. -#' \item \code{var.model}: \code{c("adj.r.squared", "dev.expl", -#' "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", -#' "df.residual", "nobs")}; From \code{broom::glance()} and -#' \code{\link[mgcv]{summary.gam}}. +#' \item a one-row \code{data.frame} or \code{tibble} +#' \item a named list +#' \item a named atomic vector #' } +#' The column names from the first successful element determine the final +#' schema. #' -#' Smooth term names in the output are normalized: \code{s(age)} becomes -#' \code{s_age}, \code{ti(x,z)} becomes \code{ti_x_z}, and -#' \code{s(x):oFactor} becomes \code{s_x_BYoFactor}. -#' -#' For p-value corrections (arguments \code{correct.p.value.*}), supported -#' methods include all methods in \code{p.adjust.methods} except -#' \code{"none"}. You can request more than one method. FDR-corrected -#' p-values (\code{"fdr"}) are calculated by default. Turn it off by -#' setting to \code{"none"}. -#' -#' When \code{changed.rsq.term.index} is provided, a reduced model (dropping -#' the specified term) is fit at each element to compute delta adjusted -#' R-squared and partial R-squared. This approximately doubles execution -#' time per requested term. The term index refers to the position on the -#' right-hand side of \code{formula} (use \code{labels(terms(formula))} to -#' see the ordering). +#' Note: \code{ModelArray.wrap} never performs any p-value corrections or +#' modifications. If you need adjusted p-values (e.g. FDR), implement +#' them inside \code{FUN}. #' -#' Arguments \code{num.subj.lthr.abs} and \code{num.subj.lthr.rel} are -#' mainly for input data with subject-specific masks, i.e. currently only -#' for volume data. For fixel-wise data, you may ignore these arguments. +#' Use \code{\link{exampleElementData}} to construct a sample per-element +#' data.frame for testing your function before committing to a full run. #' #' @inheritParams ModelArray.lm #' @@ -805,58 +852,54 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, #' level for scalar writes. Default 4. #' @param ... Additional arguments forwarded to \code{FUN}. #' -#' @return A data.frame with one row per element. The first column is -#' \code{element_id} (0-based). Remaining columns contain the requested -#' statistics, named as \code{.} for per-term statistics -#' and \code{model.} for model-level statistics. Smooth term -#' names are normalized (e.g. \code{s_age.statistic}). If p-value -#' corrections were requested, additional columns are appended with the -#' correction method as suffix (e.g. \code{s_age.p.value.fdr}). If -#' \code{changed.rsq.term.index} was requested, additional columns -#' \code{.delta.adj.rsq} and \code{.partial.rsq} are -#' appended. +#' @return If \code{flag_initiate = TRUE}, a list with one component: +#' \describe{ +#' \item{column_names}{Character vector. The column names derived from +#' the return value of \code{user_fun}, with \code{"element_id"} +#' prepended. For unnamed list or atomic returns, columns are named +#' \code{v1}, \code{v2}, etc. Set to \code{NaN} if the element was +#' skipped or errored.} +#' } +#' If \code{flag_initiate = FALSE}, a numeric vector of length +#' \code{num.stat.output} with \code{element_id} (0-based) first and +#' the coerced output of \code{user_fun} in subsequent positions. +#' All-\code{NaN} (except \code{element_id}) if the element was skipped +#' or if an error occurred with \code{on_error = "skip"}. #' #' @seealso \code{\link{ModelArray.lm}} for linear models, -#' \code{\link{ModelArray.wrap}} for user-supplied functions, -#' \code{\link{gen_gamFormula_fxSmooth}} and -#' \code{\link{gen_gamFormula_contIx}} for formula helpers, -#' \linkS4class{ModelArray} for the input class, -#' \code{\link{ModelArray}} for the constructor, -#' \code{\link{exampleElementData}} for testing formulas on a single -#' element. +#' \code{\link{ModelArray.gam}} for GAMs, +#' \code{\link{exampleElementData}} for building test data, +#' \linkS4class{ModelArray} for the input class. #' #' @examples{ #' \dontrun{ #' ma <- ModelArray("path/to/data.h5", scalar_types = c("FD")) #' phenotypes <- read.csv("cohort.csv") #' -#' # Fit GAM with default outputs -#' results <- ModelArray.gam( -#' FD ~ s(age, fx = TRUE) + sex, -#' data = ma, -#' phenotypes = phenotypes, -#' scalar = "FD" -#' ) -#' head(results) +#' # Simple custom function +#' my_fun <- function(data, ...) { +#' mod <- lm(FD ~ age + sex, data = data) +#' tidy_out <- broom::tidy(mod) +#' # Return a one-row tibble +#' tibble::tibble( +#' age_estimate = tidy_out$estimate[tidy_out$term == "age"], +#' age_pvalue = tidy_out$p.value[tidy_out$term == "age"] +#' ) +#' } #' -#' # With changed R-squared for the smooth term (term index 1) -#' results_rsq <- ModelArray.gam( -#' FD ~ s(age, fx = TRUE) + sex, -#' data = ma, -#' phenotypes = phenotypes, -#' scalar = "FD", -#' changed.rsq.term.index = list(1) -#' ) #' -#' # Full outputs, no p-value correction -#' results_full <- ModelArray.gam( -#' FD ~ s(age, fx = TRUE) + sex, +#' # Test on one element first +#' test_df <- exampleElementData(ma, scalar = "FD", +#' i_element = 1, +#' phenotypes = phenotypes) +#' my_fun(data = test_df) +#' +#' # Run across all elements +#' results <- ModelArray.wrap( +#' FUN = my_fun, #' data = ma, #' phenotypes = phenotypes, -#' scalar = "FD", -#' full.outputs = TRUE, -#' correct.p.value.smoothTerms = "none", -#' correct.p.value.parametricTerms = "none" +#' scalar = "FD" #' ) #' } #' } diff --git a/man/ModelArray.gam.Rd b/man/ModelArray.gam.Rd index 74fbe72..c662f91 100644 --- a/man/ModelArray.gam.Rd +++ b/man/ModelArray.gam.Rd @@ -2,9 +2,7 @@ % Please edit documentation in R/analyse.R \name{ModelArray.gam} \alias{ModelArray.gam} -\title{Fit element-wise generalized additive models -no model-level p-value for GAMs, so there is no -\code{correct.p.value.model} argument.} +\title{Fit element-wise generalized additive models} \usage{ ModelArray.gam( formula, @@ -63,19 +61,22 @@ parametric terms, from \code{broom::tidy(parametric = TRUE)}. See Details.} \item{var.model}{Character vector. Statistics to save for the overall -model, from \code{broom::glance()} and \code{summary()}. See Details.} +model, from \code{broom::glance()} and +\code{\link[mgcv]{summary.gam}}. See Details.} \item{changed.rsq.term.index}{A list of positive integers. Each value is the index of a term on the right-hand side of \code{formula} for which delta adjusted R-squared and partial R-squared should be computed. Usually the term of interest is a smooth term or interaction -term. Default \code{NULL} (not computed). See Details for warnings.} +term. Default \code{NULL} (not computed). See Details.} \item{correct.p.value.smoothTerms}{Character vector. P-value -correction method(s) for each smooth term. Default: \code{"fdr"}.} +correction method(s) for each smooth term. Default: \code{"fdr"}. +See Details.} \item{correct.p.value.parametricTerms}{Character vector. P-value -correction method(s) for each parametric term. Default: \code{"fdr"}.} +correction method(s) for each parametric term. Default: \code{"fdr"}. +See Details.} \item{num.subj.lthr.abs}{Integer. Lower threshold for the absolute number of subjects with finite scalar values (not \code{NaN}, @@ -123,27 +124,72 @@ level for HDF5 writes. Default 4.} combined data.frame. If \code{FALSE}, return \code{invisible(NULL)}; useful when writing large outputs directly to HDF5.} -\item{...}{Additional arguments passed to \code{\link[mgcv]{gam}}.} +\item{...}{Additional arguments passed to \code{\link[mgcv]{gam}} +(e.g. \code{method = "REML"}).} } \value{ -A tibble with one row per element. The first column is +A data.frame with one row per element. The first column is \code{element_id} (0-based). Remaining columns contain the requested -statistics, named as \code{.}. If +statistics, named as \code{.} for per-term statistics +and \code{model.} for model-level statistics. Smooth term +names are normalized (e.g. \code{s_age.statistic}). If p-value +corrections were requested, additional columns are appended with the +correction method as suffix (e.g. \code{s_age.p.value.fdr}). If \code{changed.rsq.term.index} was requested, additional columns \code{.delta.adj.rsq} and \code{.partial.rsq} are appended. } \description{ -Fit element-wise generalized additive models -no model-level p-value for GAMs, so there is no -\code{correct.p.value.model} argument. +\code{ModelArray.gam} fits a generalized additive model at each requested +element in a \linkS4class{ModelArray} and returns a tibble of requested +model statistics. There is no model-level p-value for GAMs, so there is +no \code{correct.p.value.model} argument. +} +\details{ +You may request returning specific statistical variables by setting +\code{var.*}, or you can get all by setting \code{full.outputs = TRUE}. +Note that statistics covered by \code{full.outputs} or \code{var.*} are +the ones from \code{broom::tidy()}, \code{broom::glance()}, and +\code{summary.gam()} only, and do not include corrected p-values. +However FDR-corrected p-values (\code{"fdr"}) are generated by default. + +List of acceptable statistic names for each of \code{var.*}: +\itemize{ +\item \code{var.smoothTerms}: \code{c("edf", "ref.df", "statistic", + "p.value")}; From \code{broom::tidy(parametric = FALSE)}. +\item \code{var.parametricTerms}: \code{c("estimate", "std.error", + "statistic", "p.value")}; From \code{broom::tidy(parametric = TRUE)}. +\item \code{var.model}: \code{c("adj.r.squared", "dev.expl", + "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", + "df.residual", "nobs")}; From \code{broom::glance()} and +\code{\link[mgcv]{summary.gam}}. +} + +Smooth term names in the output are normalized: \code{s(age)} becomes +\code{s_age}, \code{ti(x,z)} becomes \code{ti_x_z}, and +\code{s(x):oFactor} becomes \code{s_x_BYoFactor}. + +For p-value corrections (arguments \code{correct.p.value.*}), supported +methods include all methods in \code{p.adjust.methods} except +\code{"none"}. You can request more than one method. FDR-corrected +p-values (\code{"fdr"}) are calculated by default. Turn it off by +setting to \code{"none"}. + +When \code{changed.rsq.term.index} is provided, a reduced model (dropping +the specified term) is fit at each element to compute delta adjusted +R-squared and partial R-squared. This approximately doubles execution +time per requested term. The term index refers to the position on the +right-hand side of \code{formula} (use \code{labels(terms(formula))} to +see the ordering). + +Arguments \code{num.subj.lthr.abs} and \code{num.subj.lthr.rel} are +mainly for input data with subject-specific masks, i.e. currently only +for volume data. For fixel-wise data, you may ignore these arguments. } \examples{ { \dontrun{ -ma <- ModelArray("path/to/data.h5", scalar_types = c("FD")) -phenotypes <- read.csv("cohort.csv") - +# Fit GAM with default outputs results <- ModelArray.gam( FD ~ s(age, fx = TRUE) + sex, data = ma, @@ -160,6 +206,17 @@ results_rsq <- ModelArray.gam( scalar = "FD", changed.rsq.term.index = list(1) ) + +# Full outputs, no p-value correction +results_full <- ModelArray.gam( + FD ~ s(age, fx = TRUE) + sex, + data = ma, + phenotypes = phenotypes, + scalar = "FD", + full.outputs = TRUE, + correct.p.value.smoothTerms = "none", + correct.p.value.parametricTerms = "none" +) } } @@ -169,5 +226,8 @@ results_rsq <- ModelArray.gam( \code{\link{ModelArray.wrap}} for user-supplied functions, \code{\link{gen_gamFormula_fxSmooth}} and \code{\link{gen_gamFormula_contIx}} for formula helpers, -\linkS4class{ModelArray} for the input class. +\linkS4class{ModelArray} for the input class, +\code{\link{ModelArray}} for the constructor, +\code{\link{exampleElementData}} for testing formulas on a single +element. } diff --git a/man/ModelArray.wrap.Rd b/man/ModelArray.wrap.Rd index dcbd782..79e5103 100644 --- a/man/ModelArray.wrap.Rd +++ b/man/ModelArray.wrap.Rd @@ -125,63 +125,46 @@ useful when writing large outputs directly to HDF5.} \item{...}{Additional arguments forwarded to \code{FUN}.} } \value{ -A data.frame with one row per element. The first column is -\code{element_id} (0-based). Remaining columns contain the requested -statistics, named as \code{.} for per-term statistics -and \code{model.} for model-level statistics. Smooth term -names are normalized (e.g. \code{s_age.statistic}). If p-value -corrections were requested, additional columns are appended with the -correction method as suffix (e.g. \code{s_age.p.value.fdr}). If -\code{changed.rsq.term.index} was requested, additional columns -\code{.delta.adj.rsq} and \code{.partial.rsq} are -appended. +If \code{flag_initiate = TRUE}, a list with one component: +\describe{ +\item{column_names}{Character vector. The column names derived from +the return value of \code{user_fun}, with \code{"element_id"} +prepended. For unnamed list or atomic returns, columns are named +\code{v1}, \code{v2}, etc. Set to \code{NaN} if the element was +skipped or errored.} +} +If \code{flag_initiate = FALSE}, a numeric vector of length +\code{num.stat.output} with \code{element_id} (0-based) first and +the coerced output of \code{user_fun} in subsequent positions. +All-\code{NaN} (except \code{element_id}) if the element was skipped +or if an error occurred with \code{on_error = "skip"}. } \description{ -\code{ModelArray.gam} fits a generalized additive model at each requested -element in a \linkS4class{ModelArray} and returns a tibble of requested -model statistics. There is no model-level p-value for GAMs, so there is -no \code{correct.p.value.model} argument. +\code{ModelArray.wrap} runs a user-supplied function \code{FUN} at each +requested element and returns a tibble of results combined across +elements. } \details{ -You may request returning specific statistical variables by setting -\code{var.*}, or you can get all by setting \code{full.outputs = TRUE}. -Note that statistics covered by \code{full.outputs} or \code{var.*} are -the ones from \code{broom::tidy()}, \code{broom::glance()}, and -\code{summary.gam()} only, and do not include corrected p-values. -However FDR-corrected p-values (\code{"fdr"}) are generated by default. - -List of acceptable statistic names for each of \code{var.*}: +This provides a generic framework reusing ModelArray's per-element +looping, alignment, subject-thresholding, and parallelization. The user +function is called as \code{FUN(data = dat, ...)} where \code{dat} is +\code{phenotypes} with all scalar columns appended for the current +element. The return value from \code{FUN} for a single element must be +one of: \itemize{ -\item \code{var.smoothTerms}: \code{c("edf", "ref.df", "statistic", - "p.value")}; From \code{broom::tidy(parametric = FALSE)}. -\item \code{var.parametricTerms}: \code{c("estimate", "std.error", - "statistic", "p.value")}; From \code{broom::tidy(parametric = TRUE)}. -\item \code{var.model}: \code{c("adj.r.squared", "dev.expl", - "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", - "df.residual", "nobs")}; From \code{broom::glance()} and -\code{\link[mgcv]{summary.gam}}. +\item a one-row \code{data.frame} or \code{tibble} +\item a named list +\item a named atomic vector } +The column names from the first successful element determine the final +schema. + +Note: \code{ModelArray.wrap} never performs any p-value corrections or +modifications. If you need adjusted p-values (e.g. FDR), implement +them inside \code{FUN}. -Smooth term names in the output are normalized: \code{s(age)} becomes -\code{s_age}, \code{ti(x,z)} becomes \code{ti_x_z}, and -\code{s(x):oFactor} becomes \code{s_x_BYoFactor}. - -For p-value corrections (arguments \code{correct.p.value.*}), supported -methods include all methods in \code{p.adjust.methods} except -\code{"none"}. You can request more than one method. FDR-corrected -p-values (\code{"fdr"}) are calculated by default. Turn it off by -setting to \code{"none"}. - -When \code{changed.rsq.term.index} is provided, a reduced model (dropping -the specified term) is fit at each element to compute delta adjusted -R-squared and partial R-squared. This approximately doubles execution -time per requested term. The term index refers to the position on the -right-hand side of \code{formula} (use \code{labels(terms(formula))} to -see the ordering). - -Arguments \code{num.subj.lthr.abs} and \code{num.subj.lthr.rel} are -mainly for input data with subject-specific masks, i.e. currently only -for volume data. For fixel-wise data, you may ignore these arguments. +Use \code{\link{exampleElementData}} to construct a sample per-element +data.frame for testing your function before committing to a full run. } \examples{ { @@ -189,33 +172,30 @@ for volume data. For fixel-wise data, you may ignore these arguments. ma <- ModelArray("path/to/data.h5", scalar_types = c("FD")) phenotypes <- read.csv("cohort.csv") -# Fit GAM with default outputs -results <- ModelArray.gam( - FD ~ s(age, fx = TRUE) + sex, - data = ma, - phenotypes = phenotypes, - scalar = "FD" -) -head(results) +# Simple custom function +my_fun <- function(data, ...) { + mod <- lm(FD ~ age + sex, data = data) + tidy_out <- broom::tidy(mod) + # Return a one-row tibble + tibble::tibble( + age_estimate = tidy_out$estimate[tidy_out$term == "age"], + age_pvalue = tidy_out$p.value[tidy_out$term == "age"] + ) +} -# With changed R-squared for the smooth term (term index 1) -results_rsq <- ModelArray.gam( - FD ~ s(age, fx = TRUE) + sex, - data = ma, - phenotypes = phenotypes, - scalar = "FD", - changed.rsq.term.index = list(1) -) -# Full outputs, no p-value correction -results_full <- ModelArray.gam( - FD ~ s(age, fx = TRUE) + sex, +# Test on one element first +test_df <- exampleElementData(ma, scalar = "FD", + i_element = 1, + phenotypes = phenotypes) +my_fun(data = test_df) + +# Run across all elements +results <- ModelArray.wrap( + FUN = my_fun, data = ma, phenotypes = phenotypes, - scalar = "FD", - full.outputs = TRUE, - correct.p.value.smoothTerms = "none", - correct.p.value.parametricTerms = "none" + scalar = "FD" ) } } @@ -223,11 +203,7 @@ results_full <- ModelArray.gam( } \seealso{ \code{\link{ModelArray.lm}} for linear models, -\code{\link{ModelArray.wrap}} for user-supplied functions, -\code{\link{gen_gamFormula_fxSmooth}} and -\code{\link{gen_gamFormula_contIx}} for formula helpers, -\linkS4class{ModelArray} for the input class, -\code{\link{ModelArray}} for the constructor, -\code{\link{exampleElementData}} for testing formulas on a single -element. +\code{\link{ModelArray.gam}} for GAMs, +\code{\link{exampleElementData}} for building test data, +\linkS4class{ModelArray} for the input class. }