diff --git a/R/ModelArray-package.R b/R/ModelArray-package.R index cee59be..508ac21 100644 --- a/R/ModelArray-package.R +++ b/R/ModelArray-package.R @@ -71,3 +71,7 @@ #' #' @keywords internal "_PACKAGE" + +## Suppress R CMD check NOTEs for non-standard evaluation symbols +## used inside magrittr pipe expressions. +utils::globalVariables(".") diff --git a/R/ModelArray_Constructor.R b/R/ModelArray_Constructor.R index 22772d7..625a55e 100644 --- a/R/ModelArray_Constructor.R +++ b/R/ModelArray_Constructor.R @@ -1,3 +1,4 @@ +# ModelArray class ---- #' ModelArray class #' #' ModelArray is an S4 class that represents element-wise scalar data and @@ -394,19 +395,29 @@ numElementsTotal <- function(modelarray, scalar_name = "FD") { } #' Fit a linear model for a single element +#' +#' @description +#' Fits \code{\link[stats]{lm}} on one element's data. When a precomputed +#' context (\code{ctx}) is provided, all loop-invariant work (formula parsing, +#' collision checks, source alignment) is skipped. When \code{ctx} is +#' \code{NULL}, the function falls back to computing everything inline +#' (legacy behaviour for direct calls / debugging). +#' #' If the number of subjects with finite scalar values (not \code{NaN}, #' \code{NA}, or \code{Inf}) does not exceed \code{num.subj.lthr}, the #' element is skipped and all statistics are set to \code{NaN}. #' #' @param i_element Integer. The 1-based index of the element to analyse. #' @param formula A \code{\link[stats]{formula}} passed to -#' \code{\link[stats]{lm}}. -#' @param modelarray A \linkS4class{ModelArray} object. +#' \code{\link[stats]{lm}}. Ignored when \code{ctx} is provided (the +#' formula is taken from the context). +#' @param modelarray A \linkS4class{ModelArray} object. Ignored when +#' \code{ctx} is provided. #' @param phenotypes A data.frame of the cohort with columns of independent #' variables and covariates. Must contain a \code{"source_file"} column -#' matching \code{sources(modelarray)[[scalar]]}. +#' matching \code{sources(modelarray)[[scalar]]}. Ignored when \code{ctx} is provided. #' @param scalar Character. The name of the element-wise scalar to analyse. -#' Must be one of \code{names(scalars(modelarray))}. +#' Must be one of \code{names(scalars(modelarray))}. Ignored when \code{ctx} is provided. #' @param var.terms Character vector. Statistics to extract per term from #' \code{\link[broom]{tidy.lm}} (e.g. \code{"estimate"}, \code{"statistic"}, #' \code{"p.value"}). @@ -418,7 +429,7 @@ numElementsTotal <- function(modelarray, scalar_name = "FD") { #' below this threshold are skipped. This value is typically computed by #' the parent function from \code{num.subj.lthr.abs} and #' \code{num.subj.lthr.rel}. -#' @param num.stat.output Integer or \code{NULL}. The total number of output +#' @param num.stat.output Integer or \code{NULL}. The total number of output #' columns (including \code{element_id}). Used when #' \code{flag_initiate = FALSE} to generate an all-\code{NaN} row for #' skipped elements. Must be \code{NULL} when \code{flag_initiate = TRUE}. @@ -431,6 +442,8 @@ numElementsTotal <- function(modelarray, scalar_name = "FD") { #' halts execution; \code{"skip"} returns all-\code{NaN} for this element; #' \code{"debug"} drops into \code{\link{browser}} (if interactive) then #' skips. Default: \code{"stop"}. +#' @param ctx A precomputed context list from \code{.build_lm_context()}, +#' or \code{NULL} for legacy inline computation. #' @param ... Additional arguments passed to \code{\link[stats]{lm}}. #' #' @return If \code{flag_initiate = TRUE}, a list with components: @@ -446,283 +459,146 @@ numElementsTotal <- function(modelarray, scalar_name = "FD") { #' (except \code{element_id}) if the element had insufficient valid subjects #' or if an error occurred with \code{on_error = "skip"}. #' -#' @seealso \code{\link{ModelArray.lm}} which calls this function iteratively, +#' @seealso \code{\link{ModelArray.lm}}, \code{.build_lm_context}, #' \code{\link{analyseOneElement.gam}} for the GAM equivalent, -#' \code{\link{analyseOneElement.wrap}} for user-supplied functions. +#' \code{\link{analyseOneElement.wrap}} for user-supplied functions #' #' @keywords internal #' @rdname analyseOneElement.lm #' @export - analyseOneElement.lm <- function(i_element, - formula, - modelarray, - phenotypes, - scalar, - var.terms, - var.model, + formula = NULL, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, + var.terms = c("estimate", "statistic", "p.value"), + var.model = c("adj.r.squared", "p.value"), num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, on_error = "stop", + ctx = NULL, ...) { - values <- scalars(modelarray)[[scalar]][i_element, ] - ## check number of subjects with (in)valid values: - flag_sufficient <- NULL # whether number of subjects with valid values are sufficient - num.subj.valid <- length(values[is.finite(values)]) - if (num.subj.valid > num.subj.lthr) { - flag_sufficient <- TRUE - } else { - flag_sufficient <- FALSE - } - if (flag_sufficient == TRUE) { - dat <- phenotypes - # detect scalar predictors referenced in formula - all_vars <- all.vars(formula) - lhs_name <- tryCatch(as.character(formula[[2]]), error = function(e) NULL) - rhs_vars <- setdiff(all_vars, lhs_name) - scalar_names <- names(scalars(modelarray)) - scalar_predictors <- intersect(rhs_vars, scalar_names) - - # collision check with phenotypes - collisions <- intersect(c(scalar, scalar_predictors), colnames(dat)) - if (length(collisions) > 0) { - stop(paste0( - "Column name collision between phenotypes and scalar names: ", - paste(collisions, collapse = ", "), - ". Please rename or remove these columns from phenotypes before modeling." - )) - } + # Resolve context ---- + # Use precomputed if available, else build inline - # attach response first - dat[[scalar]] <- values - - # attach scalar predictors with source alignment and build intersection mask - masks <- list(is.finite(values)) - if (length(scalar_predictors) > 0) { - for (pred in scalar_predictors) { - pred_sources <- sources(modelarray)[[pred]] - phen_sources <- dat[["source_file"]] - if (!(all(pred_sources %in% phen_sources) && all(phen_sources %in% pred_sources))) { - stop(paste0( - "The source files for predictor scalar '", pred, "' do not match phenotypes$source_file." - )) - } - reorder_idx <- match(phen_sources, pred_sources) - pred_vals <- scalars(modelarray)[[pred]][i_element, ] - pred_vals <- pred_vals[reorder_idx] - dat[[pred]] <- pred_vals - masks[[length(masks) + 1L]] <- is.finite(pred_vals) - } - } + if (!is.null(ctx)) { + # Fast path: all invariant work already done + effective_formula <- ctx$formula + effective_scalar <- ctx$scalar + } else { + # Legacy path: compute everything inline (for direct calls / debugging) + effective_formula <- formula + effective_scalar <- scalar + ctx <- .build_lm_context(formula, modelarray, phenotypes, scalar) + } - # intersection-based threshold - valid_mask <- Reduce("&", masks) - num.subj.valid <- sum(valid_mask) - if (!(num.subj.valid > num.subj.lthr)) { - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN, list.terms = NaN) - return(toreturn) - } else { - onerow <- c( - i_element - 1, - rep(NaN, (num.stat.output - 1)) - ) - return(onerow) - } - } - # dots <- list(...) - # dots_names <- names(dots) - # if ("weights" %in% dots_names) { - # message(dots$weights) - # myWeights <- dots$weights - # dots$weights <- NULL # remove weights from - # - # arguments_lm <- dots - # - # } - arguments_lm <- list(...) - arguments_lm$formula <- formula - arguments_lm$data <- dat - - # onemodel <- stats::lm(formula, data = dat, ...) - # onemodel <- stats::lm(formula, data = dat, weights = myWeights,...) - onemodel <- tryCatch( - { - do.call(stats::lm, arguments_lm) - }, - error = function(e) { - msg <- paste0( - "analyseOneElement.lm error at element ", - i_element, - ": ", - conditionMessage(e) - ) - if (on_error == "debug" && interactive()) { - message(msg) - browser() - } - if (on_error == "skip" || on_error == "debug") { - warning(msg) - if (flag_initiate == TRUE) { - return(structure(list(.lm_error_initiate = TRUE), class = "lm_error")) - } else { - return(structure(list(.lm_error_runtime = TRUE), class = "lm_error")) - } - } - stop(e) - } - ) + # Per-element data assembly ---- + elem <- .assemble_element_data(i_element, ctx, num.subj.lthr) - if (inherits(onemodel, "lm_error")) { - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN, list.terms = NaN) - return(toreturn) - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - return(onerow) - } + if (!elem$sufficient) { + if (flag_initiate) { + return(list(column_names = NaN, list.terms = NaN)) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } - # explicitly passing arguments into lm, to avoid error of argument "weights" + } - onemodel.tidy <- onemodel %>% broom::tidy() - onemodel.glance <- onemodel %>% broom::glance() + dat <- elem$dat - # delete columns you don't want: - var.terms.full <- names(onemodel.tidy) - var.model.full <- names(onemodel.glance) + # Fit model ---- + arguments_lm <- list(...) + arguments_lm$formula <- effective_formula + arguments_lm$data <- dat - # list to remove: - var.terms.orig <- var.terms - var.terms <- list("term", var.terms) %>% unlist() # we will always keep "term" column - var.terms.remove <- list() - for (l in var.terms.full) { - if (!(l %in% var.terms)) { - var.terms.remove <- var.terms.remove %>% - append(., l) %>% - unlist() # the order will still be kept + onemodel <- tryCatch( + { + do.call(stats::lm, arguments_lm) + }, + error = function(e) { + msg <- paste0( + "analyseOneElement.lm error at element ", + i_element, ": ", conditionMessage(e) + ) + if (on_error == "debug" && interactive()) { + message(msg) + browser() } - } - - var.model.remove <- list() - for (l in var.model.full) { - if (!(l %in% var.model)) { - var.model.remove <- var.model.remove %>% - append(., l) %>% - unlist() # the order will still be kept + if (on_error == "skip" || on_error == "debug") { + warning(msg) + if (flag_initiate) { + return(structure(list(.lm_error_initiate = TRUE), class = "lm_error")) + } else { + return(structure(list(.lm_error_runtime = TRUE), class = "lm_error")) + } } + stop(e) } + ) - # remove those columns: - if (length(var.terms.remove) != 0) { - # if length=0, it's list(), nothing to remove - onemodel.tidy <- dplyr::select(onemodel.tidy, -dplyr::all_of(var.terms.remove)) - } - if (length(var.model.remove) != 0) { - onemodel.glance <- dplyr::select(onemodel.glance, -dplyr::all_of(var.model.remove)) + if (inherits(onemodel, "lm_error")) { + if (flag_initiate) { + return(list(column_names = NaN, list.terms = NaN)) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } + } - # adjust: - # change the term name from "(Intercept)" to "Intercept" - onemodel.tidy$term[onemodel.tidy$term == "(Intercept)"] <- "Intercept" - onemodel.glance <- onemodel.glance %>% dplyr::mutate(term = "model") # add a column - - # get the list of terms: - list.terms <- onemodel.tidy$term - - # check if the onemodel.* does not have real statistics (but only a column of 'term') - temp_colnames <- onemodel.tidy %>% colnames() - # union of colnames and "term"; if colnames only has "term" or lengt of 0 (tibble()), - # union = "term", all(union)=TRUE; otherwise, if there is colnames other than "term", - # all(union) = c(TRUE, FALSE, ...) - temp <- union(temp_colnames, "term") - # just an empty tibble (so below, all(dim(onemodel.tidy)) = FALSE) - if (all(temp == "term")) { - onemodel.tidy <- tibble::tibble() - } - temp_colnames <- onemodel.glance %>% colnames() - temp <- union(temp_colnames, "term") # union of colnames and "term"; - if (all(temp == "term")) { - onemodel.glance <- tibble::tibble() - } + # Extract statistics ---- + onemodel.tidy <- onemodel %>% broom::tidy() + onemodel.glance <- onemodel %>% broom::glance() + # Normalize intercept term name to match expected output convention + onemodel.tidy$term[onemodel.tidy$term == "(Intercept)"] <- "Intercept" - # flatten .tidy results into one row: - if (all(dim(onemodel.tidy))) { - # not empty | if any dim is 0, all=FALSE - onemodel.tidy.onerow <- onemodel.tidy %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.terms.orig), - names_glue = "{term}.{.value}" - ) - } else { - onemodel.tidy.onerow <- onemodel.tidy - } + list.terms <- onemodel.tidy$term - if (all(dim(onemodel.glance))) { - # not empty - onemodel.glance.onerow <- onemodel.glance %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.model), - names_glue = "{term}.{.value}" + ## Term-level statistics ---- + onerow.terms <- tibble::tibble() + for (i_term in seq_along(list.terms)) { + for (i_var in seq_along(var.terms)) { + temp <- tibble::tibble( + placeholder = onemodel.tidy[[var.terms[i_var]]][i_term] ) - } else { - onemodel.glance.onerow <- onemodel.glance + colnames(temp) <- paste0(list.terms[i_term], ".", var.terms[i_var]) + onerow.terms <- bind_cols_check_emptyTibble(onerow.terms, temp) } + } + ## Model-level statistics ---- + onerow.model <- tibble::tibble() + for (i_var in seq_along(var.model)) { + temp <- tibble::tibble(placeholder = onemodel.glance[[var.model[i_var]]]) + colnames(temp) <- paste0("model.", var.model[i_var]) + onerow.model <- bind_cols_check_emptyTibble(onerow.model, temp) + } - # combine the tables: check if any of them is empty (tibble()) - onemodel.onerow <- bind_cols_check_emptyTibble(onemodel.tidy.onerow, onemodel.glance.onerow) - - # add a column of element ids: - colnames.temp <- colnames(onemodel.onerow) - onemodel.onerow <- onemodel.onerow %>% - tibble::add_column( - element_id = i_element - 1, - .before = colnames.temp[1] - ) # add as the first column - - # now you can get the headers, # of columns, etc of the output results - - - if (flag_initiate == TRUE) { - # return the column names: - - # return: - column_names <- colnames(onemodel.onerow) - toreturn <- list(column_names = column_names, list.terms = list.terms) - toreturn - } else if (flag_initiate == FALSE) { - # return the one row results: + onerow.all <- bind_cols_check_emptyTibble(onerow.terms, onerow.model) + column_names <- c("element_id", colnames(onerow.all)) - # return: - onerow <- as.numeric(onemodel.onerow) # change from tibble to numeric to save some space - onerow - } + if (flag_initiate) { + toreturn <- list( + column_names = column_names, + list.terms = list.terms + ) + return(toreturn) } else { - # if flag_sufficient==FALSE - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN, list.terms = NaN) - toreturn - } else if (flag_initiate == FALSE) { - onerow <- c( - i_element - 1, - # first is element_id, which will still be a finite number - rep(NaN, (num.stat.output - 1)) - ) # other columns will be NaN - - onerow - } + onerow <- c(i_element - 1, as.numeric(onerow.all)) + return(onerow) } } #' Fit a GAM for a single element -#' returns metadata (column names, smooth term names, parametric term names, +#' +#' #' @description +#' Returns metadata (column names, smooth term names, parametric term names, #' and the smoothing parameter criterion attribute name) used by #' \code{\link{ModelArray.gam}} to initialise the output data.frame. When #' \code{flag_initiate = FALSE}, it returns a numeric vector representing one @@ -752,6 +628,8 @@ analyseOneElement.lm <- function(i_element, #' squares (\code{model.sse}) for the model, which is needed for #' partial R-squared calculations in \code{\link{ModelArray.gam}}. #' Default: \code{FALSE}. +#' @param ctx A precomputed context list from \code{.build_gam_context()}, +#' or \code{NULL}. #' @param ... Additional arguments passed to \code{\link[mgcv]{gam}}. #' #' @return If \code{flag_initiate = TRUE}, a list with components: @@ -777,385 +655,225 @@ analyseOneElement.lm <- function(i_element, #' @export analyseOneElement.gam <- function(i_element, - formula, - modelarray, - phenotypes, - scalar, - var.smoothTerms, - var.parametricTerms, - var.model, + formula = NULL, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "statistic", "p.value"), + var.model = c("dev.expl"), num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, flag_sse = FALSE, on_error = "stop", + ctx = NULL, ...) { - values <- scalars(modelarray)[[scalar]][i_element, ] - ## check number of subjects with (in)valid values: - flag_sufficient <- NULL # whether number of subjects with valid values are sufficient - num.subj.valid <- length(values[is.finite(values)]) - if (num.subj.valid > num.subj.lthr) { - flag_sufficient <- TRUE + + # Resolve context ---- + if (!is.null(ctx)) { + effective_formula <- ctx$formula + effective_scalar <- ctx$scalar } else { - flag_sufficient <- FALSE + effective_formula <- formula + effective_scalar <- scalar + ctx <- .build_gam_context(formula, modelarray, phenotypes, scalar) } - if (flag_sufficient == TRUE) { - dat <- phenotypes - # detect scalar predictors referenced in formula - all_vars <- all.vars(formula) - lhs_name <- tryCatch(as.character(formula[[2]]), error = function(e) NULL) - rhs_vars <- setdiff(all_vars, lhs_name) - scalar_names <- names(scalars(modelarray)) - scalar_predictors <- intersect(rhs_vars, scalar_names) - - # collision check with phenotypes - collisions <- intersect(c(scalar, scalar_predictors), colnames(dat)) - if (length(collisions) > 0) { - stop(paste0( - "Column name collision between phenotypes and scalar names: ", - paste(collisions, collapse = ", "), - ". Please rename or remove these columns from phenotypes before modeling." + # Per-element data assembly ---- + elem <- .assemble_element_data(i_element, ctx, num.subj.lthr) + + if (!elem$sufficient) { + if (flag_initiate) { + return(list( + column_names = NaN, + list.smoothTerms = NaN, + list.parametricTerms = NaN, + sp.criterion.attr.name = NaN )) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } + } - # attach response first - dat[[scalar]] <- values - - # attach scalar predictors with source alignment and build intersection mask - masks <- list(is.finite(values)) - if (length(scalar_predictors) > 0) { - for (pred in scalar_predictors) { - pred_sources <- sources(modelarray)[[pred]] - phen_sources <- dat[["source_file"]] - if (!(all(pred_sources %in% phen_sources) && all(phen_sources %in% pred_sources))) { - stop(paste0( - "The source files for predictor scalar '", pred, "' do not match phenotypes$source_file." - )) + dat <- elem$dat + + # Fit GAM ---- + arguments <- list(...) + arguments$formula <- effective_formula + arguments$data <- dat + + onemodel <- tryCatch( + { + do.call(mgcv::gam, arguments) + }, + error = function(e) { + msg <- paste0( + "analyseOneElement.gam error at element ", + i_element, ": ", conditionMessage(e) + ) + if (on_error == "debug" && interactive()) { + message(msg) + browser() + } + if (on_error == "skip" || on_error == "debug") { + warning(msg) + if (flag_initiate) { + return(structure(list(.gam_error_initiate = TRUE), class = "gam_error")) + } else { + return(structure(list(.gam_error_runtime = TRUE), class = "gam_error")) } - reorder_idx <- match(phen_sources, pred_sources) - pred_vals <- scalars(modelarray)[[pred]][i_element, ] - pred_vals <- pred_vals[reorder_idx] - dat[[pred]] <- pred_vals - masks[[length(masks) + 1L]] <- is.finite(pred_vals) } + stop(e) } + ) - # intersection-based threshold - valid_mask <- Reduce("&", masks) - num.subj.valid <- sum(valid_mask) - if (!(num.subj.valid > num.subj.lthr)) { - if (flag_initiate == TRUE) { - toreturn <- list( - column_names = NaN, - list.smoothTerms = NaN, - list.parametricTerms = NaN, - sp.criterion.attr.name = NaN - ) - return(toreturn) - } else { - onerow <- c( - i_element - 1, - rep(NaN, (num.stat.output - 1)) - ) - return(onerow) - } + if (inherits(onemodel, "gam_error")) { + if (flag_initiate) { + return(list( + column_names = NaN, + list.smoothTerms = NaN, + list.parametricTerms = NaN, + sp.criterion.attr.name = NaN + )) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } + } - arguments <- list(...) - arguments$formula <- formula - arguments$data <- dat - - # explicitly passing arguments into command, to avoid error of argument "weights" - onemodel <- tryCatch( - { - do.call(mgcv::gam, arguments) - }, - error = function(e) { - msg <- paste0( - "analyseOneElement.gam error at element ", - i_element, - ": ", - conditionMessage(e) - ) - if (on_error == "debug" && interactive()) { - message(msg) - browser() - } - if (on_error == "skip" || on_error == "debug") { - warning(msg) - if (flag_initiate == TRUE) { - return(structure(list(.gam_error_initiate = TRUE), class = "gam_error")) - } else { - return(structure(list(.gam_error_runtime = TRUE), class = "gam_error")) - } - } - stop(e) - } - ) - if (inherits(onemodel, "gam_error")) { - if (flag_initiate == TRUE) { - toreturn <- list( - column_names = NaN, - list.smoothTerms = NaN, - list.parametricTerms = NaN, - sp.criterion.attr.name = NaN - ) - return(toreturn) - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - return(onerow) - } - } + # Extract statistics ---- - onemodel.tidy.smoothTerms <- onemodel %>% broom::tidy(parametric = FALSE) - onemodel.tidy.parametricTerms <- onemodel %>% broom::tidy(parametric = TRUE) - onemodel.glance <- onemodel %>% broom::glance() - onemodel.summary <- onemodel %>% summary() - # add additional model's stat to onemodel.glance(): - onemodel.glance[["adj.r.squared"]] <- onemodel.summary$r.sq - onemodel.glance[["dev.expl"]] <- onemodel.summary$dev.expl - - sp.criterion.attr.name <- onemodel.summary$sp.criterion %>% attr(which = "name") - onemodel.glance[["sp.criterion"]] <- onemodel.summary$sp.criterion[[sp.criterion.attr.name]] - onemodel.glance[["scale"]] <- onemodel.summary$scale # scale estimate - - num.smoothTerms <- onemodel.summary$m # The number of smooth terms in the model. - - - # delete columns you don't want: - var.smoothTerms.full <- names(onemodel.tidy.smoothTerms) - var.parametricTerms.full <- names(onemodel.tidy.parametricTerms) - var.model.full <- names(onemodel.glance) - - # list to remove: - var.smoothTerms.orig <- var.smoothTerms - var.smoothTerms <- list("term", var.smoothTerms) %>% unlist() # we will always keep "term" column - var.smoothTerms.remove <- list() - for (l in var.smoothTerms.full) { - if (!(l %in% var.smoothTerms)) { - var.smoothTerms.remove <- var.smoothTerms.remove %>% - append(., l) %>% - unlist() # the order will still be kept - } - } + onemodel.tidy.smoothTerms <- onemodel %>% broom::tidy(parametric = FALSE) + onemodel.tidy.parametricTerms <- onemodel %>% broom::tidy(parametric = TRUE) + onemodel.glance <- onemodel %>% broom::glance() + onemodel.summary <- onemodel %>% summary() - var.parametricTerms.orig <- var.parametricTerms - var.parametricTerms <- list("term", var.parametricTerms) %>% unlist() # we will always keep "term" column - var.parametricTerms.remove <- list() - for (l in var.parametricTerms.full) { - if (!(l %in% var.parametricTerms)) { - var.parametricTerms.remove <- var.parametricTerms.remove %>% - append(., l) %>% - unlist() # the order will still be kept - } - } + # ---- Normalize term names (must happen before column name assembly) ---- - var.model.remove <- list() - for (l in var.model.full) { - if (!(l %in% var.model)) { - var.model.remove <- var.model.remove %>% - append(., l) %>% - unlist() # the order will still be kept - } - } - - # remove those columns: - if (length(var.smoothTerms.remove) != 0) { - # if length=0, it's list(), nothing to remove - onemodel.tidy.smoothTerms <- dplyr::select(onemodel.tidy.smoothTerms, -dplyr::all_of(var.smoothTerms.remove)) - } - if (length(var.parametricTerms.remove) != 0) { - # if length=0, it's list(), nothing to remove - onemodel.tidy.parametricTerms <- dplyr::select(onemodel.tidy.parametricTerms, -dplyr::all_of(var.parametricTerms.remove)) - } - if (length(var.model.remove) != 0) { - onemodel.glance <- dplyr::select(onemodel.glance, -dplyr::all_of(var.model.remove)) - } + # Parametric: (Intercept) → Intercept + if (nrow(onemodel.tidy.parametricTerms) > 0) { + onemodel.tidy.parametricTerms$term[ + onemodel.tidy.parametricTerms$term == "(Intercept)" + ] <- "Intercept" + } - # adjust: - if (num.smoothTerms > 0) { - # if there is any smooth term - onemodel.tidy.smoothTerms$term[onemodel.tidy.smoothTerms$term == "(Intercept)"] <- "Intercept" - # change the term name from "(Intercept)" to "Intercept" + # Smooth: s(age) → s_age, ti(x,z) → ti_x_z, s(x):oFactor → s_x_BYoFactor + num.smoothTerms <- onemodel.summary$m + if (num.smoothTerms > 0) { + if (nrow(onemodel.tidy.smoothTerms) > 0) { + onemodel.tidy.smoothTerms$term[ + onemodel.tidy.smoothTerms$term == "(Intercept)" + ] <- "Intercept" } - if (nrow(onemodel.tidy.parametricTerms) > 0) { - # if there is any parametric term - onemodel.tidy.parametricTerms$term[onemodel.tidy.parametricTerms$term == "(Intercept)"] <- "Intercept" - # change the term name from "(Intercept)" to "Intercept" - } - - # change from s(x) to s_x: (could be s, te, etc); from s(x):oFactor to s_x_BYoFactor; from ti(x,z) to ti_x_z - if (num.smoothTerms > 0) { - # if there is any smooth term - for (i_row in seq_len(nrow(onemodel.tidy.smoothTerms))) { - # step 1: change from s(x) to s_x - term_name <- onemodel.tidy.smoothTerms$term[i_row] - str_list <- strsplit(term_name, split = "[()]")[[1]] - - str <- str_list[2] # extract string between () - smooth_name <- str_list[1] # "s" or some other smooth method type such as "te" - str_valid <- paste0(smooth_name, "_", str) - - if (length(str_list) > 2) { - # there is string after variable name - str_valid <- paste0( - str_valid, - "_", - paste(str_list[3:length(str_list)], collapse = "") - ) # combine rest of strings - } - # detect ":", and change to "BY" # there is "_" replacing for ")" in "s()" already - str_valid <- gsub(":", "BY", str_valid, fixed = TRUE) + for (i_row in seq_len(nrow(onemodel.tidy.smoothTerms))) { + term_name <- onemodel.tidy.smoothTerms$term[i_row] + str_list <- strsplit(term_name, split = "[()]")[[1]] - # detect ",", and change to "_" - str_valid <- gsub(",", "_", str_valid, fixed = TRUE) + str <- str_list[2] + smooth_name <- str_list[1] + str_valid <- paste0(smooth_name, "_", str) - onemodel.tidy.smoothTerms$term[i_row] <- str_valid + if (length(str_list) > 2) { + str_valid <- paste0( + str_valid, "_", + paste(str_list[3:length(str_list)], collapse = "") + ) } - } - - onemodel.glance <- onemodel.glance %>% dplyr::mutate(term = "model") # add a column + # : → BY + str_valid <- gsub(":", "BY", str_valid, fixed = TRUE) + # , → _ + str_valid <- gsub(",", "_", str_valid, fixed = TRUE) - # get the list of terms: - if (num.smoothTerms > 0) { - list.smoothTerms <- onemodel.tidy.smoothTerms$term # if empty, gives warning - } else { - list.smoothTerms <- NULL - } - - if (nrow(onemodel.tidy.parametricTerms) > 0) { - list.parametricTerms <- onemodel.tidy.parametricTerms$term - } else { - list.parametricTerms <- NULL + onemodel.tidy.smoothTerms$term[i_row] <- str_valid } + } + list.smoothTerms <- onemodel.tidy.smoothTerms$term + list.parametricTerms <- onemodel.tidy.parametricTerms$term - # check if the onemodel.* does not have real statistics (but only a column of 'term') - temp_colnames <- onemodel.tidy.smoothTerms %>% colnames() - temp <- union(temp_colnames, "term") - # union of colnames and "term"; if colnames only has "term" or lengt of 0 (tibble()), - # union = "term", all(union)=TRUE; otherwise, if there is colnames other than "term", - # all(union) = c(TRUE, FALSE, ...) - if (all(temp == "term")) { - onemodel.tidy.smoothTerms <- tibble::tibble() - } - # just an empty tibble (so below, all(dim(onemodel.tidy.smoothTerms)) = FALSE) - - temp_colnames <- onemodel.tidy.parametricTerms %>% colnames() - temp <- union(temp_colnames, "term") - if (all(temp == "term")) { - onemodel.tidy.parametricTerms <- tibble::tibble() - } # just an empty tibble - - temp_colnames <- onemodel.glance %>% colnames() - temp <- union(temp_colnames, "term") - if (all(temp == "term")) { - onemodel.glance <- tibble::tibble() - } # just an empty tibble - - - # flatten .tidy results into one row: - if (all(dim(onemodel.tidy.smoothTerms))) { - # not empty | if any dim is 0, all=FALSE - onemodel.tidy.smoothTerms.onerow <- onemodel.tidy.smoothTerms %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.smoothTerms.orig), - names_glue = "{term}.{.value}" + ## Smooth term statistics ---- + onerow.smoothTerms <- tibble::tibble() + for (i_term in seq_along(list.smoothTerms)) { + for (i_var in seq_along(var.smoothTerms)) { + temp <- tibble::tibble( + placeholder = onemodel.tidy.smoothTerms[[var.smoothTerms[i_var]]][i_term] ) - } else { - onemodel.tidy.smoothTerms.onerow <- onemodel.tidy.smoothTerms + colnames(temp) <- paste0(list.smoothTerms[i_term], ".", var.smoothTerms[i_var]) + onerow.smoothTerms <- bind_cols_check_emptyTibble(onerow.smoothTerms, temp) } + } - if (all(dim(onemodel.tidy.parametricTerms))) { - # not empty - onemodel.tidy.parametricTerms.onerow <- onemodel.tidy.parametricTerms %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.parametricTerms.orig), - names_glue = "{term}.{.value}" + ## Parametric term statistics ---- + onerow.parametricTerms <- tibble::tibble() + for (i_term in seq_along(list.parametricTerms)) { + for (i_var in seq_along(var.parametricTerms)) { + temp <- tibble::tibble( + placeholder = onemodel.tidy.parametricTerms[[var.parametricTerms[i_var]]][i_term] ) - } else { - onemodel.tidy.parametricTerms.onerow <- onemodel.tidy.parametricTerms + colnames(temp) <- paste0(list.parametricTerms[i_term], ".", var.parametricTerms[i_var]) + onerow.parametricTerms <- bind_cols_check_emptyTibble(onerow.parametricTerms, temp) } + } - if (all(dim(onemodel.glance))) { - # not empty - onemodel.glance.onerow <- onemodel.glance %>% tidyr::pivot_wider( - names_from = "term", - values_from = dplyr::all_of(var.model), - names_glue = "{term}.{.value}" - ) + ## Model-level statistics ---- + onerow.model <- tibble::tibble() + for (i_var in seq_along(var.model)) { + val <- NULL + if (var.model[i_var] %in% colnames(onemodel.glance)) { + val <- onemodel.glance[[var.model[i_var]]] + } else if (var.model[i_var] == "adj.r.squared") { + val <- onemodel.summary$r.sq + } else if (var.model[i_var] == "dev.expl") { + val <- onemodel.summary$dev.expl + } else if (var.model[i_var] == "sp.criterion") { + val <- onemodel.summary$sp.criterion + if (flag_initiate) { + sp.criterion.attr.name <- names(onemodel.summary$sp.criterion) + } + } else if (var.model[i_var] == "scale") { + val <- onemodel.summary$scale } else { - onemodel.glance.onerow <- onemodel.glance + val <- onemodel.glance[[var.model[i_var]]] } + temp <- tibble::tibble(placeholder = val) + colnames(temp) <- paste0("model.", var.model[i_var]) + onerow.model <- bind_cols_check_emptyTibble(onerow.model, temp) + } + ## SSE if requested ---- + onerow.sse <- tibble::tibble() + if (flag_sse) { + sse_val <- sum(onemodel$residuals^2) + temp <- tibble::tibble(placeholder = sse_val) + colnames(temp) <- "model.sse" + onerow.sse <- temp + } - # combine the tables: check if any of them is empty (tibble()) - onemodel.onerow <- bind_cols_check_emptyTibble( - onemodel.tidy.smoothTerms.onerow, - onemodel.tidy.parametricTerms.onerow - ) - onemodel.onerow <- bind_cols_check_emptyTibble(onemodel.onerow, onemodel.glance.onerow) - + onerow.all <- bind_cols_check_emptyTibble(onerow.smoothTerms, onerow.parametricTerms) + onerow.all <- bind_cols_check_emptyTibble(onerow.all, onerow.model) + onerow.all <- bind_cols_check_emptyTibble(onerow.all, onerow.sse) - # add a column of element ids: - colnames.temp <- colnames(onemodel.onerow) - onemodel.onerow <- onemodel.onerow %>% tibble::add_column(element_id = i_element - 1, .before = colnames.temp[1]) - # add as the first column + column_names <- c("element_id", colnames(onerow.all)) - # add sse if requested: - if (flag_sse == TRUE) { - onemodel.onerow[["model.sse"]] <- sum((onemodel$y - onemodel$fitted.values)^2) - # using values from model itself, where NAs in y have been excluded --> - # sse won't be NA --> partial R-squared won't be NA - } - - - # now you can get the headers, # of columns, etc of the output results - - - if (flag_initiate == TRUE) { - # return the column names: - - # return: - column_names <- colnames(onemodel.onerow) - toreturn <- list( - column_names = column_names, - list.smoothTerms = list.smoothTerms, - list.parametricTerms = list.parametricTerms, - sp.criterion.attr.name = sp.criterion.attr.name - ) - toreturn - } else if (flag_initiate == FALSE) { - # return the one row results: - - # return: - onerow <- as.numeric(onemodel.onerow) # change from tibble to numeric to save some space - onerow + if (flag_initiate) { + if (!exists("sp.criterion.attr.name")) { + sp.criterion.attr.name <- NA_character_ } + return(list( + column_names = column_names, + list.smoothTerms = list.smoothTerms, + list.parametricTerms = list.parametricTerms, + sp.criterion.attr.name = sp.criterion.attr.name + )) } else { - # if flag_sufficient==FALSE - if (flag_initiate == TRUE) { - toreturn <- list( - column_names = NaN, - list.smoothTerms = NaN, - list.parametricTerms = NaN, - sp.criterion.attr.name = NaN - ) - toreturn - } else if (flag_initiate == FALSE) { - onerow <- c( - i_element - 1, - # first is element_id, which will still be a finite number - rep(NaN, (num.stat.output - 1)) - ) # other columns will be NaN - - onerow - } + onerow <- c(i_element - 1, as.numeric(onerow.all)) + return(onerow) } } @@ -1195,8 +913,19 @@ analyseOneElement.gam <- function(i_element, #' #' @param user_fun A function that accepts at least an argument named #' \code{data} (a data.frame: \code{phenotypes} with scalar columns -#' appended for the current element) and returns a one-row -#' data.frame/tibble, a named list, or a named atomic vector. +#' appended for the current element) and returns one of: +#' \itemize{ +#' \item A one-row \code{data.frame} or \code{tibble}. Multi-row +#' returns will error. +#' \item A named list. Unnamed lists are accepted and auto-named +#' as \code{v1}, \code{v2}, etc. +#' \item A named atomic vector. Unnamed vectors are accepted and +#' auto-named as \code{v1}, \code{v2}, etc. +#' } +#' All return values are coerced to a numeric vector internally. +#' Unsupported types (e.g., environments, S4 objects) will error. +#' @param ctx A precomputed context list from \code{.build_wrap_context()}, +#' or \code{NULL}. #' @param ... Additional arguments forwarded to \code{user_fun}. #' #' @return If \code{flag_initiate = TRUE}, a list with one component: @@ -1222,153 +951,124 @@ analyseOneElement.gam <- function(i_element, #' @export analyseOneElement.wrap <- function(i_element, user_fun, - modelarray, - phenotypes, - scalar, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, on_error = "stop", + ctx = NULL, ...) { - values <- scalars(modelarray)[[scalar]][i_element, ] - ## check number of subjects with (in)valid values: - flag_sufficient <- NULL - num.subj.valid <- length(values[is.finite(values)]) - if (num.subj.valid > num.subj.lthr) { - flag_sufficient <- TRUE - } else { - flag_sufficient <- FALSE + # Resolve context ---- + + if (is.null(ctx)) { + ctx <- .build_wrap_context(modelarray, phenotypes, scalar) } - if (flag_sufficient == TRUE) { - dat <- phenotypes - # attach all scalars with alignment and collision checks - scalar_names <- names(scalars(modelarray)) - collisions <- intersect(scalar_names, colnames(dat)) - if (length(collisions) > 0) { - stop(paste0( - "Column name collision between phenotypes and scalar names: ", - paste(collisions, collapse = ", "), - ". Please rename or remove these columns from phenotypes before wrapping." - )) - } - masks <- list() - phen_sources <- dat[["source_file"]] - for (sname in scalar_names) { - s_sources <- sources(modelarray)[[sname]] - if (!(all(s_sources %in% phen_sources) && all(phen_sources %in% s_sources))) { - stop(paste0( - "The source files for scalar '", sname, "' do not match phenotypes$source_file." - )) - } - reorder_idx <- match(phen_sources, s_sources) - s_vals <- scalars(modelarray)[[sname]][i_element, ] - s_vals <- s_vals[reorder_idx] - dat[[sname]] <- s_vals - masks[[length(masks) + 1L]] <- is.finite(s_vals) - } + # Per-element data assembly ---- - # intersection-based threshold across all scalars - valid_mask <- Reduce("&", masks) - num.subj.valid <- sum(valid_mask) - if (!(num.subj.valid > num.subj.lthr)) { - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN) - return(toreturn) - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - return(onerow) - } + elem <- .assemble_element_data(i_element, ctx, num.subj.lthr) + + if (!elem$sufficient) { + if (flag_initiate) { + return(list(column_names = NaN)) + } else { + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) } + } - arguments <- list(...) - arguments$data <- dat - - # Execute user function with error handling - result <- tryCatch( - { - do.call(user_fun, arguments) - }, - error = function(e) { - msg <- paste0( - "analyseOneElement.wrap error at element ", - i_element, - ": ", - conditionMessage(e) - ) - if (on_error == "debug" && interactive()) { - message(msg) - browser() - } - if (on_error == "skip" || on_error == "debug") { - warning(msg) - if (flag_initiate == TRUE) { - return(structure(list(.wrap_error_initiate = TRUE), class = "wrap_error")) - } else { - return(structure(list(.wrap_error_runtime = TRUE), class = "wrap_error")) - } - } - stop(e) - } - ) + dat <- elem$dat - # On error in user function, return NaNs according to flag - if (inherits(result, "wrap_error")) { - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN) - return(toreturn) - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - return(onerow) - } - } - # Coerce to one-row tibble - if ("data.frame" %in% class(result)) { - onerow_tbl <- tibble::as_tibble(result) - if (nrow(onerow_tbl) != 1) { - stop( - "The user function must return a one-row data.frame/tibble, a named list, or a named vector." - ) + # Execute user function ---- + + arguments <- list(...) + arguments$data <- dat + + result <- tryCatch( + { + do.call(user_fun, arguments) + }, + error = function(e) { + msg <- paste0( + "analyseOneElement.wrap error at element ", + i_element, ": ", conditionMessage(e) + ) + if (on_error == "debug" && interactive()) { + message(msg) + browser() } - } else if (is.list(result)) { - onerow_tbl <- tibble::as_tibble_row(result) - } else if (is.atomic(result)) { - if (is.null(names(result))) { - names(result) <- paste0("v", seq_along(result)) + if (on_error == "skip" || on_error == "debug") { + warning(msg) + if (flag_initiate) { + return(structure(list(.wrap_error_initiate = TRUE), class = "wrap_error")) + } else { + return(structure(list(.wrap_error_runtime = TRUE), class = "wrap_error")) + } } - onerow_tbl <- tibble::as_tibble_row(as.list(result)) - } else { - stop( - paste0( - "Unsupported return type from user function. ", - "Return a one-row data.frame/tibble, named list, or named vector." - ) - ) + stop(e) } + ) - # add element_id as first column - colnames.temp <- colnames(onerow_tbl) - onerow_tbl <- onerow_tbl %>% tibble::add_column(element_id = i_element - 1, .before = colnames.temp[1]) - - if (flag_initiate == TRUE) { - toreturn <- list(column_names = colnames(onerow_tbl)) - toreturn + # Coerce result to named numeric vector ---- + if (inherits(result, "wrap_error")) { + if (flag_initiate) { + return(list(column_names = NaN)) } else { - # return numeric vector to be consistent with other analysers - as.numeric(onerow_tbl) + onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) + return(onerow) + } + } + + if (is.data.frame(result) || tibble::is_tibble(result)) { + if (nrow(result) != 1) { + msg <- paste0( + "The user function must return a one-row data.frame/tibble, ", + "a named list, or a named vector." + ) + if (on_error == "skip" || on_error == "debug") { + warning(paste0("analyseOneElement.wrap at element ", i_element, ": ", msg)) + if (flag_initiate) return(list(column_names = NaN)) + return(c(i_element - 1, rep(NaN, (num.stat.output - 1)))) + } + stop(msg) } + result_names <- colnames(result) + result_values <- as.numeric(result[1, ]) + } else if (is.list(result)) { + result_names <- names(result) + if (is.null(result_names) || any(result_names == "")) { + result_names <- paste0("v", seq_along(result)) + } + result_values <- as.numeric(unlist(result)) + } else if (is.atomic(result)) { + if (is.null(names(result))) { + names(result) <- paste0("v", seq_along(result)) + } + result_names <- names(result) + result_values <- as.numeric(result) } else { - # insufficient subjects - if (flag_initiate == TRUE) { - toreturn <- list(column_names = NaN) - toreturn - } else { - onerow <- c(i_element - 1, rep(NaN, (num.stat.output - 1))) - onerow + msg <- paste0("user_fun must return a one-row data.frame/tibble, a named list, ", + "or a named atomic vector. Got: ", class(result)[1]) + if (on_error == "skip" || on_error == "debug") { + warning(paste0("analyseOneElement.wrap at element ", i_element, ": ", msg)) + if (flag_initiate) return(list(column_names = NaN)) + return(c(i_element - 1, rep(NaN, (num.stat.output - 1)))) } + stop(msg) + } + + column_names <- c("element_id", result_names) + + if (flag_initiate) { + return(list(column_names = column_names)) + } else { + onerow <- c(i_element - 1, result_values) + return(onerow) } } diff --git a/R/analyse-helpers.R b/R/analyse-helpers.R index 5144a5d..41927d1 100644 --- a/R/analyse-helpers.R +++ b/R/analyse-helpers.R @@ -1,7 +1,7 @@ # Internal shared helpers for ModelArray analysis functions # These are not exported — used by ModelArray.lm, ModelArray.gam, ModelArray.wrap - +# Validation and dataset preparation ---- #' Validate that data is a ModelArray #' @noRd .validate_modelarray_input <- function(data) { @@ -110,7 +110,320 @@ max(num.subj.total * num.subj.lthr.rel, num.subj.lthr.abs) } +# Create analysis context ---- +# Precomputed context builders for element-wise analysis functions. +# These hoist loop-invariant computations out of the per-element functions +# so they execute once before the loop rather than millions of times inside it. + +# Level 1: Base context shared by all analysis functions (lm, gam, wrap). +# Handles scalar discovery, collision checks, and cross-scalar source alignment. + +#' Build base context shared by all element-wise analysis functions +#' +#' Precomputes scalar name lookups, column-name collision checks, and +#' cross-scalar source-alignment indices that are invariant across elements. +#' This is the foundation on which model-specific contexts are built. +#' +#' @param modelarray A \linkS4class{ModelArray} object. +#' @param phenotypes A data.frame, already aligned to \code{modelarray} via +#' \code{.align_phenotypes()}. +#' @param scalar Character. The name of the response scalar. +#' @param scalar_subset Character vector or \code{NULL}. Which scalars to +#' attach and align. If \code{NULL}, all scalars in the ModelArray are used +#' (the \code{wrap} behaviour). If a character vector, only those scalars +#' are included (the \code{lm}/\code{gam} behaviour where we detect +#' formula-referenced scalars). +#' @return A named list with components: +#' \describe{ +#' \item{modelarray}{The ModelArray object (read-only reference).} +#' \item{phenotypes}{The aligned phenotypes data.frame.} +#' \item{scalar}{Character. The response scalar name.} +#' \item{all_scalar_names}{Character vector. Names of all scalars in the +#' ModelArray.} +#' \item{attached_scalars}{Character vector. Names of scalars that will +#' be attached to the per-element data.frame (may be a subset of +#' \code{all_scalar_names} or all of them).} +#' \item{predictor_reorder}{A named list of integer vectors. For each +#' scalar in \code{attached_scalars}, the precomputed index vector +#' from \code{match(phen_sources, scalar_sources)} that reorders +#' scalar columns to match phenotype rows. For the response scalar +#' (which is already aligned by \code{.align_phenotypes()}), this +#' entry is \code{NULL} (no reordering needed).} +#' } +#' @noRd +.build_base_context <- function(modelarray, phenotypes, scalar, + scalar_subset = NULL) { + all_scalar_names <- names(scalars(modelarray)) + + + # Determine which scalars to attach + + if (is.null(scalar_subset)) { + # wrap mode: attach all scalars + attached_scalars <- all_scalar_names + } else { + # lm/gam mode: attach only the explicitly requested scalars + # (response + any predictor scalars detected from the formula) + attached_scalars <- unique(scalar_subset) + } + + + # Collision check: scalar names vs phenotype column names. + # This is invariant — the same collision would occur at every element, + + # so we fail fast once rather than millions of times. + + collisions <- intersect(attached_scalars, colnames(phenotypes)) + if (length(collisions) > 0) { + stop( + "Column name collision between phenotypes and scalar names: ", + paste(collisions, collapse = ", "), + ". Please rename or remove these columns from phenotypes before ", + "modeling." + ) + } + + # Precompute source-alignment reorder indices for each attached scalar. + # The response scalar is already aligned by .align_phenotypes(), so it + + # needs no reordering. Predictor scalars from mergeModelArrays() may + # have different column orderings and need match()-based reordering. + phen_sources <- phenotypes[["source_file"]] + predictor_reorder <- stats::setNames( + vector("list", length(attached_scalars)), + attached_scalars + ) + + for (sname in attached_scalars) { + s_sources <- sources(modelarray)[[sname]] + + if (identical(s_sources, phen_sources)) { + # Already aligned (typical for the response scalar, or when all + # scalars share the same source order after mergeModelArrays) + predictor_reorder[[sname]] <- NULL + } else { + # Validate that the sets match (once, not per element) + if (!(all(s_sources %in% phen_sources) && + all(phen_sources %in% s_sources))) { + stop( + "The source files for scalar '", sname, + "' do not match phenotypes$source_file." + ) + } + predictor_reorder[[sname]] <- match(phen_sources, s_sources) + } + } + + list( + modelarray = modelarray, + phenotypes = phenotypes, + scalar = scalar, + all_scalar_names = all_scalar_names, + attached_scalars = attached_scalars, + predictor_reorder = predictor_reorder + ) +} + + +# Level 2: Model-specific context builders that add formula-derived +# information on top of the base context. + +#' Build context for element-wise linear model fitting +#' +#' Extends the base context with formula parsing results: the LHS variable +#' name, RHS variable names, and which RHS variables are scalars (as opposed +#' to phenotype columns). All of this is invariant across elements. +#' +#' @param formula A \code{\link[stats]{formula}} to be passed to +#' \code{\link[stats]{lm}}. +#' @param modelarray A \linkS4class{ModelArray} object. +#' @param phenotypes A data.frame, already aligned via +#' \code{.align_phenotypes()}. +#' @param scalar Character. The response scalar name. +#' @return A named list inheriting all components from +#' \code{.build_base_context()} plus: +#' \describe{ +#' \item{formula}{The model formula.} +#' \item{lhs_name}{Character. The response variable name from the +#' formula LHS.} +#' \item{rhs_vars}{Character vector. All variable names on the RHS.} +#' \item{scalar_predictors}{Character vector. RHS variables that are +#' scalar names in the ModelArray (i.e., cross-scalar predictors).} +#' } +#' @noRd +.build_lm_context <- function(formula, modelarray, phenotypes, scalar) { + # Formula parsing — currently repeated per element inside + # analyseOneElement.lm + all_vars <- all.vars(formula) + lhs_name <- tryCatch( + as.character(formula[[2]]), + error = function(e) NULL + ) + rhs_vars <- setdiff(all_vars, lhs_name) + + # Detect which RHS variables are scalars in the ModelArray + all_scalar_names <- names(scalars(modelarray)) + scalar_predictors <- intersect(rhs_vars, all_scalar_names) + + # The scalars to attach: response + any scalar predictors + scalar_subset <- unique(c(scalar, scalar_predictors)) + + # Build the base context with only the needed scalars + base_ctx <- .build_base_context( + modelarray = modelarray, + phenotypes = phenotypes, + scalar = scalar, + scalar_subset = scalar_subset + ) + + # Extend with formula-specific fields + base_ctx$formula <- formula + base_ctx$lhs_name <- lhs_name + base_ctx$rhs_vars <- rhs_vars + base_ctx$scalar_predictors <- scalar_predictors + base_ctx +} + + +#' Build context for element-wise GAM fitting +#' +#' Extends the base context with formula parsing results and GAM-specific +#' formula validation that currently runs inside the ModelArray.gam() +#' preamble but whose results are never passed to the per-element function. +#' +#' @param formula A \code{\link[stats]{formula}} to be passed to +#' \code{\link[mgcv]{gam}}. +#' @param modelarray A \linkS4class{ModelArray} object. +#' @param phenotypes A data.frame, already aligned via +#' \code{.align_phenotypes()}. +#' @param scalar Character. The response scalar name. +#' @return A named list inheriting all components from +#' \code{.build_lm_context()} (which itself inherits from +#' \code{.build_base_context()}) plus: +#' \describe{ +#' \item{gam_formula_breakdown}{The result of +#' \code{mgcv::interpret.gam(formula)}, cached for reuse.} +#' } +#' @noRd +.build_gam_context <- function(formula, modelarray, phenotypes, scalar) { + # GAM formula validation — currently runs in ModelArray.gam() preamble + # but the breakdown result is discarded. We cache it here. + gam_breakdown <- tryCatch( + mgcv::interpret.gam(formula), + error = function(cond) { + stop("The formula is not valid for mgcv::gam()! Please check and revise.") + } + ) + + # The formula structure is the same as lm for variable detection purposes + ctx <- .build_lm_context(formula, modelarray, phenotypes, scalar) + + # Add GAM-specific cached data + ctx$gam_formula_breakdown <- gam_breakdown + + ctx +} + + +#' Build context for element-wise user-supplied function execution +#' +#' Uses the base context in "attach all scalars" mode since +#' \code{analyseOneElement.wrap} attaches every scalar in the ModelArray +#' to the per-element data.frame, not just formula-referenced ones. +#' +#' @param modelarray A \linkS4class{ModelArray} object. +#' @param phenotypes A data.frame, already aligned via +#' \code{.align_phenotypes()}. +#' @param scalar Character. The primary scalar name (used for the initial +#' validity check). +#' @return A named list: the base context with \code{scalar_subset = NULL} +#' (all scalars attached). +#' @noRd +.build_wrap_context <- function(modelarray, phenotypes, scalar) { + # scalar_subset = NULL triggers "attach all" mode in .build_base_context() + ctx <- .build_base_context( + modelarray = modelarray, + phenotypes = phenotypes, + scalar = scalar, + scalar_subset = NULL + ) + + ctx +} + + +# Level 3: Shared per-element data assembly helper. +# Extracts scalar values for one element using the precomputed context, +# builds the validity mask, and returns the filtered data.frame. +# This replaces duplicated logic across analyseOneElement.lm, +# analyseOneElement.gam, and analyseOneElement.wrap. + +#' Assemble per-element data.frame from precomputed context +#' +#' Reads scalar rows from the ModelArray, applies precomputed reorder +#' indices, builds the intersection validity mask, and returns the +#' filtered data.frame ready for model fitting or user function execution. +#' +#' @param i_element Integer. 1-based element index. +#' @param ctx A context list from one of the \code{.build_*_context()} +#' functions. +#' @param num.subj.lthr Numeric. Minimum number of subjects with finite +#' values required. +#' @return A list with components: +#' \describe{ +#' \item{dat}{A data.frame: the filtered phenotypes with scalar columns +#' attached. \code{NULL} if the element has insufficient valid +#' subjects.} +#' \item{sufficient}{Logical. Whether the element passed the subject +#' threshold.} +#' \item{num_valid}{Integer. Number of subjects with finite values +#' across all attached scalars.} +#' } +#' @noRd +.assemble_element_data <- function(i_element, ctx, num.subj.lthr) { + # Read the response scalar row — the only mandatory per-element I/O + response_vals <- scalars(ctx$modelarray)[[ctx$scalar]][i_element, ] + + # Start the validity mask with the response scalar + masks <- list(is.finite(response_vals)) + + # Read and reorder additional attached scalars + scalar_values <- list() + scalar_values[[ctx$scalar]] <- response_vals + + other_scalars <- setdiff(ctx$attached_scalars, ctx$scalar) + for (sname in other_scalars) { + s_vals <- scalars(ctx$modelarray)[[sname]][i_element, ] + + # Apply precomputed reorder index (or use as-is if NULL) + reorder_idx <- ctx$predictor_reorder[[sname]] + if (!is.null(reorder_idx)) { + s_vals <- s_vals[reorder_idx] + } + + scalar_values[[sname]] <- s_vals + masks[[length(masks) + 1L]] <- is.finite(s_vals) + } + + # Intersection mask across all scalars + valid_mask <- Reduce("&", masks) + num_valid <- sum(valid_mask) + + if (!(num_valid > num.subj.lthr)) { + return(list(dat = NULL, sufficient = FALSE, num_valid = num_valid)) + } + + # Build filtered data.frame + dat <- ctx$phenotypes[valid_mask, , drop = FALSE] + for (sname in ctx$attached_scalars) { + dat[[sname]] <- scalar_values[[sname]][valid_mask] + } + + list(dat = dat, sufficient = TRUE, num_valid = num_valid) +} + +# Find initiator element ---- #' Find a valid initiator element by searching middle → forward → backward #' #' @param analyse_one_fn The per-element analysis function @@ -186,6 +499,7 @@ } +# Parallelization ---- #' Dispatch parallel/serial apply with optional progress bar #' #' @param element.subset Integer vector of element indices @@ -236,7 +550,7 @@ fits } - +# P-value correction ---- #' Correct p-values for a set of terms and append corrected columns #' #' @param df_out Data.frame of results @@ -264,7 +578,7 @@ df_out } - +# Streaming writes ---- #' Initialize an incremental writer for /results datasets #' @noRd .init_results_stream_writer <- function(write_results_name, diff --git a/R/analyse.R b/R/analyse.R index 9cd8881..46a2106 100644 --- a/R/analyse.R +++ b/R/analyse.R @@ -132,10 +132,12 @@ #' @rdname ModelArray.lm #' @export -ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NULL, full.outputs = FALSE, +ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NULL, + full.outputs = FALSE, var.terms = c("estimate", "statistic", "p.value"), var.model = c("adj.r.squared", "p.value"), - correct.p.value.terms = c("fdr"), correct.p.value.model = c("fdr"), + correct.p.value.terms = c("fdr"), + correct.p.value.model = c("fdr"), num.subj.lthr.abs = 10, num.subj.lthr.rel = 0.2, verbose = TRUE, pbar = TRUE, n_cores = 1, on_error = "stop", @@ -146,83 +148,47 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU write_results_compression_level = 4L, return_output = TRUE, ...) { + # Validation ---- .validate_modelarray_input(data) element.subset <- .validate_element_subset(element.subset, data, scalar) phenotypes <- .align_phenotypes(data, phenotypes, scalar) - - ### display additional arguments: + # Display additional arguments ---- dots <- list(...) dots_names <- names(dots) - FUN <- stats::lm - - # subset: m1 <- "no default" printAdditionalArgu(FUN, "subset", dots, m1) - - # weights: m1 <- "no default" - if ("weights" %in% dots_names) { # if user provides weights - m_usr_input <- paste0( - class(dots$weights), - " with length of ", - length(dots$weights) - ) # message describing usr's input; cannot use dim on c(1,2) + if ("weights" %in% dots_names) { + m_usr_input <- paste0(class(dots$weights), " with length of ", length(dots$weights)) } else { m_usr_input <- NULL } printAdditionalArgu(FUN, "weights", dots, m1, m_usr_input) - - # na.action: m1 <- "no default" printAdditionalArgu(FUN, "na.action", dots, m1) - - # method: - printAdditionalArgu(FUN, "method", dots) # default: "qr" - - # model: - m1 <- invisible(eval(formals(FUN)[["model"]])) %>% as.character() # default: [logical] TRUE + printAdditionalArgu(FUN, "method", dots) + m1 <- invisible(eval(formals(FUN)[["model"]])) %>% as.character() printAdditionalArgu(FUN, "model", dots, m1) - - # x: - m1 <- invisible(eval(formals(FUN)[["x"]])) %>% as.character() # default: [logical] FALSE + m1 <- invisible(eval(formals(FUN)[["x"]])) %>% as.character() printAdditionalArgu(FUN, "x", dots, m1) - # y: - m1 <- invisible(eval(formals(FUN)[["y"]])) %>% as.character() # default: [logical] FALSE - printAdditionalArgu(FUN, "y", dots, m1) - - # qr: - m1 <- invisible(eval(formals(FUN)[["qr"]])) %>% as.character() # default: [logical] TRUE - printAdditionalArgu(FUN, "qr", dots, m1) - - # singular.ok: - m1 <- invisible(eval(formals(FUN)[["singular.ok"]])) %>% as.character() # default: [logical] TRUE - printAdditionalArgu(FUN, "singular.ok", dots, m1) - - # contrasts: - printAdditionalArgu(FUN, "contrasts", dots) # default: NULL - - # offset: - m1 <- "no default" # there is no default - printAdditionalArgu(FUN, "offset", dots, m1) - - num.subj.lthr <- .compute_subject_threshold(phenotypes, num.subj.lthr.abs, num.subj.lthr.rel) - ### other setups: + # Full outputs override ---- var.terms.full <- c("estimate", "std.error", "statistic", "p.value") var.model.full <- c( "r.squared", "adj.r.squared", "sigma", "statistic", "p.value", "df", "logLik", "AIC", "BIC", "deviance", "df.residual", "nobs" ) - if (full.outputs == TRUE) { # full set of outputs + if (full.outputs == TRUE) { var.terms <- var.terms.full var.model <- var.model.full } + # check on validity of list of vars: - var.terms <- var.terms[!duplicated(var.terms)] # remove duplicated element(s) + var.terms <- var.terms[!duplicated(var.terms)] var.model <- var.model[!duplicated(var.model)] # check if all var.* are empty: @@ -242,29 +208,30 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU } } - - # check for p.value correction: - # check for terms: + # P-value correction checks ---- check_validity_correctPValue( correct.p.value.terms, "correct.p.value.terms", var.terms, "var.terms" ) - # check for model: check_validity_correctPValue( correct.p.value.model, "correct.p.value.model", var.model, "var.model" ) + # Build context ONCE before the loop ---- + ctx <- .build_lm_context(formula, data, phenotypes, scalar) - ### start the process: + # Start the model fitting ---- if (verbose) { message(glue::glue("Fitting element-wise linear models for {scalar}")) message(glue::glue("initiating....")) } num.elements.total <- numElementsTotal(modelarray = data, scalar_name = scalar) + + # Initiate using ctx ---- init_args <- list( - formula = formula, modelarray = data, phenotypes = phenotypes, scalar = scalar, + ctx = ctx, var.terms = var.terms, var.model = var.model, num.subj.lthr = num.subj.lthr, on_error = on_error, ... ) @@ -279,13 +246,16 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU message(glue::glue("looping across elements....")) } - need_term_correction <- (!all(correct.p.value.terms == "none")) && ("p.value" %in% var.terms) - need_model_correction <- (!all(correct.p.value.model == "none")) && ("p.value" %in% var.model) + need_term_correction <- (!all(correct.p.value.terms == "none")) && + ("p.value" %in% var.terms) + need_model_correction <- (!all(correct.p.value.model == "none")) && + ("p.value" %in% var.model) need_full_df <- return_output || need_term_correction || need_model_correction if (!return_output && (need_term_correction || need_model_correction)) { stop("return_output=FALSE is not supported when p-value correction is requested") } + # Initialize stream writer ---- writer <- .init_results_stream_writer( write_results_name = write_results_name, write_results_file = write_results_file, @@ -296,15 +266,19 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU compression_level = write_results_compression_level ) + # Start model looping ---- fits_all <- if (need_full_df) vector("list", length(element.subset)) else NULL chunk_size <- if (is.null(writer)) length(element.subset) else as.integer(write_results_flush_every) chunk_starts <- seq(1L, length(element.subset), by = chunk_size) + for (chunk_start in chunk_starts) { chunk_end <- min(chunk_start + chunk_size - 1L, length(element.subset)) chunk_elements <- element.subset[chunk_start:chunk_end] - chunk_fits <- .parallel_dispatch(chunk_elements, analyseOneElement.lm, n_cores, pbar, - formula = formula, modelarray = data, phenotypes = phenotypes, scalar = scalar, + ## Main loop now passes ctx instead of formula/modelarray/phenotypes/scalar + chunk_fits <- .parallel_dispatch( + chunk_elements, analyseOneElement.lm, n_cores, pbar, + ctx = ctx, var.terms = var.terms, var.model = var.model, num.subj.lthr = num.subj.lthr, num.stat.output = num.stat.output, flag_initiate = FALSE, on_error = on_error, @@ -333,7 +307,7 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU df_out <- .correct_pvalues(df_out, list.terms, correct.p.value.terms, var.terms) df_out <- .correct_pvalues(df_out, "model", correct.p.value.model, var.model) - # Streamed blocks were uncorrected; rewrite final corrected results if needed. + # Rewrite corrected results to HDF5 if needed ---- if (!is.null(writer) && (need_term_correction || need_model_correction)) { writeResults( fn.output = write_results_file, @@ -350,8 +324,54 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU } #' Fit element-wise generalized additive models -#' no model-level p-value for GAMs, so there is no -#' \code{correct.p.value.model} argument. +#' +#' +#' @description +#' `ModelArray.gam` fits a generalized additive model at each requested +#' element in a \linkS4class{ModelArray} and returns a tibble of requested +#' model statistics. There is no model-level p-value for GAMs, so there is +#' no \code{correct.p.value.model} argument. +#' +#' @details +#' You may request returning specific statistical variables by setting +#' \code{var.*}, or you can get all by setting \code{full.outputs = TRUE}. +#' Note that statistics covered by \code{full.outputs} or \code{var.*} are +#' the ones from \code{broom::tidy()}, \code{broom::glance()}, and +#' \code{summary.gam()} only, and do not include corrected p-values. +#' However FDR-corrected p-values (\code{"fdr"}) are generated by default. +#' +#' List of acceptable statistic names for each of \code{var.*}: +#' \itemize{ +#' \item \code{var.smoothTerms}: \code{c("edf", "ref.df", "statistic", +#' "p.value")}; From \code{broom::tidy(parametric = FALSE)}. +#' \item \code{var.parametricTerms}: \code{c("estimate", "std.error", +#' "statistic", "p.value")}; From \code{broom::tidy(parametric = TRUE)}. +#' \item \code{var.model}: \code{c("adj.r.squared", "dev.expl", +#' "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", +#' "df.residual", "nobs")}; From \code{broom::glance()} and +#' \code{\link[mgcv]{summary.gam}}. +#' } +#' +#' Smooth term names in the output are normalized: \code{s(age)} becomes +#' \code{s_age}, \code{ti(x,z)} becomes \code{ti_x_z}, and +#' \code{s(x):oFactor} becomes \code{s_x_BYoFactor}. +#' +#' For p-value corrections (arguments \code{correct.p.value.*}), supported +#' methods include all methods in \code{p.adjust.methods} except +#' \code{"none"}. You can request more than one method. FDR-corrected +#' p-values (\code{"fdr"}) are calculated by default. Turn it off by +#' setting to \code{"none"}. +#' +#' When \code{changed.rsq.term.index} is provided, a reduced model (dropping +#' the specified term) is fit at each element to compute delta adjusted +#' R-squared and partial R-squared. This approximately doubles execution +#' time per requested term. The term index refers to the position on the +#' right-hand side of \code{formula} (use \code{labels(terms(formula))} to +#' see the ordering). +#' +#' Arguments \code{num.subj.lthr.abs} and \code{num.subj.lthr.rel} are +#' mainly for input data with subject-specific masks, i.e. currently only +#' for volume data. For fixel-wise data, you may ignore these arguments. #' #' @inheritParams ModelArray.lm #' @@ -362,21 +382,29 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU #' parametric terms, from \code{broom::tidy(parametric = TRUE)}. #' See Details. #' @param var.model Character vector. Statistics to save for the overall -#' model, from \code{broom::glance()} and \code{summary()}. See Details. +#' model, from \code{broom::glance()} and +#' \code{\link[mgcv]{summary.gam}}. See Details. #' @param changed.rsq.term.index A list of positive integers. Each value #' is the index of a term on the right-hand side of \code{formula} for #' which delta adjusted R-squared and partial R-squared should be #' computed. Usually the term of interest is a smooth term or interaction -#' term. Default \code{NULL} (not computed). See Details for warnings. +#' term. Default \code{NULL} (not computed). See Details. #' @param correct.p.value.smoothTerms Character vector. P-value #' correction method(s) for each smooth term. Default: \code{"fdr"}. +#' See Details. #' @param correct.p.value.parametricTerms Character vector. P-value #' correction method(s) for each parametric term. Default: \code{"fdr"}. -#' @param ... Additional arguments passed to \code{\link[mgcv]{gam}}. +#' See Details. +#' @param ... Additional arguments passed to \code{\link[mgcv]{gam}} +#' (e.g. \code{method = "REML"}). #' -#' @return A tibble with one row per element. The first column is +#' @return A data.frame with one row per element. The first column is #' \code{element_id} (0-based). Remaining columns contain the requested -#' statistics, named as \code{.}. If +#' statistics, named as \code{.} for per-term statistics +#' and \code{model.} for model-level statistics. Smooth term +#' names are normalized (e.g. \code{s_age.statistic}). If p-value +#' corrections were requested, additional columns are appended with the +#' correction method as suffix (e.g. \code{s_age.p.value.fdr}). If #' \code{changed.rsq.term.index} was requested, additional columns #' \code{.delta.adj.rsq} and \code{.partial.rsq} are #' appended. @@ -385,13 +413,15 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU #' \code{\link{ModelArray.wrap}} for user-supplied functions, #' \code{\link{gen_gamFormula_fxSmooth}} and #' \code{\link{gen_gamFormula_contIx}} for formula helpers, -#' \linkS4class{ModelArray} for the input class. +#' \linkS4class{ModelArray} for the input class, +#' \code{\link{ModelArray}} for the constructor, +#' \code{\link{exampleElementData}} for testing formulas on a single +#' element. +#' #' #' @examples{ #' \dontrun{ -#' ma <- ModelArray("path/to/data.h5", scalar_types = c("FD")) -#' phenotypes <- read.csv("cohort.csv") -#' +#' # Fit GAM with default outputs #' results <- ModelArray.gam( #' FD ~ s(age, fx = TRUE) + sex, #' data = ma, @@ -408,13 +438,25 @@ ModelArray.lm <- function(formula, data, phenotypes, scalar, element.subset = NU #' scalar = "FD", #' changed.rsq.term.index = list(1) #' ) +#' +#' # Full outputs, no p-value correction +#' results_full <- ModelArray.gam( +#' FD ~ s(age, fx = TRUE) + sex, +#' data = ma, +#' phenotypes = phenotypes, +#' scalar = "FD", +#' full.outputs = TRUE, +#' correct.p.value.smoothTerms = "none", +#' correct.p.value.parametricTerms = "none" +#' ) #' } #' } #' #' @rdname ModelArray.gam #' @export -ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = NULL, full.outputs = FALSE, +ModelArray.gam <- function(formula, data, phenotypes, scalar, + element.subset = NULL, full.outputs = FALSE, var.smoothTerms = c("statistic", "p.value"), var.parametricTerms = c("estimate", "statistic", "p.value"), var.model = c("dev.expl"), @@ -431,62 +473,46 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N write_results_compression_level = 4L, return_output = TRUE, ...) { + # Validation ---- .validate_modelarray_input(data) element.subset <- .validate_element_subset(element.subset, data, scalar) + phenotypes <- .align_phenotypes(data, phenotypes, scalar) - # check if the formula is valid in terms of mgcv::gam() - tryCatch( - { - gam.formula.breakdown <- mgcv::interpret.gam(formula) - }, - error = function(cond) { - stop(paste0("The formula is not valid for mgcv::gam()! Please check and revise.")) - } - ) - - checker_gam_formula(formula, gam.formula.breakdown) + # Build context ---- + ctx <- .build_gam_context(formula, data, phenotypes, scalar) - phenotypes <- .align_phenotypes(data, phenotypes, scalar) + # Use the cached breakdown for the formula checker display ---- + checker_gam_formula(formula, ctx$gam_formula_breakdown) - ### display additional arguments: [only important one] + # Display additional arguments ---- dots <- list(...) dots_names <- names(dots) - FUN <- mgcv::gam - - # method: (default: "GCV.Cp") - printAdditionalArgu(FUN, "method", dots) # default: "GCV.Cp" - - # TODO: optional: check if fx=FALSE; if so, add edf to the list of var + warning: fx=TRUE is recommended - + printAdditionalArgu(FUN, "method", dots) num.subj.lthr <- .compute_subject_threshold(phenotypes, num.subj.lthr.abs, num.subj.lthr.rel) - ### when full.outputs = TRUE: + # Full outputs override ---- var.smoothTerms.full <- c("edf", "ref.df", "statistic", "p.value") var.parametricTerms.full <- c("estimate", "std.error", "statistic", "p.value") var.model.full <- c( "adj.r.squared", "dev.expl", "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", "df.residual", "nobs" ) - - if (full.outputs == TRUE) { # full set of outputs + if (full.outputs == TRUE) { var.smoothTerms <- var.smoothTerms.full var.parametricTerms <- var.parametricTerms.full var.model <- var.model.full } - ### check on validity of arguments: var.term and var.model - var.smoothTerms <- var.smoothTerms[!duplicated(var.smoothTerms)] # remove duplicated element(s) + var.smoothTerms <- var.smoothTerms[!duplicated(var.smoothTerms)] var.parametricTerms <- var.parametricTerms[!duplicated(var.parametricTerms)] var.model <- var.model[!duplicated(var.model)] - # check if all var.* are empty: if (length(var.smoothTerms) == 0 && length(var.parametricTerms) == 0 && length(var.model) == 0) { stop("All var.* arguments [var.smoothTerms, var.parametricTerms, var.model] are empty!") } - # check if every var is valid: for (var in var.smoothTerms) { if (!(var %in% var.smoothTerms.full)) { stop(paste0(var, " is not valid for var.smoothTerms!")) @@ -503,86 +529,100 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N } } - + # Changed.rsq setup ---- + # Changed.rsq setup ---- var.model.orig <- var.model - if (!is.null(changed.rsq.term.index)) { # changed.rsq is not null --> requested + if (!is.null(changed.rsq.term.index)) { - # check if the term index is valid: - if (min(changed.rsq.term.index) <= 0) { - # any of not positive | can't really check if it's integer as is.integer(1) is FALSE... + # Validation checks + if (min(unlist(changed.rsq.term.index)) <= 0) { stop( - paste0( - "There is element(s) in changed.rsq.term.index <= 0. ", - "It should be a (list of) positive integer!" - ) + "There is element(s) in changed.rsq.term.index <= 0. ", + "It should be a (list of) positive integer!" ) } terms.full.formula <- stats::terms(formula, keep.order = TRUE) - # not the re-order the terms | see: https://rdrr.io/r/stats/terms.formula.html - if (max(changed.rsq.term.index) > length(labels(terms.full.formula))) { - # if max is more than the number of terms on RHS of formula + + if (max(unlist(changed.rsq.term.index)) > length(labels(terms.full.formula))) { stop( - paste0( - "Largest index in changed.rsq.term.index is more than the term number on the ", - "right hand side of formula!" - ) + "Largest index in changed.rsq.term.index is more than the term number on the ", + "right hand side of formula!" ) } - # check how many variables on RHS; if no (but intercept, i.e. xx ~ 1), stop if (length(labels(terms.full.formula)) == 0) { stop( - paste0( - "Trying to analyze changed.rsq but there is no variable (except intercept 1) ", - "on right hand side of formula! Please provide at least one valid variable." + "Trying to analyze changed.rsq but there is no variable (except intercept 1) ", + "on right hand side of formula! Please provide at least one valid variable." + ) + } + + changed.rsq.term.fullFormat.list <- labels(terms.full.formula)[unlist(changed.rsq.term.index)] + + # Compute short format names: s(age) -> s_age, ti(x,z) -> ti_x_z, etc. + changed.rsq.term.shortFormat.list <- list() + for (changed.rsq.term.fullFormat in changed.rsq.term.fullFormat.list) { + temp <- strsplit(changed.rsq.term.fullFormat, "[(]")[[1]] + if (length(temp) == 1) { + # Not a smooth term — no parentheses + str_valid <- changed.rsq.term.fullFormat + } else { + smooth.class <- temp[1] + theEval <- eval(parse(text = changed.rsq.term.fullFormat)) + str_valid <- paste0( + smooth.class, "_", + paste(theEval$term, collapse = "_") ) + if (theEval$by != "NA") { + str_valid <- paste0(str_valid, "_BY", theEval$by) + } + } + changed.rsq.term.shortFormat.list <- append( + changed.rsq.term.shortFormat.list, str_valid ) } - # print warning: message( - paste0( - "will get changed R-squared (delta.adj.rsq and partial.rsq) so the execution time ", - "will be longer." - ) + "will get changed R-squared (delta.adj.rsq and partial.rsq) ", + "so the execution time will be longer." ) - # add adj.r.squared into var.model + if (!("adj.r.squared" %in% var.model)) { var.model <- c(var.model, "adj.r.squared") } - - # also should return model.sse (for both full and reduced model): flag_sse <- TRUE - } else { # changed.rsq is null --> not requested - flag_sse <- FALSE # no need to calculate sse for the model + } else { + flag_sse <- FALSE } - ### check on arguments: p-values correction methods - # check for smoothTerms: + # p-value correction checks ---- check_validity_correctPValue( correct.p.value.smoothTerms, "correct.p.value.smoothTerms", var.smoothTerms, "var.smoothTerms" ) - # check for parametricTerms: check_validity_correctPValue( correct.p.value.parametricTerms, "correct.p.value.parametricTerms", var.parametricTerms, "var.parametricTerms" ) - - ### run + # Start the model fitting ---- if (verbose) { message(glue::glue("Fitting element-wise GAMs for {scalar}")) message(glue::glue("initiating....")) } num.elements.total <- numElementsTotal(modelarray = data, scalar_name = scalar) + + # Initiate using ctx ---- init_args <- list( - formula = formula, modelarray = data, phenotypes = phenotypes, scalar = scalar, - var.smoothTerms = var.smoothTerms, var.parametricTerms = var.parametricTerms, + ctx = ctx, + var.smoothTerms = var.smoothTerms, + var.parametricTerms = var.parametricTerms, var.model = var.model, - num.subj.lthr = num.subj.lthr, flag_sse = flag_sse, on_error = on_error, ... + num.subj.lthr = num.subj.lthr, + flag_sse = flag_sse, + on_error = on_error, ... ) initiator <- .find_initiator_element( analyseOneElement.gam, num.elements.total, init_args, verbose @@ -607,6 +647,7 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N stop("return_output=FALSE is not supported when p-value correction or changed.rsq is requested") } + # Initialize stream writer ---- writer <- .init_results_stream_writer( write_results_name = write_results_name, write_results_file = write_results_file, @@ -617,20 +658,26 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N compression_level = write_results_compression_level ) + # Start model looping ---- fits_all <- if (need_full_df) vector("list", length(element.subset)) else NULL chunk_size <- if (is.null(writer)) length(element.subset) else as.integer(write_results_flush_every) chunk_starts <- seq(1L, length(element.subset), by = chunk_size) + for (chunk_start in chunk_starts) { chunk_end <- min(chunk_start + chunk_size - 1L, length(element.subset)) chunk_elements <- element.subset[chunk_start:chunk_end] - chunk_fits <- .parallel_dispatch(chunk_elements, analyseOneElement.gam, n_cores, pbar, - formula = formula, modelarray = data, phenotypes = phenotypes, scalar = scalar, - var.smoothTerms = var.smoothTerms, var.parametricTerms = var.parametricTerms, + # Main loop passes ctx + chunk_fits <- .parallel_dispatch( + chunk_elements, analyseOneElement.gam, n_cores, pbar, + ctx = ctx, + var.smoothTerms = var.smoothTerms, + var.parametricTerms = var.parametricTerms, var.model = var.model, - num.subj.lthr = num.subj.lthr, num.stat.output = num.stat.output, - flag_initiate = FALSE, flag_sse = flag_sse, on_error = on_error, - ... + num.subj.lthr = num.subj.lthr, + num.stat.output = num.stat.output, + flag_initiate = FALSE, flag_sse = flag_sse, + on_error = on_error, ... ) if (need_full_df) { @@ -652,79 +699,47 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N df_out <- as.data.frame(df_out) colnames(df_out) <- column_names + # P-value corrections ---- + df_out <- .correct_pvalues(df_out, list.smoothTerms, correct.p.value.smoothTerms, var.smoothTerms) + df_out <- .correct_pvalues(df_out, list.parametricTerms, correct.p.value.parametricTerms, var.parametricTerms) - ### get the changed.rsq for smooth terms: - if (!is.null(changed.rsq.term.index)) { # if changed.rsq is requested - message("Getting delta R-squared: running the reduced model...") - - # list of term of interest for changed.rsq: - changed.rsq.term.fullFormat.list <- labels(terms.full.formula)[changed.rsq.term.index] - # the term for changed.rsq, in full format - # get the short version: - changed.rsq.term.shortFormat.list <- list() - for (changed.rsq.term.fullFormat in changed.rsq.term.fullFormat.list) { - temp <- strsplit(changed.rsq.term.fullFormat, "[(]")[[1]] - if (length(temp) == 1) { # it's not a smooth term - as there is no () - str_valid <- changed.rsq.term.fullFormat - } else { - smooth.class <- temp[1] - - theEval <- eval(parse(text = changed.rsq.term.fullFormat)) - str_valid <- paste0( - smooth.class, "_", - paste(theEval$term, collapse = "_") - ) # ti(x,z) --> ti_x_z; s(x) --> s_x - if (theEval$by != "NA") { - str_valid <- paste0(str_valid, "_BY", theEval$by) # s(age,by=oSex) --> s_age_BYoSex - } - } - - changed.rsq.term.shortFormat <- str_valid - changed.rsq.term.shortFormat.list <- append(changed.rsq.term.shortFormat.list, changed.rsq.term.shortFormat) - } + # Compute changed R-squared ---- + if (need_changed_rsq) { + terms.full.formula <- stats::terms(formula, keep.order = TRUE) - # loop of each changed.rsq.term.index (i.e. each term of interest) - for (i.changed.rsq.term in seq_along(changed.rsq.term.fullFormat.list)) { - idx.changed.rsq.term <- changed.rsq.term.index[i.changed.rsq.term] # index + for (i.changed.rsq.term in seq_along(changed.rsq.term.index)) { + idx.changed.rsq.term <- changed.rsq.term.index[[i.changed.rsq.term]] changed.rsq.term.fullFormat <- changed.rsq.term.fullFormat.list[i.changed.rsq.term] - changed.rsq.term.shortFormat <- changed.rsq.term.shortFormat.list[[i.changed.rsq.term]][1] # it's nested + changed.rsq.term.shortFormat <- changed.rsq.term.shortFormat.list[[i.changed.rsq.term]][1] - # get the formula of reduced model - # check if there is only one term (after removing it in reduced model, - # there is no term but intercept in the formula...) if (length(labels(terms.full.formula)) == 1) { temp <- toString(formula) %>% strsplit("[, ]") reduced.formula <- stats::as.formula(paste0(temp[[1]][3], "~1")) } else { reduced.formula <- formula( - stats::drop.terms( - terms.full.formula, - idx.changed.rsq.term, - keep.response = TRUE - ) - ) # index on RHS of formula -> change to class of formula (stats::as.formula does not work) + stats::drop.terms(terms.full.formula, idx.changed.rsq.term, keep.response = TRUE) + ) } message( - paste0( - "* Getting changed R-squared for term: ", - changed.rsq.term.fullFormat, - " via reduced model as below", - "; will show up as ", - changed.rsq.term.shortFormat, - " in final dataframe" - ) + "* Getting changed R-squared for term: ", + changed.rsq.term.fullFormat, + " via reduced model; will show up as ", + changed.rsq.term.shortFormat, + " in final dataframe" ) print(reduced.formula) - # var* for reduced model: only adjusted r sq is enough - # initiate: + ## Build a separate context for the reduced model ---- + reduced_ctx <- .build_gam_context(reduced.formula, data, phenotypes, scalar) + reduced.model.outputs_initiator <- analyseOneElement.gam( - i_element = i_element_success_initiate, # directly use the i_element with known success - reduced.formula, data, phenotypes, scalar, - var.smoothTerms = c(), var.parametricTerms = c(), var.model = c("adj.r.squared"), + i_element = i_element_success_initiate, + ctx = reduced_ctx, + var.smoothTerms = c(), var.parametricTerms = c(), + var.model = c("adj.r.squared"), num.subj.lthr = num.subj.lthr, num.stat.output = NULL, - flag_initiate = TRUE, flag_sse = TRUE, # also to get model.sse + flag_initiate = TRUE, flag_sse = TRUE, ... ) reduced.model.column_names <- reduced.model.outputs_initiator$column_names @@ -732,8 +747,7 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N reduced.model.fits <- .parallel_dispatch( element.subset, analyseOneElement.gam, n_cores, pbar, - formula = reduced.formula, modelarray = data, phenotypes = phenotypes, - scalar = scalar, + ctx = reduced_ctx, var.smoothTerms = c(), var.parametricTerms = c(), var.model = c("adj.r.squared"), num.subj.lthr = num.subj.lthr, @@ -746,47 +760,28 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N reduced.model.df_out <- as.data.frame(reduced.model.df_out) colnames(reduced.model.df_out) <- reduced.model.column_names - # rename "adj.r.squared" as "redModel.adj.r.squared" before merging into df_out - names(reduced.model.df_out)[names(reduced.model.df_out) == "model.adj.r.squared"] <- "redModel.adj.r.squared" - # rename "model.sse" as "redModel.sse" before merging into df_out - names(reduced.model.df_out)[names(reduced.model.df_out) == "model.sse"] <- "redModel.sse" - - # combine new df_out to original one: - df_out <- merge(df_out, reduced.model.df_out, by = "element_id") - - # calculate the delta.adj.rsq, add to the df_out: - col_name <- paste0(changed.rsq.term.shortFormat, ".delta.adj.rsq") - df_out[[col_name]] <- df_out[["model.adj.r.squared"]] - df_out[["redModel.adj.r.squared"]] - - # calculate the partial.rsq, add to the df_out - # partialRsq <- (sse.red - sse.full) / sse.red - col_name <- paste0(changed.rsq.term.shortFormat, ".partial.rsq") - df_out[[col_name]] <- (df_out[["redModel.sse"]] - df_out[["model.sse"]]) / df_out[["redModel.sse"]] + ## Compute delta adj R-sq and partial R-sq ---- + delta_col <- paste0(changed.rsq.term.shortFormat, ".delta.adj.rsq") + partial_col <- paste0(changed.rsq.term.shortFormat, ".partial.rsq") - # remove column of redModel - df_out <- df_out %>% subset(select = -c( - redModel.adj.r.squared, - redModel.sse - )) - } # end of for loop across term of interest for changed.rsq + df_out[[delta_col]] <- df_out[["model.adj.r.squared"]] - + reduced.model.df_out[["model.adj.r.squared"]] + full_sse <- df_out[["model.sse"]] + reduced_sse <- reduced.model.df_out[["model.sse"]] + df_out[[partial_col]] <- (reduced_sse - full_sse) / reduced_sse + } # if adjusted r sq is not requested (see var.model.orig), remove it: if (!("adj.r.squared" %in% var.model.orig)) { - df_out <- df_out %>% subset(select = -c(model.adj.r.squared)) + df_out <- df_out[, colnames(df_out) != "model.adj.r.squared", drop = FALSE] } # remove full model's sse (model.sse): - df_out <- df_out %>% subset(select = -c(model.sse)) - } # end of if: requesting changed.rsq - - - ### correct p values - df_out <- .correct_pvalues(df_out, list.smoothTerms, correct.p.value.smoothTerms, var.smoothTerms) - df_out <- .correct_pvalues( - df_out, list.parametricTerms, correct.p.value.parametricTerms, var.parametricTerms - ) + df_out <- df_out[, colnames(df_out) != "model.sse", drop = FALSE] + } + # Rewrite if corrections applied and streaming was used ---- if (!is.null(writer) && (need_smooth_correction || need_param_correction || need_changed_rsq)) { writeResults( fn.output = write_results_file, @@ -796,14 +791,12 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N ) } - ### return if (return_output) { return(df_out) } invisible(NULL) } - #' Run a user-supplied function for element-wise data #' #' @description @@ -859,9 +852,19 @@ ModelArray.gam <- function(formula, data, phenotypes, scalar, element.subset = N #' level for scalar writes. Default 4. #' @param ... Additional arguments forwarded to \code{FUN}. #' -#' @return A tibble with one row per element and first column -#' \code{element_id} (0-based). Remaining columns are determined by -#' the return value of \code{FUN}. +#' @return If \code{flag_initiate = TRUE}, a list with one component: +#' \describe{ +#' \item{column_names}{Character vector. The column names derived from +#' the return value of \code{user_fun}, with \code{"element_id"} +#' prepended. For unnamed list or atomic returns, columns are named +#' \code{v1}, \code{v2}, etc. Set to \code{NaN} if the element was +#' skipped or errored.} +#' } +#' If \code{flag_initiate = FALSE}, a numeric vector of length +#' \code{num.stat.output} with \code{element_id} (0-based) first and +#' the coerced output of \code{user_fun} in subsequent positions. +#' All-\code{NaN} (except \code{element_id}) if the element was skipped +#' or if an error occurred with \code{on_error = "skip"}. #' #' @seealso \code{\link{ModelArray.lm}} for linear models, #' \code{\link{ModelArray.gam}} for GAMs, @@ -921,21 +924,29 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL write_results_compression_level = 4L, return_output = TRUE, ...) { + # Validation ---- .validate_modelarray_input(data) element.subset <- .validate_element_subset(element.subset, data, scalar) phenotypes <- .align_phenotypes(data, phenotypes, scalar) num.subj.lthr <- .compute_subject_threshold(phenotypes, num.subj.lthr.abs, num.subj.lthr.rel) - ### initiate to get column names + # Build context ---- + ctx <- .build_wrap_context(data, phenotypes, scalar) + + # Start model fitting ---- if (verbose) { message(glue::glue("Running element-wise user function for {scalar}")) message(glue::glue("initiating....")) } num.elements.total <- numElementsTotal(modelarray = data, scalar_name = scalar) + + # Initiate using ctx ---- init_args <- list( - user_fun = FUN, modelarray = data, phenotypes = phenotypes, scalar = scalar, - num.subj.lthr = num.subj.lthr, on_error = on_error, ... + user_fun = FUN, + ctx = ctx, + num.subj.lthr = num.subj.lthr, + on_error = on_error, ... ) initiator <- .find_initiator_element( analyseOneElement.wrap, num.elements.total, init_args, verbose @@ -945,6 +956,7 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL if (verbose) message(glue::glue("looping across elements....")) + # Initialize stream writer ---- writer_results <- .init_results_stream_writer( write_results_name = write_results_name, write_results_file = write_results_file, @@ -975,36 +987,21 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL if (is.character(write_scalar_columns)) { missing_cols <- setdiff(write_scalar_columns, column_names) if (length(missing_cols) > 0L) { - stop( - "write_scalar_columns not found in wrap output: ", - paste(missing_cols, collapse = ", ") - ) + stop("write_scalar_columns not found in wrap output: ", paste(missing_cols, collapse = ", ")) } scalar_col_idx <- match(write_scalar_columns, column_names) - } else if (is.numeric(write_scalar_columns)) { - scalar_col_idx <- as.integer(write_scalar_columns) - if (any(scalar_col_idx < 1L) || any(scalar_col_idx > length(column_names))) { - stop("write_scalar_columns numeric indices are out of bounds") - } } else { - stop("write_scalar_columns must be NULL, character, or integer") - } - - scalar_col_names <- column_names[scalar_col_idx] - if (length(scalar_col_names) == 0L) { - stop("write_scalar_columns selected zero columns") + scalar_col_idx <- as.integer(write_scalar_columns) } if (is.null(write_scalar_column_names)) { - write_scalar_column_names <- as.character(phenotypes[["source_file"]]) - } else { - write_scalar_column_names <- as.character(write_scalar_column_names) + write_scalar_column_names <- column_names[scalar_col_idx] } if (length(write_scalar_column_names) != length(scalar_col_idx)) { stop("length(write_scalar_column_names) must equal number of selected write_scalar_columns") } - # Initialize scalar output dataset. + # Initialize scalar output dataset if (!file.exists(write_scalar_file)) { rhdf5::h5createFile(write_scalar_file) } @@ -1032,6 +1029,7 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL ) } + # Start model looping ---- fits_all <- if (return_output) vector("list", length(element.subset)) else NULL write_row_cursor <- 1L @@ -1043,13 +1041,18 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL chunk_size <- min(chunk_size, as.integer(write_results_flush_every)) } chunk_starts <- seq(1L, length(element.subset), by = chunk_size) + for (chunk_start in chunk_starts) { chunk_end <- min(chunk_start + chunk_size - 1L, length(element.subset)) chunk_elements <- element.subset[chunk_start:chunk_end] - chunk_fits <- .parallel_dispatch(chunk_elements, analyseOneElement.wrap, n_cores, pbar, - user_fun = FUN, modelarray = data, phenotypes = phenotypes, scalar = scalar, - num.subj.lthr = num.subj.lthr, num.stat.output = num.stat.output, + ## Main loop passes ctx ---- + chunk_fits <- .parallel_dispatch( + chunk_elements, analyseOneElement.wrap, n_cores, pbar, + user_fun = FUN, + ctx = ctx, + num.subj.lthr = num.subj.lthr, + num.stat.output = num.stat.output, flag_initiate = FALSE, on_error = on_error, ... ) @@ -1080,6 +1083,7 @@ ModelArray.wrap <- function(FUN, data, phenotypes, scalar, element.subset = NULL } } + # Finalize H5 writing ---- if (writing_scalar) { rhdf5::h5writeAttribute( attr = write_scalar_column_names, diff --git a/man/ModelArray.gam.Rd b/man/ModelArray.gam.Rd index 74fbe72..c662f91 100644 --- a/man/ModelArray.gam.Rd +++ b/man/ModelArray.gam.Rd @@ -2,9 +2,7 @@ % Please edit documentation in R/analyse.R \name{ModelArray.gam} \alias{ModelArray.gam} -\title{Fit element-wise generalized additive models -no model-level p-value for GAMs, so there is no -\code{correct.p.value.model} argument.} +\title{Fit element-wise generalized additive models} \usage{ ModelArray.gam( formula, @@ -63,19 +61,22 @@ parametric terms, from \code{broom::tidy(parametric = TRUE)}. See Details.} \item{var.model}{Character vector. Statistics to save for the overall -model, from \code{broom::glance()} and \code{summary()}. See Details.} +model, from \code{broom::glance()} and +\code{\link[mgcv]{summary.gam}}. See Details.} \item{changed.rsq.term.index}{A list of positive integers. Each value is the index of a term on the right-hand side of \code{formula} for which delta adjusted R-squared and partial R-squared should be computed. Usually the term of interest is a smooth term or interaction -term. Default \code{NULL} (not computed). See Details for warnings.} +term. Default \code{NULL} (not computed). See Details.} \item{correct.p.value.smoothTerms}{Character vector. P-value -correction method(s) for each smooth term. Default: \code{"fdr"}.} +correction method(s) for each smooth term. Default: \code{"fdr"}. +See Details.} \item{correct.p.value.parametricTerms}{Character vector. P-value -correction method(s) for each parametric term. Default: \code{"fdr"}.} +correction method(s) for each parametric term. Default: \code{"fdr"}. +See Details.} \item{num.subj.lthr.abs}{Integer. Lower threshold for the absolute number of subjects with finite scalar values (not \code{NaN}, @@ -123,27 +124,72 @@ level for HDF5 writes. Default 4.} combined data.frame. If \code{FALSE}, return \code{invisible(NULL)}; useful when writing large outputs directly to HDF5.} -\item{...}{Additional arguments passed to \code{\link[mgcv]{gam}}.} +\item{...}{Additional arguments passed to \code{\link[mgcv]{gam}} +(e.g. \code{method = "REML"}).} } \value{ -A tibble with one row per element. The first column is +A data.frame with one row per element. The first column is \code{element_id} (0-based). Remaining columns contain the requested -statistics, named as \code{.}. If +statistics, named as \code{.} for per-term statistics +and \code{model.} for model-level statistics. Smooth term +names are normalized (e.g. \code{s_age.statistic}). If p-value +corrections were requested, additional columns are appended with the +correction method as suffix (e.g. \code{s_age.p.value.fdr}). If \code{changed.rsq.term.index} was requested, additional columns \code{.delta.adj.rsq} and \code{.partial.rsq} are appended. } \description{ -Fit element-wise generalized additive models -no model-level p-value for GAMs, so there is no -\code{correct.p.value.model} argument. +\code{ModelArray.gam} fits a generalized additive model at each requested +element in a \linkS4class{ModelArray} and returns a tibble of requested +model statistics. There is no model-level p-value for GAMs, so there is +no \code{correct.p.value.model} argument. +} +\details{ +You may request returning specific statistical variables by setting +\code{var.*}, or you can get all by setting \code{full.outputs = TRUE}. +Note that statistics covered by \code{full.outputs} or \code{var.*} are +the ones from \code{broom::tidy()}, \code{broom::glance()}, and +\code{summary.gam()} only, and do not include corrected p-values. +However FDR-corrected p-values (\code{"fdr"}) are generated by default. + +List of acceptable statistic names for each of \code{var.*}: +\itemize{ +\item \code{var.smoothTerms}: \code{c("edf", "ref.df", "statistic", + "p.value")}; From \code{broom::tidy(parametric = FALSE)}. +\item \code{var.parametricTerms}: \code{c("estimate", "std.error", + "statistic", "p.value")}; From \code{broom::tidy(parametric = TRUE)}. +\item \code{var.model}: \code{c("adj.r.squared", "dev.expl", + "sp.criterion", "scale", "df", "logLik", "AIC", "BIC", "deviance", + "df.residual", "nobs")}; From \code{broom::glance()} and +\code{\link[mgcv]{summary.gam}}. +} + +Smooth term names in the output are normalized: \code{s(age)} becomes +\code{s_age}, \code{ti(x,z)} becomes \code{ti_x_z}, and +\code{s(x):oFactor} becomes \code{s_x_BYoFactor}. + +For p-value corrections (arguments \code{correct.p.value.*}), supported +methods include all methods in \code{p.adjust.methods} except +\code{"none"}. You can request more than one method. FDR-corrected +p-values (\code{"fdr"}) are calculated by default. Turn it off by +setting to \code{"none"}. + +When \code{changed.rsq.term.index} is provided, a reduced model (dropping +the specified term) is fit at each element to compute delta adjusted +R-squared and partial R-squared. This approximately doubles execution +time per requested term. The term index refers to the position on the +right-hand side of \code{formula} (use \code{labels(terms(formula))} to +see the ordering). + +Arguments \code{num.subj.lthr.abs} and \code{num.subj.lthr.rel} are +mainly for input data with subject-specific masks, i.e. currently only +for volume data. For fixel-wise data, you may ignore these arguments. } \examples{ { \dontrun{ -ma <- ModelArray("path/to/data.h5", scalar_types = c("FD")) -phenotypes <- read.csv("cohort.csv") - +# Fit GAM with default outputs results <- ModelArray.gam( FD ~ s(age, fx = TRUE) + sex, data = ma, @@ -160,6 +206,17 @@ results_rsq <- ModelArray.gam( scalar = "FD", changed.rsq.term.index = list(1) ) + +# Full outputs, no p-value correction +results_full <- ModelArray.gam( + FD ~ s(age, fx = TRUE) + sex, + data = ma, + phenotypes = phenotypes, + scalar = "FD", + full.outputs = TRUE, + correct.p.value.smoothTerms = "none", + correct.p.value.parametricTerms = "none" +) } } @@ -169,5 +226,8 @@ results_rsq <- ModelArray.gam( \code{\link{ModelArray.wrap}} for user-supplied functions, \code{\link{gen_gamFormula_fxSmooth}} and \code{\link{gen_gamFormula_contIx}} for formula helpers, -\linkS4class{ModelArray} for the input class. +\linkS4class{ModelArray} for the input class, +\code{\link{ModelArray}} for the constructor, +\code{\link{exampleElementData}} for testing formulas on a single +element. } diff --git a/man/ModelArray.wrap.Rd b/man/ModelArray.wrap.Rd index 10d0686..79e5103 100644 --- a/man/ModelArray.wrap.Rd +++ b/man/ModelArray.wrap.Rd @@ -125,9 +125,19 @@ useful when writing large outputs directly to HDF5.} \item{...}{Additional arguments forwarded to \code{FUN}.} } \value{ -A tibble with one row per element and first column -\code{element_id} (0-based). Remaining columns are determined by -the return value of \code{FUN}. +If \code{flag_initiate = TRUE}, a list with one component: +\describe{ +\item{column_names}{Character vector. The column names derived from +the return value of \code{user_fun}, with \code{"element_id"} +prepended. For unnamed list or atomic returns, columns are named +\code{v1}, \code{v2}, etc. Set to \code{NaN} if the element was +skipped or errored.} +} +If \code{flag_initiate = FALSE}, a numeric vector of length +\code{num.stat.output} with \code{element_id} (0-based) first and +the coerced output of \code{user_fun} in subsequent positions. +All-\code{NaN} (except \code{element_id}) if the element was skipped +or if an error occurred with \code{on_error = "skip"}. } \description{ \code{ModelArray.wrap} runs a user-supplied function \code{FUN} at each diff --git a/man/analyseOneElement.gam.Rd b/man/analyseOneElement.gam.Rd index 58733da..55fb7c1 100644 --- a/man/analyseOneElement.gam.Rd +++ b/man/analyseOneElement.gam.Rd @@ -2,27 +2,23 @@ % Please edit documentation in R/ModelArray_Constructor.R \name{analyseOneElement.gam} \alias{analyseOneElement.gam} -\title{Fit a GAM for a single element -returns metadata (column names, smooth term names, parametric term names, -and the smoothing parameter criterion attribute name) used by -\code{\link{ModelArray.gam}} to initialise the output data.frame. When -\code{flag_initiate = FALSE}, it returns a numeric vector representing one -row of the final results matrix.} +\title{Fit a GAM for a single element} \usage{ analyseOneElement.gam( i_element, - formula, - modelarray, - phenotypes, - scalar, - var.smoothTerms, - var.parametricTerms, - var.model, + formula = NULL, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "statistic", "p.value"), + var.model = c("dev.expl"), num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, flag_sse = FALSE, on_error = "stop", + ctx = NULL, ... ) } @@ -32,14 +28,15 @@ analyseOneElement.gam( \item{formula}{A \code{\link[stats]{formula}} passed to \code{\link[mgcv]{gam}}.} -\item{modelarray}{A \linkS4class{ModelArray} object.} +\item{modelarray}{A \linkS4class{ModelArray} object. Ignored when +\code{ctx} is provided.} \item{phenotypes}{A data.frame of the cohort with columns of independent variables and covariates. Must contain a \code{"source_file"} column -matching \code{sources(modelarray)[[scalar]]}.} +matching \code{sources(modelarray)[[scalar]]}. Ignored when \code{ctx} is provided.} \item{scalar}{Character. The name of the element-wise scalar to analyse. -Must be one of \code{names(scalars(modelarray))}.} +Must be one of \code{names(scalars(modelarray))}. Ignored when \code{ctx} is provided.} \item{var.smoothTerms}{Character vector. Statistics to extract for smooth terms from \code{\link[broom]{tidy.gam}} with \code{parametric = FALSE} @@ -83,6 +80,9 @@ halts execution; \code{"skip"} returns all-\code{NaN} for this element; \code{"debug"} drops into \code{\link{browser}} (if interactive) then skips. Default: \code{"stop"}.} +\item{ctx}{A precomputed context list from \code{.build_gam_context()}, +or \code{NULL}.} + \item{...}{Additional arguments passed to \code{\link[mgcv]{gam}}.} } \value{ @@ -101,6 +101,14 @@ requested statistics in subsequent positions. All-\code{NaN} (except \code{element_id}) if the element was skipped. } \description{ +#' @description +Returns metadata (column names, smooth term names, parametric term names, +and the smoothing parameter criterion attribute name) used by +\code{\link{ModelArray.gam}} to initialise the output data.frame. When +\code{flag_initiate = FALSE}, it returns a numeric vector representing one +row of the final results matrix. +} +\details{ If the number of subjects with finite scalar values does not exceed \code{num.subj.lthr}, the element is skipped and all statistics are set to \code{NaN}. diff --git a/man/analyseOneElement.lm.Rd b/man/analyseOneElement.lm.Rd index 1069a1d..ddc9251 100644 --- a/man/analyseOneElement.lm.Rd +++ b/man/analyseOneElement.lm.Rd @@ -2,23 +2,21 @@ % Please edit documentation in R/ModelArray_Constructor.R \name{analyseOneElement.lm} \alias{analyseOneElement.lm} -\title{Fit a linear model for a single element -If the number of subjects with finite scalar values (not \code{NaN}, -\code{NA}, or \code{Inf}) does not exceed \code{num.subj.lthr}, the -element is skipped and all statistics are set to \code{NaN}.} +\title{Fit a linear model for a single element} \usage{ analyseOneElement.lm( i_element, - formula, - modelarray, - phenotypes, - scalar, - var.terms, - var.model, + formula = NULL, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, + var.terms = c("estimate", "statistic", "p.value"), + var.model = c("adj.r.squared", "p.value"), num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, on_error = "stop", + ctx = NULL, ... ) } @@ -26,16 +24,18 @@ analyseOneElement.lm( \item{i_element}{Integer. The 1-based index of the element to analyse.} \item{formula}{A \code{\link[stats]{formula}} passed to -\code{\link[stats]{lm}}.} +\code{\link[stats]{lm}}. Ignored when \code{ctx} is provided (the +formula is taken from the context).} -\item{modelarray}{A \linkS4class{ModelArray} object.} +\item{modelarray}{A \linkS4class{ModelArray} object. Ignored when +\code{ctx} is provided.} \item{phenotypes}{A data.frame of the cohort with columns of independent variables and covariates. Must contain a \code{"source_file"} column -matching \code{sources(modelarray)[[scalar]]}.} +matching \code{sources(modelarray)[[scalar]]}. Ignored when \code{ctx} is provided.} \item{scalar}{Character. The name of the element-wise scalar to analyse. -Must be one of \code{names(scalars(modelarray))}.} +Must be one of \code{names(scalars(modelarray))}. Ignored when \code{ctx} is provided.} \item{var.terms}{Character vector. Statistics to extract per term from \code{\link[broom]{tidy.lm}} (e.g. \code{"estimate"}, \code{"statistic"}, @@ -67,6 +67,9 @@ halts execution; \code{"skip"} returns all-\code{NaN} for this element; \code{"debug"} drops into \code{\link{browser}} (if interactive) then skips. Default: \code{"stop"}.} +\item{ctx}{A precomputed context list from \code{.build_lm_context()}, +or \code{NULL} for legacy inline computation.} + \item{...}{Additional arguments passed to \code{\link[stats]{lm}}.} } \value{ @@ -84,14 +87,19 @@ value and the requested statistics in subsequent positions. All-\code{NaN} or if an error occurred with \code{on_error = "skip"}. } \description{ -Fit a linear model for a single element +Fits \code{\link[stats]{lm}} on one element's data. When a precomputed +context (\code{ctx}) is provided, all loop-invariant work (formula parsing, +collision checks, source alignment) is skipped. When \code{ctx} is +\code{NULL}, the function falls back to computing everything inline +(legacy behaviour for direct calls / debugging). + If the number of subjects with finite scalar values (not \code{NaN}, \code{NA}, or \code{Inf}) does not exceed \code{num.subj.lthr}, the element is skipped and all statistics are set to \code{NaN}. } \seealso{ -\code{\link{ModelArray.lm}} which calls this function iteratively, +\code{\link{ModelArray.lm}}, \code{.build_lm_context}, \code{\link{analyseOneElement.gam}} for the GAM equivalent, -\code{\link{analyseOneElement.wrap}} for user-supplied functions. +\code{\link{analyseOneElement.wrap}} for user-supplied functions } \keyword{internal} diff --git a/man/analyseOneElement.wrap.Rd b/man/analyseOneElement.wrap.Rd index 1780548..e8dfafc 100644 --- a/man/analyseOneElement.wrap.Rd +++ b/man/analyseOneElement.wrap.Rd @@ -7,13 +7,14 @@ analyseOneElement.wrap( i_element, user_fun, - modelarray, - phenotypes, - scalar, + modelarray = NULL, + phenotypes = NULL, + scalar = NULL, num.subj.lthr, num.stat.output = NULL, flag_initiate = FALSE, on_error = "stop", + ctx = NULL, ... ) } @@ -22,17 +23,27 @@ analyseOneElement.wrap( \item{user_fun}{A function that accepts at least an argument named \code{data} (a data.frame: \code{phenotypes} with scalar columns -appended for the current element) and returns a one-row -data.frame/tibble, a named list, or a named atomic vector.} +appended for the current element) and returns one of: +\itemize{ +\item A one-row \code{data.frame} or \code{tibble}. Multi-row +returns will error. +\item A named list. Unnamed lists are accepted and auto-named +as \code{v1}, \code{v2}, etc. +\item A named atomic vector. Unnamed vectors are accepted and +auto-named as \code{v1}, \code{v2}, etc. +} +All return values are coerced to a numeric vector internally. +Unsupported types (e.g., environments, S4 objects) will error.} -\item{modelarray}{A \linkS4class{ModelArray} object.} +\item{modelarray}{A \linkS4class{ModelArray} object. Ignored when +\code{ctx} is provided.} \item{phenotypes}{A data.frame of the cohort with columns of independent variables and covariates. Must contain a \code{"source_file"} column -matching \code{sources(modelarray)[[scalar]]}.} +matching \code{sources(modelarray)[[scalar]]}. Ignored when \code{ctx} is provided.} \item{scalar}{Character. The name of the element-wise scalar to analyse. -Must be one of \code{names(scalars(modelarray))}.} +Must be one of \code{names(scalars(modelarray))}. Ignored when \code{ctx} is provided.} \item{num.subj.lthr}{Numeric. The pre-computed minimum number of subjects with finite values required for this element to be analysed. Elements @@ -56,6 +67,9 @@ halts execution; \code{"skip"} returns all-\code{NaN} for this element; \code{"debug"} drops into \code{\link{browser}} (if interactive) then skips. Default: \code{"stop"}.} +\item{ctx}{A precomputed context list from \code{.build_wrap_context()}, +or \code{NULL}.} + \item{...}{Additional arguments forwarded to \code{user_fun}.} } \value{ diff --git a/tests/testthat/test-ModelArray_low_hanging_coverage.R b/tests/testthat/test-ModelArray_low_hanging_coverage.R index 4c637a1..c439e1e 100644 --- a/tests/testthat/test-ModelArray_low_hanging_coverage.R +++ b/tests/testthat/test-ModelArray_low_hanging_coverage.R @@ -361,7 +361,7 @@ test_that("analyseOneElement.wrap covers coercion and error branches", { phenotypes = phen, scalar = "FD", num.subj.lthr = 0, flag_initiate = FALSE, num.stat.output = 2 ), - "Unsupported return type" + "user_fun must return a one-row data.frame/tibble" ) expect_warning( diff --git a/tests/testthat/test-analyse_context.R b/tests/testthat/test-analyse_context.R new file mode 100644 index 0000000..4b1430b --- /dev/null +++ b/tests/testthat/test-analyse_context.R @@ -0,0 +1,933 @@ +# tests/testthat/test-analyse_context.R +# +# Tests for context builders (.build_*_context), shared element +# assembly (.assemble_element_data), per-element ctx vs. legacy +# equivalence, .find_initiator_element, and top-level integration. +# +# Organized to follow the two-tier hierarchy: +# SECTION 1: Context builders (base → lm → gam → wrap) +# SECTION 2: Shared per-element helper (.assemble_element_data) +# SECTION 3: Per-element function ctx vs. legacy (lm → gam → wrap) +# SECTION 4: Initiator search (.find_initiator_element) +# SECTION 5: Top-level integration (ModelArray.gam, ModelArray.wrap) + +# ---- Shared fixtures ---- +# NOTE: Uses plain matrices in the scalars slot, not DelayedArray-backed +# HDF5 data. This works because the ModelArray S4 class accepts any +# matrix-like object for scalars [4]. Integration tests in Section 5 +# use real HDF5-backed ModelArrays via system.file(). + +make_test_modelarray <- function(n_elements = 3, n_subjects = 4, + scalars = c("FD"), + inject_nan = FALSE) { + src <- paste0("sub", seq_len(n_subjects)) + scalar_list <- list() + source_list <- list() + + for (sn in scalars) { + set.seed(match(sn, scalars)) + mat <- matrix(rnorm(n_elements * n_subjects), + nrow = n_elements, ncol = n_subjects) + if (inject_nan && sn == scalars[1]) { + mat[1, 1] <- NaN + mat[1, 2] <- Inf + } + colnames(mat) <- src + scalar_list[[sn]] <- mat + source_list[[sn]] <- src + } + + ma <- methods::new("ModelArray", + sources = source_list, + scalars = scalar_list, + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 10, length.out = n_subjects), + sex = rep(c("M", "F"), length.out = n_subjects), + stringsAsFactors = FALSE + ) + list(ma = ma, phen = phen) +} + + +# ================================================================== +# SECTION 1: Context builders (unit tests) +# Ordered to match the two-tier hierarchy: +# base → lm → gam → wrap +# ================================================================== + +# ---- .build_base_context ---- + +test_that(".build_base_context attaches all scalars when scalar_subset is NULL", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = NULL) + + expect_identical(ctx$scalar, "FD") + expect_identical(sort(ctx$attached_scalars), sort(c("FD", "FC"))) + expect_identical(sort(ctx$all_scalar_names), sort(c("FD", "FC"))) +}) + +test_that(".build_base_context attaches only requested scalars when scalar_subset is provided", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = c("FD")) + + expect_identical(ctx$attached_scalars, "FD") + # FC is in the ModelArray but not attached + expect_true("FC" %in% ctx$all_scalar_names) + expect_false("FC" %in% ctx$attached_scalars) +}) + +test_that(".build_base_context detects collision between scalar names and phenotype columns", { + fix <- make_test_modelarray(scalars = c("FD")) + phen_collision <- fix$phen + phen_collision$FD <- 1:nrow(phen_collision) + + expect_error( + .build_base_context(fix$ma, phen_collision, scalar = "FD", + scalar_subset = c("FD")), + "Column name collision" + ) +}) + +test_that(".build_base_context sets predictor_reorder to NULL for aligned sources", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = c("FD")) + + expect_null(ctx$predictor_reorder[["FD"]]) +}) + +test_that(".build_base_context computes reorder indices for misaligned sources", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + + # Manually scramble FC sources + fc_src <- rev(sources(fix$ma)[["FC"]]) + fix$ma@sources[["FC"]] <- fc_src + + ctx <- .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = c("FD", "FC")) + + expect_null(ctx$predictor_reorder[["FD"]]) + expect_false(is.null(ctx$predictor_reorder[["FC"]])) + # Applying the reorder should recover the phenotype source order + expect_identical( + fc_src[ctx$predictor_reorder[["FC"]]], + fix$phen$source_file + ) +}) + +test_that(".build_base_context errors when scalar sources don't match phenotypes", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + fix$ma@sources[["FC"]] <- paste0("wrong", seq_along(sources(fix$ma)[["FC"]])) + + expect_error( + .build_base_context(fix$ma, fix$phen, scalar = "FD", + scalar_subset = c("FD", "FC")), + "do not match phenotypes" + ) +}) + +# ---- .build_lm_context ---- + +test_that(".build_lm_context extracts formula components correctly", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_lm_context(FD ~ age + sex, fix$ma, fix$phen, scalar = "FD") + + expect_identical(ctx$formula, FD ~ age + sex) + expect_identical(ctx$lhs_name, "FD") + expect_true("age" %in% ctx$rhs_vars) + expect_true("sex" %in% ctx$rhs_vars) + expect_identical(ctx$scalar_predictors, character(0)) +}) + +test_that(".build_lm_context detects scalar predictors in cross-scalar formulas", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_lm_context(FD ~ age + FC, fix$ma, fix$phen, scalar = "FD") + + expect_identical(ctx$scalar_predictors, "FC") + expect_true("FC" %in% ctx$attached_scalars) + expect_true("FD" %in% ctx$attached_scalars) +}) + +test_that(".build_lm_context does not attach unrelated scalars", { + fix <- make_test_modelarray(scalars = c("FD", "FC", "FA")) + ctx <- .build_lm_context(FD ~ age + FC, fix$ma, fix$phen, scalar = "FD") + + expect_true("FA" %in% ctx$all_scalar_names) + expect_false("FA" %in% ctx$attached_scalars) +}) + +# ---- .build_gam_context ---- + +test_that(".build_gam_context caches interpret.gam breakdown", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_gam_context(FD ~ s(age) + sex, fix$ma, fix$phen, scalar = "FD") + + expect_true(!is.null(ctx$gam_formula_breakdown)) + expect_true("smooth.spec" %in% names(ctx$gam_formula_breakdown)) +}) + +test_that(".build_gam_context errors on invalid GAM formula", { + fix <- make_test_modelarray(scalars = c("FD")) + + expect_error( + .build_gam_context(FD ~ s(age, d = 999), fix$ma, fix$phen, scalar = "FD"), + "not valid for mgcv::gam" + ) +}) + +test_that(".build_gam_context inherits lm context fields", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_gam_context(FD ~ s(age) + sex, fix$ma, fix$phen, scalar = "FD") + + # Should have all the lm context fields + expect_identical(ctx$lhs_name, "FD") + expect_true("age" %in% ctx$rhs_vars) + expect_true("sex" %in% ctx$rhs_vars) + expect_identical(ctx$scalar_predictors, character(0)) +}) + +test_that(".build_gam_context detects scalar predictors same as lm context", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_gam_context(FD ~ s(age) + FC, fix$ma, fix$phen, scalar = "FD") + + expect_identical(ctx$scalar_predictors, "FC") + expect_true("FC" %in% ctx$attached_scalars) + expect_true("FD" %in% ctx$attached_scalars) +}) + +# ---- .build_wrap_context ---- + +test_that(".build_wrap_context attaches all scalars", { + fix <- make_test_modelarray(scalars = c("FD", "FC")) + ctx <- .build_wrap_context(fix$ma, fix$phen, scalar = "FD") + + expect_identical(sort(ctx$attached_scalars), sort(c("FD", "FC"))) +}) + +test_that(".build_wrap_context has no formula fields", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_wrap_context(fix$ma, fix$phen, scalar = "FD") + + expect_null(ctx$formula) + expect_null(ctx$lhs_name) +}) + + +# ================================================================== +# SECTION 2: Shared per-element helper +# ================================================================== + +# ---- .assemble_element_data ---- + +test_that(".assemble_element_data returns filtered data.frame for valid element", { + fix <- make_test_modelarray(scalars = c("FD")) + ctx <- .build_lm_context(FD ~ age, fix$ma, fix$phen, scalar = "FD") + + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 0) + + expect_true(elem$sufficient) + expect_true(is.data.frame(elem$dat)) + expect_true("FD" %in% colnames(elem$dat)) + expect_true("age" %in% colnames(elem$dat)) + expect_true("source_file" %in% colnames(elem$dat)) +}) + +test_that(".assemble_element_data returns insufficient when below threshold", { + fix <- make_test_modelarray(n_subjects = 4, scalars = c("FD")) + ctx <- .build_lm_context(FD ~ age, fix$ma, fix$phen, scalar = "FD") + + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 100) + + expect_false(elem$sufficient) + expect_null(elem$dat) +}) + +test_that(".assemble_element_data excludes NaN/Inf subjects from mask", { + fix <- make_test_modelarray(n_subjects = 4, scalars = c("FD"), + inject_nan = TRUE) + ctx <- .build_lm_context(FD ~ age, fix$ma, fix$phen, scalar = "FD") + + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 0) + + # Element 1 has NaN in subject 1 and Inf in subject 2 + expect_true(elem$sufficient) + expect_equal(elem$num_valid, 2) # only subjects 3 and 4 + expect_equal(nrow(elem$dat), 2) +}) + +test_that(".assemble_element_data applies precomputed reorder for cross-scalar", { + fix <- make_test_modelarray(n_elements = 2, n_subjects = 4, + scalars = c("FD", "FC")) + + # Scramble FC sources + fc_src_scrambled <- rev(sources(fix$ma)[["FC"]]) + fix$ma@sources[["FC"]] <- fc_src_scrambled + + ctx <- .build_lm_context(FD ~ age + FC, fix$ma, fix$phen, scalar = "FD") + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 0) + + expect_true(elem$sufficient) + expect_true("FC" %in% colnames(elem$dat)) + + # The FC values should be reordered to match phenotype order + fc_raw <- scalars(fix$ma)[["FC"]][1, ] + reorder_idx <- ctx$predictor_reorder[["FC"]] + fc_reordered <- fc_raw[reorder_idx] + expect_equal(as.numeric(elem$dat$FC), as.numeric(fc_reordered)) +}) + +test_that(".assemble_element_data intersection mask works across scalars", { + n_subj <- 6 + src <- paste0("sub", 1:n_subj) + + fd_mat <- matrix(c(1, 2, 3, 4, 5, 6), nrow = 1) + fc_mat <- matrix(c(10, NaN, 30, NaN, 50, 60), nrow = 1) + colnames(fd_mat) <- src + colnames(fc_mat) <- src + + ma <- methods::new("ModelArray", + sources = list(FD = src, FC = src), + scalars = list(FD = fd_mat, FC = fc_mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 10, length.out = n_subj) + ) + + ctx <- .build_lm_context(FD ~ age + FC, ma, phen, scalar = "FD") + elem <- .assemble_element_data(1, ctx, num.subj.lthr = 0) + + # Subjects 2 and 4 have NaN in FC, so should be excluded + expect_equal(elem$num_valid, 4) + expect_equal(nrow(elem$dat), 4) + expect_false(any(is.nan(elem$dat$FC))) +}) + + +# ================================================================== +# SECTION 3: Per-element function ctx vs. legacy equivalence +# Ordered: lm → gam → wrap +# ================================================================== + +# ---- analyseOneElement.lm ---- + +test_that("analyseOneElement.lm produces identical results with ctx vs. without", { + # Create fixture with 8 subjects (enough for a meaningful lm fit) + fix <- make_test_modelarray(n_elements = 2, n_subjects = 8, scalars = c("FD")) + + # The fixture's phen has age and sex; rebuild with only age for a + # clean single-predictor test. The fixture already has 8 subjects. + src <- sources(fix$ma)[["FD"]] + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = length(src)), + stringsAsFactors = FALSE + ) + + ctx <- .build_lm_context(FD ~ age, fix$ma, phen, scalar = "FD") + + # With context — initiation + result_ctx <- analyseOneElement.lm( + i_element = 1, + ctx = ctx, + var.terms = c("estimate", "p.value"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + # Without context — legacy path + result_legacy <- analyseOneElement.lm( + i_element = 1, + formula = FD ~ age, + modelarray = fix$ma, + phenotypes = phen, + scalar = "FD", + var.terms = c("estimate", "p.value"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + expect_identical(result_ctx$column_names, result_legacy$column_names) + expect_identical(result_ctx$list.terms, result_legacy$list.terms) + + # Also check runtime (non-initiate) path + num_stat <- length(result_ctx$column_names) + result_ctx_run <- analyseOneElement.lm( + i_element = 1, + ctx = ctx, + var.terms = c("estimate", "p.value"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + result_legacy_run <- analyseOneElement.lm( + i_element = 1, + formula = FD ~ age, + modelarray = fix$ma, + phenotypes = phen, + scalar = "FD", + var.terms = c("estimate", "p.value"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + expect_equal(result_ctx_run, result_legacy_run) +}) + +# ---- analyseOneElement.gam ---- + +test_that("analyseOneElement.gam produces identical results with ctx vs. without", { + src <- paste0("s", 1:8) + set.seed(42) + mat <- matrix(rnorm(3 * 8), nrow = 3) + colnames(mat) <- src + + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = 8), + stringsAsFactors = FALSE + ) + + ctx <- .build_gam_context(FD ~ s(age, k = 4), modelarray, phen, scalar = "FD") + + # With context — initiation + result_ctx <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "p.value"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + # Without context — legacy path + result_legacy <- analyseOneElement.gam( + i_element = 1, + formula = FD ~ s(age, k = 4), + modelarray = modelarray, + phenotypes = phen, + scalar = "FD", + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "p.value"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + expect_identical(result_ctx$column_names, result_legacy$column_names) + expect_identical(result_ctx$list.smoothTerms, result_legacy$list.smoothTerms) + expect_identical(result_ctx$list.parametricTerms, result_legacy$list.parametricTerms) + + # Runtime path + num_stat <- length(result_ctx$column_names) + + result_ctx_run <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "p.value"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + result_legacy_run <- analyseOneElement.gam( + i_element = 1, + formula = FD ~ s(age, k = 4), + modelarray = modelarray, + phenotypes = phen, + scalar = "FD", + var.smoothTerms = c("statistic", "p.value"), + var.parametricTerms = c("estimate", "p.value"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + expect_equal(result_ctx_run, result_legacy_run) +}) + +test_that("analyseOneElement.gam with ctx normalizes (Intercept) to Intercept", { + src <- paste0("s", 1:8) + set.seed(42) + mat <- matrix(rnorm(2 * 8), nrow = 2) + colnames(mat) <- src + + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = 8), + stringsAsFactors = FALSE + ) + + ctx <- .build_gam_context(FD ~ s(age, k = 4) + 1, modelarray, phen, scalar = "FD") + + result <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c("p.value"), + var.parametricTerms = c("estimate"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = NULL, + flag_initiate = TRUE + ) + + # Should contain "Intercept", not "(Intercept)" + param_cols <- grep("estimate", result$column_names, value = TRUE) + expect_true(any(grepl("^Intercept\\.", param_cols))) + expect_false(any(grepl("\\(Intercept\\)", result$column_names))) +}) + +test_that("analyseOneElement.gam with ctx returns NaN row for insufficient subjects", { + src <- paste0("s", 1:4) + mat <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8), nrow = 2) + colnames(mat) <- src + + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = c(20, 30, 40, 50) + ) + + ctx <- .build_gam_context(FD ~ age, modelarray, phen, scalar = "FD") + + # Initiation with impossible threshold + result_init <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c(), + var.parametricTerms = c("estimate"), + var.model = c("dev.expl"), + num.subj.lthr = 1000, + flag_initiate = TRUE + ) + expect_true(is.nan(result_init$column_names[1])) + + # Runtime with impossible threshold + result_run <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c(), + var.parametricTerms = c("estimate"), + var.model = c("dev.expl"), + num.subj.lthr = 1000, + num.stat.output = 5, + flag_initiate = FALSE + ) + expect_equal(result_run[1], 0) + expect_true(all(is.nan(result_run[-1]))) +}) + +test_that("analyseOneElement.gam with ctx handles on_error = 'skip'", { + src <- paste0("s", 1:8) + set.seed(42) + mat <- matrix(rnorm(2 * 8), nrow = 2) + colnames(mat) <- src + + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = mat), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = 8) + ) + + ctx <- .build_gam_context(FD ~ nonexistent_var, modelarray, phen, scalar = "FD") + + expect_warning( + result <- analyseOneElement.gam( + i_element = 1, + ctx = ctx, + var.smoothTerms = c(), + var.parametricTerms = c("estimate"), + var.model = c("dev.expl"), + num.subj.lthr = 0, + num.stat.output = 4, + flag_initiate = FALSE, + on_error = "skip" + ) + ) + expect_equal(result[1], 0) + expect_true(all(is.nan(result[-1]))) +}) + +# ---- analyseOneElement.wrap ---- + +test_that("analyseOneElement.wrap produces identical results with ctx vs. without", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src, FA = src), + scalars = list( + FD = matrix(c(1, 2, 3, 4, 2, 3, 4, 5), nrow = 2, byrow = TRUE), + FA = matrix(c(10, 20, 30, 40, 50, 60, 70, 80), nrow = 2, byrow = TRUE) + ), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame( + source_file = src, + age = c(20, 30, 40, 50) + ) + + my_fun <- function(data, ...) { + list(fd_mean = mean(data$FD), fa_mean = mean(data$FA)) + } + + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + # With context — initiation + result_ctx <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + ctx = ctx, + num.subj.lthr = 0, + flag_initiate = TRUE + ) + + # Without context — legacy + result_legacy <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + modelarray = modelarray, + phenotypes = phen, + scalar = "FD", + num.subj.lthr = 0, + flag_initiate = TRUE + ) + + expect_identical(result_ctx$column_names, result_legacy$column_names) + + # Runtime + num_stat <- length(result_ctx$column_names) + + result_ctx_run <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + ctx = ctx, + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + result_legacy_run <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + modelarray = modelarray, + phenotypes = phen, + scalar = "FD", + num.subj.lthr = 0, + num.stat.output = num_stat, + flag_initiate = FALSE + ) + + expect_equal(result_ctx_run, result_legacy_run) +}) + +test_that("analyseOneElement.wrap with ctx attaches ALL scalars", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src, FA = src, FC = src), + scalars = list( + FD = matrix(1:8, nrow = 2), + FA = matrix(11:18, nrow = 2), + FC = matrix(21:28, nrow = 2) + ), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame(source_file = src, age = c(20, 30, 40, 50)) + + check_fun <- function(data, ...) { + # Verify all three scalars are present + list( + has_FD = as.numeric("FD" %in% colnames(data)), + has_FA = as.numeric("FA" %in% colnames(data)), + has_FC = as.numeric("FC" %in% colnames(data)) + ) + } + + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + result <- analyseOneElement.wrap( + i_element = 1, + user_fun = check_fun, + ctx = ctx, + num.subj.lthr = 0, + num.stat.output = 4, + flag_initiate = FALSE + ) + + # element_id = 0, then has_FD=1, has_FA=1, has_FC=1 + expect_equal(result[2], 1) + expect_equal(result[3], 1) + expect_equal(result[4], 1) +}) + +test_that("analyseOneElement.wrap with ctx returns NaN for insufficient subjects", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = matrix(c(1, 2, 3, 4, 5, 6, 7, 8), nrow = 2, byrow = TRUE)), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame(source_file = src, age = c(20, 30, 40, 50)) + my_fun <- function(data, ...) list(m = mean(data$FD)) + + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + result_init <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + ctx = ctx, + num.subj.lthr = 1000, + flag_initiate = TRUE + ) + expect_true(is.nan(result_init$column_names[1])) + + result_run <- analyseOneElement.wrap( + i_element = 1, + user_fun = my_fun, + ctx = ctx, + num.subj.lthr = 1000, + num.stat.output = 2, + flag_initiate = FALSE + ) + expect_equal(result_run[1], 0) + expect_true(all(is.nan(result_run[-1]))) +}) + +test_that("analyseOneElement.wrap with ctx handles on_error = 'skip'", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = matrix(1:8, nrow = 2)), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame(source_file = src, age = c(20, 30, 40, 50)) + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + expect_warning( + result <- analyseOneElement.wrap( + i_element = 1, + user_fun = function(data, ...) stop("boom"), + ctx = ctx, + num.subj.lthr = 0, + num.stat.output = 3, + flag_initiate = FALSE, + on_error = "skip" + ) + ) + expect_equal(result[1], 0) + expect_true(all(is.nan(result[-1]))) +}) + +test_that("analyseOneElement.wrap with ctx handles various return types", { + src <- c("s1", "s2", "s3", "s4") + modelarray <- methods::new("ModelArray", + sources = list(FD = src), + scalars = list(FD = matrix(c(1, 2, 3, 4, 5, 6, 7, 8), nrow = 2, byrow = TRUE)), + results = list(), + path = tempfile(fileext = ".h5") + ) + phen <- data.frame(source_file = src, age = c(20, 30, 40, 50)) + ctx <- .build_wrap_context(modelarray, phen, scalar = "FD") + + # data.frame return + df_fun <- function(data, ...) data.frame(m = mean(data$FD)) + result_df <- analyseOneElement.wrap( + i_element = 1, user_fun = df_fun, ctx = ctx, + num.subj.lthr = 0, num.stat.output = 2, flag_initiate = FALSE + ) + expect_equal(length(result_df), 2) + + # named list return + list_fun <- function(data, ...) list(m = mean(data$FD)) + result_list <- analyseOneElement.wrap( + i_element = 1, user_fun = list_fun, ctx = ctx, + num.subj.lthr = 0, num.stat.output = 2, flag_initiate = FALSE + ) + expect_equal(length(result_list), 2) + + # named atomic return + atomic_fun <- function(data, ...) c(m = mean(data$FD)) + result_atomic <- analyseOneElement.wrap( + i_element = 1, user_fun = atomic_fun, ctx = ctx, + num.subj.lthr = 0, num.stat.output = 2, flag_initiate = FALSE + ) + expect_equal(length(result_atomic), 2) + + # All three should produce the same values + expect_equal(result_df[2], result_list[2]) + expect_equal(result_df[2], result_atomic[2]) +}) + + +# ================================================================== +# SECTION 4: Initiator search +# ================================================================== + +# ---- .find_initiator_element ---- + +test_that(".find_initiator_element succeeds on middle element", { + fix <- make_test_modelarray(n_elements = 5, n_subjects = 8, scalars = c("FD")) + src <- sources(fix$ma)[["FD"]] + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = length(src)) + ) + ctx <- .build_lm_context(FD ~ age, fix$ma, phen, scalar = "FD") + + init_args <- list( + ctx = ctx, + var.terms = c("estimate"), + var.model = c("adj.r.squared"), + num.subj.lthr = 0, + on_error = "stop" + ) + + result <- .find_initiator_element( + analyseOneElement.lm, 5, init_args, verbose = FALSE + ) + + expect_false(is.nan(result$outputs$column_names[1])) + expect_true(result$i_element >= 1 && result$i_element <= 5) +}) + +test_that(".find_initiator_element fails when all elements have insufficient subjects", { + fix <- make_test_modelarray(n_elements = 3, n_subjects = 4, scalars = c("FD")) + src <- sources(fix$ma)[["FD"]] + phen <- data.frame( + source_file = src, + age = seq(20, by = 5, length.out = length(src)) + ) + ctx <- .build_lm_context(FD ~ age, fix$ma, phen, scalar = "FD") + + init_args <- list( + ctx = ctx, + var.terms = c("estimate"), + var.model = c("adj.r.squared"), + num.subj.lthr = 1000, # impossibly high threshold + on_error = "stop" + ) + + expect_error( + .find_initiator_element(analyseOneElement.lm, 3, init_args, verbose = FALSE), + "no elements with sufficient valid" + ) +}) + + +# ================================================================== +# SECTION 5: Top-level integration +# Verifies ctx flows correctly through ModelArray.gam/wrap +# using real HDF5-backed data from system.file() +# ================================================================== + +# ---- ModelArray.gam integration ---- + +test_that("ModelArray.gam produces consistent results after context refactoring", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + csv_path <- system.file("extdata", "n50_cohort.csv", package = "ModelArray") + + modelarray <- ModelArray(h5_path, scalar_types = c("FD")) + phenotypes <- read.csv(csv_path) + element.subset <- 1:5 + + result <- ModelArray.gam( + FD ~ s(age, fx = TRUE) + sex, + data = modelarray, + phenotypes = phenotypes, + scalar = "FD", + element.subset = element.subset, + n_cores = 1, + pbar = FALSE + ) + + expect_true(is.data.frame(result)) + expect_equal(nrow(result), length(element.subset)) + expect_equal(result$element_id, 0:(length(element.subset) - 1)) + + # Verify Intercept naming (not "(Intercept)") + expect_true(any(grepl("^Intercept\\.", colnames(result)))) + expect_false(any(grepl("\\(Intercept\\)", colnames(result)))) + + # Verify smooth term naming uses underscores [10] + expect_true(any(grepl("^s_age\\.", colnames(result)))) + + # No NAs for valid elements + stat_cols <- setdiff(colnames(result), "element_id") + expect_true(all(is.finite(as.matrix(result[, stat_cols])))) +}) + +# ---- ModelArray.wrap integration ---- + +test_that("ModelArray.wrap produces consistent results after context refactoring", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + csv_path <- system.file("extdata", "n50_cohort.csv", package = "ModelArray") + + modelarray <- ModelArray(h5_path, scalar_types = c("FD")) + phenotypes <- read.csv(csv_path) + element.subset <- 1:5 + + my_fun <- function(data, ...) { + list(fd_mean = mean(data$FD, na.rm = TRUE)) + } + + result <- ModelArray.wrap( + FUN = my_fun, + data = modelarray, + phenotypes = phenotypes, + scalar = "FD", + element.subset = element.subset, + n_cores = 1, + pbar = FALSE + ) + + expect_true(is.data.frame(result)) + expect_equal(nrow(result), length(element.subset)) + expect_equal(result$element_id, 0:(length(element.subset) - 1)) + expect_true("fd_mean" %in% colnames(result)) + expect_true(all(is.finite(result$fd_mean))) +})