|
| 1 | +#' Unormalizing transformation |
| 2 | +#' |
| 3 | +#' Will undo a step_epi_YeoJohnson transformation. |
| 4 | +#' |
| 5 | +#' @param frosting a `frosting` postprocessor. The layer will be added to the |
| 6 | +#' sequence of operations for this frosting. |
| 7 | +#' @param ... One or more selector functions to scale variables |
| 8 | +#' for this step. See [recipes::selections()] for more details. |
| 9 | +#' @param df a data frame that contains the population data to be used for |
| 10 | +#' inverting the existing scaling. |
| 11 | +#' @param by A (possibly named) character vector of variables to join by. |
| 12 | +#' @param id a random id string |
| 13 | +#' |
| 14 | +#' @return an updated `frosting` postprocessor |
| 15 | +#' @export |
| 16 | +#' @examples |
| 17 | +#' library(dplyr) |
| 18 | +#' jhu <- epidatasets::cases_deaths_subset %>% |
| 19 | +#' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% |
| 20 | +#' select(geo_value, time_value, cases) |
| 21 | +#' |
| 22 | +#' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) |
| 23 | +#' |
| 24 | +#' r <- epi_recipe(jhu) %>% |
| 25 | +#' step_epi_YeoJohnson( |
| 26 | +#' df = pop_data, |
| 27 | +#' df_pop_col = "value", |
| 28 | +#' by = c("geo_value" = "states"), |
| 29 | +#' cases, suffix = "_scaled" |
| 30 | +#' ) %>% |
| 31 | +#' step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>% |
| 32 | +#' step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>% |
| 33 | +#' step_epi_naomit() |
| 34 | +#' |
| 35 | +#' f <- frosting() %>% |
| 36 | +#' layer_predict() %>% |
| 37 | +#' layer_threshold(.pred) %>% |
| 38 | +#' layer_naomit(.pred) %>% |
| 39 | +#' layer_epi_YeoJohnson(.pred, |
| 40 | +#' df = pop_data, |
| 41 | +#' by = c("geo_value" = "states"), |
| 42 | +#' df_pop_col = "value" |
| 43 | +#' ) |
| 44 | +#' |
| 45 | +#' wf <- epi_workflow(r, linear_reg()) %>% |
| 46 | +#' fit(jhu) %>% |
| 47 | +#' add_frosting(f) |
| 48 | +#' |
| 49 | +#' forecast(wf) |
| 50 | +layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { |
| 51 | + checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE) |
| 52 | + |
| 53 | + add_layer( |
| 54 | + frosting, |
| 55 | + layer_epi_YeoJohnson_new( |
| 56 | + lambdas = lambdas, |
| 57 | + by = by, |
| 58 | + terms = dplyr::enquos(...), |
| 59 | + id = id |
| 60 | + ) |
| 61 | + ) |
| 62 | +} |
| 63 | + |
| 64 | +layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) { |
| 65 | + layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id) |
| 66 | +} |
| 67 | + |
| 68 | +#' @export |
| 69 | +#' @importFrom workflows extract_preprocessor |
| 70 | +slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) { |
| 71 | + rlang::check_dots_empty() |
| 72 | + |
| 73 | + # Get the lambdas from the layer or from the workflow. |
| 74 | + lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow) |
| 75 | + |
| 76 | + # If the by is not specified, try to infer it from the lambdas. |
| 77 | + if (is.null(object$by)) { |
| 78 | + # Assume `layer_predict` has calculated the prediction keys and other |
| 79 | + # layers don't change the prediction key colnames: |
| 80 | + prediction_key_colnames <- names(components$keys) |
| 81 | + lhs_potential_keys <- prediction_key_colnames |
| 82 | + rhs_potential_keys <- colnames(select(lambdas, -starts_with("lambda_"))) |
| 83 | + object$by <- intersect(lhs_potential_keys, rhs_potential_keys) |
| 84 | + suggested_min_keys <- setdiff(lhs_potential_keys, "time_value") |
| 85 | + if (!all(suggested_min_keys %in% object$by)) { |
| 86 | + cli_warn( |
| 87 | + c( |
| 88 | + "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, |
| 89 | + but {?wasn't/weren't} found in the population `df`.", |
| 90 | + "i" = "Defaulting to join by {object$by}", |
| 91 | + ">" = "Double-check whether column names on the population `df` match those expected in your predictions", |
| 92 | + ">" = "Consider using population data with breakdowns by {suggested_min_keys}", |
| 93 | + ">" = "Manually specify `by =` to silence" |
| 94 | + ), |
| 95 | + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" |
| 96 | + ) |
| 97 | + } |
| 98 | + } |
| 99 | + |
| 100 | + # Establish the join columns. |
| 101 | + object$by <- object$by %||% |
| 102 | + intersect( |
| 103 | + epipredict:::epi_keys_only(components$predictions), |
| 104 | + colnames(select(lambdas, -starts_with("lambda_"))) |
| 105 | + ) |
| 106 | + joinby <- list(x = names(object$by) %||% object$by, y = object$by) |
| 107 | + hardhat::validate_column_names(components$predictions, joinby$x) |
| 108 | + hardhat::validate_column_names(lambdas, joinby$y) |
| 109 | + |
| 110 | + # Join the lambdas. |
| 111 | + components$predictions <- inner_join( |
| 112 | + components$predictions, |
| 113 | + lambdas, |
| 114 | + by = object$by, |
| 115 | + relationship = "many-to-one", |
| 116 | + unmatched = c("error", "drop") |
| 117 | + ) |
| 118 | + |
| 119 | + # TODO: There are many possibilities here: |
| 120 | + # - (a) the terms can be empty, where we should probably default to |
| 121 | + # all_outcomes(). |
| 122 | + # - (b) explicitly giving all_outcomes(), we end here with terms being empty, |
| 123 | + # which doesn't seem right; need to make sure we pull in all the outcome |
| 124 | + # columns here. The question is what form should they have? |
| 125 | + # - (c) if the user just specifies .pred, then we have to infer the outcome |
| 126 | + # from the mold, which is simple enough and the main case I have working. |
| 127 | + # - (d) the user might specify outcomes of the form .pred_ahead_1_cases, |
| 128 | + # .pred_ahead_7_cases, etc. Is that the right format? Trying those out now |
| 129 | + # and getting errors downstream from forecast(). |
| 130 | + # Get the columns to transform. |
| 131 | + exprs <- rlang::expr(c(!!!object$terms)) |
| 132 | + pos <- tidyselect::eval_select(exprs, components$predictions) |
| 133 | + col_names <- names(pos) |
| 134 | + |
| 135 | + # For every column, we need to use the appropriate lambda column, which differs per row. |
| 136 | + # Note that yj_inverse() is vectorized. |
| 137 | + if (identical(col_names, ".pred")) { |
| 138 | + # In this case, we don't get a hint for the outcome column name, so we need to |
| 139 | + # infer it from the mold. `outcomes` is a vector of objects like |
| 140 | + # ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part. |
| 141 | + outcome_cols <- names(components$mold$outcomes) %>% |
| 142 | + stringr::str_match("ahead_\\d+_(.*)") %>% |
| 143 | + extract(, 2) |
| 144 | + |
| 145 | + components$predictions <- components$predictions %>% |
| 146 | + rowwise() %>% |
| 147 | + mutate(.pred := yj_inverse(.pred, !!sym(paste0("lambda_", outcome_cols)))) |
| 148 | + } else if (identical(col_names, character(0))) { |
| 149 | + # In this case, we should assume the user wants to transform all outcomes. |
| 150 | + cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env()) |
| 151 | + } else { |
| 152 | + # In this case, we assume that the user has specified the columns they want |
| 153 | + # transformed here. We then need to determine the lambda columns for each of |
| 154 | + # these columns. That is, we need to convert a vector of column names like |
| 155 | + # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to |
| 156 | + # c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate"). |
| 157 | + original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] |
| 158 | + if (all(original_outcome_cols %nin% names(components$mold$outcomes))) { |
| 159 | + cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env()) |
| 160 | + } |
| 161 | + |
| 162 | + for (i in seq_along(col_names)) { |
| 163 | + col <- col_names[i] |
| 164 | + lambda_col <- paste0("lambda_", original_outcome_cols[i]) |
| 165 | + components$predictions <- components$predictions %>% |
| 166 | + rowwise() %>% |
| 167 | + mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col))) |
| 168 | + } |
| 169 | + } |
| 170 | + |
| 171 | + # Remove the lambda columns. |
| 172 | + components$predictions <- components$predictions %>% |
| 173 | + select(-any_of(starts_with("lambda_"))) %>% |
| 174 | + ungroup() |
| 175 | + components |
| 176 | +} |
| 177 | + |
| 178 | +#' @export |
| 179 | +print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) { |
| 180 | + title <- "Yeo-Johnson transformation (see `lambdas` object for values) on " |
| 181 | + epipredict:::print_layer(x$terms, title = title, width = width) |
| 182 | +} |
| 183 | + |
| 184 | +#' Inverse Yeo-Johnson transformation |
| 185 | +#' |
| 186 | +#' Inverse of `yj_transform` in step_yeo_johnson.R. |
| 187 | +#' |
| 188 | +#' @keywords internal |
| 189 | +yj_inverse <- function(x, lambda, eps = 0.001) { |
| 190 | + if (is.na(lambda)) { |
| 191 | + return(x) |
| 192 | + } |
| 193 | + if (!inherits(x, "tbl_df") || is.data.frame(x)) { |
| 194 | + x <- unlist(x, use.names = FALSE) |
| 195 | + } else { |
| 196 | + if (!is.vector(x)) { |
| 197 | + x <- as.vector(x) |
| 198 | + } |
| 199 | + } |
| 200 | + |
| 201 | + dat_neg <- x < 0 |
| 202 | + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) |
| 203 | + not_neg <- ind_neg[["not"]] |
| 204 | + is_neg <- ind_neg[["is"]] |
| 205 | + |
| 206 | + nn_inv_trans <- function(x, lambda) { |
| 207 | + if (abs(lambda) < eps) { |
| 208 | + # log(x + 1) |
| 209 | + exp(x) - 1 |
| 210 | + } else { |
| 211 | + # ((x + 1)^lambda - 1) / lambda |
| 212 | + (lambda * x + 1)^(1 / lambda) - 1 |
| 213 | + } |
| 214 | + } |
| 215 | + |
| 216 | + ng_inv_trans <- function(x, lambda) { |
| 217 | + if (abs(lambda - 2) < eps) { |
| 218 | + # -log(-x + 1) |
| 219 | + -(exp(-x) - 1) |
| 220 | + } else { |
| 221 | + # -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) |
| 222 | + -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) |
| 223 | + } |
| 224 | + } |
| 225 | + |
| 226 | + if (length(not_neg) > 0) { |
| 227 | + x[not_neg] <- nn_inv_trans(x[not_neg], lambda) |
| 228 | + } |
| 229 | + |
| 230 | + if (length(is_neg) > 0) { |
| 231 | + x[is_neg] <- ng_inv_trans(x[is_neg], lambda) |
| 232 | + } |
| 233 | + x |
| 234 | +} |
| 235 | + |
| 236 | +get_lambdas_in_layer <- function(workflow) { |
| 237 | + this_recipe <- hardhat::extract_recipe(workflow) |
| 238 | + if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { |
| 239 | + cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) |
| 240 | + } |
| 241 | + for (step in this_recipe$steps) { |
| 242 | + if (inherits(step, "step_epi_YeoJohnson")) { |
| 243 | + lambdas <- step$lambdas |
| 244 | + break |
| 245 | + } |
| 246 | + } |
| 247 | + lambdas |
| 248 | +} |
0 commit comments