Skip to content

Commit 959ef4b

Browse files
committed
feat: add yeo-johnson
* step and layer work with a single outcome and layer_yj(.pred) * need to work on multiple outcomes case
1 parent 7cd135f commit 959ef4b

File tree

5 files changed

+850
-1
lines changed

5 files changed

+850
-1
lines changed
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
#' Unormalizing transformation
2+
#'
3+
#' Will undo a step_epi_YeoJohnson transformation.
4+
#'
5+
#' @param frosting a `frosting` postprocessor. The layer will be added to the
6+
#' sequence of operations for this frosting.
7+
#' @param ... One or more selector functions to scale variables
8+
#' for this step. See [recipes::selections()] for more details.
9+
#' @param df a data frame that contains the population data to be used for
10+
#' inverting the existing scaling.
11+
#' @param by A (possibly named) character vector of variables to join by.
12+
#' @param id a random id string
13+
#'
14+
#' @return an updated `frosting` postprocessor
15+
#' @export
16+
#' @examples
17+
#' library(dplyr)
18+
#' jhu <- epidatasets::cases_deaths_subset %>%
19+
#' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
20+
#' select(geo_value, time_value, cases)
21+
#'
22+
#' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000))
23+
#'
24+
#' r <- epi_recipe(jhu) %>%
25+
#' step_epi_YeoJohnson(
26+
#' df = pop_data,
27+
#' df_pop_col = "value",
28+
#' by = c("geo_value" = "states"),
29+
#' cases, suffix = "_scaled"
30+
#' ) %>%
31+
#' step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>%
32+
#' step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>%
33+
#' step_epi_naomit()
34+
#'
35+
#' f <- frosting() %>%
36+
#' layer_predict() %>%
37+
#' layer_threshold(.pred) %>%
38+
#' layer_naomit(.pred) %>%
39+
#' layer_epi_YeoJohnson(.pred,
40+
#' df = pop_data,
41+
#' by = c("geo_value" = "states"),
42+
#' df_pop_col = "value"
43+
#' )
44+
#'
45+
#' wf <- epi_workflow(r, linear_reg()) %>%
46+
#' fit(jhu) %>%
47+
#' add_frosting(f)
48+
#'
49+
#' forecast(wf)
50+
layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) {
51+
checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE)
52+
53+
add_layer(
54+
frosting,
55+
layer_epi_YeoJohnson_new(
56+
lambdas = lambdas,
57+
by = by,
58+
terms = dplyr::enquos(...),
59+
id = id
60+
)
61+
)
62+
}
63+
64+
layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) {
65+
layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id)
66+
}
67+
68+
#' @export
69+
#' @importFrom workflows extract_preprocessor
70+
slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) {
71+
rlang::check_dots_empty()
72+
73+
# Get the lambdas from the layer or from the workflow.
74+
lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow)
75+
76+
# If the by is not specified, try to infer it from the lambdas.
77+
if (is.null(object$by)) {
78+
# Assume `layer_predict` has calculated the prediction keys and other
79+
# layers don't change the prediction key colnames:
80+
prediction_key_colnames <- names(components$keys)
81+
lhs_potential_keys <- prediction_key_colnames
82+
rhs_potential_keys <- colnames(select(lambdas, -starts_with("lambda_")))
83+
object$by <- intersect(lhs_potential_keys, rhs_potential_keys)
84+
suggested_min_keys <- setdiff(lhs_potential_keys, "time_value")
85+
if (!all(suggested_min_keys %in% object$by)) {
86+
cli_warn(
87+
c(
88+
"{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions,
89+
but {?wasn't/weren't} found in the population `df`.",
90+
"i" = "Defaulting to join by {object$by}",
91+
">" = "Double-check whether column names on the population `df` match those expected in your predictions",
92+
">" = "Consider using population data with breakdowns by {suggested_min_keys}",
93+
">" = "Manually specify `by =` to silence"
94+
),
95+
class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys"
96+
)
97+
}
98+
}
99+
100+
# Establish the join columns.
101+
object$by <- object$by %||%
102+
intersect(
103+
epipredict:::epi_keys_only(components$predictions),
104+
colnames(select(lambdas, -starts_with("lambda_")))
105+
)
106+
joinby <- list(x = names(object$by) %||% object$by, y = object$by)
107+
hardhat::validate_column_names(components$predictions, joinby$x)
108+
hardhat::validate_column_names(lambdas, joinby$y)
109+
110+
# Join the lambdas.
111+
components$predictions <- inner_join(
112+
components$predictions,
113+
lambdas,
114+
by = object$by,
115+
relationship = "many-to-one",
116+
unmatched = c("error", "drop")
117+
)
118+
119+
# TODO: There are many possibilities here:
120+
# - (a) the terms can be empty, where we should probably default to
121+
# all_outcomes().
122+
# - (b) explicitly giving all_outcomes(), we end here with terms being empty,
123+
# which doesn't seem right; need to make sure we pull in all the outcome
124+
# columns here. The question is what form should they have?
125+
# - (c) if the user just specifies .pred, then we have to infer the outcome
126+
# from the mold, which is simple enough and the main case I have working.
127+
# - (d) the user might specify outcomes of the form .pred_ahead_1_cases,
128+
# .pred_ahead_7_cases, etc. Is that the right format? Trying those out now
129+
# and getting errors downstream from forecast().
130+
# Get the columns to transform.
131+
exprs <- rlang::expr(c(!!!object$terms))
132+
pos <- tidyselect::eval_select(exprs, components$predictions)
133+
col_names <- names(pos)
134+
135+
# For every column, we need to use the appropriate lambda column, which differs per row.
136+
# Note that yj_inverse() is vectorized.
137+
if (identical(col_names, ".pred")) {
138+
# In this case, we don't get a hint for the outcome column name, so we need to
139+
# infer it from the mold. `outcomes` is a vector of objects like
140+
# ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part.
141+
outcome_cols <- names(components$mold$outcomes) %>%
142+
stringr::str_match("ahead_\\d+_(.*)") %>%
143+
extract(, 2)
144+
145+
components$predictions <- components$predictions %>%
146+
rowwise() %>%
147+
mutate(.pred := yj_inverse(.pred, !!sym(paste0("lambda_", outcome_cols))))
148+
} else if (identical(col_names, character(0))) {
149+
# In this case, we should assume the user wants to transform all outcomes.
150+
cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env())
151+
} else {
152+
# In this case, we assume that the user has specified the columns they want
153+
# transformed here. We then need to determine the lambda columns for each of
154+
# these columns. That is, we need to convert a vector of column names like
155+
# c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to
156+
# c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate").
157+
original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2]
158+
if (all(original_outcome_cols %nin% names(components$mold$outcomes))) {
159+
cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env())
160+
}
161+
162+
for (i in seq_along(col_names)) {
163+
col <- col_names[i]
164+
lambda_col <- paste0("lambda_", original_outcome_cols[i])
165+
components$predictions <- components$predictions %>%
166+
rowwise() %>%
167+
mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col)))
168+
}
169+
}
170+
171+
# Remove the lambda columns.
172+
components$predictions <- components$predictions %>%
173+
select(-any_of(starts_with("lambda_"))) %>%
174+
ungroup()
175+
components
176+
}
177+
178+
#' @export
179+
print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) {
180+
title <- "Yeo-Johnson transformation (see `lambdas` object for values) on "
181+
epipredict:::print_layer(x$terms, title = title, width = width)
182+
}
183+
184+
#' Inverse Yeo-Johnson transformation
185+
#'
186+
#' Inverse of `yj_transform` in step_yeo_johnson.R.
187+
#'
188+
#' @keywords internal
189+
yj_inverse <- function(x, lambda, eps = 0.001) {
190+
if (is.na(lambda)) {
191+
return(x)
192+
}
193+
if (!inherits(x, "tbl_df") || is.data.frame(x)) {
194+
x <- unlist(x, use.names = FALSE)
195+
} else {
196+
if (!is.vector(x)) {
197+
x <- as.vector(x)
198+
}
199+
}
200+
201+
dat_neg <- x < 0
202+
ind_neg <- list(is = which(dat_neg), not = which(!dat_neg))
203+
not_neg <- ind_neg[["not"]]
204+
is_neg <- ind_neg[["is"]]
205+
206+
nn_inv_trans <- function(x, lambda) {
207+
if (abs(lambda) < eps) {
208+
# log(x + 1)
209+
exp(x) - 1
210+
} else {
211+
# ((x + 1)^lambda - 1) / lambda
212+
(lambda * x + 1)^(1 / lambda) - 1
213+
}
214+
}
215+
216+
ng_inv_trans <- function(x, lambda) {
217+
if (abs(lambda - 2) < eps) {
218+
# -log(-x + 1)
219+
-(exp(-x) - 1)
220+
} else {
221+
# -((-x + 1)^(2 - lambda) - 1) / (2 - lambda)
222+
-(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1)
223+
}
224+
}
225+
226+
if (length(not_neg) > 0) {
227+
x[not_neg] <- nn_inv_trans(x[not_neg], lambda)
228+
}
229+
230+
if (length(is_neg) > 0) {
231+
x[is_neg] <- ng_inv_trans(x[is_neg], lambda)
232+
}
233+
x
234+
}
235+
236+
get_lambdas_in_layer <- function(workflow) {
237+
this_recipe <- hardhat::extract_recipe(workflow)
238+
if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) {
239+
cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env())
240+
}
241+
for (step in this_recipe$steps) {
242+
if (inherits(step, "step_epi_YeoJohnson")) {
243+
lambdas <- step$lambdas
244+
break
245+
}
246+
}
247+
lambdas
248+
}

0 commit comments

Comments
 (0)