diff --git a/NEWS.md b/NEWS.md index 50544dcce6..8a31cd598a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,6 @@ # ggplot2 (development version) -* A function can be provided to `labs(alt = <...>)` that takes the plot as input - and returns text as output (@teunbrand, #4795). +* `stat_ecdf()` now has an optional `weight` aesthetic (@teunbrand, #5058). * Position scales combined with `coord_sf()` can now use functions in the `breaks` argument. In addition, `n.breaks` works as intended and `breaks = NULL` removes grid lines and axes (@teunbrand, #4622). diff --git a/R/stat-ecdf.R b/R/stat-ecdf.R index 5f7e5fdd30..fc55dc2e04 100644 --- a/R/stat-ecdf.R +++ b/R/stat-ecdf.R @@ -12,6 +12,10 @@ #' and one of them must be unused. The ECDF will be calculated on the given aesthetic #' and will be output on the unused one. #' +#' If the `weight` aesthetic is provided, a weighted ECDF will be computed. In +#' this case, the ECDF is incremented by `weight / sum(weight)` instead of +#' `1 / length(x)` for each observation. +#' #' @inheritParams layer #' @inheritParams geom_point #' @param na.rm If `FALSE` (the default), removes missing values with @@ -20,10 +24,16 @@ #' of points to interpolate with. #' @param pad If `TRUE`, pad the ecdf with additional points (-Inf, 0) #' and (Inf, 1) +#' @eval rd_aesthetics("stat", "ecdf") #' @eval rd_computed_vars( #' ecdf = "Cumulative density corresponding to `x`.", #' y = "`r lifecycle::badge('superseded')` For backward compatibility." #' ) +#' @section Dropped variables: +#' \describe{ +#' \item{weight}{After calculation, weights of individual observations (if +#' supplied), are no longer available.} +#' } #' @export #' @examples #' set.seed(1) @@ -41,6 +51,17 @@ #' # Multiple ECDFs #' ggplot(df, aes(x, colour = g)) + #' stat_ecdf() +#' +#' # Using weighted eCDF +#' weighted <- data.frame(x = 1:10, weights = c(1:5, 5:1)) +#' plain <- data.frame(x = rep(weighted$x, weighted$weights)) +#' +#' ggplot(plain, aes(x)) + +#' stat_ecdf(linewidth = 1) + +#' stat_ecdf( +#' aes(weight = weights), +#' data = weighted, colour = "green" +#' ) stat_ecdf <- function(mapping = NULL, data = NULL, geom = "step", position = "identity", ..., @@ -74,7 +95,7 @@ stat_ecdf <- function(mapping = NULL, data = NULL, StatEcdf <- ggproto("StatEcdf", Stat, required_aes = c("x|y"), - default_aes = aes(x = after_stat(ecdf), y = after_stat(ecdf)), + default_aes = aes(x = after_stat(ecdf), y = after_stat(ecdf), weight = NULL), setup_params = function(self, data, params) { params$flipped_aes <- has_flipped_aes(data, params, main_is_orthogonal = FALSE, main_is_continuous = TRUE) @@ -100,7 +121,7 @@ StatEcdf <- ggproto("StatEcdf", Stat, if (pad) { x <- c(-Inf, x, Inf) } - data_ecdf <- stats::ecdf(data$x)(x) + data_ecdf <- wecdf(data$x, data$weight)(x) df_ecdf <- data_frame0( x = x, @@ -110,6 +131,63 @@ StatEcdf <- ggproto("StatEcdf", Stat, ) df_ecdf$flipped_aes <- flipped_aes flip_data(df_ecdf, flipped_aes) - } + }, + + dropped_aes = "weight" ) +# Weighted eCDF function +wecdf <- function(x, weights = NULL) { + + weights <- weights %||% 1 + weights <- vec_recycle(weights, length(x)) + + # Sort vectors + ord <- order(x, na.last = NA) + x <- x[ord] + weights <- weights[ord] + + if (any(!is.finite(weights))) { + cli::cli_warn(c(paste0( + "The {.field weight} aesthetic does not support non-finite or ", + "{.code NA} values." + ), "i" = "These weights were replaced by {.val 0}.")) + weights[!is.finite(weights)] <- 0 + } + + # `total` replaces `length(x)` + total <- sum(weights) + + if (abs(total) < 1000 * .Machine$double.eps) { + if (total == 0) { + cli::cli_abort(paste0( + "Cannot compute eCDF when the {.field weight} aesthetic sums up to ", + "{.val 0}." + )) + } + cli::cli_warn(c( + "The sum of the {.field weight} aesthetic is close to {.val 0}.", + "i" = "Computed eCDF might be unstable." + )) + } + + # Link each observation to unique value + vals <- unique0(x) + matched <- match(x, vals) + + # Instead of tabulating `matched`, as we would for unweighted `ecdf(x)`, + # we sum weights per unique value of `x` + agg_weights <- vapply( + split(weights, matched), + sum, numeric(1) + ) + + # Like `ecdf(x)`, we return an approx function + approxfun( + vals, + cumsum(agg_weights) / total, + method = "constant", + yleft = 0, yright = 1, + f = 0, ties = "ordered" + ) +} diff --git a/man/stat_ecdf.Rd b/man/stat_ecdf.Rd index 2a8e6c80e7..d0941b2b56 100644 --- a/man/stat_ecdf.Rd +++ b/man/stat_ecdf.Rd @@ -125,7 +125,22 @@ The statistic relies on the aesthetics assignment to guess which variable to use as the input and which to use as the output. Either x or y must be provided and one of them must be unused. The ECDF will be calculated on the given aesthetic and will be output on the unused one. + +If the \code{weight} aesthetic is provided, a weighted ECDF will be computed. In +this case, the ECDF is incremented by \code{weight / sum(weight)} instead of +\code{1 / length(x)} for each observation. +} +\section{Aesthetics}{ + +\code{stat_ecdf()} understands the following aesthetics (required aesthetics are in bold): +\itemize{ +\item \strong{\code{\link[=aes_position]{x}} \emph{or} \code{\link[=aes_position]{y}}} +\item \code{\link[=aes_group_order]{group}} +\item \code{weight} +} +Learn more about setting these aesthetics in \code{vignette("ggplot2-specs")}. } + \section{Computed variables}{ These are calculated by the 'stat' part of layers and can be accessed with \link[=aes_eval]{delayed evaluation}. @@ -135,6 +150,14 @@ These are calculated by the 'stat' part of layers and can be accessed with \link } } +\section{Dropped variables}{ + +\describe{ +\item{weight}{After calculation, weights of individual observations (if +supplied), are no longer available.} +} +} + \examples{ set.seed(1) df <- data.frame( @@ -151,4 +174,15 @@ ggplot(df, aes(x)) + # Multiple ECDFs ggplot(df, aes(x, colour = g)) + stat_ecdf() + +# Using weighted eCDF +weighted <- data.frame(x = 1:10, weights = c(1:5, 5:1)) +plain <- data.frame(x = rep(weighted$x, weighted$weights)) + +ggplot(plain, aes(x)) + + stat_ecdf(linewidth = 1) + + stat_ecdf( + aes(weight = weights), + data = weighted, colour = "green" + ) } diff --git a/tests/testthat/test-stat-ecdf.R b/tests/testthat/test-stat-ecdf.R index 6fd8297e18..399dfbebb0 100644 --- a/tests/testthat/test-stat-ecdf.R +++ b/tests/testthat/test-stat-ecdf.R @@ -15,6 +15,54 @@ test_that("stat_ecdf works in both directions", { expect_snapshot_error(ggplot_build(p)) }) +test_that("weighted ecdf computes sensible results", { + + set.seed(42) + x <- rpois(100, 5) + ux <- sort(unique0(x)) + + # Absent weights should be the same as the original + expect_equal( + ecdf(x)(ux), + wecdf(x, NULL)(ux) + ) + + # Uniform weights should be the same as the original + expect_equal( + ecdf(x)(ux), + wecdf(x, pi)(ux) + ) + + # Tabulated weights should be the same as the original + tab <- as.data.frame(table(x), stringsAsFactors = FALSE) + tab$x <- as.numeric(tab$x) + expect_equal( + ecdf(x)(ux), + wecdf(tab$x, tab$Freq)(ux) + ) +}) + +test_that("weighted ecdf warns about weird weights", { + + # Should warn when provided with illegal weights + expect_warning( + wecdf(1:10, c(NA, rep(1, 9))), + "does not support non-finite" + ) + + # Should warn when provided with near-0 weights + expect_warning( + wecdf(1:10, .Machine$double.eps), + "might be unstable" + ) + + # Should error when weights sum to 0 + expect_error( + wecdf(1:10, rep(c(-1, 1), 5)), + "Cannot compute eCDF" + ) +}) + # See #5113 and #5112 test_that("stat_ecdf responds to axis transformations", { n <- 4