From 2ee42bdf8744f25205838a2a517a4efbbb559829 Mon Sep 17 00:00:00 2001 From: Teun van den Brand <49372158+teunbrand@users.noreply.github.com> Date: Sat, 24 Dec 2022 16:12:40 +0100 Subject: [PATCH 1/9] Add weighted ecdf --- R/stat-ecdf.r | 67 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/R/stat-ecdf.r b/R/stat-ecdf.r index 4a9daccc45..2539f926e8 100644 --- a/R/stat-ecdf.r +++ b/R/stat-ecdf.r @@ -74,7 +74,7 @@ stat_ecdf <- function(mapping = NULL, data = NULL, StatEcdf <- ggproto("StatEcdf", Stat, required_aes = c("x|y"), - default_aes = aes(y = after_stat(y)), + default_aes = aes(y = after_stat(y), weight = NULL), setup_params = function(self, data, params) { params$flipped_aes <- has_flipped_aes(data, params, main_is_orthogonal = FALSE, main_is_continuous = TRUE) @@ -100,7 +100,11 @@ StatEcdf <- ggproto("StatEcdf", Stat, if (pad) { x <- c(-Inf, x, Inf) } - data_ecdf <- ecdf(data$x)(x) + if (is.null(data$weight)) { + data_ecdf <- ecdf(data$x)(x) + } else { + data_ecdf <- wecdf(data$x, data$weight)(x) + } df_ecdf <- data_frame0( x = x, @@ -109,6 +113,63 @@ StatEcdf <- ggproto("StatEcdf", Stat, ) df_ecdf$flipped_aes <- flipped_aes flip_data(df_ecdf, flipped_aes) - } + }, + + dropped_aes = "weight" ) +# Weighted eCDF function +wecdf <- function(x, weights = NULL) { + if (is.null(weights)) { + return(ecdf(x)) + } + weights <- vec_recycle(weights, length(x)) + + # Sort vectors + ord <- order(x, na.last = NA) + x <- x[ord] + weights <- weights[ord] + + if (any(!is.finite(weights))) { + cli::cli_warn(c(paste0( + "The {.field weight} aesthetic does not support non-finite or ", + "{.code NA} values." + ), "i" = "These weights were replaced by {.val 0}.")) + weights[!is.finite(weights)] <- 0 + } + + # `total` replaces `length(x)` + total <- sum(weights) + + if (abs(total) < 1000 * .Machine$double.eps) { + if (total == 0) { + cli::cli_abort(paste0( + "Cannot compute eCDF when the {.field weight} aesthetic sums up to ", + "{.val 0}." + )) + } + cli::cli_warn(c( + "The sum of the {.field weight} aesthetic is close to {.val 0}.", + "i" = "Computed eCDF might be unstable." + )) + } + + # Link each observation to unique value + vals <- unique0(x) + matched <- match(x, vals) + + # Instead of tabulating `matched`, as we would for unweighted `ecdf(x)`, + # we sum weights per unique value of `x` + agg_weights <- vapply( + split(weights, matched), + sum, numeric(1) + ) + + # Like `ecdf(x)`, we return an approx function + approxfun( + vals, + cumsum(agg_weights) / total, + method = "constant", + yleft = 0, yright = 1, + f = 0, ties = "ordered" + ) From 83b6bc5620c9a75403814af5e84793dec32d071a Mon Sep 17 00:00:00 2001 From: Teun van den Brand <49372158+teunbrand@users.noreply.github.com> Date: Sat, 24 Dec 2022 16:12:59 +0100 Subject: [PATCH 2/9] Update docs --- R/stat-ecdf.r | 18 ++++++++++++++++++ man/stat_ecdf.Rd | 30 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/R/stat-ecdf.r b/R/stat-ecdf.r index 2539f926e8..aee9279f13 100644 --- a/R/stat-ecdf.r +++ b/R/stat-ecdf.r @@ -20,10 +20,16 @@ #' of points to interpolate with. #' @param pad If `TRUE`, pad the ecdf with additional points (-Inf, 0) #' and (Inf, 1) +#' @eval rd_aesthetics("stat", "ecdf") #' @section Computed variables: #' \describe{ #' \item{y}{cumulative density corresponding x} #' } +#' @section Dropped variables: +#' \describe{ +#' \item{weight}{After calculation, weights of individual observations (if +#' supplied), are no longer available.} +#' } #' @export #' @examples #' set.seed(1) @@ -41,6 +47,17 @@ #' # Multiple ECDFs #' ggplot(df, aes(x, colour = g)) + #' stat_ecdf() +#' +#' # Using weighted eCDF +#' weighted <- data.frame(x = 1:10, weights = c(1:5, 5:1)) +#' plain <- data.frame(x = rep(weighted$x, weighted$weights)) +#' +#' ggplot(plain, aes(x)) + +#' stat_ecdf(linewidth = 1) + +#' stat_ecdf( +#' aes(weight = weights), +#' data = weighted, colour = "green" +#' ) stat_ecdf <- function(mapping = NULL, data = NULL, geom = "step", position = "identity", ..., @@ -173,3 +190,4 @@ wecdf <- function(x, weights = NULL) { yleft = 0, yright = 1, f = 0, ties = "ordered" ) +} diff --git a/man/stat_ecdf.Rd b/man/stat_ecdf.Rd index f7d78f8f33..7038079bd3 100644 --- a/man/stat_ecdf.Rd +++ b/man/stat_ecdf.Rd @@ -86,6 +86,17 @@ use as the input and which to use as the output. Either x or y must be provided and one of them must be unused. The ECDF will be calculated on the given aesthetic and will be output on the unused one. } +\section{Aesthetics}{ + +\code{stat_ecdf()} understands the following aesthetics (required aesthetics are in bold): +\itemize{ +\item \strong{\code{x} \emph{or} \code{y}} +\item \code{group} +\item \code{weight} +} +Learn more about setting these aesthetics in \code{vignette("ggplot2-specs")}. +} + \section{Computed variables}{ \describe{ @@ -93,6 +104,14 @@ and will be output on the unused one. } } +\section{Dropped variables}{ + +\describe{ +\item{weight}{After calculation, weights of individiual observations (if +supplied), are no longer available.} +} +} + \examples{ set.seed(1) df <- data.frame( @@ -109,4 +128,15 @@ ggplot(df, aes(x)) + # Multiple ECDFs ggplot(df, aes(x, colour = g)) + stat_ecdf() + +# Using weighted eCDF +weighted <- data.frame(x = 1:10, weights = c(1:5, 5:1)) +plain <- data.frame(x = rep(weighted$x, weighted$weights)) + +ggplot(plain, aes(x)) + + stat_ecdf(linewidth = 1) + + stat_ecdf( + aes(weight = weights), + data = weighted, colour = "green" + ) } From 836ab78df790723a8dc899ca0555ff5f3f39a05a Mon Sep 17 00:00:00 2001 From: Teun van den Brand <49372158+teunbrand@users.noreply.github.com> Date: Sat, 24 Dec 2022 16:13:23 +0100 Subject: [PATCH 3/9] Add tests for weighted ecdf --- tests/testthat/test-stat-ecdf.R | 41 +++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/testthat/test-stat-ecdf.R b/tests/testthat/test-stat-ecdf.R index adb7a35290..d167ab689d 100644 --- a/tests/testthat/test-stat-ecdf.R +++ b/tests/testthat/test-stat-ecdf.R @@ -15,3 +15,44 @@ test_that("stat_ecdf works in both directions", { expect_snapshot_error(ggplot_build(p)) }) +test_that("weighted ecdf computes sensible results", { + + set.seed(42) + x <- rpois(100, 5) + ux <- sort(unique0(x)) + + # Uniform weights should be the same as the original + expect_equal( + ecdf(x)(ux), + wecdf(x, 10)(ux) + ) + + # Tabulated weights should be the same as the original + tab <- as.data.frame(table(x), stringsAsFactors = FALSE) + tab$x <- as.numeric(tab$x) + expect_equal( + ecdf(x)(ux), + wecdf(tab$x, tab$Freq)(ux) + ) +}) + +test_that("weighted ecdf wanrss about weird weights", { + + # Should warn when provided with illegal weights + expect_warning( + wecdf(1:10, c(NA, rep(1, 9))), + "does not support non-finite" + ) + + # Should warn when provided with near-0 weights + expect_warning( + wecdf(1:10, .Machine$double.eps), + "might be unstable" + ) + + # Should error when weights sum to 0 + expect_error( + wecdf(1:10, rep(c(-1, 1), 5)), + "Cannot compute eCDF" + ) +}) From d22a88bc14e4a8479a199d37a15b8778aca5467f Mon Sep 17 00:00:00 2001 From: Teun van den Brand <49372158+teunbrand@users.noreply.github.com> Date: Sat, 24 Dec 2022 16:14:13 +0100 Subject: [PATCH 4/9] Add NEWS.md bullet --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index 7cdab7224c..ab2753c4b7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # ggplot2 (development version) +* `stat_ecdf()` now has an optional `weight` aesthetic (@teunbrand, #5058). * Fixed a regression in `geom_hex()` where aesthetics were replicated across bins (@thomasp85, #5037 and #5044) * Fixed spurious warning when `weight` aesthetic was used in `stat_smooth()` From 76742c2712cc01411af12c65e09036c5f2a830a8 Mon Sep 17 00:00:00 2001 From: Teun van den Brand <49372158+teunbrand@users.noreply.github.com> Date: Sat, 24 Dec 2022 16:16:17 +0100 Subject: [PATCH 5/9] Fix typo --- tests/testthat/test-stat-ecdf.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-stat-ecdf.R b/tests/testthat/test-stat-ecdf.R index d167ab689d..af4492a13b 100644 --- a/tests/testthat/test-stat-ecdf.R +++ b/tests/testthat/test-stat-ecdf.R @@ -36,7 +36,7 @@ test_that("weighted ecdf computes sensible results", { ) }) -test_that("weighted ecdf wanrss about weird weights", { +test_that("weighted ecdf warns about weird weights", { # Should warn when provided with illegal weights expect_warning( From 8d3e86754893755d702128824cfa0779e77b999f Mon Sep 17 00:00:00 2001 From: Teun van den Brand <49372158+teunbrand@users.noreply.github.com> Date: Sun, 26 Mar 2023 13:44:17 +0200 Subject: [PATCH 6/9] Don't fallback to `ecdf()` --- R/stat-ecdf.r | 5 ++--- tests/testthat/test-stat-ecdf.R | 8 +++++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/R/stat-ecdf.r b/R/stat-ecdf.r index aee9279f13..3f7532bda2 100644 --- a/R/stat-ecdf.r +++ b/R/stat-ecdf.r @@ -137,9 +137,8 @@ StatEcdf <- ggproto("StatEcdf", Stat, # Weighted eCDF function wecdf <- function(x, weights = NULL) { - if (is.null(weights)) { - return(ecdf(x)) - } + + weights <- weights %||% 1 weights <- vec_recycle(weights, length(x)) # Sort vectors diff --git a/tests/testthat/test-stat-ecdf.R b/tests/testthat/test-stat-ecdf.R index af4492a13b..2f2030fd07 100644 --- a/tests/testthat/test-stat-ecdf.R +++ b/tests/testthat/test-stat-ecdf.R @@ -21,10 +21,16 @@ test_that("weighted ecdf computes sensible results", { x <- rpois(100, 5) ux <- sort(unique0(x)) + # Absent weights should be the same as the original + expect_equal( + ecdf(x)(ux), + wecdf(x, NULL)(ux) + ) + # Uniform weights should be the same as the original expect_equal( ecdf(x)(ux), - wecdf(x, 10)(ux) + wecdf(x, pi)(ux) ) # Tabulated weights should be the same as the original From 179849d13a7eb521d82eff6d4a94eb0913a0b5f4 Mon Sep 17 00:00:00 2001 From: Teun van den Brand <49372158+teunbrand@users.noreply.github.com> Date: Sun, 26 Mar 2023 14:04:24 +0200 Subject: [PATCH 7/9] Add clarification in docs --- R/stat-ecdf.R | 4 ++++ man/stat_ecdf.Rd | 10 +++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/R/stat-ecdf.R b/R/stat-ecdf.R index 708ba7adbc..9338a73386 100644 --- a/R/stat-ecdf.R +++ b/R/stat-ecdf.R @@ -12,6 +12,10 @@ #' and one of them must be unused. The ECDF will be calculated on the given aesthetic #' and will be output on the unused one. #' +#' If the `weight` aesthetic is provided, a weighted ECDF will be computed. In +#' this case, the ECDF is incremented by `weight / sum(weight)` instead of +#' `1 / length(x)` for each observation. +#' #' @inheritParams layer #' @inheritParams geom_point #' @param na.rm If `FALSE` (the default), removes missing values with diff --git a/man/stat_ecdf.Rd b/man/stat_ecdf.Rd index c20d7d8e03..f068d1d24b 100644 --- a/man/stat_ecdf.Rd +++ b/man/stat_ecdf.Rd @@ -85,13 +85,17 @@ The statistic relies on the aesthetics assignment to guess which variable to use as the input and which to use as the output. Either x or y must be provided and one of them must be unused. The ECDF will be calculated on the given aesthetic and will be output on the unused one. + +If the \code{weight} aesthetic is provided, a weighted ECDF will be computed. In +this case, the ECDF is incremented by \code{weight / sum(weight)} instead of +\code{1 / length(x)} for each observation. } \section{Aesthetics}{ \code{stat_ecdf()} understands the following aesthetics (required aesthetics are in bold): \itemize{ -\item \strong{\code{x} \emph{or} \code{y}} -\item \code{group} +\item \strong{\code{\link[=aes_position]{x}} \emph{or} \code{\link[=aes_position]{y}}} +\item \code{\link[=aes_group_order]{group}} \item \code{weight} } Learn more about setting these aesthetics in \code{vignette("ggplot2-specs")}. @@ -109,7 +113,7 @@ These are calculated by the 'stat' part of layers and can be accessed with \link \section{Dropped variables}{ \describe{ -\item{weight}{After calculation, weights of individiual observations (if +\item{weight}{After calculation, weights of individual observations (if supplied), are no longer available.} } } From 4c9ba927b61efa3be63904970d009f1cc9ae35de Mon Sep 17 00:00:00 2001 From: Teun van den Brand Date: Mon, 20 May 2024 10:43:48 +0200 Subject: [PATCH 8/9] use wecdf --- R/stat-ecdf.R | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/R/stat-ecdf.R b/R/stat-ecdf.R index 9338a73386..fc55dc2e04 100644 --- a/R/stat-ecdf.R +++ b/R/stat-ecdf.R @@ -121,11 +121,7 @@ StatEcdf <- ggproto("StatEcdf", Stat, if (pad) { x <- c(-Inf, x, Inf) } - if (is.null(data$weight)) { - data_ecdf <- ecdf(data$x)(x) - } else { - data_ecdf <- wecdf(data$x, data$weight)(x) - } + data_ecdf <- wecdf(data$x, data$weight)(x) df_ecdf <- data_frame0( x = x, From c06506bb3c10ce40f08599ce3388800d0e394d24 Mon Sep 17 00:00:00 2001 From: Teun van den Brand Date: Mon, 20 May 2024 10:51:50 +0200 Subject: [PATCH 9/9] cleanup news --- NEWS.md | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index 391565a34a..8a31cd598a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,19 +1,6 @@ # ggplot2 (development version) * `stat_ecdf()` now has an optional `weight` aesthetic (@teunbrand, #5058). -* To improve `width` calculation in bar plots with empty factor levels, - `resolution()` considers `mapped_discrete` values as having resolution 1 - (@teunbrand, #5211) -* When `geom_path()` has aesthetics varying within groups, the `arrow()` is - applied to groups instead of individual segments (@teunbrand, #4935). -* The default width of `geom_bar()` is now based on panel-wise resolution of - the data, rather than global resolution (@teunbrand, #4336). -* To apply dodging more consistently in violin plots, `stat_ydensity()` now - has a `drop` argument to keep or discard groups with 1 observation. -* Aesthetics listed in `geom_*()` and `stat_*()` layers now point to relevant - documentation (@teunbrand, #5123). -* A function can be provided to `labs(alt = <...>)` that takes the plot as input - and returns text as output (@teunbrand, #4795). * Position scales combined with `coord_sf()` can now use functions in the `breaks` argument. In addition, `n.breaks` works as intended and `breaks = NULL` removes grid lines and axes (@teunbrand, #4622).