diff --git a/NEWS.md b/NEWS.md index df68e55fd5..793a9dd82f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # ggplot2 (development version) +* `geom_density()` and `stat_density()` now support `bounds` argument + to estimate density with boundary correction (@echasnovski, #4013). + * ggplot now checks during statistical transformations whether any data columns were dropped and warns about this. If stats intend to drop data columns they can declare them in the new field `dropped_aes`. diff --git a/R/geom-density.r b/R/geom-density.r index d8e3f72263..bcffcf18c3 100644 --- a/R/geom-density.r +++ b/R/geom-density.r @@ -35,6 +35,12 @@ #' geom_density(alpha = 0.1) + #' xlim(55, 70) #' +#' # Use `bounds` to adjust computation for known data limits +#' big_diamonds <- diamonds[diamonds$carat >= 1, ] +#' ggplot(big_diamonds, aes(carat)) + +#' geom_density(color = 'red') + +#' geom_density(bounds = c(1, Inf), color = 'blue') +#' #' \donttest{ #' # Stacked density plots: if you want to create a stacked density plot, you #' # probably want to 'count' (density * n) variable instead of the default diff --git a/R/stat-density.r b/R/stat-density.r index c712d921fb..89e488041b 100644 --- a/R/stat-density.r +++ b/R/stat-density.r @@ -15,6 +15,11 @@ #' not line-up, and hence you won't be able to stack density values. #' This parameter only matters if you are displaying multiple densities in #' one plot or if you are manually adjusting the scale limits. +#' @param bounds Known lower and upper bounds for estimated data. Default +#' `c(-Inf, Inf)` means that there are no (finite) bounds. If any bound is +#' finite, boundary effect of default density estimation will be corrected by +#' reflecting tails outside `bounds` around their closest edge. Data points +#' outside of bounds are removed with a warning. #' @section Computed variables: #' \describe{ #' \item{density}{density estimate} @@ -36,6 +41,7 @@ stat_density <- function(mapping = NULL, data = NULL, n = 512, trim = FALSE, na.rm = FALSE, + bounds = c(-Inf, Inf), orientation = NA, show.legend = NA, inherit.aes = TRUE) { @@ -55,6 +61,7 @@ stat_density <- function(mapping = NULL, data = NULL, n = n, trim = trim, na.rm = na.rm, + bounds = bounds, orientation = orientation, ... ) @@ -70,6 +77,8 @@ StatDensity <- ggproto("StatDensity", Stat, default_aes = aes(x = after_stat(density), y = after_stat(density), fill = NA, weight = NULL), + dropped_aes = "weight", + setup_params = function(self, data, params) { params$flipped_aes <- has_flipped_aes(data, params, main_is_orthogonal = FALSE, main_is_continuous = TRUE) @@ -85,7 +94,8 @@ StatDensity <- ggproto("StatDensity", Stat, extra_params = c("na.rm", "orientation"), compute_group = function(data, scales, bw = "nrd0", adjust = 1, kernel = "gaussian", - n = 512, trim = FALSE, na.rm = FALSE, flipped_aes = FALSE) { + n = 512, trim = FALSE, na.rm = FALSE, bounds = c(-Inf, Inf), + flipped_aes = FALSE) { data <- flip_data(data, flipped_aes) if (trim) { range <- range(data$x, na.rm = TRUE) @@ -94,7 +104,8 @@ StatDensity <- ggproto("StatDensity", Stat, } density <- compute_density(data$x, data$weight, from = range[1], - to = range[2], bw = bw, adjust = adjust, kernel = kernel, n = n) + to = range[2], bw = bw, adjust = adjust, kernel = kernel, n = n, + bounds = bounds) density$flipped_aes <- flipped_aes flip_data(density, flipped_aes) } @@ -102,7 +113,8 @@ StatDensity <- ggproto("StatDensity", Stat, ) compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1, - kernel = "gaussian", n = 512) { + kernel = "gaussian", n = 512, + bounds = c(-Inf, Inf)) { nx <- length(x) if (is.null(w)) { w <- rep(1 / nx, nx) @@ -110,6 +122,12 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1, w <- w / sum(w) } + # Adjust data points and weights to all fit inside bounds + sample_data <- fit_data_to_bounds(bounds, x, w) + x <- sample_data$x + w <- sample_data$w + nx <- length(x) + # if less than 2 points return data frame of NAs and a warning if (nx < 2) { cli::cli_warn("Groups with fewer than two data points have been dropped.") @@ -124,8 +142,16 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1, )) } - dens <- stats::density(x, weights = w, bw = bw, adjust = adjust, - kernel = kernel, n = n, from = from, to = to) + # Decide whether to use boundary correction + if (any(is.finite(bounds))) { + dens <- stats::density(x, weights = w, bw = bw, adjust = adjust, + kernel = kernel, n = n) + + dens <- reflect_density(dens = dens, bounds = bounds, from = from, to = to) + } else { + dens <- stats::density(x, weights = w, bw = bw, adjust = adjust, + kernel = kernel, n = n, from = from, to = to) + } data_frame0( x = dens$x, @@ -137,3 +163,57 @@ compute_density <- function(x, w, from, to, bw = "nrd0", adjust = 1, .size = length(dens$x) ) } + +# Check if all data points are inside bounds. If not, warn and remove them. +fit_data_to_bounds <- function(bounds, x, w) { + is_inside_bounds <- (bounds[1] <= x) & (x <= bounds[2]) + + if (any(!is_inside_bounds)) { + cli::cli_warn("Some data points are outside of `bounds`. Removing them.") + x <- x[is_inside_bounds] + w <- w[is_inside_bounds] + w_sum <- sum(w) + if (w_sum > 0) { + w <- w / w_sum + } + } + + return(list(x = x, w = w)) +} + +# Update density estimation to mitigate boundary effect at known `bounds`: +# - All x values will lie inside `bounds`. +# - All y-values will be updated to have total probability of `bounds` be +# closer to 1. This is done by reflecting tails outside of `bounds` around +# their closest edge. This leads to those tails lie inside of `bounds` +# (completely, if they are not wider than `bounds` itself, which is a common +# situation) and correct boundary effect of default density estimation. +# +# `dens` - output of `stats::density`. +# `bounds` - two-element vector with left and right known (user supplied) +# bounds of x values. +# `from`, `to` - numbers used as corresponding arguments of `stats::density()` +# in case of no boundary correction. +reflect_density <- function(dens, bounds, from, to) { + # No adjustment is needed if no finite bounds are supplied + if (all(is.infinite(bounds))) { + return(dens) + } + + # Estimate linearly with zero tails (crucial to account for infinite bound) + f_dens <- stats::approxfun( + x = dens$x, y = dens$y, method = "linear", yleft = 0, yright = 0 + ) + + # Create a uniform x-grid inside `bounds` + left <- max(from, bounds[1]) + right <- min(to, bounds[2]) + out_x <- seq(from = left, to = right, length.out = length(dens$x)) + + # Update density estimation by adding reflected tails from outside `bounds` + left_reflection <- f_dens(bounds[1] + (bounds[1] - out_x)) + right_reflection <- f_dens(bounds[2] + (bounds[2] - out_x)) + out_y <- f_dens(out_x) + left_reflection + right_reflection + + list(x = out_x, y = out_y) +} diff --git a/man/geom_density.Rd b/man/geom_density.Rd index 02fdc43627..91209356f5 100644 --- a/man/geom_density.Rd +++ b/man/geom_density.Rd @@ -30,6 +30,7 @@ stat_density( n = 512, trim = FALSE, na.rm = FALSE, + bounds = c(-Inf, Inf), orientation = NA, show.legend = NA, inherit.aes = TRUE @@ -113,6 +114,12 @@ range of that group: this typically means the estimated x values will not line-up, and hence you won't be able to stack density values. This parameter only matters if you are displaying multiple densities in one plot or if you are manually adjusting the scale limits.} + +\item{bounds}{Known lower and upper bounds for estimated data. Default +\code{c(-Inf, Inf)} means that there are no (finite) bounds. If any bound is +finite, boundary effect of default density estimation will be corrected by +reflecting tails outside \code{bounds} around their closest edge. Data points +outside of bounds are removed with a warning.} } \description{ Computes and draws kernel density estimate, which is a smoothed version of @@ -173,6 +180,12 @@ ggplot(diamonds, aes(depth, fill = cut, colour = cut)) + geom_density(alpha = 0.1) + xlim(55, 70) +# Use `bounds` to adjust computation for known data limits +big_diamonds <- diamonds[diamonds$carat >= 1, ] +ggplot(big_diamonds, aes(carat)) + + geom_density(color = 'red') + + geom_density(bounds = c(1, Inf), color = 'blue') + \donttest{ # Stacked density plots: if you want to create a stacked density plot, you # probably want to 'count' (density * n) variable instead of the default diff --git a/tests/testthat/test-stat-density.R b/tests/testthat/test-stat-density.R index a2cde72e5d..ca494e0788 100644 --- a/tests/testthat/test-stat-density.R +++ b/tests/testthat/test-stat-density.R @@ -1,3 +1,94 @@ +test_that("stat_density actually computes density", { + # Compare functon approximations because outputs from `ggplot()` and + # `density()` give grids spanning different ranges + dens <- stats::density(mtcars$mpg) + expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y)) + + plot <- ggplot(mtcars, aes(mpg)) + stat_density() + actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")]) + + test_sample <- unique(mtcars$mpg) + expect_equal( + expected_density_fun(test_sample), + actual_density_fun(test_sample), + tolerance = 1e-3 + ) +}) + +test_that("stat_density can make weighted density estimation", { + df <- mtcars + df$weight <- mtcars$cyl / sum(mtcars$cyl) + + dens <- stats::density(df$mpg, weights = df$weight) + expected_density_fun <- stats::approxfun(data.frame(x = dens$x, y = dens$y)) + + plot <- ggplot(df, aes(mpg, weight = weight)) + stat_density() + actual_density_fun <- stats::approxfun(layer_data(plot)[, c("x", "y")]) + + test_sample <- unique(df$mpg) + expect_equal( + expected_density_fun(test_sample), + actual_density_fun(test_sample), + tolerance = 1e-3 + ) +}) + +test_that("stat_density uses `bounds`", { + mpg_min <- min(mtcars$mpg) + mpg_max <- max(mtcars$mpg) + + expect_bounds <- function(bounds) { + dens <- stats::density(mtcars$mpg) + orig_density <- stats::approxfun( + data.frame(x = dens$x, y = dens$y), + yleft = 0, + yright = 0 + ) + + bounded_plot <- ggplot(mtcars, aes(mpg)) + stat_density(bounds = bounds) + bounded_data <- layer_data(bounded_plot)[, c("x", "y")] + plot_density <- stats::approxfun(bounded_data, yleft = 0, yright = 0) + + test_sample <- seq(mpg_min, mpg_max, by = 0.1) + left_reflection <- orig_density(bounds[1] + (bounds[1] - test_sample)) + right_reflection <- orig_density(bounds[2] + (bounds[2] - test_sample)) + + # Plot density should be an original plus added reflection at both `bounds` + # (reflection around infinity is zero) + expect_equal( + orig_density(test_sample) + left_reflection + right_reflection, + plot_density(test_sample), + tolerance = 1e-4 + ) + } + + expect_bounds(c(-Inf, Inf)) + expect_bounds(c(mpg_min, Inf)) + expect_bounds(c(-Inf, mpg_max)) + expect_bounds(c(mpg_min, mpg_max)) +}) + +test_that("stat_density handles data outside of `bounds`", { + cutoff <- mtcars$mpg[1] + + # Both `x` and `weight` should be filtered out for out of `bounds` points + expect_warning( + data_actual <- layer_data( + ggplot(mtcars, aes(mpg, weight = cyl)) + + stat_density(bounds = c(cutoff, Inf)) + ), + "outside of `bounds`" + ) + + mtcars_filtered <- mtcars[mtcars$mpg >= cutoff, ] + data_expected <- layer_data( + ggplot(mtcars_filtered, aes(mpg, weight = cyl)) + + stat_density(bounds = c(cutoff, Inf)) + ) + + expect_equal(data_actual, data_expected) +}) + test_that("compute_density succeeds when variance is zero", { dens <- compute_density(rep(0, 10), NULL, from = 0.5, to = 0.5) expect_equal(dens$n, rep(10, 512))