From 88a4a36f32da611ea95d24626619e9270df50231 Mon Sep 17 00:00:00 2001 From: Hadley Wickham Date: Tue, 18 Jan 2022 14:31:23 -0600 Subject: [PATCH] Implement str_width() And flip str_pad() argument from use_length to use_width so that the default can be positive. Fixes #380 --- NAMESPACE | 1 + NEWS.md | 4 ++- R/length.r | 48 ++++++++++++++++++++++-------------- R/pad.r | 12 ++++----- man/str_length.Rd | 43 ++++++++++++++++++-------------- man/str_pad.Rd | 6 ++--- tests/testthat/test-length.r | 5 ++++ tests/testthat/test-pad.r | 2 +- 8 files changed, 74 insertions(+), 47 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4c69c5c3..a35b3043 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -60,6 +60,7 @@ export(str_unique) export(str_view) export(str_view_all) export(str_which) +export(str_width) export(str_wrap) export(word) import(rlang) diff --git a/NEWS.md b/NEWS.md index 57828fa1..4f3eb099 100644 --- a/NEWS.md +++ b/NEWS.md @@ -49,6 +49,8 @@ * New `str_unique()` is a wrapper around `stri_unique()` and returns unique string values in a character vector (#249, @seasmith). +* New `str_width()` returns the display width of a string (#380). + * stringr is now licensed as MIT (#351). ## Minor improvements and bug fixes @@ -60,7 +62,7 @@ * `str_flatten()` gains a `last` argument that optionally override the final separator (#377). -* `str_pad()` gains `use_length` argument to control whether to use the total code +* `str_pad()` gains `use_width` argument to control whether to use the total code point width or the number of code points as "width" of a string (#190). * `str_replace()` and `str_replace_all()` can use standard tidyverse formula diff --git a/R/length.r b/R/length.r index fa583e84..c92783b6 100644 --- a/R/length.r +++ b/R/length.r @@ -1,10 +1,12 @@ -#' The length of a string +#' The length/width of a string #' -#' Compute the number of characters in a string. Technically, this returns the -#' number of "code points", in a string. One code point usually corresponds to -#' one character, but not always. For example, an u with a umlaut might be -#' represented as a single code point or as "u" code point and an umlaut -#' code point. +#' @description +#' `str_length()` returns the number of codepoints in a string. These are +#' the individual elements (which are often, but not always letters) that +#' can be extracted with [str_sub()]. +#' +#' `str_width()` returns how much space the string will occupy when printed +#' in a fixed width font (i.e. when printed in the console). #' #' @inheritParams str_detect #' @return A numeric vector giving number of characters (code points) in each @@ -17,18 +19,28 @@ #' str_length(factor("abc")) #' str_length(c("i", "like", "programming", NA)) #' -#' # Two ways of representing a u with an umlaut -#' u1 <- "\u00fc" -#' u2 <- stringi::stri_trans_nfd(u1) -#' # The print the same: -#' u1 -#' u2 -#' # But have a different length -#' str_length(u1) -#' str_length(u2) -#' # Even though they have the same number of characters -#' str_count(u1) -#' str_count(u2) +#' # Some characters, like emoji and Chinese characters (hanzi), are square +#' # which means they take up the width of two Latin characters +#' x <- c("\u6c49\u5b57", "\U0001f60a") +#' str_view(x) +#' str_width(x) +#' str_length(x) +#' +#' # There are two ways of representing a u with an umlaut +#' u <- c("\u00fc", "u\u0308") +#' # They have the same width +#' str_width(u) +#' # But a different length +#' str_length(u) +#' # Because the second element is made up of a u + an accent +#' str_sub(u, 1, 1) str_length <- function(string) { stri_length(string) } + +#' @export +#' @rdname str_length +str_width <- function(string) { + stri_width(string) +} + diff --git a/R/pad.r b/R/pad.r index ceeae046..9db5ed0b 100644 --- a/R/pad.r +++ b/R/pad.r @@ -6,8 +6,8 @@ #' @param width Minimum width of padded strings. #' @param side Side on which padding character is added (left, right or both). #' @param pad Single padding character (default is a space). -#' @param use_length If `TRUE`, use the number of characters instead of the -#' total of character widths (see [stringi::stri_width]). +#' @param use_width If `FALSE`, use the length of the string instead of the +#' width; see [str_width()]/[str_length()] for the difference. #' @return A character vector. #' @seealso [str_trim()] to remove whitespace; #' [str_trunc()] to decrease the maximum width of a string. @@ -26,13 +26,13 @@ #' #' # Longer strings are returned unchanged #' str_pad("hadley", 3) -str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ", use_length = FALSE) { +str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ", use_width = TRUE) { vctrs::vec_size_common(string = string, width = width, pad = pad) side <- arg_match(side) switch(side, - left = stri_pad_left(string, width, pad = pad, use_length = use_length), - right = stri_pad_right(string, width, pad = pad, use_length = use_length), - both = stri_pad_both(string, width, pad = pad, use_length = use_length) + left = stri_pad_left(string, width, pad = pad, use_length = !use_width), + right = stri_pad_right(string, width, pad = pad, use_length = !use_width), + both = stri_pad_both(string, width, pad = pad, use_length = !use_width) ) } diff --git a/man/str_length.Rd b/man/str_length.Rd index 7f1da68c..34ed4e2f 100644 --- a/man/str_length.Rd +++ b/man/str_length.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/length.r \name{str_length} \alias{str_length} -\title{The length of a string} +\alias{str_width} +\title{The length/width of a string} \usage{ str_length(string) + +str_width(string) } \arguments{ \item{string}{Input vector. Either a character vector, or something @@ -15,11 +18,12 @@ A numeric vector giving number of characters (code points) in each element of the character vector. Missing string have missing length. } \description{ -Compute the number of characters in a string. Technically, this returns the -number of "code points", in a string. One code point usually corresponds to -one character, but not always. For example, an u with a umlaut might be -represented as a single code point or as "u" code point and an umlaut -code point. +\code{str_length()} returns the number of codepoints in a string. These are +the individual elements (which are often, but not always letters) that +can be extracted with \code{\link[=str_sub]{str_sub()}}. + +\code{str_width()} returns how much space the string will occupy when printed +in a fixed width font (i.e. when printed in the console). } \examples{ str_length(letters) @@ -27,18 +31,21 @@ str_length(NA) str_length(factor("abc")) str_length(c("i", "like", "programming", NA)) -# Two ways of representing a u with an umlaut -u1 <- "\u00fc" -u2 <- stringi::stri_trans_nfd(u1) -# The print the same: -u1 -u2 -# But have a different length -str_length(u1) -str_length(u2) -# Even though they have the same number of characters -str_count(u1) -str_count(u2) +# Some characters, like emoji and Chinese characters (hanzi), are square +# which means they take up the width of two Latin characters +x <- c("\u6c49\u5b57", "\U0001f60a") +str_view(x) +str_width(x) +str_length(x) + +# There are two ways of representing a u with an umlaut +u <- c("\u00fc", "u\u0308") +# They have the same width +str_width(u) +# But a different length +str_length(u) +# Because the second element is made up of a u + an accent +str_sub(u, 1, 1) } \seealso{ \code{\link[stringi:stri_length]{stringi::stri_length()}} which this function wraps. diff --git a/man/str_pad.Rd b/man/str_pad.Rd index 19cbcbdb..72d7dcbf 100644 --- a/man/str_pad.Rd +++ b/man/str_pad.Rd @@ -9,7 +9,7 @@ str_pad( width, side = c("left", "right", "both"), pad = " ", - use_length = FALSE + use_width = TRUE ) } \arguments{ @@ -21,8 +21,8 @@ str_pad( \item{pad}{Single padding character (default is a space).} -\item{use_length}{If \code{TRUE}, use the number of characters instead of the -total of character widths (see \link[stringi:stri_width]{stringi::stri_width}).} +\item{use_width}{If \code{FALSE}, use the length of the string instead of the +width; see \code{\link[=str_width]{str_width()}}/\code{\link[=str_length]{str_length()}} for the difference.} } \value{ A character vector. diff --git a/tests/testthat/test-length.r b/tests/testthat/test-length.r index 2a932a3f..0da42b13 100644 --- a/tests/testthat/test-length.r +++ b/tests/testthat/test-length.r @@ -15,3 +15,8 @@ test_that("str_length of factor is length of level", { expect_equal(str_length(factor("ab")), 2) expect_equal(str_length(factor("abc")), 3) }) + +test_that("str_width returns display width", { + x <- c("\u0308", "x", "\U0001f60a") + expect_equal(str_width(x), c(0, 1, 2)) +}) diff --git a/tests/testthat/test-pad.r b/tests/testthat/test-pad.r index e0af5b09..098845e3 100644 --- a/tests/testthat/test-pad.r +++ b/tests/testthat/test-pad.r @@ -22,7 +22,7 @@ test_that("padding based of length works", { pad <- function(...) str_pad("\u4e2d", ..., side = "both") expect_equal(pad(width = 6), " \u4e2d ") - expect_equal(pad(width = 5, use_length = TRUE), " \u4e2d ") + expect_equal(pad(width = 5, use_width = FALSE), " \u4e2d ") }) test_that("uses tidyverse recycling rules", {