tidyverse · hadley · Jan 20, 2022 · Jan 18, 2022
diff --git a/NAMESPACE b/NAMESPACE
@@ -60,6 +60,7 @@ export(str_unique)
 export(str_view)
 export(str_view_all)
 export(str_which)
+export(str_width)
 export(str_wrap)
 export(word)
 import(rlang)

diff --git a/NEWS.md b/NEWS.md
@@ -49,6 +49,8 @@
 * New `str_unique()` is a wrapper around `stri_unique()` and returns unique 
   string values in a character vector (#249, @seasmith).
 
+* New `str_width()` returns the display width of a string (#380).
+
 * stringr is now licensed as MIT (#351).
 
 ## Minor improvements and bug fixes
@@ -60,7 +62,7 @@
 * `str_flatten()` gains a `last` argument that optionally override the
   final separator (#377).
 
-* `str_pad()` gains `use_length` argument to control whether to use the total code
+* `str_pad()` gains `use_width` argument to control whether to use the total code
   point width or the number of code points as "width" of a string (#190).
 
 * `str_replace()` and `str_replace_all()` can use standard tidyverse formula

diff --git a/R/length.r b/R/length.r
@@ -1,10 +1,12 @@
-#' The length of a string
+#' The length/width of a string
 #'
-#' Compute the number of characters in a string. Technically, this returns the
-#' number of "code points", in a string. One code point usually corresponds to
-#' one character, but not always. For example, an u with a umlaut might be
-#' represented as a single code point or as "u" code point and an umlaut
-#' code point.
+#' @description
+#' `str_length()` returns the number of codepoints in a string. These are
+#' the individual elements (which are often, but not always letters) that
+#' can be extracted with [str_sub()].
+#'
+#' `str_width()` returns how much space the string will occupy when printed
+#' in a fixed width font (i.e. when printed in the console).
 #'
 #' @inheritParams str_detect
 #' @return A numeric vector giving number of characters (code points) in each
@@ -17,18 +19,28 @@
 #' str_length(factor("abc"))
 #' str_length(c("i", "like", "programming", NA))
 #'
-#' # Two ways of representing a u with an umlaut
-#' u1 <- "\u00fc"
-#' u2 <- stringi::stri_trans_nfd(u1)
-#' # The print the same:
-#' u1
-#' u2
-#' # But have a different length
-#' str_length(u1)
-#' str_length(u2)
-#' # Even though they have the same number of characters
-#' str_count(u1)
-#' str_count(u2)
+#' # Some characters, like emoji and Chinese characters (hanzi), are square
+#' # which means they take up the width of two Latin characters
+#' x <- c("\u6c49\u5b57", "\U0001f60a")
+#' str_view(x)
+#' str_width(x)
+#' str_length(x)
+#'
+#' # There are two ways of representing a u with an umlaut
+#' u <- c("\u00fc", "u\u0308")
+#' # They have the same width
+#' str_width(u)
+#' # But a different length
+#' str_length(u)
+#' # Because the second element is made up of a u + an accent
+#' str_sub(u, 1, 1)
 str_length <- function(string) {
   stri_length(string)
 }
+
+#' @export
+#' @rdname str_length
+str_width <- function(string) {
+  stri_width(string)
+}
+
diff --git a/R/pad.r b/R/pad.r
@@ -6,8 +6,8 @@
 #' @param width Minimum width of padded strings.
 #' @param side Side on which padding character is added (left, right or both).
 #' @param pad Single padding character (default is a space).
-#' @param use_length If `TRUE`, use the number of characters instead of the
-#'   total of character widths (see [stringi::stri_width]).
+#' @param use_width If `FALSE`, use the length of the string instead of the
+#'   width; see [str_width()]/[str_length()] for the difference.
 #' @return A character vector.
 #' @seealso [str_trim()] to remove whitespace;
 #'   [str_trunc()] to decrease the maximum width of a string.
@@ -26,13 +26,13 @@
 #'
 #' # Longer strings are returned unchanged
 #' str_pad("hadley", 3)
-str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ", use_length = FALSE) {
+str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ", use_width = TRUE) {
   vctrs::vec_size_common(string = string, width = width, pad = pad)
   side <- arg_match(side)
 
   switch(side,
-    left = stri_pad_left(string, width, pad = pad, use_length = use_length),
-    right = stri_pad_right(string, width, pad = pad, use_length = use_length),
-    both = stri_pad_both(string, width, pad = pad, use_length = use_length)
+    left = stri_pad_left(string, width, pad = pad, use_length = !use_width),
+    right = stri_pad_right(string, width, pad = pad, use_length = !use_width),
+    both = stri_pad_both(string, width, pad = pad, use_length = !use_width)
   )
 }
diff --git a/man/str_length.Rd b/man/str_length.Rd
diff --git a/man/str_pad.Rd b/man/str_pad.Rd
diff --git a/tests/testthat/test-length.r b/tests/testthat/test-length.r
@@ -15,3 +15,8 @@ test_that("str_length of factor is length of level", {
   expect_equal(str_length(factor("ab")), 2)
   expect_equal(str_length(factor("abc")), 3)
 })
+
+test_that("str_width returns display width", {
+  x <- c("\u0308", "x", "\U0001f60a")
+  expect_equal(str_width(x), c(0, 1, 2))
+})
diff --git a/tests/testthat/test-pad.r b/tests/testthat/test-pad.r
@@ -22,7 +22,7 @@ test_that("padding based of length works", {
   pad <- function(...) str_pad("\u4e2d", ..., side = "both")
 
   expect_equal(pad(width = 6),                    "  \u4e2d  ")
-  expect_equal(pad(width = 5, use_length = TRUE), "  \u4e2d  ")
+  expect_equal(pad(width = 5, use_width = FALSE), "  \u4e2d  ")
 })
 
 test_that("uses tidyverse recycling rules", {