From 88a4a36f32da611ea95d24626619e9270df50231 Mon Sep 17 00:00:00 2001
From: Hadley Wickham <h.wickham@gmail.com>
Date: Tue, 18 Jan 2022 14:31:23 -0600
Subject: [PATCH] Implement str_width()

And flip str_pad() argument from use_length to use_width so that the default can be positive.

Fixes #380
---
 NAMESPACE                    |  1 +
 NEWS.md                      |  4 ++-
 R/length.r                   | 48 ++++++++++++++++++++++--------------
 R/pad.r                      | 12 ++++-----
 man/str_length.Rd            | 43 ++++++++++++++++++--------------
 man/str_pad.Rd               |  6 ++---
 tests/testthat/test-length.r |  5 ++++
 tests/testthat/test-pad.r    |  2 +-
 8 files changed, 74 insertions(+), 47 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 4c69c5c3..a35b3043 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -60,6 +60,7 @@ export(str_unique)
 export(str_view)
 export(str_view_all)
 export(str_which)
+export(str_width)
 export(str_wrap)
 export(word)
 import(rlang)
diff --git a/NEWS.md b/NEWS.md
index 57828fa1..4f3eb099 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -49,6 +49,8 @@
 * New `str_unique()` is a wrapper around `stri_unique()` and returns unique 
   string values in a character vector (#249, @seasmith).
 
+* New `str_width()` returns the display width of a string (#380).
+
 * stringr is now licensed as MIT (#351).
 
 ## Minor improvements and bug fixes
@@ -60,7 +62,7 @@
 * `str_flatten()` gains a `last` argument that optionally override the
   final separator (#377).
 
-* `str_pad()` gains `use_length` argument to control whether to use the total code
+* `str_pad()` gains `use_width` argument to control whether to use the total code
   point width or the number of code points as "width" of a string (#190).
 
 * `str_replace()` and `str_replace_all()` can use standard tidyverse formula
diff --git a/R/length.r b/R/length.r
index fa583e84..c92783b6 100644
--- a/R/length.r
+++ b/R/length.r
@@ -1,10 +1,12 @@
-#' The length of a string
+#' The length/width of a string
 #'
-#' Compute the number of characters in a string. Technically, this returns the
-#' number of "code points", in a string. One code point usually corresponds to
-#' one character, but not always. For example, an u with a umlaut might be
-#' represented as a single code point or as "u" code point and an umlaut
-#' code point.
+#' @description
+#' `str_length()` returns the number of codepoints in a string. These are
+#' the individual elements (which are often, but not always letters) that
+#' can be extracted with [str_sub()].
+#'
+#' `str_width()` returns how much space the string will occupy when printed
+#' in a fixed width font (i.e. when printed in the console).
 #'
 #' @inheritParams str_detect
 #' @return A numeric vector giving number of characters (code points) in each
@@ -17,18 +19,28 @@
 #' str_length(factor("abc"))
 #' str_length(c("i", "like", "programming", NA))
 #'
-#' # Two ways of representing a u with an umlaut
-#' u1 <- "\u00fc"
-#' u2 <- stringi::stri_trans_nfd(u1)
-#' # The print the same:
-#' u1
-#' u2
-#' # But have a different length
-#' str_length(u1)
-#' str_length(u2)
-#' # Even though they have the same number of characters
-#' str_count(u1)
-#' str_count(u2)
+#' # Some characters, like emoji and Chinese characters (hanzi), are square
+#' # which means they take up the width of two Latin characters
+#' x <- c("\u6c49\u5b57", "\U0001f60a")
+#' str_view(x)
+#' str_width(x)
+#' str_length(x)
+#'
+#' # There are two ways of representing a u with an umlaut
+#' u <- c("\u00fc", "u\u0308")
+#' # They have the same width
+#' str_width(u)
+#' # But a different length
+#' str_length(u)
+#' # Because the second element is made up of a u + an accent
+#' str_sub(u, 1, 1)
 str_length <- function(string) {
   stri_length(string)
 }
+
+#' @export
+#' @rdname str_length
+str_width <- function(string) {
+  stri_width(string)
+}
+
diff --git a/R/pad.r b/R/pad.r
index ceeae046..9db5ed0b 100644
--- a/R/pad.r
+++ b/R/pad.r
@@ -6,8 +6,8 @@
 #' @param width Minimum width of padded strings.
 #' @param side Side on which padding character is added (left, right or both).
 #' @param pad Single padding character (default is a space).
-#' @param use_length If `TRUE`, use the number of characters instead of the
-#'   total of character widths (see [stringi::stri_width]).
+#' @param use_width If `FALSE`, use the length of the string instead of the
+#'   width; see [str_width()]/[str_length()] for the difference.
 #' @return A character vector.
 #' @seealso [str_trim()] to remove whitespace;
 #'   [str_trunc()] to decrease the maximum width of a string.
@@ -26,13 +26,13 @@
 #'
 #' # Longer strings are returned unchanged
 #' str_pad("hadley", 3)
-str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ", use_length = FALSE) {
+str_pad <- function(string, width, side = c("left", "right", "both"), pad = " ", use_width = TRUE) {
   vctrs::vec_size_common(string = string, width = width, pad = pad)
   side <- arg_match(side)
 
   switch(side,
-    left = stri_pad_left(string, width, pad = pad, use_length = use_length),
-    right = stri_pad_right(string, width, pad = pad, use_length = use_length),
-    both = stri_pad_both(string, width, pad = pad, use_length = use_length)
+    left = stri_pad_left(string, width, pad = pad, use_length = !use_width),
+    right = stri_pad_right(string, width, pad = pad, use_length = !use_width),
+    both = stri_pad_both(string, width, pad = pad, use_length = !use_width)
   )
 }
diff --git a/man/str_length.Rd b/man/str_length.Rd
index 7f1da68c..34ed4e2f 100644
--- a/man/str_length.Rd
+++ b/man/str_length.Rd
@@ -2,9 +2,12 @@
 % Please edit documentation in R/length.r
 \name{str_length}
 \alias{str_length}
-\title{The length of a string}
+\alias{str_width}
+\title{The length/width of a string}
 \usage{
 str_length(string)
+
+str_width(string)
 }
 \arguments{
 \item{string}{Input vector. Either a character vector, or something
@@ -15,11 +18,12 @@ A numeric vector giving number of characters (code points) in each
 element of the character vector. Missing string have missing length.
 }
 \description{
-Compute the number of characters in a string. Technically, this returns the
-number of "code points", in a string. One code point usually corresponds to
-one character, but not always. For example, an u with a umlaut might be
-represented as a single code point or as "u" code point and an umlaut
-code point.
+\code{str_length()} returns the number of codepoints in a string. These are
+the individual elements (which are often, but not always letters) that
+can be extracted with \code{\link[=str_sub]{str_sub()}}.
+
+\code{str_width()} returns how much space the string will occupy when printed
+in a fixed width font (i.e. when printed in the console).
 }
 \examples{
 str_length(letters)
@@ -27,18 +31,21 @@ str_length(NA)
 str_length(factor("abc"))
 str_length(c("i", "like", "programming", NA))
 
-# Two ways of representing a u with an umlaut
-u1 <- "\u00fc"
-u2 <- stringi::stri_trans_nfd(u1)
-# The print the same:
-u1
-u2
-# But have a different length
-str_length(u1)
-str_length(u2)
-# Even though they have the same number of characters
-str_count(u1)
-str_count(u2)
+# Some characters, like emoji and Chinese characters (hanzi), are square
+# which means they take up the width of two Latin characters
+x <- c("\u6c49\u5b57", "\U0001f60a")
+str_view(x)
+str_width(x)
+str_length(x)
+
+# There are two ways of representing a u with an umlaut
+u <- c("\u00fc", "u\u0308")
+# They have the same width
+str_width(u)
+# But a different length
+str_length(u)
+# Because the second element is made up of a u + an accent
+str_sub(u, 1, 1)
 }
 \seealso{
 \code{\link[stringi:stri_length]{stringi::stri_length()}} which this function wraps.
diff --git a/man/str_pad.Rd b/man/str_pad.Rd
index 19cbcbdb..72d7dcbf 100644
--- a/man/str_pad.Rd
+++ b/man/str_pad.Rd
@@ -9,7 +9,7 @@ str_pad(
   width,
   side = c("left", "right", "both"),
   pad = " ",
-  use_length = FALSE
+  use_width = TRUE
 )
 }
 \arguments{
@@ -21,8 +21,8 @@ str_pad(
 
 \item{pad}{Single padding character (default is a space).}
 
-\item{use_length}{If \code{TRUE}, use the number of characters instead of the
-total of character widths (see \link[stringi:stri_width]{stringi::stri_width}).}
+\item{use_width}{If \code{FALSE}, use the length of the string instead of the
+width; see \code{\link[=str_width]{str_width()}}/\code{\link[=str_length]{str_length()}} for the difference.}
 }
 \value{
 A character vector.
diff --git a/tests/testthat/test-length.r b/tests/testthat/test-length.r
index 2a932a3f..0da42b13 100644
--- a/tests/testthat/test-length.r
+++ b/tests/testthat/test-length.r
@@ -15,3 +15,8 @@ test_that("str_length of factor is length of level", {
   expect_equal(str_length(factor("ab")), 2)
   expect_equal(str_length(factor("abc")), 3)
 })
+
+test_that("str_width returns display width", {
+  x <- c("\u0308", "x", "\U0001f60a")
+  expect_equal(str_width(x), c(0, 1, 2))
+})
diff --git a/tests/testthat/test-pad.r b/tests/testthat/test-pad.r
index e0af5b09..098845e3 100644
--- a/tests/testthat/test-pad.r
+++ b/tests/testthat/test-pad.r
@@ -22,7 +22,7 @@ test_that("padding based of length works", {
   pad <- function(...) str_pad("\u4e2d", ..., side = "both")
 
   expect_equal(pad(width = 6),                    "  \u4e2d  ")
-  expect_equal(pad(width = 5, use_length = TRUE), "  \u4e2d  ")
+  expect_equal(pad(width = 5, use_width = FALSE), "  \u4e2d  ")
 })
 
 test_that("uses tidyverse recycling rules", {