diff --git a/NEWS.md b/NEWS.md index d0c1cf00..6da65950 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,9 @@ * `str_replace()` and `str_replace_all()` now behave correctly when a replacement string contains `$`s, `\\\\1`, etc. (#83, @gagolews). +* `boundary()` has a different default argument which works for splitting on + sentence boundaries (#58, @lmullen). + # stringr 1.0.0 * stringr is now powered by [stringi](https://github.com/Rexamine/stringi) diff --git a/R/modifiers.r b/R/modifiers.r index 5e2ee492..ce7648a4 100644 --- a/R/modifiers.r +++ b/R/modifiers.r @@ -115,12 +115,18 @@ regex <- function(pattern, ignore_case = FALSE, multiline = FALSE, #' @param type Boundary type to detect. #' @param skip_word_none Ignore "words" that don't contain any characters -#' or numbers - i.e. punctuation. +#' or numbers - i.e. punctuation. Default \code{NA} will skip such "words" +#' only when splitting on \code{word} boundaries. #' @export #' @rdname modifiers boundary <- function(type = c("character", "line_break", "sentence", "word"), - skip_word_none = TRUE, ...) { + skip_word_none = NA, ...) { type <- match.arg(type) + + if (identical(skip_word_none, NA)) { + skip_word_none <- type == "word" + } + options <- stri_opts_brkiter( type = type, skip_word_none = skip_word_none, diff --git a/tests/testthat/test-split.r b/tests/testthat/test-split.r index 942bf9e0..09f5f017 100644 --- a/tests/testthat/test-split.r +++ b/tests/testthat/test-split.r @@ -67,3 +67,16 @@ test_that("n sets exact number of splits in str_split_fixed", { equals(c("Subject", "Roger: his drinking problems"))) }) + +test_that("str_split can split sentences correctly", { + test <- "This is a sentence. Is this a sentence? Why, yes it is." + + expect_that( + length(str_split(test, boundary("sentence"))[[1]]), + equals(3)) + expect_that( + str_split(test, boundary("sentence")), + equals(list(c("This is a sentence. ", "Is this a sentence? ", + "Why, yes it is.")))) + +})