diff --git a/NEWS.md b/NEWS.md index d0c1cf00..7a71f99e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,9 @@ * `str_replace()` and `str_replace_all()` now behave correctly when a replacement string contains `$`s, `\\\\1`, etc. (#83, @gagolews). + +* `boundary()` has a different default argument which works for splitting on + sentence boundaries (#58, @lmullen). # stringr 1.0.0 diff --git a/R/modifiers.r b/R/modifiers.r index 5e2ee492..ce7648a4 100644 --- a/R/modifiers.r +++ b/R/modifiers.r @@ -115,12 +115,18 @@ regex <- function(pattern, ignore_case = FALSE, multiline = FALSE, #' @param type Boundary type to detect. #' @param skip_word_none Ignore "words" that don't contain any characters -#' or numbers - i.e. punctuation. +#' or numbers - i.e. punctuation. Default \code{NA} will skip such "words" +#' only when splitting on \code{word} boundaries. #' @export #' @rdname modifiers boundary <- function(type = c("character", "line_break", "sentence", "word"), - skip_word_none = TRUE, ...) { + skip_word_none = NA, ...) { type <- match.arg(type) + + if (identical(skip_word_none, NA)) { + skip_word_none <- type == "word" + } + options <- stri_opts_brkiter( type = type, skip_word_none = skip_word_none, diff --git a/man/modifiers.Rd b/man/modifiers.Rd index 1b27dd1f..732165fa 100644 --- a/man/modifiers.Rd +++ b/man/modifiers.Rd @@ -16,7 +16,7 @@ regex(pattern, ignore_case = FALSE, multiline = FALSE, comments = FALSE, dotall = FALSE, ...) boundary(type = c("character", "line_break", "sentence", "word"), - skip_word_none = TRUE, ...) + skip_word_none = NA, ...) } \arguments{ \item{pattern}{Pattern to modify behaviour.} @@ -43,7 +43,8 @@ default, only match the start and end of the input.} \item{type}{Boundary type to detect.} \item{skip_word_none}{Ignore "words" that don't contain any characters -or numbers - i.e. punctuation.} +or numbers - i.e. punctuation. Default \code{NA} will skip such "words" +only when splitting on \code{word} boundaries.} } \description{ \describe{ diff --git a/tests/testthat/test-split.r b/tests/testthat/test-split.r index 942bf9e0..33c070ea 100644 --- a/tests/testthat/test-split.r +++ b/tests/testthat/test-split.r @@ -67,3 +67,12 @@ test_that("n sets exact number of splits in str_split_fixed", { equals(c("Subject", "Roger: his drinking problems"))) }) + +test_that("str_split can split sentences correctly", { + test <- "This is a sentence. Is this a sentence? Why, yes it is." + expect_that(length(str_split(test, boundary("sentence"))[[1]]), + equals(3)) + expect_that(str_split(test, boundary("sentence")), + equals(list(c("This is a sentence. ", "Is this a sentence? ", + "Why, yes it is.")))) +})