tidymodels · topepo · Jun 25, 2021 · Mar 20, 2021 · Mar 22, 2021 · Mar 22, 2021
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -19,3 +19,4 @@ derby.log
 ^README\.html$
 ^codecov\.yml$
 ^LICENSE\.md$
+^man-roxygen$
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -65,6 +65,8 @@ jobs:
         run: |
           pak::local_system_requirements(execute = TRUE)
           pak::pkg_system_requirements("rcmdcheck", execute = TRUE)
+          pak::pkg_system_requirements("textshaping", execute = TRUE)
+          pak::pkg_system_requirements("gert", execute = TRUE)
         shell: Rscript {0}
 
       - name: Install dependencies

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -55,6 +55,8 @@ Suggests:
     modeldata,
     LiblineaR,
     Matrix,
-    mgcv
-Remotes:
+    mgcv,
+    dials (>= 0.0.9.9000)
+Remotes:  
+    tidymodels/dials,
     topepo/C5.0
diff --git a/NAMESPACE b/NAMESPACE
@@ -138,6 +138,7 @@ export(control_parsnip)
 export(convert_stan_interval)
 export(decision_tree)
 export(eval_args)
+export(find_engine_files)
 export(fit)
 export(fit.model_spec)
 export(fit_control)
@@ -158,6 +159,8 @@ export(linear_reg)
 export(logistic_reg)
 export(make_call)
 export(make_classes)
+export(make_engine_list)
+export(make_seealso_list)
 export(mars)
 export(maybe_data_frame)
 export(maybe_matrix)
@@ -230,6 +233,7 @@ export(update_main_parameters)
 export(varying)
 export(varying_args)
 export(xgb_train)
+importFrom(dplyr,"%>%")
 importFrom(dplyr,arrange)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,bind_cols)

diff --git a/NEWS.md b/NEWS.md
@@ -33,7 +33,11 @@
 
 * `set_mode()` now checks if `mode` is compatible with the model class, similar to `new_model_spec()` (@jtlandis, #467). Both `set_mode()` and `set_engine()` now error for `NULL` or missing arguments (#503).
 
-* Re-organized model documentation for `update` methods (#479).
+* Re-organized model documentation:
+
+   * `update` methods were moved out of the model help files (#479).
+   * Each model/engine combination has its own help page. 
+   * The model help page has a dynamic bulleted list of the engines with links to the individual help pages. 
 
 * `generics::required_pkgs()` was extended for `parsnip` objects. 
 

diff --git a/R/aaa_models.R b/R/aaa_models.R
@@ -65,7 +65,7 @@ get_model_env <- function() {
 #' @export
 get_from_env <- function(items) {
   mod_env <- get_model_env()
-  rlang::env_get(mod_env, items)
+  rlang::env_get(mod_env, items, default = NULL)
 }
 
 #' @rdname get_model_env
@@ -497,6 +497,7 @@ set_model_mode <- function(model, mode) {
 
 #' @rdname set_new_model
 #' @keywords internal
+#' @importFrom dplyr %>%
 #' @export
 set_model_engine <- function(model, mode, eng) {
   check_model_exists(model)
@@ -951,3 +952,151 @@ get_encoding <- function(model) {
   }
   res
 }
+
+#' Tools for dynamically documenting packages
+#'
+#' @description
+#' These are functions used to create dynamic documentation in Rd files
+#' based on which parsnip-related packages are loaded by the user.
+#'
+#' These functions can be used to make dynamic lists of documentation help
+#'  files. \pkg{parsnip} uses these, along with files contained in `man/rmd`
+#'  containing expanded documentation, for specific model/engine combinations.
+#'  [find_engine_files()] looks for files that have the pattern
+#'  `details_{model}_{engine}.Rd` to link to. These files are generated by files
+#'  named `man/rmd/details_{model}_{engine}.Rmd`. `make_engine_list()` creates a
+#'  list seen at the top of the model Rd files while `make_seealso_list()`
+#'  populates the list seen in "See Also" below. See the details section.
+#'
+#' @param mod A character string for the model file (e.g. "linear_reg")
+#' @return
+#' `make_engine_list()` returns a character string that creates a
+#' bulleted list of links to more specific help files.
+#'
+#' `make_seealso_list()` returns a formatted character string of links.
+#'
+#' `find_engine_files()` returns a tibble.
+#' @details
+#' The \pkg{parsnip} documentation is generated _dynamically_. Part of the Rd
+#'  file populates a list of engines that depends on what packages are loaded
+#'  *at the time that the man file is loaded*. For example, if
+#'  another package has a new engine for `linear_reg()`, the
+#'  `parsnip::linear_reg()` help can show a link to a detailed help page in the
+#'  other package.
+#'
+#' To enable this, the process for a package developer is to:
+#'
+#'   1. Create an engine-specific R file in the `R` directory with the name
+#'  `{model}_{engine}.R` (e.g. `boost_tree_C5.0.R`). This has a small amount of
+#'  documentation, as well as the directive
+#'  "`@includeRmd man/rmd/{model}_{engine}.Rmd details`".
+#'
+#'   1. Copy the file in \pkg{parsnip} that is in `man/rmd/setup.Rmd` and put
+#'  it in the same place in your package.
+#'
+#'   1. Write your own `man/rmd/{model}_{engine}.Rmd` file. This can include
+#'  packages that are not listed in the DESCRIPTION file. Those are only
+#'  required when the documentation file is created locally (probably using
+#'  [devtools::document()].
+#'
+#'   1. Run [devtools::document()] so that the Rmd content is included in the
+#'  Rd file.
+#'
+#' The examples in \pkg{parsnip} can provide guidance for how to organize
+#' technical information about the models.
+#' @name doc-tools
+#' @keywords internal
+#' @export
+#' @examples
+#' find_engine_files("linear_reg")
+#' cat(make_engine_list("linear_reg"))
+find_engine_files <- function(mod) {
+
+  # Get available topics
+  topic_names <- search_for_engine_docs(mod)
+  if (length(topic_names) == 0) {
+    return(character(0))
+  }
+
+  # Subset for our model function
+  eng <- strsplit(topic_names, "_")
+  eng <- purrr::map_chr(eng, ~ .x[length(.x)])
+  eng <- tibble::tibble(engine = eng, topic = topic_names)
+
+  # Combine them to keep the order in which they were registered
+  all_eng <- get_from_env(mod) %>% dplyr::distinct(engine)
+  all_eng$.order <- 1:nrow(all_eng)
+  eng <- dplyr::left_join(eng, all_eng, by = "engine")
+  eng <- eng[order(eng$.order),]
+
+  # Determine and label default engine
+  default <- get_default_engine(mod)
+  eng$default <- ifelse(eng$engine == default, " (default)", "")
+
+  eng
+}
+
+#' @export
+#' @rdname doc-tools
+make_engine_list <- function(mod) {
+  eng <- find_engine_files(mod)
+
+  res <-
+    glue::glue("  \\item \\code{\\link[=|eng$topic|]{|eng$engine|} |eng$default| }",
+               .open = "|", .close = "|")
+
+  res <- paste0("\\itemize{\n", paste0(res, collapse = "\n"), "\n}")
+  res
+}
+
+get_default_engine <- function(mod) {
+  cl <- rlang::call2(mod, .ns = "parsnip")
+  rlang::eval_tidy(cl)$engine
+}
+
+#' @export
+#' @rdname  doc-tools
+make_seealso_list <- function(mod) {
+  eng <- find_engine_files(mod)
+
+  res <-
+    glue::glue("\\code{\\link[=|eng$topic|]{|eng$engine| engine details}}",
+               .open = "|", .close = "|")
+
+  main <- c("\\code{\\link[=fit.model_spec]{fit.model_spec()}}",
+            "\\code{\\link[=set_engine]{set_engine()}}",
+            "\\code{\\link[=update]{update()}}"
+            )
+  paste0(c(main, res), collapse = ", ")
+}
+
+# These will never have documentation and we can avoid searching them.
+excl_pkgs <-
+  c("C50", "Cubist", "earth", "flexsurv", "forecast", "glmnet",
+    "keras", "kernlab", "kknn", "klaR", "LiblineaR", "liquidSVM",
+    "magrittr", "MASS", "mda", "mixOmics", "naivebayes", "nnet",
+    "prophet", "pscl", "randomForest", "ranger", "rpart", "rstanarm",
+    "sparklyr", "stats", "survival", "xgboost", "xrf")
+
+search_for_engine_docs <- function(mod) {
+  all_deps <- get_from_env(paste0(mod, "_pkgs"))
+  all_deps <- unlist(all_deps$pkg)
+  all_deps <- unique(c("parsnip", all_deps))
+
+  all_deps <- all_deps[!(all_deps %in% excl_pkgs)]
+  res <- purrr::map(all_deps, find_details_topics, mod = mod)
+  res <- unique(unlist(res))
+  res
+}
+
+find_details_topics <- function(pkg, mod) {
+  meta_loc <- system.file("Meta/Rd.rds", package = pkg)
+  meta_loc <- meta_loc[meta_loc != ""]
+  if (length(meta_loc) > 0) {
+    topic_names <- readRDS(meta_loc)$Name
+    res <- grep(paste0("details_", mod), topic_names, value = TRUE)
+  } else {
+    res <- character(0)
+  }
+  res
+}
diff --git a/R/augment.R b/R/augment.R
@@ -3,13 +3,14 @@
 #' `augment()` will add column(s) for predictions to the given data.
 #'
 #' For regression models, a `.pred` column is added. If `x` was created using
-#' [fit()] and `new_data` contains the outcome column, a `.resid` column is
+#' [fit.model_spec()] and `new_data` contains the outcome column, a `.resid` column is
 #' also added.
 #'
 #' For classification models, the results can include a column called
 #'  `.pred_class` as well as class probability columns named `.pred_{level}`.
 #'  This depends on what type of prediction types are available for the model.
-#' @param x A `model_fit` object produced by [fit()] or [fit_xy()].
+#' @param x A `model_fit` object produced by [fit.model_spec()] or
+#' [fit_xy.model_spec()] .
 #' @param new_data A data frame or matrix.
 #' @param ... Not currently used.
 #' @rdname augment

diff --git a/R/boost_tree.R b/R/boost_tree.R
@@ -1,98 +1,58 @@
 # Prototype parsnip code for boosted trees
 
-#' General Interface for Boosted Trees
+#' Boosted trees
 #'
-#' `boost_tree()` is a way to generate a _specification_ of a model
-#'  before fitting and allows the model to be created using
-#'  different packages in R or via Spark. The main arguments for the
-#'  model are:
-#' \itemize{
-#'   \item \code{mtry}: The number of predictors that will be
-#'   randomly sampled at each split when creating the tree models.
-#'   \item \code{trees}: The number of trees contained in the ensemble.
-#'   \item \code{min_n}: The minimum number of data points in a node
-#'   that is required for the node to be split further.
-#'   \item \code{tree_depth}: The maximum depth of the tree (i.e. number of
-#'  splits).
-#'   \item \code{learn_rate}: The rate at which the boosting algorithm adapts
-#'   from iteration-to-iteration.
-#'   \item \code{loss_reduction}: The reduction in the loss function required
-#'   to split further.
-#'   \item \code{sample_size}: The amount of data exposed to the fitting routine.
-#'   \item \code{stop_iter}: The number of iterations without improvement before
-#'   stopping.
-#' }
-#' These arguments are converted to their specific names at the
-#'  time that the model is fit. Other options and arguments can be
-#'  set using the `set_engine()` function. If left to their defaults
-#'  here (`NULL`), the values are taken from the underlying model
-#'  functions. If parameters need to be modified, `update()` can be used
-#'  in lieu of recreating the object from scratch.
+#' @description
+#'
+#' `boost_tree()` defines a model that creates a series of decision trees
+#' forming an ensemble. Each tree depends on the results of previous trees.
+#' All trees in the ensemble are combined to produce a final prediction.
+#'
+#' There are different ways to fit this model. See the engine-specific pages
+#' for more details:
+#'
+#' \Sexpr[stage=render,results=rd]{parsnip:::make_engine_list("boost_tree")}
+#'
+#' More information on how \pkg{parsnip} is used for modeling is at
+#' \url{https://www.tidymodels.org/}.
 #'
 #' @param mode A single character string for the prediction outcome mode.
 #'  Possible values for this model are "unknown", "regression", or
 #'  "classification".
 #' @param engine A single character string specifying what computational engine
-#'  to use for fitting. Possible engines are listed below. The default for this
-#'  model is `"xgboost"`.
+#'  to use for fitting.
 #' @param mtry A number for the number (or proportion) of predictors that will
-#'  be randomly sampled at each split when creating the tree models (`xgboost`
-#'  only).
+#'  be randomly sampled at each split when creating the tree models
+#' (specific engines only)
 #' @param trees An integer for the number of trees contained in
 #'  the ensemble.
 #' @param min_n An integer for the minimum number of data points
 #'  in a node that is required for the node to be split further.
 #' @param tree_depth An integer for the maximum depth of the tree (i.e. number
-#'  of splits) (`xgboost` only).
+#'  of splits) (specific engines only).
 #' @param learn_rate A number for the rate at which the boosting algorithm adapts
-#'   from iteration-to-iteration (`xgboost` only).
+#'   from iteration-to-iteration (specific engines only).
 #' @param loss_reduction A number for the reduction in the loss function required
-#'   to split further (`xgboost` only).
+#'   to split further (specific engines only).
 #' @param sample_size A number for the number (or proportion) of data that is
 #'  exposed to the fitting routine. For `xgboost`, the sampling is done at
 #'  each iteration while `C5.0` samples once during training.
 #' @param stop_iter The number of iterations without improvement before
-#'   stopping (`xgboost` only).
-#' @details
-#' The data given to the function are not saved and are only used
-#'  to determine the _mode_ of the model. For `boost_tree()`, the
-#'  possible modes are "regression" and "classification".
+#'   stopping (specific engines only).
 #'
-#' The model can be created using the `fit()` function using the
-#'  following _engines_:
-#' \itemize{
-#' \item \pkg{R}: `"xgboost"` (the default), `"C5.0"`
-#' \item \pkg{Spark}: `"spark"`
-#' }
+#' @template spec-details
 #'
-#' For this model, other packages may add additional engines. Use
-#' [show_engines()] to see the current set of engines.
+#' @template spec-references
 #'
-#' @includeRmd man/rmd/boost-tree.Rmd details
+#' @seealso \Sexpr[stage=render,results=rd]{parsnip:::make_seealso_list("boost_tree")},
+#' [xgb_train()], [C5.0_train()]
 #'
-#' @note For models created using the spark engine, there are
-#'  several differences to consider. First, only the formula
-#'  interface to via `fit()` is available; using `fit_xy()` will
-#'  generate an error. Second, the predictions will always be in a
-#'  spark table format. The names will be the same as documented but
-#'  without the dots. Third, there is no equivalent to factor
-#'  columns in spark tables so class predictions are returned as
-#'  character columns. Fourth, to retain the model object for a new
-#'  R session (via `save()`), the `model$fit` element of the `parsnip`
-#'  object should be serialized via `ml_save(object$fit)` and
-#'  separately saved to disk. In a new session, the object can be
-#'  reloaded and reattached to the `parsnip` object.
-#'
-#' @importFrom purrr map_lgl
-#' @seealso [fit()], [set_engine()], [update()]
 #' @examples
 #' show_engines("boost_tree")
 #'
 #' boost_tree(mode = "classification", trees = 20)
-#' # Parameters can be represented by a placeholder:
-#' boost_tree(mode = "regression", mtry = varying())
 #' @export
-
+#' @importFrom purrr map_lgl
 boost_tree <-
   function(mode = "unknown",
            engine = "xgboost",
@@ -573,7 +533,8 @@ xgb_by_tree <- function(tree, object, new_data, type, ...) {
 #'  random proportion of the data should be used to train the model.
 #'  By default, all the samples are used for model training. Samples
 #'  not used for training are used to evaluate the accuracy of the
-#'  model in the printed output.
+#'  model in the printed output. A value of zero means that all the training
+#'  data are used.
 #' @param ... Other arguments to pass.
 #' @return A fitted C5.0 model.
 #' @keywords internal

diff --git a/R/boost_tree_C5.0.R b/R/boost_tree_C5.0.R
@@ -0,0 +1,11 @@
+#' Boosted trees via C5.0
+#'
+#' [C50::C5.0()] creates a series of classification trees forming an
+#' ensemble. Each tree depends on the results of previous trees. All trees in
+#' the ensemble are combined to produce a final prediction.
+#'
+#' @includeRmd man/rmd/boost_tree_C5.0.Rmd details
+#'
+#' @name details_boost_tree_C5.0
+#' @keywords internal
+NULL