|> in backtesting, dropped a section in get started

dsweber2 · dsweber2 · commit 58229d841f76 · 2025-02-25T17:07:30.000-06:00
diff --git a/vignettes/backtesting.Rmd b/vignettes/backtesting.Rmd
@@ -57,9 +57,9 @@ medical insurance claims and the number of new confirmed COVID-19 cases per
 
 ```{r grab-epi-data}
 # Select the `percent_cli` column from the data archive
-doctor_visits <- archive_cases_dv_subset$DT %>%
-  select(geo_value, time_value, version, percent_cli) %>%
-  tidyr::drop_na(percent_cli) %>%
+doctor_visits <- archive_cases_dv_subset$DT |>
+  select(geo_value, time_value, version, percent_cli) |>
+  tidyr::drop_na(percent_cli) |>
   as_epi_archive(compactify = TRUE)
 ```
 
@@ -76,8 +76,8 @@ doctor_visits <- pub_covidcast(
   geo_values = "ca,fl,ny,tx",
   time_values = epirange(20200601, 20211201),
   issues = epirange(20200601, 20211201)
-) %>%
-  rename(version = issue, percent_cli = value) %>%
+) |>
+  rename(version = issue, percent_cli = value) |>
   as_epi_archive(compactify = TRUE)
 ```
 
@@ -98,20 +98,20 @@ percent_cli_data <- bind_rows(
   # Snapshotted data for the version-faithful forecasts
   map(
     forecast_dates,
-    ~ doctor_visits %>%
-      epix_as_of(.x) %>%
+    ~ doctor_visits |>
+      epix_as_of(.x) |>
       mutate(version = .x)
-  ) %>%
-    bind_rows() %>%
+  ) |>
+    bind_rows() |>
     mutate(version_faithful = TRUE),
   # Latest data for the version-faithless forecasts
-  doctor_visits %>%
-    epix_as_of(doctor_visits$versions_end) %>%
+  doctor_visits |>
+    epix_as_of(doctor_visits$versions_end) |>
     mutate(version_faithful = FALSE)
 )
 
 p0 <-
-  ggplot(data = percent_cli_data %>% filter(geo_value == geo_choose)) +
+  ggplot(data = percent_cli_data |> filter(geo_value == geo_choose)) +
   geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +
   geom_line(
     aes(x = time_value, y = percent_cli, color = factor(version)),
@@ -153,9 +153,9 @@ of the red time-series to its left.
 In fact, if we take a snapshot and get the last `time_value`:
 
 ```{r}
-doctor_visits %>%
-  epix_as_of(as.Date("2020-08-01")) %>%
-  pull(time_value) %>%
+doctor_visits |>
+  epix_as_of(as.Date("2020-08-01")) |>
+  pull(time_value) |>
   max()
 ```
 
@@ -184,14 +184,14 @@ One way to do this is by setting the `.version` argument for `epix_slide()`:
 
 ```{r single_version, warn = FALSE}
 forecast_date <- as.Date("2021-04-06")
-forecasts <- doctor_visits %>%
+forecasts <- doctor_visits |>
   epix_slide(
     ~ arx_forecaster(
       .x,
       outcome = "percent_cli",
       predictors = "percent_cli",
       args_list = arx_args_list()
-    )$predictions %>%
+    )$predictions |>
       pivot_quantiles_wider(.pred_distn),
     .versions = forecast_date
   )
@@ -201,12 +201,12 @@ As truth data, we'll compare with the `epix_as_of()` to generate a snapshot of
 the archive at the last date[^1].
 
 ```{r compare_single_with_result}
-forecasts %>%
+forecasts |>
   inner_join(
-    doctor_visits %>%
+    doctor_visits |>
       epix_as_of(doctor_visits$versions_end),
     by = c("geo_value", "target_date" = "time_value")
-  ) %>%
+  ) |>
   select(geo_value, forecast_date, .pred, `0.05`, `0.95`, percent_cli)
 ```
 
@@ -226,9 +226,9 @@ This has the effect of simulating a data set that receives the final version
 updates every day.
 
 ```{r}
-archive_cases_dv_subset_faux <- doctor_visits %>%
-  epix_as_of(doctor_visits$versions_end) %>%
-  mutate(version = time_value) %>%
+archive_cases_dv_subset_faux <- doctor_visits |>
+  epix_as_of(doctor_visits$versions_end) |>
+  mutate(version = time_value) |>
   as_epi_archive()
 ```
 
@@ -250,10 +250,10 @@ forecast_wrapper <- function(
           lags = c(0:7, 14, 21),
           adjust_latency = "extend_ahead"
         )
-      )$predictions %>%
+      )$predictions |>
         pivot_quantiles_wider(.pred_distn)
     }
-  ) %>%
+  ) |>
     bind_rows()
 }
 ```
@@ -275,20 +275,20 @@ forecast_dates <- seq(
 )
 aheads <- c(1, 7, 14, 21, 28)
 
-version_faithless <- archive_cases_dv_subset_faux %>%
+version_faithless <- archive_cases_dv_subset_faux |>
   epix_slide(
     ~ forecast_wrapper(.x, aheads, "percent_cli", "percent_cli"),
     .before = 120,
     .versions = forecast_dates
-  ) %>%
+  ) |>
   mutate(version_faithful = FALSE)
 
-version_faithful <- doctor_visits %>%
+version_faithful <- doctor_visits |>
   epix_slide(
     ~ forecast_wrapper(.x, aheads, "percent_cli", "percent_cli"),
     .before = 120,
     .versions = forecast_dates
-  ) %>%
+  ) |>
   mutate(version_faithful = TRUE)
 
 forecasts <-
@@ -315,8 +315,8 @@ ny), we'll just display the results for two states, California (CA) and Florida
 
 ```{r plot_ca_forecasts, warning = FALSE}
 geo_choose <- "ca"
-forecasts_filtered <- forecasts %>%
-  filter(geo_value == geo_choose) %>%
+forecasts_filtered <- forecasts |>
+  filter(geo_value == geo_choose) |>
   mutate(time_value = version)
 
 p1 <- # first plotting the forecasts as bands, lines and points
@@ -325,10 +325,10 @@ p1 <- # first plotting the forecasts as bands, lines and points
   geom_line(aes(y = .pred, color = factor(time_value)), linetype = 2L) +
   geom_point(aes(y = .pred, color = factor(time_value)), size = 0.75) +
   # the forecast date
-  geom_vline(data = percent_cli_data %>% filter(geo_value == geo_choose) %>% select(-version_faithful), aes(color = factor(version), xintercept = version), lty = 2) +
+  geom_vline(data = percent_cli_data |> filter(geo_value == geo_choose) |> select(-version_faithful), aes(color = factor(version), xintercept = version), lty = 2) +
   # the underlying data
   geom_line(
-    data = percent_cli_data %>% filter(geo_value == geo_choose),
+    data = percent_cli_data |> filter(geo_value == geo_choose),
     aes(x = time_value, y = percent_cli, color = factor(version)),
     inherit.aes = FALSE, na.rm = TRUE
   ) +
@@ -341,8 +341,8 @@ p1 <- # first plotting the forecasts as bands, lines and points
 
 ```{r plot_fl_forecasts, warning = FALSE}
 geo_choose <- "fl"
-forecasts_filtered <- forecasts %>%
-  filter(geo_value == geo_choose) %>%
+forecasts_filtered <- forecasts |>
+  filter(geo_value == geo_choose) |>
   mutate(time_value = version)
 
 p2 <-
@@ -351,11 +351,11 @@ p2 <-
   geom_line(aes(y = .pred, color = factor(time_value)), linetype = 2L) +
   geom_point(aes(y = .pred, color = factor(time_value)), size = 0.75) +
   geom_vline(
-    data = percent_cli_data %>% filter(geo_value == geo_choose) %>% select(-version_faithful),
+    data = percent_cli_data |> filter(geo_value == geo_choose) |> select(-version_faithful),
     aes(color = factor(version), xintercept = version), lty = 2
   ) +
   geom_line(
-    data = percent_cli_data %>% filter(geo_value == geo_choose),
+    data = percent_cli_data |> filter(geo_value == geo_choose),
     aes(x = time_value, y = percent_cli, color = factor(version)),
     inherit.aes = FALSE, na.rm = TRUE
   ) +
@@ -397,7 +397,7 @@ p2
 
 
 [^1]: For forecasting a single day like this, we could have actually just used
-    `doctor_visits %>% epix_as_of(forecast_date)` to get the relevant snapshot, and then fed that into `arx_forecaster()` as we did in the [landing
+    `doctor_visits |> epix_as_of(forecast_date)` to get the relevant snapshot, and then fed that into `arx_forecaster()` as we did in the [landing
 page](../index.html#motivating-example).
 
 
diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd
@@ -19,6 +19,7 @@ library(recipes)
 library(epidatasets)
 library(epipredict)
 library(ggplot2)
+library(purrr)
 forecast_date <- as.Date("2021-08-01")
 used_locations <- c("ca", "ma", "ny", "tx")
 library(epidatr)
@@ -331,8 +332,41 @@ autoplot(
 The 8 graphs are all pairs of the `geo_values` (`"Quebec"` and `"British Columbia"`), `edu_quals` (`"Undergraduate degree"` and `"Professional degree"`), and age brackets (`"15 to 34 years"` and `"35 to 64 years"`).
 
 ## Fitting a non-geo-pooled model
-The primary difference to avoid geo-pooling is to first `group_by(geo_value)`
-before forecasting
+
+Because our internal methods fit a single model, to fit a non-geo-pooled model
+that has a different fit for each geography, one either needs a multi-level
+engine (which at the moment parsnip doesn't support), or one needs to map over
+geographies.
+
+```{r fit_non_geo_pooled, warning=FALSE}
+geo_values <- covid_case_death_rates |> pull(geo_value) |> unique()
+
+all_fits <-
+  purrr::map(geo_values, \(geo) {
+  covid_case_death_rates |>
+    filter(
+      geo_value == geo,
+      time_value <= forecast_date) |>
+    arx_forecaster(
+      outcome = "death_rate",
+      trainer = linear_reg(),
+      predictors = c("death_rate"),
+      args_list = arx_args_list(
+        lags = list(c(0, 7, 14)),
+        ahead = 14
+      )
+    )
+})
+map_df(all_fits, ~ pluck(., "predictions"))
+```
+
+This is both 56 times slower[^7], and uses far less data to fit each model.
+If the geographies are at all comparable, for example by normalization, we would
+ get much better results by pooling.
+
+If we wanted to build a geo-aware model, such as one that sets the constant in a
+ linear regression fit to be different for each geography, we would need to build a [Custom workflow](custom_epiworkflows) with geography as a factor.
+
 # Anatomy of a canned forecaster
 ## Code object
 Let's dissect the forecaster we trained back on the [landing
@@ -390,7 +424,7 @@ An `epi_workflow()` consists of 3 parts:
   5 of as these well. You can inspect the layers more closely by running
   `epipredict::extract_layers(four_week_ahead$epi_workflow)`.
 
-See the [Guts vignette](preprocessing-and-models) for recreating and then
+See the [Guts vignette](custom_epiworkflows) for recreating and then
 extending `four_week_ahead` using the custom forecaster framework.
 
 ## Mathematical description
@@ -436,3 +470,5 @@ without `NA` values is a training point to fit the coefficients $a_0,\ldots, a_6
 
 [^6]: alternatively, for an unfit version of the preprocessor, you can call
     `hardhat::extract_preprocessor(four_week_ahead$epi_workflow)`
+
+[^7]: the number of geographies