Skip to content

Commit 58229d8

Browse files
committed
|> in backtesting, dropped a section in get started
1 parent f1e8104 commit 58229d8

File tree

2 files changed

+77
-41
lines changed

2 files changed

+77
-41
lines changed

vignettes/backtesting.Rmd

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ medical insurance claims and the number of new confirmed COVID-19 cases per
5757

5858
```{r grab-epi-data}
5959
# Select the `percent_cli` column from the data archive
60-
doctor_visits <- archive_cases_dv_subset$DT %>%
61-
select(geo_value, time_value, version, percent_cli) %>%
62-
tidyr::drop_na(percent_cli) %>%
60+
doctor_visits <- archive_cases_dv_subset$DT |>
61+
select(geo_value, time_value, version, percent_cli) |>
62+
tidyr::drop_na(percent_cli) |>
6363
as_epi_archive(compactify = TRUE)
6464
```
6565

@@ -76,8 +76,8 @@ doctor_visits <- pub_covidcast(
7676
geo_values = "ca,fl,ny,tx",
7777
time_values = epirange(20200601, 20211201),
7878
issues = epirange(20200601, 20211201)
79-
) %>%
80-
rename(version = issue, percent_cli = value) %>%
79+
) |>
80+
rename(version = issue, percent_cli = value) |>
8181
as_epi_archive(compactify = TRUE)
8282
```
8383

@@ -98,20 +98,20 @@ percent_cli_data <- bind_rows(
9898
# Snapshotted data for the version-faithful forecasts
9999
map(
100100
forecast_dates,
101-
~ doctor_visits %>%
102-
epix_as_of(.x) %>%
101+
~ doctor_visits |>
102+
epix_as_of(.x) |>
103103
mutate(version = .x)
104-
) %>%
105-
bind_rows() %>%
104+
) |>
105+
bind_rows() |>
106106
mutate(version_faithful = TRUE),
107107
# Latest data for the version-faithless forecasts
108-
doctor_visits %>%
109-
epix_as_of(doctor_visits$versions_end) %>%
108+
doctor_visits |>
109+
epix_as_of(doctor_visits$versions_end) |>
110110
mutate(version_faithful = FALSE)
111111
)
112112
113113
p0 <-
114-
ggplot(data = percent_cli_data %>% filter(geo_value == geo_choose)) +
114+
ggplot(data = percent_cli_data |> filter(geo_value == geo_choose)) +
115115
geom_vline(aes(color = factor(version), xintercept = version), lty = 2) +
116116
geom_line(
117117
aes(x = time_value, y = percent_cli, color = factor(version)),
@@ -153,9 +153,9 @@ of the red time-series to its left.
153153
In fact, if we take a snapshot and get the last `time_value`:
154154

155155
```{r}
156-
doctor_visits %>%
157-
epix_as_of(as.Date("2020-08-01")) %>%
158-
pull(time_value) %>%
156+
doctor_visits |>
157+
epix_as_of(as.Date("2020-08-01")) |>
158+
pull(time_value) |>
159159
max()
160160
```
161161

@@ -184,14 +184,14 @@ One way to do this is by setting the `.version` argument for `epix_slide()`:
184184

185185
```{r single_version, warn = FALSE}
186186
forecast_date <- as.Date("2021-04-06")
187-
forecasts <- doctor_visits %>%
187+
forecasts <- doctor_visits |>
188188
epix_slide(
189189
~ arx_forecaster(
190190
.x,
191191
outcome = "percent_cli",
192192
predictors = "percent_cli",
193193
args_list = arx_args_list()
194-
)$predictions %>%
194+
)$predictions |>
195195
pivot_quantiles_wider(.pred_distn),
196196
.versions = forecast_date
197197
)
@@ -201,12 +201,12 @@ As truth data, we'll compare with the `epix_as_of()` to generate a snapshot of
201201
the archive at the last date[^1].
202202

203203
```{r compare_single_with_result}
204-
forecasts %>%
204+
forecasts |>
205205
inner_join(
206-
doctor_visits %>%
206+
doctor_visits |>
207207
epix_as_of(doctor_visits$versions_end),
208208
by = c("geo_value", "target_date" = "time_value")
209-
) %>%
209+
) |>
210210
select(geo_value, forecast_date, .pred, `0.05`, `0.95`, percent_cli)
211211
```
212212

@@ -226,9 +226,9 @@ This has the effect of simulating a data set that receives the final version
226226
updates every day.
227227

228228
```{r}
229-
archive_cases_dv_subset_faux <- doctor_visits %>%
230-
epix_as_of(doctor_visits$versions_end) %>%
231-
mutate(version = time_value) %>%
229+
archive_cases_dv_subset_faux <- doctor_visits |>
230+
epix_as_of(doctor_visits$versions_end) |>
231+
mutate(version = time_value) |>
232232
as_epi_archive()
233233
```
234234

@@ -250,10 +250,10 @@ forecast_wrapper <- function(
250250
lags = c(0:7, 14, 21),
251251
adjust_latency = "extend_ahead"
252252
)
253-
)$predictions %>%
253+
)$predictions |>
254254
pivot_quantiles_wider(.pred_distn)
255255
}
256-
) %>%
256+
) |>
257257
bind_rows()
258258
}
259259
```
@@ -275,20 +275,20 @@ forecast_dates <- seq(
275275
)
276276
aheads <- c(1, 7, 14, 21, 28)
277277
278-
version_faithless <- archive_cases_dv_subset_faux %>%
278+
version_faithless <- archive_cases_dv_subset_faux |>
279279
epix_slide(
280280
~ forecast_wrapper(.x, aheads, "percent_cli", "percent_cli"),
281281
.before = 120,
282282
.versions = forecast_dates
283-
) %>%
283+
) |>
284284
mutate(version_faithful = FALSE)
285285
286-
version_faithful <- doctor_visits %>%
286+
version_faithful <- doctor_visits |>
287287
epix_slide(
288288
~ forecast_wrapper(.x, aheads, "percent_cli", "percent_cli"),
289289
.before = 120,
290290
.versions = forecast_dates
291-
) %>%
291+
) |>
292292
mutate(version_faithful = TRUE)
293293
294294
forecasts <-
@@ -315,8 +315,8 @@ ny), we'll just display the results for two states, California (CA) and Florida
315315

316316
```{r plot_ca_forecasts, warning = FALSE}
317317
geo_choose <- "ca"
318-
forecasts_filtered <- forecasts %>%
319-
filter(geo_value == geo_choose) %>%
318+
forecasts_filtered <- forecasts |>
319+
filter(geo_value == geo_choose) |>
320320
mutate(time_value = version)
321321
322322
p1 <- # first plotting the forecasts as bands, lines and points
@@ -325,10 +325,10 @@ p1 <- # first plotting the forecasts as bands, lines and points
325325
geom_line(aes(y = .pred, color = factor(time_value)), linetype = 2L) +
326326
geom_point(aes(y = .pred, color = factor(time_value)), size = 0.75) +
327327
# the forecast date
328-
geom_vline(data = percent_cli_data %>% filter(geo_value == geo_choose) %>% select(-version_faithful), aes(color = factor(version), xintercept = version), lty = 2) +
328+
geom_vline(data = percent_cli_data |> filter(geo_value == geo_choose) |> select(-version_faithful), aes(color = factor(version), xintercept = version), lty = 2) +
329329
# the underlying data
330330
geom_line(
331-
data = percent_cli_data %>% filter(geo_value == geo_choose),
331+
data = percent_cli_data |> filter(geo_value == geo_choose),
332332
aes(x = time_value, y = percent_cli, color = factor(version)),
333333
inherit.aes = FALSE, na.rm = TRUE
334334
) +
@@ -341,8 +341,8 @@ p1 <- # first plotting the forecasts as bands, lines and points
341341

342342
```{r plot_fl_forecasts, warning = FALSE}
343343
geo_choose <- "fl"
344-
forecasts_filtered <- forecasts %>%
345-
filter(geo_value == geo_choose) %>%
344+
forecasts_filtered <- forecasts |>
345+
filter(geo_value == geo_choose) |>
346346
mutate(time_value = version)
347347
348348
p2 <-
@@ -351,11 +351,11 @@ p2 <-
351351
geom_line(aes(y = .pred, color = factor(time_value)), linetype = 2L) +
352352
geom_point(aes(y = .pred, color = factor(time_value)), size = 0.75) +
353353
geom_vline(
354-
data = percent_cli_data %>% filter(geo_value == geo_choose) %>% select(-version_faithful),
354+
data = percent_cli_data |> filter(geo_value == geo_choose) |> select(-version_faithful),
355355
aes(color = factor(version), xintercept = version), lty = 2
356356
) +
357357
geom_line(
358-
data = percent_cli_data %>% filter(geo_value == geo_choose),
358+
data = percent_cli_data |> filter(geo_value == geo_choose),
359359
aes(x = time_value, y = percent_cli, color = factor(version)),
360360
inherit.aes = FALSE, na.rm = TRUE
361361
) +
@@ -397,7 +397,7 @@ p2
397397

398398

399399
[^1]: For forecasting a single day like this, we could have actually just used
400-
`doctor_visits %>% epix_as_of(forecast_date)` to get the relevant snapshot, and then fed that into `arx_forecaster()` as we did in the [landing
400+
`doctor_visits |> epix_as_of(forecast_date)` to get the relevant snapshot, and then fed that into `arx_forecaster()` as we did in the [landing
401401
page](../index.html#motivating-example).
402402

403403

vignettes/epipredict.Rmd

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ library(recipes)
1919
library(epidatasets)
2020
library(epipredict)
2121
library(ggplot2)
22+
library(purrr)
2223
forecast_date <- as.Date("2021-08-01")
2324
used_locations <- c("ca", "ma", "ny", "tx")
2425
library(epidatr)
@@ -331,8 +332,41 @@ autoplot(
331332
The 8 graphs are all pairs of the `geo_values` (`"Quebec"` and `"British Columbia"`), `edu_quals` (`"Undergraduate degree"` and `"Professional degree"`), and age brackets (`"15 to 34 years"` and `"35 to 64 years"`).
332333

333334
## Fitting a non-geo-pooled model
334-
The primary difference to avoid geo-pooling is to first `group_by(geo_value)`
335-
before forecasting
335+
336+
Because our internal methods fit a single model, to fit a non-geo-pooled model
337+
that has a different fit for each geography, one either needs a multi-level
338+
engine (which at the moment parsnip doesn't support), or one needs to map over
339+
geographies.
340+
341+
```{r fit_non_geo_pooled, warning=FALSE}
342+
geo_values <- covid_case_death_rates |> pull(geo_value) |> unique()
343+
344+
all_fits <-
345+
purrr::map(geo_values, \(geo) {
346+
covid_case_death_rates |>
347+
filter(
348+
geo_value == geo,
349+
time_value <= forecast_date) |>
350+
arx_forecaster(
351+
outcome = "death_rate",
352+
trainer = linear_reg(),
353+
predictors = c("death_rate"),
354+
args_list = arx_args_list(
355+
lags = list(c(0, 7, 14)),
356+
ahead = 14
357+
)
358+
)
359+
})
360+
map_df(all_fits, ~ pluck(., "predictions"))
361+
```
362+
363+
This is both 56 times slower[^7], and uses far less data to fit each model.
364+
If the geographies are at all comparable, for example by normalization, we would
365+
get much better results by pooling.
366+
367+
If we wanted to build a geo-aware model, such as one that sets the constant in a
368+
linear regression fit to be different for each geography, we would need to build a [Custom workflow](custom_epiworkflows) with geography as a factor.
369+
336370
# Anatomy of a canned forecaster
337371
## Code object
338372
Let's dissect the forecaster we trained back on the [landing
@@ -390,7 +424,7 @@ An `epi_workflow()` consists of 3 parts:
390424
5 of as these well. You can inspect the layers more closely by running
391425
`epipredict::extract_layers(four_week_ahead$epi_workflow)`.
392426

393-
See the [Guts vignette](preprocessing-and-models) for recreating and then
427+
See the [Guts vignette](custom_epiworkflows) for recreating and then
394428
extending `four_week_ahead` using the custom forecaster framework.
395429

396430
## Mathematical description
@@ -436,3 +470,5 @@ without `NA` values is a training point to fit the coefficients $a_0,\ldots, a_6
436470

437471
[^6]: alternatively, for an unfit version of the preprocessor, you can call
438472
`hardhat::extract_preprocessor(four_week_ahead$epi_workflow)`
473+
474+
[^7]: the number of geographies

0 commit comments

Comments
 (0)