Skip to content

Commit 2c36d3b

Browse files
committed
score notebooks (covid+flu)x(nhsn+nssp)
1 parent dcba5ca commit 2c36d3b

File tree

6 files changed

+680
-210
lines changed

6 files changed

+680
-210
lines changed

R/targets/score_targets.R

Lines changed: 44 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,34 @@
11
get_external_forecasts <- function(external_object_name) {
2-
locations_crosswalk <- get_population_data() %>%
3-
select(state_id, state_code) %>%
4-
filter(state_id != "usa")
5-
s3read_using(
6-
nanoparquet::read_parquet,
7-
object = external_object_name,
8-
bucket = "forecasting-team-data"
9-
) %>%
10-
filter(output_type == "quantile") %>%
11-
select(target, forecaster, geo_value = location, forecast_date, target_end_date, quantile = output_type_id, value) %>%
12-
inner_join(locations_crosswalk, by = c("geo_value" = "state_code")) %>%
13-
mutate(geo_value = state_id) %>%
14-
select(target, forecaster, geo_value, forecast_date, target_end_date, quantile, value)
2+
# try-catch in case that particular date is a 404
3+
tryCatch(
4+
{
5+
external_values <- s3read_using(
6+
nanoparquet::read_parquet,
7+
object = external_object_name,
8+
bucket = "forecasting-team-data"
9+
)
10+
locations_crosswalk <- get_population_data() %>%
11+
select(state_id, state_code) %>%
12+
filter(state_id != "usa")
13+
external_values <- external_values %>%
14+
filter(output_type == "quantile") %>%
15+
select(target, forecaster, geo_value = location, forecast_date, target_end_date, quantile = output_type_id, value) %>%
16+
inner_join(locations_crosswalk, by = c("geo_value" = "state_code")) %>%
17+
mutate(geo_value = state_id) %>%
18+
select(target, forecaster, geo_value, forecast_date, target_end_date, quantile, value)
19+
return(external_values)
20+
},
21+
error = function(e) {
22+
return(tibble())
23+
}
24+
)
1525
}
1626

17-
score_forecasts <- function(latest_data, joined_forecasts_and_ensembles, target) {
27+
score_forecasts <- function(latest_data, forecasts, target) {
28+
browser()
29+
if (length(forecasts) == 0) {
30+
return(tibble())
31+
}
1832
truth_data <-
1933
latest_data %>%
2034
select(geo_value, target_end_date = time_value, oracle_value = value) %>%
@@ -25,17 +39,30 @@ score_forecasts <- function(latest_data, joined_forecasts_and_ensembles, target)
2539
) %>%
2640
drop_na() %>%
2741
rename(location = state_code) %>%
28-
select(-geo_value)
42+
select(-geo_value) %>%
43+
mutate(
44+
target_end_date = round_date(target_end_date, unit = "week", week_start = 6)
45+
)
2946
# limit the forecasts to the same set of forecasting times
3047
max_forecast_date <-
31-
joined_forecasts_and_ensembles %>%
48+
forecasts %>%
3249
group_by(forecaster) %>%
3350
summarize(max_forecast = max(forecast_date)) %>%
3451
pull(max_forecast) %>%
3552
min()
3653
forecasts_formatted <-
37-
joined_forecasts_and_ensembles[joined_forecasts_and_ensembles$forecast_date <= max_forecast_date, ] %>%
54+
forecasts[forecasts$forecast_date <= max_forecast_date, ]
55+
forecasts_formatted <-
56+
forecasts_formatted[forecasts_formatted$target == target, ]
57+
# no forecasts for that target for these forecast dates
58+
if (nrow(forecasts_formatted) == 0) {
59+
return(tibble())
60+
}
61+
forecasts_formatted %<>%
3862
format_scoring_utils(target)
63+
if (target == "wk inc covid prop ed visits") {
64+
forecasts_formatted %<>% mutate(value = value * 100)
65+
}
3966
scores <-
4067
forecasts_formatted %>%
4168
filter(output_type == "quantile") %>%

R/utils.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,7 @@ get_socrata_updated_at <- function(dataset_url, missing_value) {
726726
#' create a list of valid locations x forecast_dates shared among forecasters
727727
#' which have at least `min_locations` and `min_dates`, and create a list of
728728
#' these for each forecaster
729-
get_unique <- function(forecasts, min_locations = 50, min_dates = 12) {
729+
get_unique <- function(forecasts, min_locations = 50, min_dates = 40) {
730730
forecasters <- forecasts %>%
731731
pull(forecaster) %>%
732732
unique()
@@ -789,7 +789,7 @@ filter_shared_geo_dates <- function(
789789
if (local_forecasts %>% distinct(forecast_date) %>% length() == 1) {
790790
viable_dates <-
791791
external_forecasts %>%
792-
get_unique()
792+
get_unique(min_locations = min_locations, min_dates = min_dates)
793793
} else {
794794
viable_dates <- inner_join(
795795
local_forecasts %>%

scripts/covid_hosp_prod.R

Lines changed: 122 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ g_insufficient_data_geos_nssp <- c(g_insufficient_data_geos, "wy")
1616
g_time_value_adjust <- 3
1717
g_fetch_args <- epidatr::fetch_args_list(return_empty = FALSE, timeout_seconds = 400)
1818
g_disease <- "covid"
19+
g_s3_prefix <- "exploration"
1920
g_external_object_name <- glue::glue("exploration/2024-2025_{g_disease}_hosp_forecasts.parquet")
2021
# date to cut the truth data off at, so we don't have too much of the past
2122
g_truth_data_date <- "2023-09-01"
@@ -153,19 +154,19 @@ parameters_and_date_targets <- rlang::list2(
153154
command = "scripts/reports/ongoing_score_report.Rmd"
154155
),
155156
tar_file(
156-
score_report_rmd,
157+
name = score_report_rmd,
157158
command = "scripts/reports/score_report.Rmd"
158159
),
159160
tar_file(
160-
covid_geo_exclusions,
161+
name = covid_geo_exclusions,
161162
command = "scripts/covid_geo_exclusions.csv"
162163
),
163164
tar_file(
164-
covid_nssp_geo_exclusions,
165+
name = covid_nssp_geo_exclusions,
165166
command = "scripts/covid_nssp_geo_exclusions.csv"
166167
),
167168
tar_file(
168-
covid_data_substitutions,
169+
name = covid_data_substitutions,
169170
command = "scripts/covid_data_substitutions.csv"
170171
),
171172
tar_change(
@@ -176,15 +177,15 @@ parameters_and_date_targets <- rlang::list2(
176177
}
177178
),
178179
tar_target(
179-
nhsn_latest_data,
180+
name = nhsn_latest_data,
180181
command = {
181182
nhsn_archive_data %>%
182183
epix_as_of(min(Sys.Date(), nhsn_archive_data$versions_end)) %>%
183184
filter(geo_value %nin% g_insufficient_data_geos)
184185
}
185186
),
186187
tar_change(
187-
nssp_archive_data,
188+
name = nssp_archive_data,
188189
change = max(
189190
get_covidcast_signal_last_update("nssp", "pct_ed_visits_covid", "state"),
190191
get_socrata_updated_at("https://data.cdc.gov/api/views/mpgq-jmmr", lubridate::now(tz = "UTC"))
@@ -194,7 +195,7 @@ parameters_and_date_targets <- rlang::list2(
194195
}
195196
),
196197
tar_target(
197-
nssp_latest_data,
198+
name = nssp_latest_data,
198199
command = {
199200
nssp_archive_data %>%
200201
epix_as_of(min(Sys.Date(), nssp_archive_data$versions_end))
@@ -647,7 +648,64 @@ ensemble_targets <- tar_map(
647648

648649

649650
# ================================ SCORE TARGETS ================================
650-
score_targets <- list2(
651+
external_forecast_targets <- tar_map(
652+
values = tibble(
653+
forecast_date_int = seq(as.Date("2024-11-23"), round_date(Sys.Date() - 3, "week", 6), by = "week")
654+
) %>%
655+
mutate(
656+
forecast_date_chr = as.character(as.Date(forecast_date_int)),
657+
filename = paste0(g_s3_prefix, "/", forecast_date_chr, "/", g_disease, "_forecasts.parquet"),
658+
),
659+
names = "forecast_date_chr",
660+
tar_change(
661+
name = external_forecasts,
662+
change = get_s3_object_last_modified(filename, "forecasting-team-data"),
663+
command = {
664+
get_external_forecasts(filename)
665+
}
666+
),
667+
tar_target(
668+
name = score_external_nhsn_forecasts,
669+
command = {
670+
score_forecasts(nhsn_latest_data, external_forecasts, "wk inc covid hosp")
671+
}
672+
),
673+
tar_target(
674+
name = score_external_nssp_forecasts,
675+
command = {
676+
score_forecasts(
677+
nssp_latest_data %>% mutate(value = nssp),
678+
external_forecasts,
679+
"wk inc covid prop ed visits")
680+
}
681+
)
682+
)
683+
684+
combined_targets <- list2(
685+
tar_combine(
686+
name = external_forecasts_full,
687+
external_forecast_targets[["external_forecasts"]],
688+
command = {
689+
dplyr::bind_rows(!!!.x)
690+
}
691+
),
692+
tar_combine(
693+
name = external_scores_nhsn_full,
694+
external_forecast_targets[["score_external_nhsn_forecasts"]],
695+
command = {
696+
dplyr::bind_rows(!!!.x)
697+
}
698+
),
699+
tar_combine(
700+
name = external_scores_nssp_full,
701+
external_forecast_targets[["score_external_nssp_forecasts"]],
702+
command = {
703+
dplyr::bind_rows(!!!.x)
704+
}
705+
)
706+
)
707+
708+
list2(
651709
tar_change(
652710
external_forecasts,
653711
change = get_s3_object_last_modified(g_external_object_name, "forecasting-team-data"),
@@ -663,7 +721,9 @@ score_targets <- list2(
663721
dplyr::bind_rows(!!!.x),
664722
external_forecasts %>%
665723
filter(target == "wk inc covid hosp") %>%
666-
select(-target)
724+
select(-target),
725+
min_locations = 52,
726+
min_dates = 40
667727
)
668728
}
669729
),
@@ -675,7 +735,10 @@ score_targets <- list2(
675735
dplyr::bind_rows(!!!.x),
676736
external_forecasts %>%
677737
filter(target == "wk inc covid prop ed visits") %>%
678-
select(-target)
738+
select(-target) %>%
739+
mutate(value = value * 100),
740+
min_locations = 50,
741+
min_dates = 14
679742
)
680743
}
681744
),
@@ -706,27 +769,66 @@ if (g_backtest_mode) {
706769
} else {
707770
# Only render the report if there is only one forecast date
708771
# i.e. we're running this in prod on schedule
709-
score_notebook <- tar_target(
710-
ongoing_score_notebook,
711-
command = {
772+
score_notebook <- list2(
773+
tar_target(
774+
ongoing_nhsn_score_notebook,
775+
command = {
712776
if (!dir.exists(here::here("reports"))) {
713777
dir.create(here::here("reports"))
714778
}
779+
# Don't run if there aren't forecasts in the past 4 weeks to evaluate
780+
if (external_forecasts_full %>%
781+
filter(
782+
forecast_date >= round_date(Sys.Date() - 3, "week", 6) - 4*7,
783+
target == "wk inc covid hosp") %>% distinct(forecast_date) %>% nrow() == 0) {
784+
return()
785+
}
715786
rmarkdown::render(
716787
ongoing_score_report_rmd,
717788
output_file = here::here(
718789
"reports",
719-
sprintf("%s_covid_scoring.html", as.Date(Sys.Date()))
790+
sprintf("%s_covid_nhsn_scoring.html", as.Date(Sys.Date()))
720791
),
721792
params = list(
722793
disease = "covid",
723794
target = "nhsn",
724-
external_forecasts = external_forecasts,
725-
nhsn_archive = nhsn_archive_data,
726-
scores_nhsn = scores_nhsn
795+
external_forecasts = external_forecasts_full %>% filter(target == "wk inc covid hosp") %>% select(-target),
796+
archive = nhsn_archive_data,
797+
scores = external_scores_nhsn_full
727798
)
728799
)
729-
}
800+
}
801+
),
802+
tar_target(
803+
ongoing_nssp_score_notebook,
804+
command = {
805+
if (!dir.exists(here::here("reports"))) {
806+
dir.create(here::here("reports"))
807+
}
808+
# Don't run if there aren't forecasts in the past 4 weeks to evaluate
809+
if (external_forecasts_full %>%
810+
filter(
811+
forecast_date >= round_date(Sys.Date() - 3, "week", 6) - 4*7,
812+
target == "wk inc covid prop ed visits"
813+
) %>% distinct(forecast_date) %>% nrow() == 0) {
814+
return()
815+
}
816+
rmarkdown::render(
817+
ongoing_score_report_rmd,
818+
output_file = here::here(
819+
"reports",
820+
sprintf("%s_covid_nssp_scoring.html", as.Date(Sys.Date()))
821+
),
822+
params = list(
823+
disease = "covid",
824+
target = "nssp",
825+
external_forecasts = external_forecasts_full %>% filter(target == "wk inc covid prop ed visits") %>% select(-target),
826+
archive = nssp_archive_data,
827+
scores = external_scores_nssp_full
828+
)
829+
)
830+
}
831+
),
730832
)
731833
}
732834

@@ -736,6 +838,7 @@ list2(
736838
ensemble_targets,
737839
combined_nhsn_forecasts,
738840
combined_nssp_forecasts,
739-
score_targets,
841+
external_forecast_targets,
842+
combined_targets,
740843
score_notebook
741844
)

0 commit comments

Comments
 (0)