|
1 | | -source(here::here("R", "load_all.R")) |
| 1 | +suppressPackageStartupMessages(source(here::here("R", "load_all.R"))) |
2 | 2 |
|
3 | | -data <- tribble( |
4 | | - ~geo_value, ~time_value, ~version, ~value1, |
5 | | - "us", "2024-11-08", "2024-11-13", 1, |
6 | | - "us", "2024-11-07", "2024-11-13", 2, |
7 | | - "us", "2024-11-06", "2024-11-13", 3, |
8 | | - "us", "2024-11-05", "2024-11-13", 4, |
9 | | - "us", "2024-11-04", "2024-11-13", 5, |
10 | | - "us", "2024-11-03", "2024-11-13", 6, |
11 | | - "us", "2024-11-02", "2024-11-13", 7, |
12 | | - "us", "2024-11-01", "2024-11-13", 8, |
13 | | - "us", "2024-10-31", "2024-11-13", 9, |
14 | | - "us", "2024-10-30", "2024-11-13", 10, |
15 | | - "us", "2024-10-29", "2024-11-13", 11, |
16 | | - "us", "2024-10-28", "2024-11-13", 12, |
17 | | - "us", "2024-10-27", "2024-11-13", 13 |
18 | | -) %>% |
19 | | - mutate(value2 = value1 * 11) %>% |
20 | | - bind_rows((.) %>% mutate(geo_value = "ca", value1 = value1 * 3 + 1, value2 = value2 + 50)) %>% |
21 | | - mutate(time_value = as.Date(time_value), version = as.Date(version)) %>% |
22 | | - as_epi_df() |
23 | 3 |
|
24 | | -r <- epi_recipe(data) %>% |
25 | | - step_YeoJohnson2(value1, value2) %>% |
26 | | - prep(data) |
27 | | -r |
28 | | -r$steps[[1]]$lambdas |
29 | | -outcome <- r %>% bake(data) |
30 | | - |
31 | | -httpgd::hgd() |
32 | | -data %>% |
33 | | - pivot_longer(c(value1, value2), names_to = "variable", values_to = "value") %>% |
34 | | - ggplot(aes(time_value, value, color = variable)) + |
35 | | - geom_line() + |
36 | | - geom_line( |
37 | | - data = outcome %>% pivot_longer(c(value1, value2), names_to = "variable", values_to = "value"), |
38 | | - aes(time_value, value, color = variable), |
39 | | - ) + |
40 | | - facet_wrap(~geo_value, scales = "free_y") + |
41 | | - theme_minimal() + |
42 | | - labs(title = "Yeo-Johnson transformation", x = "Time", y = "Value") |
| 4 | +# Real data test |
| 5 | +Sys.setenv(TAR_PROJECT = "flu_hosp_explore") |
43 | 6 |
|
44 | 7 |
|
| 8 | +# Transform with Yeo-Johnson |
45 | 9 | data <- tar_read(joined_archive_data) %>% |
46 | | - epix_as_of(as.Date("2023-11-01")) %>% |
47 | | - filter(source == "nhsn") %>% |
48 | | - rename(value = hhs) |
49 | | -r <- epi_recipe(data) %>% |
50 | | - step_YeoJohnson2(value) %>% |
51 | | - prep(data) |
| 10 | + epix_as_of(as.Date("2023-11-01")) |
| 11 | +state_geo_values <- data %>% filter(source == "nhsn") %>% pull(geo_value) %>% unique() |
| 12 | +filtered_data <- data %>% |
| 13 | + filter(geo_value %in% state_geo_values) %>% |
| 14 | + select(geo_value, source, time_value, hhs) |
| 15 | +r <- epi_recipe(filtered_data) %>% |
| 16 | + step_YeoJohnson2(hhs) %>% |
| 17 | + prep(filtered_data) |
52 | 18 | r |
| 19 | +# Inspect the lambda values (a few states have default lambda = 0.25, because |
| 20 | +# they have issues) |
53 | 21 | r$steps[[1]]$lambdas %>% print(n = 55) |
54 | | -outcome <- r %>% bake(data) |
| 22 | +out1 <- r %>% bake(filtered_data) |
55 | 23 |
|
56 | | -httpgd::hgd() |
57 | | -data %>% |
58 | | - ggplot(aes(time_value, value)) + |
59 | | - geom_line(color = "blue") + |
60 | | - geom_line(data = outcome, aes(time_value, value), color = "green") + |
| 24 | +# Transform with manual whitening (quarter root scaling) |
| 25 | +# learned_params <- calculate_whitening_params(filtered_data, "hhs", scale_method = "none", center_method = "none", nonlin_method = "quart_root") |
| 26 | +out2 <- filtered_data %>% |
| 27 | + mutate(hhs = (hhs + 0.01)^(1 / 4)) |
| 28 | + |
| 29 | +out1 %>% |
| 30 | + left_join(out2, by = c("geo_value", "source", "time_value")) %>% |
| 31 | + mutate(hhs_diff = hhs.x - hhs.y) %>% |
| 32 | + ggplot(aes(time_value, hhs_diff)) + |
| 33 | + geom_line() + |
61 | 34 | facet_wrap(~geo_value, scales = "free_y") + |
62 | 35 | theme_minimal() + |
63 | | - labs(title = "Yeo-Johnson transformation", x = "Time", y = "Value") |
| 36 | + labs(title = "Yeo-Johnson transformation", x = "Time", y = "HHS") |
| 37 | + |
| 38 | +# Plot the real data before and after transformation |
| 39 | +geo_filter <- "ca" |
| 40 | +filtered_data %>% |
| 41 | + filter(geo_value == geo_filter, source == "nhsn") %>% |
| 42 | + mutate(hhs = log(hhs)) %>% |
| 43 | + ggplot(aes(time_value, hhs)) + |
| 44 | + geom_line(color = "blue") + |
| 45 | + geom_line(data = out1 %>% filter(geo_value == geo_filter, source == "nhsn") %>% mutate(hhs = log(hhs)), aes(time_value, hhs), color = "green") + |
| 46 | + geom_line(data = out2 %>% filter(geo_value == geo_filter, source == "nhsn") %>% mutate(hhs = log(hhs)), aes(time_value, hhs), color = "red") + |
| 47 | + theme_minimal() + |
| 48 | + labs(title = "Yeo-Johnson transformation", x = "Time", y = "HHS") |
64 | 49 |
|
65 | 50 |
|
66 | 51 | # TODO: Test this. |
67 | 52 | ## Layer Yeo-Johnson2 |
68 | 53 | postproc <- frosting() %>% |
69 | 54 | layer_YeoJohnson2() |
70 | 55 |
|
71 | | -wf <- epi_workflow(r, linear_reg()) %>% |
| 56 | +wf <- epi_workflow(r) %>% |
72 | 57 | fit(data) %>% |
73 | 58 | add_frosting(postproc) |
0 commit comments