Skip to content

Commit d068b3e

Browse files
committed
add missingness
1 parent 903e79c commit d068b3e

File tree

1 file changed

+148
-28
lines changed

1 file changed

+148
-28
lines changed

scripts/signal_spreadsheet_updater.R

Lines changed: 148 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ avail_geos <- c(
486486
# # Not available for all indicators. Try nation. Avoid smaller geos because
487487
# # processing later will take a while.
488488
# geo_type <- "state"
489-
489+
#
490490
# # Consider a range of issues. About 2 weeks is probably fine. Not all indicators
491491
# # are available in this time range, so you may need to make another range of
492492
# # dates that is years or months different.
@@ -507,8 +507,8 @@ avail_geos <- c(
507507
# "2021-02-15",
508508
# "2021-02-16"
509509
# )
510-
511-
510+
#
511+
#
512512
# epidata <- pub_covidcast(
513513
# source,
514514
# signal,
@@ -517,16 +517,16 @@ avail_geos <- c(
517517
# time_type = "day",
518518
# issues = about_2weeks_issues
519519
# )
520-
521-
520+
#
521+
#
522522
# # Make sure data is looking reasonable
523523
# # Number of reference dates reported in each issue
524524
# count(epidata, issue)
525-
525+
#
526526
# # Number of locations reported for each issue and reference date
527527
# count(epidata, issue, time_value)
528-
529-
528+
#
529+
#
530530
# ## Revision cadence
531531
# # For each location and reference date, are all reported values the same across
532532
# # all lags we're checking?
@@ -544,8 +544,8 @@ avail_geos <- c(
544544
# # Are all reference dates without any lag?
545545
# all(revision_comparison$no_backfill == "TRUE")
546546
# View(revision_comparison)
547-
548-
547+
#
548+
#
549549
# ## Reporting lag
550550
# # Find how lagged the newest reported value is for each issue.
551551
# epidata_slice <- epidata %>% group_by(issue) %>% slice_min(lag)
@@ -730,7 +730,66 @@ signal_specific_censoring <- tibble::tribble(
730730
)
731731
source_updated[, col] <- data_censoring[source_updated$data_source]
732732

733-
# TODO
733+
# Add signal_specific_censoring info
734+
source_updated <- left_join(
735+
source_updated, signal_specific_censoring,
736+
by = c("Signal" = "signal", "data_source")
737+
) %>%
738+
mutate(`Data Censoring` = coalesce(note, `Data Censoring`)) %>%
739+
select(-note)
740+
741+
742+
# # Tool: Investigate state and county coverage
743+
# suppressPackageStartupMessages({
744+
# library(epidatr) # Access Delphi API
745+
# library(dplyr) # Data handling
746+
# library(ggplot2)
747+
# })
748+
#
749+
#
750+
# # COVIDcast metadata
751+
# # Metadata documentation: https://cmu-delphi.github.io/delphi-epidata/api/covidcast_meta.html
752+
# metadata <- pub_covidcast_meta()
753+
# # Convert `last_update` into a datetime.
754+
# # metadata$last_update <- as.POSIXct(metadata$last_update, origin = "1970-01-01")
755+
# ## If don't want the hours, etc, truncate with `as.Date`
756+
# metadata$last_update <- as.Date(as.POSIXct(metadata$last_update, origin = "1970-01-01"))
757+
#
758+
# one_sig_per_source <- metadata %>%
759+
# arrange(desc(signal)) %>%
760+
# group_by(data_source) %>%
761+
# slice_head(n = 1)
762+
#
763+
# state_filtered <- metadata %>%
764+
# filter(geo_type == "state") %>%
765+
# select(data_source, signal, geo_type, num_locations) %>%
766+
# mutate(pct_locations = num_locations / 51 * 100)
767+
# first_sig_per_source_state <- state_filtered %>%
768+
# group_by(data_source) %>%
769+
# slice_head(n = 1)
770+
# first_sig_per_source_state
771+
#
772+
# ggplot(
773+
# data = state_filtered,
774+
# aes(x = data_source, y = pct_locations)
775+
# ) + geom_boxplot()
776+
#
777+
#
778+
# county_filtered <- metadata %>%
779+
# filter(geo_type == "county") %>%
780+
# select(data_source, signal, geo_type, num_locations) %>%
781+
# mutate(pct_locations = num_locations / 3143 * 100)
782+
# first_sig_per_source_county <- county_filtered %>%
783+
# group_by(data_source) %>%
784+
# slice_head(n = 1)
785+
# first_sig_per_source_county
786+
#
787+
# ggplot(
788+
# data = county_filtered,
789+
# aes(x = data_source, y = pct_locations)
790+
# ) + geom_boxplot()
791+
792+
734793
col <- "Missingness"
735794
# How much missingness is there, and for what reasons? Is it possible to
736795
# distinguish a missing value from a true zero? This is an unstructured text
@@ -743,27 +802,88 @@ col <- "Missingness"
743802
# not sure what to do. Maybe just summarize the current state, e.g. "85%
744803
# counties available in mid 2020, then gradually declined to 8% of counties
745804
# by April 2024", and leave it at that. We could occasionally update it.
805+
806+
all_counties_terr <- "Data is available for all counties and some territorial county equivalents."
807+
all_states <- "Data is available for all states."
808+
all_states_terr <- "Data is available for all states and some territories."
746809
missingness <- c(
747-
"chng" = NA_character_,
748-
"covid-act-now" = "A few counties, most notably in California, are not covered by this data source",
749-
"doctor-visits" = NA_character_,
750-
"dsew-cpr" = NA_character_,
751-
"fb-survey" = "A missing value indicates no valid data OR, for test positivity, that the value was censored due to small sample size (<= 5)",
752-
"ght" = NA_character_,
753-
"google-survey" = NA_character_,
754-
"google-symptoms" = NA_character_,
755-
"hhs" = NA_character_,
756-
"hospital-admissions" = NA_character_,
757-
"indicator-combination" = NA_character_,
758-
"jhu-csse" = NA_character_,
759-
"nchs-mortality" = NA_character_,
760-
"quidel" = NA_character_,
761-
"safegraph" = NA_character_,
762-
"usa-facts" = NA_character_,
763-
"youtube-survey" = NA_character_
810+
"chng" = paste("Data is available for nearly all (99%) of counties.", all_states_terr),
811+
"covid-act-now" = paste("Data is available for nearly all (99%) of counties. A few counties, most notably in California, are not covered by this data source", all_states),
812+
"doctor-visits" = paste("Data is available for about 80% of counties", all_states_terr),
813+
"dsew-cpr" = paste(all_counties_terr, all_states_terr),
814+
"fb-survey" = "Geographic coverage varies widely by signal, with anywhere from 0.4% to 25% of counties available and 15% to 100% of states. A handful of signals are available for 40-50% of counties, and all states and some territories. Signals based on questions that were asked to a subset of survey respondents are available for fewer locations. Availability declines over time as survey response rate decreases. A missing value indicates no valid data OR, for test positivity, that the value was censored due to small sample size (<= 5)",
815+
"ght" = all_states,
816+
"google-survey" = paste("Data is available for about 20% of counties", all_states),
817+
"google-symptoms" = NA_character_, # below
818+
"hhs" = all_states_terr,
819+
"hospital-admissions" = paste("Data is available for about 35% of counties", all_states),
820+
"indicator-combination" = paste(all_counties_terr, all_states_terr),
821+
"jhu-csse" = paste(all_counties_terr, all_states_terr),
822+
"nchs-mortality" = paste(all_states_terr),
823+
"quidel" = "Geographic coverage for some age groups (e.g. age 0-4 and age 65+) are extremely limited at HRR and MSA level, and can even be limited at state level on weekends.", # TODO
824+
"safegraph" = paste(all_counties_terr, all_states_terr),
825+
"usa-facts" = paste(all_counties_terr, all_states),
826+
"youtube-survey" = NA_character_ # below
764827
)
765828
source_updated[, col] <- missingness[source_updated$data_source]
766829

830+
google_symptoms_note <- "Signals associated with rarer symptoms (e.g. ageusia) will tend to have fewer locations available, due to upstream privacy censoring. Locations with lower populations will tend to be less available for the same reason"
831+
signal_specific_missingness <- tibble::tribble(
832+
~data_source, ~signal, ~note,
833+
"indicator-combination", "nmf_day_doc_fbc_fbs_ght", paste("Data is available for about 80% of counties", all_states_terr),
834+
"indicator-combination", "nmf_day_doc_fbs_ght", paste("Data is available for about 70% of counties", all_states_terr),
835+
836+
"safegraph", "bars_visit_num", "Data is available for about 10% of counties. Data is available for about 90% of states",
837+
"safegraph", "bars_visit_prop", "Data is available for about 10% of counties. Data is available for about 90% of states",
838+
"safegraph", "restaurants_visit_num", paste("Data is available for about 80% of counties", all_states_terr),
839+
"safegraph", "restaurants_visit_prop", paste("Data is available for about 80% of counties", all_states_terr),
840+
841+
"fb-survey", "smoothed_cli", paste("Data is available for about 50% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
842+
"fb-survey", "smoothed_ili", paste("Data is available for about 50% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
843+
"fb-survey", "smoothed_wcli", paste("Data is available for about 50% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
844+
"fb-survey", "smoothed_wili", paste("Data is available for about 50% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
845+
"fb-survey", "smoothed_travel_outside_state_5d", paste("Data is available for about 45% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
846+
"fb-survey", "smoothed_wtravel_outside_state_5d", paste("Data is available for about 45% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
847+
"fb-survey", "smoothed_nohh_cmnty_cli", paste("Data is available for about 40% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
848+
"fb-survey", "smoothed_hh_cmnty_cli", paste("Data is available for about 40% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
849+
"fb-survey", "smoothed_whh_cmnty_cli", paste("Data is available for about 35% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
850+
"fb-survey", "smoothed_wnohh_cmnty_cli", paste("Data is available for about 35% of counties.", all_states_terr, "Availability declines over time as survey response rate decreases"),
851+
852+
"youtube-survey", "raw_cli", "Data is available for about 40% of states",
853+
"youtube-survey", "raw_ili", "Data is available for about 40% of states",
854+
"youtube-survey", "smoothed_cli", "Data is available for about 80% of states",
855+
"youtube-survey", "smoothed_ili", "Data is available for about 80% of states",
856+
857+
"google-symptoms", "ageusia_raw_search", paste("Data is available for about 3-4% of counties. Data is available for about 85% of states.", google_symptoms_note),
858+
"google-symptoms", "ageusia_smoothed_search", paste("Data is available for about 3-4% of counties. Data is available for about 85% of states.", google_symptoms_note),
859+
"google-symptoms", "anosmia_raw_search", paste("Data is available for about 3-4% of counties. Data is available for about 85% of states.", google_symptoms_note),
860+
"google-symptoms", "anosmia_smoothed_search", paste("Data is available for about 3-4% of counties. Data is available for about 85% of states.", google_symptoms_note),
861+
"google-symptoms", "s01_raw_search", paste("Data is available for about 50% of counties.", all_states, google_symptoms_note),
862+
"google-symptoms", "s01_smoothed_search", paste("Data is available for about 50% of counties.", all_states, google_symptoms_note),
863+
"google-symptoms", "s02_raw_search", paste("Data is available for about 65% of counties.", all_states, google_symptoms_note),
864+
"google-symptoms", "s02_smoothed_search", paste("Data is available for about 65% of counties.", all_states, google_symptoms_note),
865+
"google-symptoms", "s03_raw_search", paste("Data is available for about 50% of counties.", all_states, google_symptoms_note),
866+
"google-symptoms", "s03_smoothed_search", paste("Data is available for about 50% of counties.", all_states, google_symptoms_note),
867+
"google-symptoms", "s04_raw_search", paste("Data is available for about 30% of counties.", all_states, google_symptoms_note),
868+
"google-symptoms", "s04_smoothed_search", paste("Data is available for about 30% of counties.", all_states, google_symptoms_note),
869+
"google-symptoms", "s05_raw_search", paste("Data is available for about 3-4% of counties. Data is available for about 90% of states.", google_symptoms_note),
870+
"google-symptoms", "s05_smoothed_search", paste("Data is available for about 3-4% of counties. Data is available for about 90% of states.", google_symptoms_note),
871+
"google-symptoms", "s06_raw_search", paste("Data is available for about 30% of counties.", all_states, google_symptoms_note),
872+
"google-symptoms", "s06_smoothed_search", paste("Data is available for about 30% of counties.", all_states, google_symptoms_note),
873+
"google-symptoms", "scontrol_raw_search", paste("Data is available for about 45% of counties.", all_states, google_symptoms_note),
874+
"google-symptoms", "scontrol_smoothed_search", paste("Data is available for about 45% of counties.", all_states, google_symptoms_note),
875+
"google-symptoms", "sum_anosmia_ageusia_raw_search", paste("Data is available for about 3-4% of counties. Data is available for about 85% of states.", google_symptoms_note),
876+
"google-symptoms", "sum_anosmia_ageusia_smoothed_search", paste("Data is available for about 3-4% of counties. Data is available for about 85% of states.", google_symptoms_note),
877+
)
878+
879+
# Add signal-specific missingness
880+
source_updated <- left_join(
881+
source_updated, signal_specific_missingness,
882+
by = c("Signal" = "signal", "data_source")
883+
) %>%
884+
mutate(`Missingness` = coalesce(note, `Missingness`)) %>%
885+
select(-note)
886+
767887

768888
col <- "Who may access this signal?"
769889
# Who has the right to access this signal? E.g. "Delphi, CDC" or "Delphi,

0 commit comments

Comments
 (0)