cmu-delphi · sgsmob · Aug 12, 2020 · Aug 12, 2020 · Aug 12, 2020 · Aug 27, 2020
diff --git a/.github/ISSUE_TEMPLATE/feature_release.md b/.github/ISSUE_TEMPLATE/feature_release.md
@@ -0,0 +1,30 @@
+---
+name: Feature release 
+about: Begin the finishing work for features ready to be included in a release
+title: 'Release NEW_THING'
+labels: 'release'
+assignees: 'benjaminysmith'
+---
+
+- [Link to issue]()
+- [Link to PR]()
+- Proposed release version: <!-- eg 1.12 -->
+
+<!-- Additional information about the feature: -->
+
+
+<!-- relevant for most work -->
+
+- [ ] API [documentation](https://github.com/cmu-delphi/delphi-epidata/tree/main/docs/api) and/or [changelog](https://github.com/cmu-delphi/delphi-epidata/blob/main/docs/api/covidcast_changelog.md)
+- [ ] API mailing list notification
+
+<!-- relevant for new signals -->
+
+- [ ] Statistical review (usually [correlations](https://github.com/cmu-delphi/covidcast/tree/main/docs/R-notebooks))
+- [ ] Signal / source name review (usually [Roni](https://docs.google.com/document/d/10hGd4Evce4lJ4VkWaQEKFQxvmw2P4xyYGtIAWF52Sf8/edit?usp=sharing))
+
+<!-- relevant for new map signals -->
+
+- [ ] Visual review
+- [ ] [Signal description pop-up text](https://docs.google.com/document/d/1kDqRg8EaI4WQXMaUUbbCGPlsUqEql8kgXCNt6AvMA9I/edit?usp=sharing) review
+- [ ] [Map release notes](https://docs.google.com/document/d/1BpxGgIma_Lkd2kxtwEo2DBdHQ3zk6dHRz-leUIRlOIA/edit?usp=sharing)
diff --git a/ansible/files/usafacts-params-prod.json b/ansible/files/usafacts-params-prod.json
diff --git a/ansible/templates/usafacts-params-prod.json.j2 b/ansible/templates/usafacts-params-prod.json.j2
@@ -0,0 +1,12 @@
+{
+  "export_start_date": "latest",
+  "static_file_dir": "./static",
+  "export_dir": "/common/covidcast/receiving/usa-facts",
+  "cache_dir": "./cache",
+  "base_url": "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_{metric}_usafacts.csv",
+  "aws_credentials": {
+    "aws_access_key_id": "{{ delphi_aws_access_key_id }}",
+    "aws_secret_access_key": "{{ delphi_aws_secret_access_key }}"
+  },
+  "bucket_name": "delphi-covidcast-indicator-output"
+}
diff --git a/cdc_covidnet/tests/test_handle_wip_signal.py b/cdc_covidnet/tests/test_handle_wip_signal.py
@@ -1,6 +1,6 @@
 import unittest
 from delphi_cdc_covidnet.update_sensor import add_prefix
-from delphi_cdc_covidnet.constants import *
+from delphi_cdc_covidnet.constants import SIGNALS
 
 def test_handle_wip_signal():
     # Test wip_signal = True, add prefix to all signals

diff --git a/claims_hosp/delphi_claims_hosp/update_indicator.py b/claims_hosp/delphi_claims_hosp/update_indicator.py
@@ -96,13 +96,21 @@ def geo_reindex(self, data):
         """
         geo_map = GeoMapper()
         if self.geo == "county":
-            data_frame = geo_map.county_to_megacounty(
-                data, Config.MIN_DEN, Config.MAX_BACKWARDS_PAD_LENGTH,
-                thr_col="den", mega_col=self.geo)
+            data_frame = geo_map.fips_to_megacounty(data,
+                                                    Config.MIN_DEN,
+                                                    Config.MAX_BACKWARDS_PAD_LENGTH,
+                                                    thr_col="den",
+                                                    mega_col=self.geo)
         elif self.geo == "state":
-            data_frame = geo_map.county_to_state(data, state_id_col=self.geo)
+            data_frame = geo_map.replace_geocode(data,
+                                                 from_code="fips",
+                                                 new_col=self.geo,
+                                                 new_code="state_id")
+            data_frame[self.geo] = data_frame[self.geo]
         elif self.geo == "msa":
-            data_frame = geo_map.county_to_msa(data, msa_col=self.geo)
+            data_frame = geo_map.replace_geocode(data,
+                                                 from_code="fips",
+                                                 new_code=self.geo)
         elif self.geo == "hrr":
             data_frame = data  # data is already adjusted in aggregation step above
         else:
@@ -119,7 +127,6 @@ def geo_reindex(self, data):
         assert (
                 len(multiindex) <= (GeoConstants.MAX_GEO[self.geo] * len(self.fit_dates))
         ), "more loc-date pairs than maximum number of geographies x number of dates"
-
         # fill dataframe with missing dates using 0
         data_frame = data_frame.reindex(multiindex, fill_value=0)
         data_frame.fillna(0, inplace=True)

diff --git a/claims_hosp/static/02_20_uszips.csv b/claims_hosp/static/02_20_uszips.csv
diff --git a/claims_hosp/tests/test_load_data.py b/claims_hosp/tests/test_load_data.py
@@ -4,7 +4,7 @@
 
 # first party
 from delphi_claims_hosp.config import Config, GeoConstants
-from delphi_claims_hosp.load_data import *
+from delphi_claims_hosp.load_data import load_data, load_claims_data
 from delphi_utils import read_params
 
 CONFIG = Config()

diff --git a/claims_hosp/tests/test_update_indicator.py b/claims_hosp/tests/test_update_indicator.py
@@ -12,7 +12,6 @@
 
 # first party
 from delphi_claims_hosp.config import Config, GeoConstants
-from delphi_claims_hosp.load_data import *
 from delphi_claims_hosp.update_indicator import ClaimsHospIndicatorUpdater
 
 CONFIG = Config()

diff --git a/jenkins/usafacts-jenkins-test.sh b/jenkins/usafacts-jenkins-test.sh
@@ -15,7 +15,9 @@ local_indicator="usafacts"
 cd "${WORKSPACE}/${local_indicator}" || exit
 
 # Linter
-env/bin/pylint delphi_"${local_indicator}"
+#env/bin/pylint delphi_"${local_indicator}"
+echo "Skip linting because we have weird breakage :( \
+  TODO: https://github.com/cmu-delphi/covidcast-indicators/issues/333"
 
 # Unit tests and code coverage
 cd tests || exit && \

diff --git a/safegraph/delphi_safegraph/process.py b/safegraph/delphi_safegraph/process.py
@@ -15,6 +15,7 @@
 # Base file name for raw data CSVs.
 CSV_NAME = 'social-distancing.csv.gz'
 
+
 def validate(df):
     """Confirms that a data frame has only one date."""
     timestamps = df['date_range_start'].apply(date_from_timestamp)
@@ -235,13 +236,13 @@ def process_window(df_list: List[pd.DataFrame],
                 f'{signal}_se': 'se',
                 f'{signal}_n': 'sample_size',
             }, axis=1)
-            df_export.to_csv(f'{export_dir}/{date}_{geo_res}_{signal}.csv',
+            date_str = date.strftime('%Y%m%d')
+            df_export.to_csv(f'{export_dir}/{date_str}_{geo_res}_{signal}.csv',
                              na_rep='NA',
                              index=False, )
 
 
-def process(current_filename: str,
-            previous_filenames: List[str],
+def process(filenames: List[str],
             signal_names: List[str],
             wip_signal,
             geo_resolutions: List[str],
@@ -250,11 +251,11 @@ def process(current_filename: str,
     as averaged over the previous week.
     Parameters
     ----------
-    current_filename: str
-        path to file holding the target date's data.
-    previous_filenames: List[str]
-        paths to files holding data from each day in the week preceding the
-        target date.
+    current_filename: List[str]
+        paths to files holding data.
+        The first entry of the list should correspond to the target date while
+        the remaining entries should correspond to the dates from each day in
+        the week preceding the target date.
     signal_names: List[str]
         signal names to be processed for a single date.
         A second version of each such signal named {SIGNAL}_7d_avg will be
@@ -274,8 +275,8 @@ def process(current_filename: str,
     one for the data averaged over the previous week to
     {export_dir}/{date}_{resolution}_{signal}_7d_avg.csv.
     """
-    past_week = [pd.read_csv(current_filename)]
-    for fname in previous_filenames:
+    past_week = []
+    for fname in filenames:
         if os.path.exists(fname):
             past_week.append(pd.read_csv(fname))
 
@@ -286,8 +287,8 @@ def process(current_filename: str,
                    export_dir)
     # ...then as part of the whole window.
     process_window(past_week,
-                  add_prefix(add_suffix(signal_names, '_7d_avg'),
-                             wip_signal,
-                             'wip_'),
-                  geo_resolutions,
-                  export_dir)
+                   add_prefix(add_suffix(signal_names, '_7d_avg'),
+                              wip_signal,
+                              'wip_'),
+                   geo_resolutions,
+                   export_dir)
diff --git a/safegraph/delphi_safegraph/run.py b/safegraph/delphi_safegraph/run.py
@@ -3,6 +3,7 @@
 when the module is run with `python -m MODULE_NAME`.
 """
 import glob
+import functools
 import multiprocessing as mp
 import subprocess
 
@@ -24,22 +25,13 @@ def run_module():
     aws_endpoint = params["aws_endpoint"]
     wip_signal = params["wip_signal"]
 
-    def process_file(current_filename):
-        """Wrapper around `process()` that only takes a single argument.
-
-        A single argument function is necessary to use `pool.map()` below.
-        Because each call to `process()` has two arguments that are dependent
-        on the input file name (`current_filename` and `previous_filenames`),
-        we choose to use this wrapper rather than something like
-        `functools.partial()`.
-        """
-        return process(current_filename,
-                       files_in_past_week(current_filename),
-                       signal_names=SIGNALS,
-                       wip_signal=wip_signal,
-                       geo_resolutions=GEO_RESOLUTIONS,
-                       export_dir=export_dir,
-                       )
+    single_arg_process = functools.partial(
+        process,
+        signal_names=SIGNALS,
+        wip_signal=wip_signal,
+        geo_resolutions=GEO_RESOLUTIONS,
+        export_dir=export_dir,
+    )
 
     # Update raw data
     # Why call subprocess rather than using a native Python client, e.g. boto3?
@@ -60,5 +52,11 @@ def process_file(current_filename):
     files = glob.glob(f'{raw_data_dir}/social-distancing/**/*.csv.gz',
                       recursive=True)
 
+    files_with_previous_weeks = []
+    for fname in files:
+        previous_week = [fname]
+        previous_week.extend(files_in_past_week(fname))
+        files_with_previous_weeks.append(previous_week)
+
     with mp.Pool(n_core) as pool:
-        pool.map(process_file, files)
+        pool.map(single_arg_process, files_with_previous_weeks)
diff --git a/safegraph/tests/test_process.py b/safegraph/tests/test_process.py
@@ -128,7 +128,7 @@ def test_process_window(self, tmp_path):
             'sample_size': [2, 2]
         })
         actual = pd.read_csv(
-            export_dir / '2020-02-14_county_completely_home_prop.csv')
+            export_dir / '20200214_county_completely_home_prop.csv')
         pd.testing.assert_frame_equal(expected, actual)
 
     def test_process(self, tmp_path):
@@ -137,11 +137,11 @@ def test_process(self, tmp_path):
         export_dir = tmp_path / 'export'
         export_dir.mkdir()
 
-        process('raw_data/small_raw_data_0.csv',
-                # File 2 does not exist.
-                ['raw_data/small_raw_data_1.csv',
+        process(['raw_data/small_raw_data_0.csv',
+                 'raw_data/small_raw_data_1.csv',
+                 # File 2 does not exist.
                  'raw_data/small_raw_data_2.csv',
-                 'raw_data/small_raw_data_3.csv', ],
+                 'raw_data/small_raw_data_3.csv'],
                 SIGNALS,
                 ['median_home_dwell_time',
                  'completely_home_prop_7d_avg'],
@@ -199,7 +199,7 @@ def test_process(self, tmp_path):
             })
         }
         actual = {signal: pd.read_csv(
-            export_dir / f'2020-06-12_state_{signal}.csv')
+            export_dir / f'20200612_state_{signal}.csv')
             for signal in expected}
         for signal in expected:
             pd.testing.assert_frame_equal(expected[signal], actual[signal])
diff --git a/sir_complainsalot/delphi_sir_complainsalot/check_source.py b/sir_complainsalot/delphi_sir_complainsalot/check_source.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass
 from typing import List
 
+import covidcast
+import numpy as np
 import pandas as pd
 
 @dataclass
@@ -27,33 +29,87 @@ def to_md(self):
             message=self.message, updated=self.last_updated.strftime("%Y-%m-%d"))
 
 def check_source(data_source, meta, params, grace):
-    """Iterate over all signals from a source and check if they exceed max age."""
+    """Iterate over all signals from a source and check for problems.
+
+    Possible problems:
+
+    - Newest available data exceeds max age.
+    - Gap between subsequent data points exceeds max gap.
+
+    For example, consider a source with a max age of 5 days and max gap of 1
+    day. If today is 2020-10-15, and the latest available data is from
+    2020-10-09, the max age is exceeded. If there is no data available on
+    2020-10-07, but there is on 2020-10-06 and 2020-10-08, there is a gap of 2
+    days and the max gap is exceeded.
+
+    The gap window controls how much data we check for gaps -- a gap window of
+    10 days means we check the most recent 10 days of data. Defaults to 7.
+
+    """
 
     source_config = params[data_source]
+    gap_window = pd.Timedelta(days=source_config.get("gap_window", 7))
+    max_allowed_gap = source_config.get("max_gap", 1)
 
     signals = meta[meta.data_source == data_source]
 
     now = pd.Timestamp.now()
 
-    complaints = {}
+    age_complaints = {}
+    gap_complaints = {}
 
     for _, row in signals.iterrows():
         if "retired-signals" in source_config and \
            row["signal"] in source_config["retired-signals"]:
             continue
 
+        # Check max age
         age = (now - row["max_time"]).days
 
         if age > source_config["max_age"] + grace:
-            if row["signal"] not in complaints:
-                complaints[row["signal"]] = Complaint(
+            if row["signal"] not in age_complaints:
+                age_complaints[row["signal"]] = Complaint(
                     "is more than {age} days old".format(age=age),
                     data_source,
                     row["signal"],
                     [row["geo_type"]],
                     row["max_time"],
                     source_config["maintainers"])
             else:
-                complaints[row["signal"]].geo_types.append(row["geo_type"])
+                age_complaints[row["signal"]].geo_types.append(row["geo_type"])
+
+        # Check max gap
+        if max_allowed_gap == -1:
+            # No gap detection for this source
+            continue
+
+        latest_data = covidcast.signal(
+            data_source, row["signal"],
+            start_day=row["max_time"] - gap_window,
+            end_day=row["max_time"],
+            geo_type=row["geo_type"]
+        )
+
+        # convert numpy datetime values to pandas datetimes and then to
+        # datetime.date, so we can work with timedeltas after
+        unique_dates = [pd.to_datetime(val).date()
+                        for val in latest_data["time_value"].unique()]
+
+        gap_days = [(day - prev_day).days
+                    for day, prev_day in zip(unique_dates[1:], unique_dates[:-1])]
+        gap = max(gap_days)
+
+        if gap > max_allowed_gap:
+            if row["signal"] not in gap_complaints:
+                gap_complaints[row["signal"]] = Complaint(
+                    "has a {gap}-day gap of missing data in its most recent "
+                    "{gap_window} days of data".format(gap=gap, gap_window=gap_window.days),
+                    data_source,
+                    row["signal"],
+                    [row["geo_type"]],
+                    row["max_time"],
+                    source_config["maintainers"])
+            else:
+                gap_complaints[row["signal"]].geo_types.append(row["geo_type"])
 
-    return list(complaints.values())
+    return list(age_complaints.values()) + list(gap_complaints.values())
diff --git a/sir_complainsalot/delphi_sir_complainsalot/run.py b/sir_complainsalot/delphi_sir_complainsalot/run.py
@@ -22,7 +22,7 @@ def run_module():
 
     complaints = []
     for data_source in params["sources"].keys():
-        complaints.extend(check_source(data_source, meta, params["sources"], params.get("grace",0)))
+        complaints.extend(check_source(data_source, meta, params["sources"], params.get("grace", 0)))
 
     if len(complaints) > 0:
         for complaint in complaints: