Skip to content

Commit 3450dfc

Browse files
nmdefriesminhkhul
andauthored
Add nchs-mortality raw data backups and backup export utility (#2065)
* add helper fn in utils to save backup data to csv * use helper to save nchs data to disk right after pulling * add backup dir param * import backup utility * update arg name * add gzip + fix old json template log + remove table_name * fix current tests to take backup dirs and custom_run flag into account * add logging * fix log getsize of backup file * lint * lint * lint * lint * add backup test * remove deep copy --------- Co-authored-by: minhkhul <[email protected]> Co-authored-by: minhkhul <[email protected]>
1 parent efeff2f commit 3450dfc

File tree

10 files changed

+245
-19
lines changed

10 files changed

+245
-19
lines changed

_delphi_utils_python/delphi_utils/__init__.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,14 @@
44
from __future__ import absolute_import
55

66
from .archive import ArchiveDiffer, GitArchiveDiffer, S3ArchiveDiffer
7-
from .export import create_export_csv
8-
from .utils import read_params
9-
10-
from .slack_notifier import SlackNotifier
11-
from .logger import get_structured_logger
7+
from .export import create_backup_csv, create_export_csv
128
from .geomap import GeoMapper
13-
from .smooth import Smoother
14-
from .signal import add_prefix
9+
from .logger import get_structured_logger
1510
from .nancodes import Nans
11+
from .signal import add_prefix
12+
from .slack_notifier import SlackNotifier
13+
from .smooth import Smoother
14+
from .utils import read_params
1615
from .weekday import Weekday
1716

1817
__version__ = "0.3.25"

_delphi_utils_python/delphi_utils/export.py

Lines changed: 72 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
"""Export data in the format expected by the Delphi API."""
22
# -*- coding: utf-8 -*-
3+
import gzip
4+
import logging
35
from datetime import datetime
4-
from os.path import join
6+
from os.path import getsize, join
57
from typing import Optional
6-
import logging
78

8-
from epiweeks import Week
99
import numpy as np
1010
import pandas as pd
11+
from epiweeks import Week
1112

1213
from .nancodes import Nans
1314

15+
1416
def filter_contradicting_missing_codes(df, sensor, metric, date, logger=None):
1517
"""Find values with contradictory missingness codes, filter them, and log."""
1618
columns = ["val", "se", "sample_size"]
@@ -132,3 +134,70 @@ def create_export_csv(
132134
export_df = export_df.sort_values(by="geo_id")
133135
export_df.to_csv(export_file, index=False, na_rep="NA")
134136
return dates
137+
138+
139+
def create_backup_csv(
140+
df: pd.DataFrame,
141+
backup_dir: str,
142+
custom_run: bool,
143+
issue: Optional[str] = None,
144+
geo_res: Optional[str] = None,
145+
sensor: Optional[str] = None,
146+
metric: Optional[str] = None,
147+
logger: Optional[logging.Logger] = None,
148+
):
149+
"""Save data for use as a backup.
150+
151+
This function is meant to save raw data fetched from data sources.
152+
Therefore, it avoids manipulating the data as much as possible to
153+
preserve the input.
154+
155+
When only required arguments are passed, data will be saved to a file of
156+
the format `<export_dir>/<today's date as YYYYMMDD>.csv`. Optional arguments
157+
should be passed if the source data is fetched from different tables or
158+
in batches by signal, geo, etc.
159+
160+
Parameters
161+
----------
162+
df: pd.DataFrame
163+
Columns: geo_id, timestamp, val, se, sample_size
164+
backup_dir: str
165+
Backup directory
166+
custom_run: bool
167+
Flag indicating if the current run is a patch, or other run where
168+
backups aren't needed. If so, don't save any data to disk
169+
issue: Optional[str]
170+
The date the data was fetched, in YYYYMMDD format. Defaults to "today"
171+
if not provided
172+
geo_res: Optional[str]
173+
Geographic resolution of the data
174+
sensor: Optional[str]
175+
Sensor that has been calculated (cumulative_counts vs new_counts)
176+
metric: Optional[str]
177+
Metric we are considering, if any.
178+
logger: Optional[logging.Logger]
179+
Pass a logger object here to log information about name and size of the backup file.
180+
181+
Returns
182+
---------
183+
dates: pd.Series[datetime]
184+
Series of dates for which CSV files were exported.
185+
"""
186+
if not custom_run:
187+
# Label the file with today's date (the date the data was fetched).
188+
if not issue:
189+
issue = datetime.today().strftime("%Y%m%d")
190+
191+
backup_filename = [issue, geo_res, metric, sensor]
192+
backup_filename = "_".join(filter(None, backup_filename)) + ".csv.gz"
193+
backup_file = join(backup_dir, backup_filename)
194+
195+
with gzip.open(backup_file, "wt", newline="") as f:
196+
df.to_csv(f, index=False, na_rep="NA")
197+
198+
if logger:
199+
logger.info(
200+
"Backup file created",
201+
backup_file=backup_file,
202+
backup_size=getsize(backup_file),
203+
)

ansible/templates/nchs_mortality-params-prod.json.j2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
{
22
"common": {
33
"daily_export_dir": "./daily_receiving",
4+
"backup_dir": "./raw_data_backups",
45
"log_filename": "/var/log/indicators/nchs_mortality.log",
56
"weekly_export_dir": "/common/covidcast/receiving/nchs-mortality"
67
},

nchs_mortality/delphi_nchs_mortality/pull.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
# -*- coding: utf-8 -*-
22
"""Functions for pulling NCHS mortality data API."""
33

4+
import logging
45
from typing import Optional
56

67
import numpy as np
78
import pandas as pd
9+
from delphi_utils import create_backup_csv
10+
from delphi_utils.geomap import GeoMapper
811
from sodapy import Socrata
912

10-
from delphi_utils.geomap import GeoMapper
13+
from .constants import METRICS, NEWLINE, RENAME
1114

12-
from .constants import METRICS, RENAME, NEWLINE
1315

1416
def standardize_columns(df):
1517
"""Rename columns to comply with a standard set.
@@ -22,7 +24,13 @@ def standardize_columns(df):
2224
return df.rename(columns=dict(rename_pairs))
2325

2426

25-
def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None):
27+
def pull_nchs_mortality_data(
28+
socrata_token: str,
29+
backup_dir: str,
30+
custom_run: bool,
31+
logger: Optional[logging.Logger] = None,
32+
test_file: Optional[str] = None,
33+
):
2634
"""Pull the latest NCHS Mortality data, and conforms it into a dataset.
2735
2836
The output dataset has:
@@ -40,6 +48,10 @@ def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None
4048
----------
4149
socrata_token: str
4250
My App Token for pulling the NCHS mortality data
51+
backup_dir: str
52+
Directory to which to save raw backup data
53+
custom_run: bool
54+
Flag indicating if the current run is a patch. If so, don't save any data to disk
4355
test_file: Optional[str]
4456
When not null, name of file from which to read test data
4557
@@ -60,6 +72,10 @@ def pull_nchs_mortality_data(socrata_token: str, test_file: Optional[str] = None
6072
client = Socrata("data.cdc.gov", socrata_token)
6173
results = client.get("r8kw-7aab", limit=10**10)
6274
df = pd.DataFrame.from_records(results)
75+
76+
create_backup_csv(df, backup_dir, custom_run=custom_run, logger=logger)
77+
78+
if not test_file:
6379
# drop "By Total" rows
6480
df = df[df["group"].transform(str.lower) == "by week"]
6581

nchs_mortality/delphi_nchs_mortality/run.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ def run_module(params: Dict[str, Any]):
5959
days=date.today().weekday() + 2)
6060
export_start_date = export_start_date.strftime('%Y-%m-%d')
6161
daily_export_dir = params["common"]["daily_export_dir"]
62+
backup_dir = params["common"]["backup_dir"]
63+
custom_run = params["common"].get("custom_run", False)
6264
socrata_token = params["indicator"]["socrata_token"]
6365
test_file = params["indicator"].get("test_file", None)
6466

@@ -70,7 +72,9 @@ def run_module(params: Dict[str, Any]):
7072
daily_arch_diff.update_cache()
7173

7274
stats = []
73-
df_pull = pull_nchs_mortality_data(socrata_token, test_file)
75+
df_pull = pull_nchs_mortality_data(
76+
socrata_token, backup_dir, custom_run=custom_run, test_file=test_file, logger=logger
77+
)
7478
for metric in METRICS:
7579
for geo in ["state", "nation"]:
7680
if metric == 'percent_of_expected_deaths':

nchs_mortality/params.json.template

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
"common": {
33
"daily_export_dir": "./daily_receiving",
44
"weekly_export_dir": "./receiving",
5-
"log_filename": "/var/log/indicators/nchs_mortality.log",
5+
"backup_dir": "./raw_data_backups",
6+
"log_filename": "./nchs_mortality.log",
67
"log_exceptions": false
78
},
89
"indicator": {
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
# You should hard commit a prototype for this file, but we
2+
# want to avoid accidental adding of API tokens and other
3+
# private data parameters
4+
params.json
5+
6+
# Do not commit output files
7+
receiving/*.csv
8+
9+
# Remove macOS files
10+
.DS_Store
11+
12+
# virtual environment
13+
dview/
14+
15+
# Byte-compiled / optimized / DLL files
16+
__pycache__/
17+
*.py[cod]
18+
*$py.class
19+
20+
# C extensions
21+
*.so
22+
23+
# Distribution / packaging
24+
coverage.xml
25+
.Python
26+
build/
27+
develop-eggs/
28+
dist/
29+
downloads/
30+
eggs/
31+
.eggs/
32+
lib/
33+
lib64/
34+
parts/
35+
sdist/
36+
var/
37+
wheels/
38+
*.egg-info/
39+
.installed.cfg
40+
*.egg
41+
MANIFEST
42+
43+
# PyInstaller
44+
# Usually these files are written by a python script from a template
45+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
46+
*.manifest
47+
*.spec
48+
49+
# Installer logs
50+
pip-log.txt
51+
pip-delete-this-directory.txt
52+
53+
# Unit test / coverage reports
54+
htmlcov/
55+
.tox/
56+
.coverage
57+
.coverage.*
58+
.cache
59+
nosetests.xml
60+
coverage.xml
61+
*.cover
62+
.hypothesis/
63+
.pytest_cache/
64+
65+
# Translations
66+
*.mo
67+
*.pot
68+
69+
# Django stuff:
70+
*.log
71+
.static_storage/
72+
.media/
73+
local_settings.py
74+
75+
# Flask stuff:
76+
instance/
77+
.webassets-cache
78+
79+
# Scrapy stuff:
80+
.scrapy
81+
82+
# Sphinx documentation
83+
docs/_build/
84+
85+
# PyBuilder
86+
target/
87+
88+
# Jupyter Notebook
89+
.ipynb_checkpoints
90+
91+
# pyenv
92+
.python-version
93+
94+
# celery beat schedule file
95+
celerybeat-schedule
96+
97+
# SageMath parsed files
98+
*.sage.py
99+
100+
# Environments
101+
.env
102+
.venv
103+
env/
104+
venv/
105+
ENV/
106+
env.bak/
107+
venv.bak/
108+
109+
# Spyder project settings
110+
.spyderproject
111+
.spyproject
112+
113+
# Rope project settings
114+
.ropeproject
115+
116+
# mkdocs documentation
117+
/site
118+
119+
# mypy
120+
.mypy_cache/

nchs_mortality/tests/conftest.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414

1515
PARAMS = {
1616
"common": {
17+
"custom_run": True,
1718
"daily_export_dir": "./daily_receiving",
18-
"weekly_export_dir": "./receiving"
19+
"weekly_export_dir": "./receiving",
20+
"backup_dir": "./raw_data_backups"
1921
},
2022
"indicator": {
2123
"export_start_date": "2020-04-11",
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*.csv
2+
*.gz

nchs_mortality/tests/test_pull.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import pytest
23

34
import pandas as pd
@@ -34,7 +35,7 @@ def test_standardize_columns(self):
3435
pd.testing.assert_frame_equal(expected, df)
3536

3637
def test_good_file(self):
37-
df = pull_nchs_mortality_data(SOCRATA_TOKEN, "test_data.csv")
38+
df = pull_nchs_mortality_data(SOCRATA_TOKEN, backup_dir = "", custom_run = True, test_file = "test_data.csv")
3839

3940
# Test columns
4041
assert (
@@ -90,9 +91,20 @@ def test_good_file(self):
9091
def test_bad_file_with_inconsistent_time_col(self):
9192
with pytest.raises(ValueError):
9293
pull_nchs_mortality_data(
93-
SOCRATA_TOKEN, "bad_data_with_inconsistent_time_col.csv"
94+
SOCRATA_TOKEN, backup_dir = "", custom_run = True, test_file = "bad_data_with_inconsistent_time_col.csv"
9495
)
9596

9697
def test_bad_file_with_missing_cols(self):
9798
with pytest.raises(ValueError):
98-
pull_nchs_mortality_data(SOCRATA_TOKEN, "bad_data_with_missing_cols.csv")
99+
pull_nchs_mortality_data(SOCRATA_TOKEN, backup_dir = "", custom_run = True, test_file = "bad_data_with_missing_cols.csv")
100+
101+
def test_backup_today_data(self):
102+
today = pd.Timestamp.today().strftime("%Y%m%d")
103+
backup_dir = "./raw_data_backups"
104+
pull_nchs_mortality_data(SOCRATA_TOKEN, backup_dir = backup_dir, custom_run = False, test_file = "test_data.csv")
105+
backup_file = f"{backup_dir}/{today}.csv.gz"
106+
backup_df = pd.read_csv(backup_file)
107+
source_df = pd.read_csv("test_data/test_data.csv")
108+
pd.testing.assert_frame_equal(source_df, backup_df)
109+
if os.path.exists(backup_file):
110+
os.remove(backup_file)

0 commit comments

Comments
 (0)