|
1 | 1 | """Export data in the format expected by the Delphi API.""" |
2 | 2 | # -*- coding: utf-8 -*- |
| 3 | +import gzip |
| 4 | +import logging |
3 | 5 | from datetime import datetime |
4 | | -from os.path import join |
| 6 | +from os.path import getsize, join |
5 | 7 | from typing import Optional |
6 | | -import logging |
7 | 8 |
|
8 | | -from epiweeks import Week |
9 | 9 | import numpy as np |
10 | 10 | import pandas as pd |
| 11 | +from epiweeks import Week |
11 | 12 |
|
12 | 13 | from .nancodes import Nans |
13 | 14 |
|
| 15 | + |
14 | 16 | def filter_contradicting_missing_codes(df, sensor, metric, date, logger=None): |
15 | 17 | """Find values with contradictory missingness codes, filter them, and log.""" |
16 | 18 | columns = ["val", "se", "sample_size"] |
@@ -132,3 +134,70 @@ def create_export_csv( |
132 | 134 | export_df = export_df.sort_values(by="geo_id") |
133 | 135 | export_df.to_csv(export_file, index=False, na_rep="NA") |
134 | 136 | return dates |
| 137 | + |
| 138 | + |
| 139 | +def create_backup_csv( |
| 140 | + df: pd.DataFrame, |
| 141 | + backup_dir: str, |
| 142 | + custom_run: bool, |
| 143 | + issue: Optional[str] = None, |
| 144 | + geo_res: Optional[str] = None, |
| 145 | + sensor: Optional[str] = None, |
| 146 | + metric: Optional[str] = None, |
| 147 | + logger: Optional[logging.Logger] = None, |
| 148 | +): |
| 149 | + """Save data for use as a backup. |
| 150 | +
|
| 151 | + This function is meant to save raw data fetched from data sources. |
| 152 | + Therefore, it avoids manipulating the data as much as possible to |
| 153 | + preserve the input. |
| 154 | +
|
| 155 | + When only required arguments are passed, data will be saved to a file of |
| 156 | + the format `<export_dir>/<today's date as YYYYMMDD>.csv`. Optional arguments |
| 157 | + should be passed if the source data is fetched from different tables or |
| 158 | + in batches by signal, geo, etc. |
| 159 | +
|
| 160 | + Parameters |
| 161 | + ---------- |
| 162 | + df: pd.DataFrame |
| 163 | + Columns: geo_id, timestamp, val, se, sample_size |
| 164 | + backup_dir: str |
| 165 | + Backup directory |
| 166 | + custom_run: bool |
| 167 | + Flag indicating if the current run is a patch, or other run where |
| 168 | + backups aren't needed. If so, don't save any data to disk |
| 169 | + issue: Optional[str] |
| 170 | + The date the data was fetched, in YYYYMMDD format. Defaults to "today" |
| 171 | + if not provided |
| 172 | + geo_res: Optional[str] |
| 173 | + Geographic resolution of the data |
| 174 | + sensor: Optional[str] |
| 175 | + Sensor that has been calculated (cumulative_counts vs new_counts) |
| 176 | + metric: Optional[str] |
| 177 | + Metric we are considering, if any. |
| 178 | + logger: Optional[logging.Logger] |
| 179 | + Pass a logger object here to log information about name and size of the backup file. |
| 180 | +
|
| 181 | + Returns |
| 182 | + --------- |
| 183 | + dates: pd.Series[datetime] |
| 184 | + Series of dates for which CSV files were exported. |
| 185 | + """ |
| 186 | + if not custom_run: |
| 187 | + # Label the file with today's date (the date the data was fetched). |
| 188 | + if not issue: |
| 189 | + issue = datetime.today().strftime("%Y%m%d") |
| 190 | + |
| 191 | + backup_filename = [issue, geo_res, metric, sensor] |
| 192 | + backup_filename = "_".join(filter(None, backup_filename)) + ".csv.gz" |
| 193 | + backup_file = join(backup_dir, backup_filename) |
| 194 | + |
| 195 | + with gzip.open(backup_file, "wt", newline="") as f: |
| 196 | + df.to_csv(f, index=False, na_rep="NA") |
| 197 | + |
| 198 | + if logger: |
| 199 | + logger.info( |
| 200 | + "Backup file created", |
| 201 | + backup_file=backup_file, |
| 202 | + backup_size=getsize(backup_file), |
| 203 | + ) |
0 commit comments