Skip to content

Commit 6d1e825

Browse files
committed
Acquisition logging:
* repair and log instead of rejecting files with invalid missing codes * introduce dtype inference code in acquisition to see if further refactors possible
1 parent cecb6c9 commit 6d1e825

File tree

2 files changed

+35
-37
lines changed

2 files changed

+35
-37
lines changed

src/acquisition/covidcast/csv_importer.py

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ def validate_quantity(row, attr_quantity):
212212
return "Error"
213213

214214
@staticmethod
215-
def validate_missing_code(row, attr_quantity, attr_name):
215+
def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=None):
216216
"""Take a row and validate the missing code associated with
217217
a quantity (e.g., val, se, stderr).
218218
@@ -221,27 +221,32 @@ def validate_missing_code(row, attr_quantity, attr_name):
221221
to infer missing codes except for a very simple cases; the default
222222
is to produce an error so that the issue can be fixed in indicators.
223223
"""
224-
if hasattr(row, "missing_" + attr_name):
225-
missing_entry = getattr(row, "missing_" + attr_name)
224+
if logger is None:
225+
logger = get_structured_logger('load_csv')
226+
missing_entry = getattr(row, "missing_" + attr_name, None)
227+
228+
if missing_entry is not None:
226229
try:
227230
missing_entry = int(float(missing_entry)) # convert from string to float to int
228231
except ValueError:
229-
return None
230-
# A missing code should never contradict the quantity being present,
231-
# since that will be filtered in the export_to_csv util in
232-
# covidcast-indicators; nonetheless this code is here for safety.
233-
if attr_quantity is not None and missing_entry != Nans.NOT_MISSING.value:
234-
return None
235-
elif attr_quantity is None and missing_entry == Nans.NOT_MISSING.value:
236-
return None
237-
return missing_entry
238-
else:
239-
if attr_quantity is None:
240-
return Nans.OTHER.value
232+
missing_entry = None
233+
234+
if missing_entry is None and attr_quantity is not None:
241235
return Nans.NOT_MISSING.value
236+
if missing_entry is None and attr_quantity is None:
237+
return Nans.OTHER.value
238+
239+
if missing_entry != Nans.NOT_MISSING.value and attr_quantity is not None:
240+
logger.warning(event = f"missing_{attr_name} column contradicting {attr_name} presence.", detail = (str(row)), file = filepath)
241+
return Nans.NOT_MISSING.value
242+
if missing_entry == Nans.NOT_MISSING.value and attr_quantity is None:
243+
logger.warning(event = f"missing_{attr_name} column contradicting {attr_name} presence.", detail = (str(row)), file = filepath)
244+
return Nans.OTHER.value
245+
246+
return missing_entry
242247

243248
@staticmethod
244-
def extract_and_check_row(row, geo_type):
249+
def extract_and_check_row(row, geo_type, filepath=None):
245250
"""Extract and return `RowValues` from a CSV row, with sanity checks.
246251
247252
Also returns the name of the field which failed sanity check, or None.
@@ -316,15 +321,9 @@ def extract_and_check_row(row, geo_type):
316321
return (None, 'sample_size')
317322

318323
# Validate and write missingness codes
319-
missing_value = CsvImporter.validate_missing_code(row, value, "val")
320-
if missing_value is None:
321-
return (None, 'missing_val')
322-
missing_stderr = CsvImporter.validate_missing_code(row, stderr, "se")
323-
if missing_stderr is None:
324-
return (None, 'missing_se')
325-
missing_sample_size = CsvImporter.validate_missing_code(row, sample_size, "sample_size")
326-
if missing_sample_size is None:
327-
return (None, 'missing_sample_size')
324+
missing_value = CsvImporter.validate_missing_code(row, value, "val", filepath)
325+
missing_stderr = CsvImporter.validate_missing_code(row, stderr, "se", filepath)
326+
missing_sample_size = CsvImporter.validate_missing_code(row, sample_size, "sample_size", filepath)
328327

329328
# return extracted and validated row values
330329
row_values = CsvImporter.RowValues(
@@ -353,7 +352,7 @@ def load_csv(filepath, geo_type, pandas=pandas):
353352
return
354353

355354
for row in table.itertuples(index=False):
356-
row_values, error = CsvImporter.extract_and_check_row(row, geo_type)
355+
row_values, error = CsvImporter.extract_and_check_row(row, geo_type, filepath)
357356
if error:
358357
logger.warning(event = 'invalid value for row', detail=(str(row), error), file=filepath)
359358
yield None

tests/acquisition/covidcast/test_csv_importer.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,6 @@ def make_row(
200200
(make_row(val='val'), 'val'),
201201
(make_row(se='se'), 'se'),
202202
(make_row(sample_size='sample_size'), 'sample_size'),
203-
(make_row(missing_val='missing_val'), 'missing_val'),
204-
(make_row(missing_se='missing_val'), 'missing_se'),
205-
(make_row(missing_sample_size='missing_val'), 'missing_sample_size'),
206-
(make_row(val='1.2', missing_val=str(float(Nans.OTHER))), 'missing_val'),
207-
(make_row(se='1.2', missing_se=str(float(Nans.OTHER))), 'missing_se'),
208-
(make_row(sample_size='1.2', missing_sample_size=str(float(Nans.OTHER))), 'missing_sample_size'),
209203
]
210204

211205
for ((geo_type, row), field) in failure_cases:
@@ -216,7 +210,8 @@ def make_row(
216210
success_cases = [
217211
(make_row(), CsvImporter.RowValues('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
218212
(make_row(val=None, se=np.nan, sample_size='', missing_val=str(float(Nans.DELETED)), missing_se=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvImporter.RowValues('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)),
219-
(make_row(se='', sample_size='NA', missing_se=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER))
213+
(make_row(se='', sample_size='NA', missing_se=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)),
214+
(make_row(sample_size=None, missing_val='missing_val', missing_se=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvImporter.RowValues('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)),
220215
]
221216

222217
for ((geo_type, row), field) in success_cases:
@@ -281,9 +276,7 @@ def test_load_csv_with_valid_header(self):
281276

282277
self.assertIsNone(rows[3])
283278

284-
# now with missing values! the last missing_sample_size
285-
# contains an error code while data is available, which
286-
# should give an error
279+
# now with missing values!
287280
data = {
288281
'geo_id': ['ca', 'tx', 'fl', 'ak'],
289282
'val': [np.nan, '1.2', '1.3', '1.4'],
@@ -328,4 +321,10 @@ def test_load_csv_with_valid_header(self):
328321
self.assertEqual(rows[2].missing_stderr, Nans.NOT_MISSING)
329322
self.assertEqual(rows[2].missing_sample_size, Nans.REGION_EXCEPTION)
330323

331-
self.assertIsNone(rows[3])
324+
self.assertEqual(rows[3].geo_value, 'ak')
325+
self.assertEqual(rows[3].value, 1.4)
326+
self.assertEqual(rows[3].stderr, 2.4)
327+
self.assertEqual(rows[3].sample_size, 304)
328+
self.assertEqual(rows[3].missing_value, Nans.NOT_MISSING)
329+
self.assertEqual(rows[3].missing_stderr, Nans.NOT_MISSING)
330+
self.assertEqual(rows[3].missing_sample_size, Nans.NOT_MISSING)

0 commit comments

Comments
 (0)