Skip to content

Commit c9d7045

Browse files
committed
Hotfix acquisition: fix dtypes of missing columns
* improve test coverage for this case
1 parent f8cb065 commit c9d7045

File tree

2 files changed

+22
-32
lines changed

2 files changed

+22
-32
lines changed

src/acquisition/covidcast/csv_importer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ def validate_missing_code(row, attr_quantity, attr_name):
224224
if hasattr(row, "missing_" + attr_name):
225225
missing_entry = getattr(row, "missing_" + attr_name)
226226
try:
227-
missing_entry = int(missing_entry)
227+
missing_entry = int(float(missing_entry)) # convert from string to float to int
228228
except ValueError:
229229
return None
230230
# A missing code should never contradict the quantity being present,

tests/acquisition/covidcast/test_csv_importer.py

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def test_find_issue_specific_csv_files(self,os_isdir_mock):
5959
issuedir_match= CsvImporter.PATTERN_ISSUE_DIR.match(path_prefix.lower())
6060
issue_date_value = int(issuedir_match.group(2))
6161
self.assertTrue(CsvImporter.is_sane_day(issue_date_value))
62-
62+
6363
found = set(CsvImporter.find_issue_specific_csv_files(path_prefix, glob=mock_glob))
6464
self.assertTrue(len(found)>0)
6565

@@ -162,9 +162,9 @@ def make_row(
162162
val='1.23',
163163
se='4.56',
164164
sample_size='100.5',
165-
missing_val=Nans.NOT_MISSING,
166-
missing_se=Nans.NOT_MISSING,
167-
missing_sample_size=Nans.NOT_MISSING):
165+
missing_val=str(float(Nans.NOT_MISSING)),
166+
missing_se=str(float(Nans.NOT_MISSING)),
167+
missing_sample_size=str(float(Nans.NOT_MISSING))):
168168
row = MagicMock(
169169
geo_id=geo_id,
170170
val=val,
@@ -203,40 +203,30 @@ def make_row(
203203
(make_row(missing_val='missing_val'), 'missing_val'),
204204
(make_row(missing_se='missing_val'), 'missing_se'),
205205
(make_row(missing_sample_size='missing_val'), 'missing_sample_size'),
206-
(make_row(val='1.2', missing_val=Nans.OTHER), 'missing_val'),
207-
(make_row(se='1.2', missing_se=Nans.OTHER), 'missing_se'),
208-
(make_row(sample_size='1.2', missing_sample_size=Nans.OTHER), 'missing_sample_size')
206+
(make_row(val='1.2', missing_val=str(float(Nans.OTHER))), 'missing_val'),
207+
(make_row(se='1.2', missing_se=str(float(Nans.OTHER))), 'missing_se'),
208+
(make_row(sample_size='1.2', missing_sample_size=str(float(Nans.OTHER))), 'missing_sample_size'),
209209
]
210210

211211
for ((geo_type, row), field) in failure_cases:
212212
values, error = CsvImporter.extract_and_check_row(row, geo_type)
213213
self.assertIsNone(values)
214214
self.assertEqual(error, field)
215215

216-
# a nominal case without missing values
217-
geo_type, row = make_row()
218-
values, error = CsvImporter.extract_and_check_row(row, geo_type)
219-
220-
self.assertIsInstance(values, CsvImporter.RowValues)
221-
self.assertEqual(str(values.geo_value), row.geo_id)
222-
self.assertEqual(str(values.value), row.val)
223-
self.assertEqual(str(values.stderr), row.se)
224-
self.assertEqual(str(values.sample_size), row.sample_size)
225-
self.assertIsNone(error)
226-
227-
# a nominal case with missing values
228-
geo_type, row = make_row(
229-
se='', sample_size='NA',
230-
missing_se=Nans.OTHER, missing_sample_size=Nans.OTHER
231-
)
232-
values, error = CsvImporter.extract_and_check_row(row, geo_type)
233-
234-
self.assertIsInstance(values, CsvImporter.RowValues)
235-
self.assertEqual(str(values.geo_value), row.geo_id)
236-
self.assertEqual(str(values.value), row.val)
237-
self.assertIsNone(values.stderr)
238-
self.assertIsNone(values.sample_size)
239-
self.assertIsNone(error)
216+
success_cases = [
217+
(make_row(), CsvImporter.RowValues('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
218+
(make_row(geo_type='county', geo_id='17000', val=np.nan, se=np.nan, sample_size=np.nan, missing_val=str(float(Nans.DELETED)), missing_se=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvImporter.RowValues('17000', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)),
219+
(make_row(se='', sample_size='NA', missing_se=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER))
220+
]
221+
222+
for ((geo_type, row), field) in success_cases:
223+
values, error = CsvImporter.extract_and_check_row(row, geo_type)
224+
self.assertIsNone(error)
225+
self.assertIsInstance(values, CsvImporter.RowValues)
226+
self.assertEqual(values.geo_value, field.geo_value)
227+
self.assertEqual(values.value, field.value)
228+
self.assertEqual(values.stderr, field.stderr)
229+
self.assertEqual(values.sample_size, field.sample_size)
240230

241231
def test_load_csv_with_invalid_header(self):
242232
"""Bail loading a CSV when the header is invalid."""

0 commit comments

Comments
 (0)