From c9d7045031a724150e35a916402eed5925a624dd Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 4 Oct 2021 12:47:33 -0700 Subject: [PATCH 1/4] Hotfix acquisition: fix dtypes of missing columns * improve test coverage for this case --- src/acquisition/covidcast/csv_importer.py | 2 +- .../covidcast/test_csv_importer.py | 52 ++++++++----------- 2 files changed, 22 insertions(+), 32 deletions(-) diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py index f55327d85..156474290 100644 --- a/src/acquisition/covidcast/csv_importer.py +++ b/src/acquisition/covidcast/csv_importer.py @@ -224,7 +224,7 @@ def validate_missing_code(row, attr_quantity, attr_name): if hasattr(row, "missing_" + attr_name): missing_entry = getattr(row, "missing_" + attr_name) try: - missing_entry = int(missing_entry) + missing_entry = int(float(missing_entry)) # convert from string to float to int except ValueError: return None # A missing code should never contradict the quantity being present, diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py index b82dec1ce..6ecf292e6 100644 --- a/tests/acquisition/covidcast/test_csv_importer.py +++ b/tests/acquisition/covidcast/test_csv_importer.py @@ -59,7 +59,7 @@ def test_find_issue_specific_csv_files(self,os_isdir_mock): issuedir_match= CsvImporter.PATTERN_ISSUE_DIR.match(path_prefix.lower()) issue_date_value = int(issuedir_match.group(2)) self.assertTrue(CsvImporter.is_sane_day(issue_date_value)) - + found = set(CsvImporter.find_issue_specific_csv_files(path_prefix, glob=mock_glob)) self.assertTrue(len(found)>0) @@ -162,9 +162,9 @@ def make_row( val='1.23', se='4.56', sample_size='100.5', - missing_val=Nans.NOT_MISSING, - missing_se=Nans.NOT_MISSING, - missing_sample_size=Nans.NOT_MISSING): + missing_val=str(float(Nans.NOT_MISSING)), + missing_se=str(float(Nans.NOT_MISSING)), + missing_sample_size=str(float(Nans.NOT_MISSING))): row = MagicMock( geo_id=geo_id, val=val, @@ -203,9 +203,9 @@ def make_row( (make_row(missing_val='missing_val'), 'missing_val'), (make_row(missing_se='missing_val'), 'missing_se'), (make_row(missing_sample_size='missing_val'), 'missing_sample_size'), - (make_row(val='1.2', missing_val=Nans.OTHER), 'missing_val'), - (make_row(se='1.2', missing_se=Nans.OTHER), 'missing_se'), - (make_row(sample_size='1.2', missing_sample_size=Nans.OTHER), 'missing_sample_size') + (make_row(val='1.2', missing_val=str(float(Nans.OTHER))), 'missing_val'), + (make_row(se='1.2', missing_se=str(float(Nans.OTHER))), 'missing_se'), + (make_row(sample_size='1.2', missing_sample_size=str(float(Nans.OTHER))), 'missing_sample_size'), ] for ((geo_type, row), field) in failure_cases: @@ -213,30 +213,20 @@ def make_row( self.assertIsNone(values) self.assertEqual(error, field) - # a nominal case without missing values - geo_type, row = make_row() - values, error = CsvImporter.extract_and_check_row(row, geo_type) - - self.assertIsInstance(values, CsvImporter.RowValues) - self.assertEqual(str(values.geo_value), row.geo_id) - self.assertEqual(str(values.value), row.val) - self.assertEqual(str(values.stderr), row.se) - self.assertEqual(str(values.sample_size), row.sample_size) - self.assertIsNone(error) - - # a nominal case with missing values - geo_type, row = make_row( - se='', sample_size='NA', - missing_se=Nans.OTHER, missing_sample_size=Nans.OTHER - ) - values, error = CsvImporter.extract_and_check_row(row, geo_type) - - self.assertIsInstance(values, CsvImporter.RowValues) - self.assertEqual(str(values.geo_value), row.geo_id) - self.assertEqual(str(values.value), row.val) - self.assertIsNone(values.stderr) - self.assertIsNone(values.sample_size) - self.assertIsNone(error) + success_cases = [ + (make_row(), CsvImporter.RowValues('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), + (make_row(geo_type='county', geo_id='17000', val=np.nan, se=np.nan, sample_size=np.nan, missing_val=str(float(Nans.DELETED)), missing_se=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvImporter.RowValues('17000', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), + (make_row(se='', sample_size='NA', missing_se=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)) + ] + + for ((geo_type, row), field) in success_cases: + values, error = CsvImporter.extract_and_check_row(row, geo_type) + self.assertIsNone(error) + self.assertIsInstance(values, CsvImporter.RowValues) + self.assertEqual(values.geo_value, field.geo_value) + self.assertEqual(values.value, field.value) + self.assertEqual(values.stderr, field.stderr) + self.assertEqual(values.sample_size, field.sample_size) def test_load_csv_with_invalid_header(self): """Bail loading a CSV when the header is invalid.""" From cecb6c950e2e2f189d4b901557d9c54833eb85e6 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 4 Oct 2021 12:52:28 -0700 Subject: [PATCH 2/4] Hotfix acquisition: improve test coverage --- tests/acquisition/covidcast/test_csv_importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py index 6ecf292e6..4b87cf68b 100644 --- a/tests/acquisition/covidcast/test_csv_importer.py +++ b/tests/acquisition/covidcast/test_csv_importer.py @@ -215,7 +215,7 @@ def make_row( success_cases = [ (make_row(), CsvImporter.RowValues('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), - (make_row(geo_type='county', geo_id='17000', val=np.nan, se=np.nan, sample_size=np.nan, missing_val=str(float(Nans.DELETED)), missing_se=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvImporter.RowValues('17000', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), + (make_row(val=None, se=np.nan, sample_size='', missing_val=str(float(Nans.DELETED)), missing_se=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvImporter.RowValues('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), (make_row(se='', sample_size='NA', missing_se=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)) ] From d474a39a22c3060ebc64ac81cacf01ad455873cf Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 5 Oct 2021 10:19:29 -0700 Subject: [PATCH 3/4] Update tests/acquisition/covidcast/test_csv_importer.py Whitespace for readability Co-authored-by: Katie Mazaitis --- tests/acquisition/covidcast/test_csv_importer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py index 4b87cf68b..c25e66fe4 100644 --- a/tests/acquisition/covidcast/test_csv_importer.py +++ b/tests/acquisition/covidcast/test_csv_importer.py @@ -214,9 +214,12 @@ def make_row( self.assertEqual(error, field) success_cases = [ - (make_row(), CsvImporter.RowValues('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), - (make_row(val=None, se=np.nan, sample_size='', missing_val=str(float(Nans.DELETED)), missing_se=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvImporter.RowValues('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), - (make_row(se='', sample_size='NA', missing_se=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)) + (make_row(), + CsvImporter.RowValues('vi', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), + (make_row(val=None, se=np.nan, sample_size='', missing_val=str(float(Nans.DELETED)), missing_se=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), + CsvImporter.RowValues('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), + (make_row(se='', sample_size='NA', missing_se=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), + CsvImporter.RowValues('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)) ] for ((geo_type, row), field) in success_cases: From 8b1232d64a42577d2701e2ea55a9bcc95add16e5 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Tue, 5 Oct 2021 10:53:35 -0700 Subject: [PATCH 4/4] Update tests/acquisition/covidcast/test_csv_importer.py Preserve old test cases Co-authored-by: Katie Mazaitis --- tests/acquisition/covidcast/test_csv_importer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py index c25e66fe4..8048772cf 100644 --- a/tests/acquisition/covidcast/test_csv_importer.py +++ b/tests/acquisition/covidcast/test_csv_importer.py @@ -203,6 +203,9 @@ def make_row( (make_row(missing_val='missing_val'), 'missing_val'), (make_row(missing_se='missing_val'), 'missing_se'), (make_row(missing_sample_size='missing_val'), 'missing_sample_size'), + (make_row(val='1.2', missing_val=Nans.OTHER), 'missing_val'), + (make_row(se='1.2', missing_se=Nans.OTHER), 'missing_se'), + (make_row(sample_size='1.2', missing_sample_size=Nans.OTHER), 'missing_sample_size'), (make_row(val='1.2', missing_val=str(float(Nans.OTHER))), 'missing_val'), (make_row(se='1.2', missing_se=str(float(Nans.OTHER))), 'missing_se'), (make_row(sample_size='1.2', missing_sample_size=str(float(Nans.OTHER))), 'missing_sample_size'),