diff --git a/usafacts/delphi_usafacts/geo.py b/usafacts/delphi_usafacts/geo.py index 53542a6c4..5979a5e21 100644 --- a/usafacts/delphi_usafacts/geo.py +++ b/usafacts/delphi_usafacts/geo.py @@ -107,6 +107,13 @@ def geo_map(df: pd.DataFrame, geo_res: str, map_df: pd.DataFrame, sensor: str): df = df.append(unassigned_counties) geo_mapper = GeoMapper() df = geo_mapper.add_geocode(df, "fips", "state_id", new_col="geo_id") + + # Zero out the state FIPS population to avoid double counting. + df = df.set_index("fips") + state_fips_codes = {str(x).zfill(2) + "000" for x in range(1, 73)} + subset_state_fips_codes = set(df.index.values) & state_fips_codes + df.loc[subset_state_fips_codes, "population"] = 0 + df = df.reset_index() elif geo_res in ("msa", "hrr"): # Map "missing" secondary FIPS to those that are in our canonical set for fips, fips_list in SECONDARY_FIPS: diff --git a/usafacts/tests/test_geo.py b/usafacts/tests/test_geo.py index 33fe7dd3f..c6f52a1ad 100644 --- a/usafacts/tests/test_geo.py +++ b/usafacts/tests/test_geo.py @@ -76,11 +76,11 @@ def test_state(self): """Tests that values are correctly aggregated at the state level.""" df = pd.DataFrame( { - "fips": ["04001", "04003", "04009", "25023"], - "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"], - "new_counts": [10, 15, 2, 13], - "cumulative_counts": [100, 20, 45, 60], - "population": [100, 2100, 300, 25], + "fips": ["04001", "04003", "04009", "25023", "25000"], + "timestamp": ["2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15", "2020-02-15"], + "new_counts": [10, 15, 2, 13, 0], + "cumulative_counts": [100, 20, 45, 60, 0], + "population": [100, 2100, 300, 25, 25], } )