@@ -113,6 +113,7 @@ def __init__(self, params):
113113 """
114114 # Get user settings from params or if not provided, set default.
115115 self .data_source = params ['data_source' ]
116+ self .validator_static_file_dir = params .get ('validator_static_file_dir' , '../validator/static' )
116117
117118 # Date/time settings
118119 self .span_length = timedelta (days = params ['span_length' ])
@@ -244,9 +245,33 @@ def check_df_format(self, df_to_test, nameformat):
244245
245246 self .increment_total_checks ()
246247
247- def check_bad_geo_id (self , df_to_test , nameformat , geo_type ):
248+ def check_bad_geo_id_value (self , df_to_test , filename , geo_type ):
248249 """
249- Check validity of geo type and values, according to regex pattern.
250+ Check for bad geo_id values, by comparing to a list of known values (drawn from historical data)
251+
252+ Arguments:
253+ - df_to_test: pandas dataframe of CSV source data containing the geo_id column to check
254+ - geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data
255+ """
256+ file_path = join (self .validator_static_file_dir , geo_type + '_geo.csv' )
257+ valid_geo_df = pd .read_csv (file_path , dtype = {'geo_id' : str })
258+ valid_geos = valid_geo_df ['geo_id' ].values
259+ unexpected_geos = [geo for geo in df_to_test ['geo_id' ] if geo .lower () not in valid_geos ]
260+ if len (unexpected_geos ) > 0 :
261+ self .raised_errors .append (ValidationError (
262+ ("check_bad_geo_id_value" , filename ),
263+ unexpected_geos , "Unrecognized geo_ids (not in historical data)" ))
264+ self .increment_total_checks ()
265+ upper_case_geos = [geo for geo in df_to_test ['geo_id' ] if geo .lower () != geo ]
266+ if len (upper_case_geos ) > 0 :
267+ self .raised_warnings .append (ValidationError (
268+ ("check_geo_id_lowercase" , filename ),
269+ upper_case_geos , "geo_id contains uppercase characters. Lowercase is preferred." ))
270+ self .increment_total_checks ()
271+
272+ def check_bad_geo_id_format (self , df_to_test , nameformat , geo_type ):
273+ """
274+ Check validity of geo_type and format of geo_ids, according to regex pattern.
250275
251276 Arguments:
252277 - df_to_test: pandas dataframe of CSV source data
@@ -720,8 +745,9 @@ def validate(self, export_dir):
720745 data_df = load_csv (join (export_dir , filename ))
721746
722747 self .check_df_format (data_df , filename )
723- self .check_bad_geo_id (
748+ self .check_bad_geo_id_format (
724749 data_df , filename , match .groupdict ()['geo_type' ])
750+ self .check_bad_geo_id_value (data_df , filename , match .groupdict ()['geo_type' ])
725751 self .check_bad_val (data_df , filename , match .groupdict ()['signal' ])
726752 self .check_bad_se (data_df , filename )
727753 self .check_bad_sample_size (data_df , filename )
0 commit comments