@@ -52,18 +52,21 @@ class CsvRowValue:
5252class CsvImporter :
5353 """Finds and parses covidcast CSV files."""
5454
55+ # set of allowed resolutions (aka "geo_type")
56+ GEOGRAPHIC_RESOLUTIONS = {'county' , 'hrr' , 'msa' , 'dma' , 'state' , 'hhs' , 'nation' , 'hsa_nci' }
57+
58+ # regex pattern for matching geo types, note: sort longer string first to avoid wrong substring matches
59+ geo_types_pattern = "|" .join (sorted (GEOGRAPHIC_RESOLUTIONS , key = len , reverse = True ))
60+
5561 # .../source/yyyymmdd_geo_signal.csv
56- PATTERN_DAILY = re .compile (r'^.*/([^/]*)/(\d{8})_(\w+? )_(\w +)\.csv$' )
62+ PATTERN_DAILY = re .compile (r'^.*/([^/]*)/(\d{8})_(' + geo_types_pattern + r' )_(. +)\.csv$' )
5763
5864 # .../source/weekly_yyyyww_geo_signal.csv
59- PATTERN_WEEKLY = re .compile (r'^.*/([^/]*)/weekly_(\d{6})_(\w+? )_(\w +)\.csv$' )
65+ PATTERN_WEEKLY = re .compile (r'^.*/([^/]*)/weekly_(\d{6})_(' + geo_types_pattern + r' )_(. +)\.csv$' )
6066
6167 # .../issue_yyyymmdd
6268 PATTERN_ISSUE_DIR = re .compile (r'^.*/([^/]*)/issue_(\d{8})$' )
6369
64- # set of allowed resolutions (aka "geo_type")
65- GEOGRAPHIC_RESOLUTIONS = {'county' , 'hrr' , 'msa' , 'dma' , 'state' , 'hhs' , 'nation' }
66-
6770 # set of required CSV columns
6871 REQUIRED_COLUMNS = {'geo_id' , 'val' , 'se' , 'sample_size' }
6972
@@ -158,7 +161,7 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
158161 daily_match = CsvImporter .PATTERN_DAILY .match (path .lower ())
159162 weekly_match = CsvImporter .PATTERN_WEEKLY .match (path .lower ())
160163 if not daily_match and not weekly_match :
161- logger .warning (event = 'invalid csv path/filename' , detail = path , file = path )
164+ logger .warning (event = 'invalid csv path/filename or geo_type ' , detail = path , file = path )
162165 yield (path , None )
163166 continue
164167
@@ -186,12 +189,8 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
186189 issue_value = issue_epiweek_value
187190 lag_value = delta_epiweeks (time_value_week , issue_epiweek_value )
188191
189- # # extract and validate geographic resolution
192+ # extract geographic resolution
190193 geo_type = match .group (3 ).lower ()
191- if geo_type not in CsvImporter .GEOGRAPHIC_RESOLUTIONS :
192- logger .warning (event = 'invalid geo_type' , detail = geo_type , file = path )
193- yield (path , None )
194- continue
195194
196195 # extract additional values, lowercased for consistency
197196 source = match .group (1 ).lower ()
@@ -300,7 +299,7 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s
300299 # geo_id was `None`
301300 return (None , 'geo_id' )
302301
303- if geo_type in ('hrr' , 'msa' , 'dma' , 'hhs' ):
302+ if geo_type in ('hrr' , 'msa' , 'dma' , 'hhs' , 'hsa_nci' ):
304303 # these particular ids are prone to be written as ints -- and floats
305304 try :
306305 geo_id = str (CsvImporter .floaty_int (geo_id ))
@@ -339,6 +338,12 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s
339338 if len (geo_id ) != 2 or not 'aa' <= geo_id <= 'zz' :
340339 return (None , 'geo_id' )
341340
341+ elif geo_type == 'hsa_nci' :
342+ # valid codes should be 1-3 digit numbers, or the special code of "1022" for blank
343+ # https://seer.cancer.gov/seerstat/variables/countyattribs/hsa.html
344+ if not re .match (r'^(1022|\d{1,3})$' , geo_id ):
345+ return (None , 'geo_id' )
346+
342347 else :
343348 return (None , 'geo_type' )
344349
0 commit comments