diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..8c0e5fb9f --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] +max-line-length = 158 +exclude = + .env + ./.venv diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 000000000..298d8ead0 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,4 @@ +[settings] +multi_line_output=3 +include_trailing_comma=True +line_length=158 diff --git a/scripts/report_missing_covidcast_meta.py b/scripts/report_missing_covidcast_meta.py index bd1779070..7600a3cb2 100644 --- a/scripts/report_missing_covidcast_meta.py +++ b/scripts/report_missing_covidcast_meta.py @@ -1,12 +1,14 @@ -from typing import Dict, List, Tuple, Union -from requests import get import sys -import pandas as pd from pathlib import Path +from typing import Dict, List, Tuple + +import pandas as pd +from requests import get base_dir = Path(__file__).parent.parent base_url = 'https://delphi.cmu.edu/epidata' + def is_known_missing(source: str, signal: str) -> bool: if '7dav_cumulative' in signal: return True @@ -14,6 +16,7 @@ def is_known_missing(source: str, signal: str) -> bool: return True return False + def compute_missing_signals() -> List[Tuple[Tuple[str, str], Dict]]: defined_meta = get(f"{base_url}/covidcast/meta").json() defined_signals: Dict[Tuple[str, str], Dict] = {} @@ -27,7 +30,7 @@ def compute_missing_signals() -> List[Tuple[Tuple[str, str], Dict]]: for entry in computed_meta: computed_signals.setdefault((entry['data_source'], entry['signal']), []).append(entry) - missing_signals: List[Tuple[Tuple[str, str], Dict]] = [] + missing_signals: List[Tuple[Tuple[str, str], Dict]] = [] for key, infos in computed_signals.items(): defined_info = defined_signals.get(key) @@ -38,9 +41,9 @@ def compute_missing_signals() -> List[Tuple[Tuple[str, str], Dict]]: def gen_row(source: str, signal: str, info: Dict) -> Dict: - is_weighted = signal.startswith('smoothed_w') and not (signal.startswith('smoothed_wa') or signal.startswith('smoothed_we') or signal.startswith('smoothed_wi') or signal.startswith('smoothed_wo') or signal.startswith('smoothed_wu')) + is_weighted = signal.startswith('smoothed_w') and not signal.startswith(('smoothed_wa', 'smoothed_we', 'smoothed_wi', 'smoothed_wo', 'smoothed_wu')) base_name = signal.replace('smoothed_w', 'smoothed_') if is_weighted else signal - bool_str = lambda x: 'TRUE' if x else 'FALSE' + bool_str = lambda x: 'TRUE' if x else 'FALSE' # noqa return { 'Source Subdivision': source, @@ -66,6 +69,7 @@ def gen_row(source: str, signal: str, info: Dict) -> Dict: 'Link': 'TODO' } + def generate_missing_info_hint(missing_signals: List[Tuple[Tuple[str, str], Dict]]) -> None: missing = pd.DataFrame.from_records([gen_row(s[0], s[1], info) for s, info in missing_signals]) @@ -76,12 +80,12 @@ def generate_missing_info_hint(missing_signals: List[Tuple[Tuple[str, str], Dict guessed: pd.DataFrame = pd.concat([current, missing]) guessed.to_csv(base_dir / 'missing_db_signals.csv', index=False) + missing = compute_missing_signals() if missing: print(f'found {len(missing)} missing signals') generate_missing_info_hint(missing) sys.exit(1) else: - print(f'all signals found') + print('all signals found') sys.exit(0) - diff --git a/src/acquisition/afhsb/afhsb_csv.py b/src/acquisition/afhsb/afhsb_csv.py index b839c4053..6d958f521 100644 --- a/src/acquisition/afhsb/afhsb_csv.py +++ b/src/acquisition/afhsb/afhsb_csv.py @@ -1,6 +1,6 @@ ''' afhsb_csv.py creates CSV files filled_00to13.csv, filled_13to17.csv and simple_DMISID_FY2018.csv -which will be later used to create MYSQL data tables. +which will be later used to create MYSQL data tables. Several intermediate files will be created, including: 00to13.pickle 13to17.pickle 00to13.csv 13to17.csv @@ -13,11 +13,10 @@ import csv import os - import pickle -import sas7bdat -import epiweeks as epi +import epiweeks as epi +import sas7bdat DATAPATH = '/home/automation/afhsb_data' SOURCE_DIR = DATAPATH @@ -25,168 +24,180 @@ INVALID_DMISIDS = set() + def get_flu_cat(dx): - # flu1 (influenza) - if len(dx) == 0: - return None - dx = dx.capitalize() - if dx.isnumeric(): - for prefix in ["487", "488"]: - if dx.startswith(prefix): - return 1 - for i in range(0, 7): - prefix = str(480 + i) - if dx.startswith(prefix): - return 2 - for i in range(0, 7): - prefix = str(460 + i) - if dx.startswith(prefix): - return 3 - for prefix in ["07999", "3829", "7806", "7862"]: - if dx.startswith(prefix): - return 3 - elif (dx[0].isalpha() and dx[1:].isnumeric()): - for prefix in ["J09", "J10", "J11"]: - if dx.startswith(prefix): - return 1 - for i in range(12, 19): - prefix = "J{}".format(i) - if dx.startswith(prefix): - return 2 - for i in range(0, 7): - prefix = "J0{}".format(i) - if dx.startswith(prefix): - return 3 - for i in range(20, 23): - prefix = "J{}".format(i) - if dx.startswith(prefix): - return 3 - for prefix in ["J40", "R05", "H669", "R509", "B9789"]: - if dx.startswith(prefix): - return 3 - else: - return None + # flu1 (influenza) + if len(dx) == 0: + return None + dx = dx.capitalize() + if dx.isnumeric(): + for prefix in ["487", "488"]: + if dx.startswith(prefix): + return 1 + for i in range(0, 7): + prefix = str(480 + i) + if dx.startswith(prefix): + return 2 + for i in range(0, 7): + prefix = str(460 + i) + if dx.startswith(prefix): + return 3 + for prefix in ["07999", "3829", "7806", "7862"]: + if dx.startswith(prefix): + return 3 + elif (dx[0].isalpha() and dx[1:].isnumeric()): + for prefix in ["J09", "J10", "J11"]: + if dx.startswith(prefix): + return 1 + for i in range(12, 19): + prefix = "J{}".format(i) + if dx.startswith(prefix): + return 2 + for i in range(0, 7): + prefix = "J0{}".format(i) + if dx.startswith(prefix): + return 3 + for i in range(20, 23): + prefix = "J{}".format(i) + if dx.startswith(prefix): + return 3 + for prefix in ["J40", "R05", "H669", "R509", "B9789"]: + if dx.startswith(prefix): + return 3 + else: + return None + def aggregate_data(sourcefile, targetfile): - reader = sas7bdat.SAS7BDAT(os.path.join(SOURCE_DIR, sourcefile), skip_header=True) - # map column names to column indices - col_2_idx = {column.name.decode('utf-8'): column.col_id for column in reader.columns} - - def get_field(row, column): - return row[col_2_idx[column]] - - def row2flu(row): - for i in range(1, 9): - dx = get_field(row, "dx{}".format(i)) - flu_cat = get_flu_cat(dx) - if flu_cat is not None: - return flu_cat - return 0 - - def row2epiweek(row): - date = get_field(row, 'd_event') - year, month, day = date.year, date.month, date.day - week_tuple = epi.Week.fromdate(year, month, day).weektuple() - year, week_num = week_tuple[0], week_tuple[1] - return year, week_num - - results_dict = {} - for _, row in enumerate(reader): - # if (r >= 1000000): break - if get_field(row, 'type') != "Outpt": - continue - year, week_num = row2epiweek(row) - dmisid = get_field(row, 'DMISID') - flu_cat = row2flu(row) - - key_list = [year, week_num, dmisid, flu_cat] - curr_dict = results_dict - for i, key in enumerate(key_list): - if i == len(key_list) - 1: - if key not in curr_dict: - curr_dict[key] = 0 - curr_dict[key] += 1 - else: - if key not in curr_dict: - curr_dict[key] = {} - curr_dict = curr_dict[key] - - results_path = os.path.join(TARGET_DIR, targetfile) - with open(results_path, 'wb') as f: - pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL) - - -################# Functions for geographical information #################### + reader = sas7bdat.SAS7BDAT(os.path.join(SOURCE_DIR, sourcefile), skip_header=True) + # map column names to column indices + col_2_idx = {column.name.decode('utf-8'): column.col_id for column in reader.columns} + + def get_field(row, column): + return row[col_2_idx[column]] + + def row2flu(row): + for i in range(1, 9): + dx = get_field(row, "dx{}".format(i)) + flu_cat = get_flu_cat(dx) + if flu_cat is not None: + return flu_cat + return 0 + + def row2epiweek(row): + date = get_field(row, 'd_event') + year, month, day = date.year, date.month, date.day + week_tuple = epi.Week.fromdate(year, month, day).weektuple() + year, week_num = week_tuple[0], week_tuple[1] + return year, week_num + + results_dict = {} + for _, row in enumerate(reader): + # if (r >= 1000000): break + if get_field(row, 'type') != "Outpt": + continue + year, week_num = row2epiweek(row) + dmisid = get_field(row, 'DMISID') + flu_cat = row2flu(row) + + key_list = [year, week_num, dmisid, flu_cat] + curr_dict = results_dict + for i, key in enumerate(key_list): + if i == len(key_list) - 1: + if key not in curr_dict: + curr_dict[key] = 0 + curr_dict[key] += 1 + else: + if key not in curr_dict: + curr_dict[key] = {} + curr_dict = curr_dict[key] + + results_path = os.path.join(TARGET_DIR, targetfile) + with open(results_path, 'wb') as f: + pickle.dump(results_dict, f, pickle.HIGHEST_PROTOCOL) + + +"""################# Functions for geographical information ####################""" + def get_country_mapping(): - filename = "country_codes.csv" - mapping = dict() - with open(os.path.join(TARGET_DIR, filename), "r") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - print(row.keys()) - alpha2 = row['alpha-2'] - alpha3 = row['alpha-3'] - mapping[alpha2] = alpha3 - - return mapping + filename = "country_codes.csv" + mapping = dict() + with open(os.path.join(TARGET_DIR, filename), "r") as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + print(row.keys()) + alpha2 = row['alpha-2'] + alpha3 = row['alpha-3'] + mapping[alpha2] = alpha3 + + return mapping + def format_dmisid_csv(filename, target_name): - src_path = os.path.join(TARGET_DIR, "{}.csv".format(filename)) - dst_path = os.path.join(TARGET_DIR, target_name) - - src_csv = open(src_path, "r", encoding='utf-8-sig') - reader = csv.DictReader(src_csv) - - dst_csv = open(dst_path, "w") - fieldnames = ['dmisid', 'country', 'state', 'zip5'] - writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) - writer.writeheader() - - country_mapping = get_country_mapping() - - for row in reader: - country2 = row['Facility ISO Country Code'] - if country2 == "": - country3 = "" - elif country2 not in country_mapping: - for key in row.keys(): - print(key, row[key]) - continue - else: - country3 = country_mapping[country2] - new_row = {'dmisid': row['DMIS ID'], - 'country': country3, - 'state': row['Facility State Code'], - 'zip5': row['Facility 5-Digit ZIP Code']} - writer.writerow(new_row) + src_path = os.path.join(TARGET_DIR, "{}.csv".format(filename)) + dst_path = os.path.join(TARGET_DIR, target_name) + + src_csv = open(src_path, "r", encoding='utf-8-sig') + reader = csv.DictReader(src_csv) + + dst_csv = open(dst_path, "w") + fieldnames = ['dmisid', 'country', 'state', 'zip5'] + writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) + writer.writeheader() + + country_mapping = get_country_mapping() + + for row in reader: + country2 = row['Facility ISO Country Code'] + if country2 == "": + country3 = "" + elif country2 not in country_mapping: + for key in row.keys(): + print(key, row[key]) + continue + else: + country3 = country_mapping[country2] + new_row = { + 'dmisid': row['DMIS ID'], + 'country': country3, + 'state': row['Facility State Code'], + 'zip5': row['Facility 5-Digit ZIP Code'] + } + writer.writerow(new_row) + def dmisid(): - filename = 'DMISID_FY2018' - target_name = "simple_DMISID_FY2018.csv" - format_dmisid_csv(filename, target_name) - - -cen2states = {'cen1': {'CT', 'ME', 'MA', 'NH', 'RI', 'VT'}, - 'cen2': {'NJ', 'NY', 'PA'}, - 'cen3': {'IL', 'IN', 'MI', 'OH', 'WI'}, - 'cen4': {'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD'}, - 'cen5': {'DE', 'DC', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'WV'}, - 'cen6': {'AL', 'KY', 'MS', 'TN'}, - 'cen7': {'AR', 'LA', 'OK', 'TX'}, - 'cen8': {'AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY'}, - 'cen9': {'AK', 'CA', 'HI', 'OR', 'WA'}} - -hhs2states = {'hhs1': {'VT', 'CT', 'ME', 'MA', 'NH', 'RI'}, - 'hhs2': {'NJ', 'NY'}, - 'hhs3': {'DE', 'DC', 'MD', 'PA', 'VA', 'WV'}, - 'hhs4': {'AL', 'FL', 'GA', 'KY', 'MS', 'NC', 'TN', 'SC'}, - 'hhs5': {'IL', 'IN', 'MI', 'MN', 'OH', 'WI'}, - 'hhs6': {'AR', 'LA', 'NM', 'OK', 'TX'}, - 'hhs7': {'IA', 'KS', 'MO', 'NE'}, - 'hhs8': {'CO', 'MT', 'ND', 'SD', 'UT', 'WY'}, - 'hhs9': {'AZ', 'CA', 'HI', 'NV'}, - 'hhs10': {'AK', 'ID', 'OR', 'WA'}} + filename = 'DMISID_FY2018' + target_name = "simple_DMISID_FY2018.csv" + format_dmisid_csv(filename, target_name) + + +cen2states = { + 'cen1': {'CT', 'ME', 'MA', 'NH', 'RI', 'VT'}, + 'cen2': {'NJ', 'NY', 'PA'}, + 'cen3': {'IL', 'IN', 'MI', 'OH', 'WI'}, + 'cen4': {'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD'}, + 'cen5': {'DE', 'DC', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'WV'}, + 'cen6': {'AL', 'KY', 'MS', 'TN'}, + 'cen7': {'AR', 'LA', 'OK', 'TX'}, + 'cen8': {'AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY'}, + 'cen9': {'AK', 'CA', 'HI', 'OR', 'WA'} +} + +hhs2states = { + 'hhs1': {'VT', 'CT', 'ME', 'MA', 'NH', 'RI'}, + 'hhs2': {'NJ', 'NY'}, + 'hhs3': {'DE', 'DC', 'MD', 'PA', 'VA', 'WV'}, + 'hhs4': {'AL', 'FL', 'GA', 'KY', 'MS', 'NC', 'TN', 'SC'}, + 'hhs5': {'IL', 'IN', 'MI', 'MN', 'OH', 'WI'}, + 'hhs6': {'AR', 'LA', 'NM', 'OK', 'TX'}, + 'hhs7': {'IA', 'KS', 'MO', 'NE'}, + 'hhs8': {'CO', 'MT', 'ND', 'SD', 'UT', 'WY'}, + 'hhs9': {'AZ', 'CA', 'HI', 'NV'}, + 'hhs10': {'AK', 'ID', 'OR', 'WA'} +} + def state2region(D): results = dict() @@ -197,155 +208,173 @@ def state2region(D): results[state] = region return results + def state2region_csv(): - to_hhs = state2region(hhs2states) - to_cen = state2region(cen2states) - states = to_hhs.keys() - target_name = "state2region.csv" - fieldnames = ['state', 'hhs', 'cen'] - with open(target_name, "w") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - for state in states: - content = {"state": state, "hhs": to_hhs[state], "cen": to_cen[state]} - writer.writerow(content) - -################# Functions for geographical information #################### - -######################### Functions for AFHSB data ########################## + to_hhs = state2region(hhs2states) + to_cen = state2region(cen2states) + states = to_hhs.keys() + target_name = "state2region.csv" + fieldnames = ['state', 'hhs', 'cen'] + with open(target_name, "w") as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for state in states: + content = {"state": state, "hhs": to_hhs[state], "cen": to_cen[state]} + writer.writerow(content) + + +"""################# Functions for geographical information ####################""" + +"""######################### Functions for AFHSB data ##########################""" + def write_afhsb_csv(period): - flu_mapping = {0: "ili-flu3", 1: "flu1", 2:"flu2-flu1", 3: "flu3-flu2"} - results_dict = pickle.load(open(os.path.join(TARGET_DIR, "{}.pickle".format(period)), 'rb')) - - fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] - with open(os.path.join(TARGET_DIR, "{}.csv".format(period)), 'w') as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - - i = 0 - for year in sorted(results_dict.keys()): - year_dict = results_dict[year] - for week in sorted(year_dict.keys()): - week_dict = year_dict[week] - for dmisid in sorted(week_dict.keys()): - dmisid_dict = week_dict[dmisid] - for flu in sorted(dmisid_dict.keys()): - visit_sum = dmisid_dict[flu] - i += 1 - epiweek = int("{}{:02d}".format(year, week)) - flu_type = flu_mapping[flu] - - row = {"epiweek": epiweek, "dmisid": None if (not dmisid.isnumeric()) else dmisid, - "flu_type": flu_type, "visit_sum": visit_sum, "id": i} - writer.writerow(row) - if i % 100000 == 0: - print(row) + flu_mapping = {0: "ili-flu3", 1: "flu1", 2: "flu2-flu1", 3: "flu3-flu2"} + results_dict = pickle.load(open(os.path.join(TARGET_DIR, "{}.pickle".format(period)), 'rb')) + + fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] + with open(os.path.join(TARGET_DIR, "{}.csv".format(period)), 'w') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + + i = 0 + for year in sorted(results_dict.keys()): + year_dict = results_dict[year] + for week in sorted(year_dict.keys()): + week_dict = year_dict[week] + for dmisid in sorted(week_dict.keys()): + dmisid_dict = week_dict[dmisid] + for flu in sorted(dmisid_dict.keys()): + visit_sum = dmisid_dict[flu] + i += 1 + epiweek = int("{}{:02d}".format(year, week)) + flu_type = flu_mapping[flu] + + row = { + "epiweek": epiweek, + "dmisid": None if (not dmisid.isnumeric()) else dmisid, + "flu_type": flu_type, + "visit_sum": visit_sum, + "id": i + } + writer.writerow(row) + if i % 100000 == 0: + print(row) + def dmisid_start_time_from_file(filename): - starttime_record = dict() - with open(filename, 'r') as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - dmisid = row['dmisid'] - epiweek = int(row['epiweek']) - if dmisid not in starttime_record: - starttime_record[dmisid] = epiweek - else: - starttime_record[dmisid] = min(epiweek, starttime_record[dmisid]) - return starttime_record + starttime_record = dict() + with open(filename, 'r') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + dmisid = row['dmisid'] + epiweek = int(row['epiweek']) + if dmisid not in starttime_record: + starttime_record[dmisid] = epiweek + else: + starttime_record[dmisid] = min(epiweek, starttime_record[dmisid]) + return starttime_record + def dmisid_start_time(): - record1 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "00to13.csv")) - record2 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "13to17.csv")) - record = record1 - for dmisid, epiweek in record2.items(): - if dmisid in record: - record[dmisid] = min(record[dmisid], epiweek) - else: - record[dmisid] = epiweek - return record + record1 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "00to13.csv")) + record2 = dmisid_start_time_from_file(os.path.join(TARGET_DIR, "13to17.csv")) + record = record1 + for dmisid, epiweek in record2.items(): + if dmisid in record: + record[dmisid] = min(record[dmisid], epiweek) + else: + record[dmisid] = epiweek + return record + def fillin_zero_to_csv(period, dmisid_start_record): - src_path = os.path.join(TARGET_DIR, "{}.csv".format(period)) - dst_path = os.path.join(TARGET_DIR, "filled_{}.csv".format(period)) - - # Load data into a dictionary - src_csv = open(src_path, "r") - reader = csv.DictReader(src_csv) - - results_dict = dict() # epiweek -> dmisid -> flu_type: visit_sum - for i, row in enumerate(reader): - epiweek = int(row['epiweek']) - dmisid = row['dmisid'] - flu_type = row['flu_type'] - visit_sum = row['visit_sum'] - if epiweek not in results_dict: - results_dict[epiweek] = dict() - week_dict = results_dict[epiweek] - if dmisid not in week_dict: - week_dict[dmisid] = dict() - dmisid_dict = week_dict[dmisid] - dmisid_dict[flu_type] = visit_sum - - # Fill in zero count records - dmisid_group = dmisid_start_record.keys() - flutype_group = ["ili-flu3", "flu1", "flu2-flu1", "flu3-flu2"] - - for epiweek in results_dict.keys(): - week_dict = results_dict[epiweek] - for dmisid in dmisid_group: - start_week = dmisid_start_record[dmisid] - if start_week > epiweek: - continue - - if dmisid not in week_dict: - week_dict[dmisid] = dict() - - dmisid_dict = week_dict[dmisid] - for flutype in flutype_group: - if flutype not in dmisid_dict: - dmisid_dict[flutype] = 0 - - # Write to csv files - dst_csv = open(dst_path, "w") - fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] - writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) - writer.writeheader() - - i = 1 - for epiweek in results_dict: - for dmisid in results_dict[epiweek]: - for flutype in results_dict[epiweek][dmisid]: - visit_sum = results_dict[epiweek][dmisid][flutype] - row = {"id": i, "epiweek": epiweek, "dmisid": dmisid, - "flu_type": flutype, "visit_sum": visit_sum} - writer.writerow(row) - if i % 100000 == 0: - print(row) - i += 1 - print("Wrote {} rows".format(i)) - -######################### Functions for AFHSB data ########################## + src_path = os.path.join(TARGET_DIR, "{}.csv".format(period)) + dst_path = os.path.join(TARGET_DIR, "filled_{}.csv".format(period)) + + # Load data into a dictionary + src_csv = open(src_path, "r") + reader = csv.DictReader(src_csv) + + results_dict = dict() # epiweek -> dmisid -> flu_type: visit_sum + for i, row in enumerate(reader): + epiweek = int(row['epiweek']) + dmisid = row['dmisid'] + flu_type = row['flu_type'] + visit_sum = row['visit_sum'] + if epiweek not in results_dict: + results_dict[epiweek] = dict() + week_dict = results_dict[epiweek] + if dmisid not in week_dict: + week_dict[dmisid] = dict() + dmisid_dict = week_dict[dmisid] + dmisid_dict[flu_type] = visit_sum + + # Fill in zero count records + dmisid_group = dmisid_start_record.keys() + flutype_group = ["ili-flu3", "flu1", "flu2-flu1", "flu3-flu2"] + + for epiweek in results_dict.keys(): + week_dict = results_dict[epiweek] + for dmisid in dmisid_group: + start_week = dmisid_start_record[dmisid] + if start_week > epiweek: + continue + + if dmisid not in week_dict: + week_dict[dmisid] = dict() + + dmisid_dict = week_dict[dmisid] + for flutype in flutype_group: + if flutype not in dmisid_dict: + dmisid_dict[flutype] = 0 + + # Write to csv files + dst_csv = open(dst_path, "w") + fieldnames = ["id", "epiweek", "dmisid", "flu_type", "visit_sum"] + writer = csv.DictWriter(dst_csv, fieldnames=fieldnames) + writer.writeheader() + + i = 1 + for epiweek in results_dict: + for dmisid in results_dict[epiweek]: + for flutype in results_dict[epiweek][dmisid]: + visit_sum = results_dict[epiweek][dmisid][flutype] + row = { + "id": i, + "epiweek": epiweek, + "dmisid": dmisid, + "flu_type": flutype, + "visit_sum": visit_sum + } + writer.writerow(row) + if i % 100000 == 0: + print(row) + i += 1 + print("Wrote {} rows".format(i)) + + +"""######################### Functions for AFHSB data ##########################""" + def main(): - # Build tables containing geographical information - state2region_csv() - dmisid() + # Build tables containing geographical information + state2region_csv() + dmisid() - # Aggregate raw data into pickle files - aggregate_data("ili_1_2000_5_2013_new.sas7bdat", "00to13.pickle") - aggregate_data("ili_1_2013_11_2017_new.sas7bdat", "13to17.pickle") + # Aggregate raw data into pickle files + aggregate_data("ili_1_2000_5_2013_new.sas7bdat", "00to13.pickle") + aggregate_data("ili_1_2013_11_2017_new.sas7bdat", "13to17.pickle") # write pickle content to csv files - write_afhsb_csv("00to13") - write_afhsb_csv("13to17") + write_afhsb_csv("00to13") + write_afhsb_csv("13to17") # Fill in zero count records - dmisid_start_record = dmisid_start_time() - fillin_zero_to_csv("00to13", dmisid_start_record) - fillin_zero_to_csv("13to17", dmisid_start_record) + dmisid_start_record = dmisid_start_time() + fillin_zero_to_csv("00to13", dmisid_start_record) + fillin_zero_to_csv("13to17", dmisid_start_record) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/afhsb/afhsb_sql.py b/src/acquisition/afhsb/afhsb_sql.py index 278f3fc38..12c77cce4 100644 --- a/src/acquisition/afhsb/afhsb_sql.py +++ b/src/acquisition/afhsb/afhsb_sql.py @@ -1,11 +1,7 @@ -# standard library import os -# third party -import mysql.connector as connector - -# first party import delphi.operations.secrets as secrets +import mysql.connector as connector def init_dmisid_table(sourcefile): @@ -18,21 +14,21 @@ def init_dmisid_table(sourcefile): `country` CHAR(3) NULL, `state` CHAR(2) NULL ); - '''.format(table_name) + '''.format(table_name) populate_table_cmd = ''' LOAD DATA INFILE '{}' INTO TABLE {} - FIELDS TERMINATED BY ',' + FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\r\n' IGNORE 1 ROWS (@dmisid, @country, @state, @zip5) - SET + SET dmisid = @dmisid, country = nullif(@country, ''), state = nullif(@state, '') ; - '''.format(sourcefile, table_name) + '''.format(sourcefile, table_name) try: cursor = cnx.cursor() cursor.execute(create_table_cmd) @@ -41,6 +37,7 @@ def init_dmisid_table(sourcefile): finally: cnx.close() + def init_region_table(sourcefile): (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") @@ -55,13 +52,13 @@ def init_region_table(sourcefile): populate_table_cmd = ''' LOAD DATA INFILE '{}' INTO TABLE {} - FIELDS TERMINATED BY ',' + FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\r\n' IGNORE 1 ROWS (@state, @hhs, @cen) SET state=@state, hhs=@hhs, cen=@cen; - '''.format(sourcefile, table_name) + '''.format(sourcefile, table_name) try: cursor = cnx.cursor() cursor.execute(create_table_cmd) @@ -82,28 +79,28 @@ def init_raw_data(table_name, sourcefile): `dmisid` CHAR(4) NULL, `flu_type` CHAR(9) NOT NULL, `visit_sum` INT(11) NOT NULL, - + KEY `epiweek` (`epiweek`), KEY `dmisid` (`dmisid`), KEY `flu_type` (`flu_type`) ); - '''.format(table_name) + '''.format(table_name) populate_table_cmd = ''' LOAD DATA INFILE '{}' INTO TABLE {} - FIELDS TERMINATED BY ',' + FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\r\n' IGNORE 1 ROWS (@id, @epiweek, @dmisid, @flu, @visits) - SET + SET id = @id, epiweek = @epiweek, dmisid = nullif(@dmisid, 'ZZZZ'), flu_type = @flu, visit_sum = @visits ; - '''.format(sourcefile, table_name) + '''.format(sourcefile, table_name) try: cursor = cnx.cursor() cursor.execute(create_table_cmd) @@ -112,6 +109,7 @@ def init_raw_data(table_name, sourcefile): finally: cnx.close() + def agg_by_state(src_table, dest_table): print("Aggregating records by states...") (u, p) = secrets.db.epi @@ -120,8 +118,8 @@ def agg_by_state(src_table, dest_table): CREATE TABLE {} SELECT a.epiweek, a.flu_type, d.state, d.country, sum(a.visit_sum) visit_sum FROM {} a - LEFT JOIN dmisid_table d - ON a.dmisid = d.dmisid + LEFT JOIN dmisid_table d + ON a.dmisid = d.dmisid GROUP BY a.epiweek, a.flu_type, d.state, d.country; '''.format(dest_table, src_table) try: @@ -131,6 +129,7 @@ def agg_by_state(src_table, dest_table): finally: cnx.close() + def agg_by_region(src_table, dest_table): print("Aggregating records by regions...") (u, p) = secrets.db.epi @@ -150,6 +149,7 @@ def agg_by_region(src_table, dest_table): finally: cnx.close() + def init_all_tables(datapath): init_dmisid_table(os.path.join(datapath, "simple_DMISID_FY2018.csv")) init_region_table(os.path.join(datapath, "state2region.csv")) @@ -164,6 +164,7 @@ def init_all_tables(datapath): agg_by_state(raw_table_name, state_table_name) agg_by_region(state_table_name, region_table_name) + def dangerously_drop_all_afhsb_tables(): (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") @@ -179,10 +180,11 @@ def dangerously_drop_all_afhsb_tables(): `state2region_table`, `dmisid_table`; ''') - cnx.commit() # (might do nothing; each DROP commits itself anyway) + cnx.commit() # (might do nothing; each DROP commits itself anyway) finally: cnx.close() + def run_cmd(cmd): (u, p) = secrets.db.epi cnx = connector.connect(user=u, passwd=p, database="epidata") diff --git a/src/acquisition/afhsb/afhsb_update.py b/src/acquisition/afhsb/afhsb_update.py index c5a8635c8..7ca241287 100644 --- a/src/acquisition/afhsb/afhsb_update.py +++ b/src/acquisition/afhsb/afhsb_update.py @@ -1,18 +1,25 @@ # standard library import argparse -import tempfile import os -import stat import shutil +import stat +import tempfile # first party from . import afhsb_sql DEFAULT_DATAPATH = '/home/automation/afhsb_data' + def main(): parser = argparse.ArgumentParser() - parser.add_argument('--datapath', action='store', type=str, default=DEFAULT_DATAPATH, help='filepath to directory containing csv files to input into database') + parser.add_argument( + '--datapath', + action='store', + type=str, + default=DEFAULT_DATAPATH, + help='filepath to directory containing csv files to input into database' + ) args = parser.parse_args() # MariaDB appears to refuse to LOAD DATA INFILE except on files under # /var/lib/mysql (which seems dedicated to its own files) or /tmp; create a diff --git a/src/acquisition/cdcp/cdc_dropbox_receiver.py b/src/acquisition/cdcp/cdc_dropbox_receiver.py index eb0d97f2a..d95858920 100644 --- a/src/acquisition/cdcp/cdc_dropbox_receiver.py +++ b/src/acquisition/cdcp/cdc_dropbox_receiver.py @@ -16,141 +16,136 @@ - cdc_extract.py """ -# standard library import datetime from zipfile import ZIP_DEFLATED, ZipFile -# third party +import delphi.operations.secrets as secrets import dropbox import mysql.connector -# first party -import delphi.operations.secrets as secrets - - # location constants DROPBOX_BASE_DIR = '/cdc_page_stats' DELPHI_BASE_DIR = '/common/cdc_stage' def get_timestamp_string(): - """ - Return the current local date and time as a string. + """ + Return the current local date and time as a string. - The format is "%Y%m%d_%H%M%S". - """ - return datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + The format is "%Y%m%d_%H%M%S". + """ + return datetime.datetime.now().strftime('%Y%m%d_%H%M%S') def trigger_further_processing(): - """Add CDCP processing scripts to the Automation run queue.""" + """Add CDCP processing scripts to the Automation run queue.""" - # connect - u, p = secrets.db.auto - cnx = mysql.connector.connect(user=u, password=p, database='automation') - cur = cnx.cursor() + # connect + u, p = secrets.db.auto + cnx = mysql.connector.connect(user=u, password=p, database='automation') + cur = cnx.cursor() - # add step "Process CDCP Data" to queue - cur.execute('CALL automation.RunStep(46)') + # add step "Process CDCP Data" to queue + cur.execute('CALL automation.RunStep(46)') - # disconnect - cur.close() - cnx.commit() - cnx.close() + # disconnect + cur.close() + cnx.commit() + cnx.close() def fetch_data(): - """ - Check for new files on dropbox, download them, zip them, cleanup dropbox, and - trigger further processing of new data. - """ - - # initialize dropbox api - dbx = dropbox.Dropbox(secrets.cdcp.dropbox_token) - - # look for new CDC data files - print('checking dropbox:%s' % DROPBOX_BASE_DIR) - save_list = [] - for entry in dbx.files_list_folder(DROPBOX_BASE_DIR).entries: - name = entry.name - if name.endswith('.csv') or name.endswith('.zip'): - print(' download "%s"' % name) - save_list.append(name) - else: - print(' skip "%s"' % name) - - # determine if there's anything to be done - if len(save_list) == 0: - print('did not find any new data files') - return - - # download new files, saving them inside of a new zip file - timestamp = get_timestamp_string() - zip_path = '%s/dropbox_%s.zip' % (DELPHI_BASE_DIR, timestamp) - print('downloading into delphi:%s' % zip_path) - with ZipFile(zip_path, 'w', ZIP_DEFLATED) as zf: + """ + Check for new files on dropbox, download them, zip them, cleanup dropbox, and + trigger further processing of new data. + """ + + # initialize dropbox api + dbx = dropbox.Dropbox(secrets.cdcp.dropbox_token) + + # look for new CDC data files + print('checking dropbox:%s' % DROPBOX_BASE_DIR) + save_list = [] + for entry in dbx.files_list_folder(DROPBOX_BASE_DIR).entries: + name = entry.name + if name.endswith('.csv') or name.endswith('.zip'): + print(' download "%s"' % name) + save_list.append(name) + else: + print(' skip "%s"' % name) + + # determine if there's anything to be done + if len(save_list) == 0: + print('did not find any new data files') + return + + # download new files, saving them inside of a new zip file + timestamp = get_timestamp_string() + zip_path = '%s/dropbox_%s.zip' % (DELPHI_BASE_DIR, timestamp) + print('downloading into delphi:%s' % zip_path) + with ZipFile(zip_path, 'w', ZIP_DEFLATED) as zf: + for name in save_list: + # location of the file on dropbox + dropbox_path = '%s/%s' % (DROPBOX_BASE_DIR, name) + print(' %s' % dropbox_path) + + # start the download + meta, resp = dbx.files_download(dropbox_path) + + # check status and length + if resp.status_code != 200: + raise Exception(['resp.status_code', resp.status_code]) + dropbox_len = meta.size + print(' need %d bytes...' % dropbox_len) + content_len = int(resp.headers.get('Content-Length', -1)) + if dropbox_len != content_len: + info = ['dropbox_len', dropbox_len, 'content_len', content_len] + raise Exception(info) + + # finish the download, holding the data in this variable + filedata = resp.content + + # check the length again + payload_len = len(filedata) + print(' downloaded') + if dropbox_len != payload_len: + info = ['dropbox_len', dropbox_len, 'payload_len', payload_len] + raise Exception(info) + + # add the downloaded file to the zip file + zf.writestr(name, filedata) + print(' added') + + # At this point, all the data is stored and awaiting further processing on + # the delphi server. + print('saved all new data in %s' % zip_path) + + # on dropbox, archive downloaded files so they won't be downloaded again + archive_dir = 'archived_reports/processed_%s' % timestamp + print('archiving files...') for name in save_list: - # location of the file on dropbox - dropbox_path = '%s/%s' % (DROPBOX_BASE_DIR, name) - print(' %s' % dropbox_path) - - # start the download - meta, resp = dbx.files_download(dropbox_path) - - # check status and length - if resp.status_code != 200: - raise Exception(['resp.status_code', resp.status_code]) - dropbox_len = meta.size - print(' need %d bytes...' % dropbox_len) - content_len = int(resp.headers.get('Content-Length', -1)) - if dropbox_len != content_len: - info = ['dropbox_len', dropbox_len, 'content_len', content_len] - raise Exception(info) - - # finish the download, holding the data in this variable - filedata = resp.content - - # check the length again - payload_len = len(filedata) - print(' downloaded') - if dropbox_len != payload_len: - info = ['dropbox_len', dropbox_len, 'payload_len', payload_len] - raise Exception(info) - - # add the downloaded file to the zip file - zf.writestr(name, filedata) - print(' added') - - # At this point, all the data is stored and awaiting further processing on - # the delphi server. - print('saved all new data in %s' % zip_path) - - # on dropbox, archive downloaded files so they won't be downloaded again - archive_dir = 'archived_reports/processed_%s' % timestamp - print('archiving files...') - for name in save_list: - # source and destination - dropbox_src = '%s/%s' % (DROPBOX_BASE_DIR, name) - dropbox_dst = '%s/%s/%s' % (DROPBOX_BASE_DIR, archive_dir, name) - print(' "%s" -> "%s"' % (dropbox_src, dropbox_dst)) - - # move the file - meta = dbx.files_move(dropbox_src, dropbox_dst) - - # sanity check - if archive_dir not in meta.path_lower: - raise Exception('failed to move "%s"' % name) - - # finally, trigger the usual processing flow - print('triggering processing flow') - trigger_further_processing() - print('done') + # source and destination + dropbox_src = '%s/%s' % (DROPBOX_BASE_DIR, name) + dropbox_dst = '%s/%s/%s' % (DROPBOX_BASE_DIR, archive_dir, name) + print(' "%s" -> "%s"' % (dropbox_src, dropbox_dst)) + + # move the file + meta = dbx.files_move(dropbox_src, dropbox_dst) + + # sanity check + if archive_dir not in meta.path_lower: + raise Exception('failed to move "%s"' % name) + + # finally, trigger the usual processing flow + print('triggering processing flow') + trigger_further_processing() + print('done') def main(): - # fetch new data - fetch_data() + # fetch new data + fetch_data() if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/cdcp/cdc_extract.py b/src/acquisition/cdcp/cdc_extract.py index 83ed08d5b..ac5b704f3 100644 --- a/src/acquisition/cdcp/cdc_extract.py +++ b/src/acquisition/cdcp/cdc_extract.py @@ -61,163 +61,161 @@ + initial version """ -# standard library + import argparse import sys -# third party -import mysql.connector - -# first party import delphi.operations.secrets as secrets import delphi.utils.epiweek as flu +import mysql.connector + from . import cdc_upload def get_num_hits(cur, epiweek, state, page): - sql = ''' - SELECT - sum(c.`num`) `num` - FROM - `cdc` c - JOIN - `cdc_meta` m - ON - m.`date` = c.`date` AND m.`state` = c.`state` - WHERE - m.`epiweek` = %s AND c.`state` = %s AND c.`page` LIKE %s - ''' - num = None - cur.execute(sql, (epiweek, state, page)) - for (num,) in cur: - pass - if num is None: - return 0 - return num + sql = ''' + SELECT + sum(c.`num`) `num` + FROM + `cdc` c + JOIN + `cdc_meta` m + ON + m.`date` = c.`date` AND m.`state` = c.`state` + WHERE + m.`epiweek` = %s AND c.`state` = %s AND c.`page` LIKE %s + ''' + num = None + cur.execute(sql, (epiweek, state, page)) + for (num,) in cur: + pass + if num is None: + return 0 + return num def get_total_hits(cur, epiweek, state): - sql = ''' - SELECT - sum(m.`total`) `total` - FROM - `cdc_meta` m - WHERE - m.`epiweek` = %s AND m.`state` = %s - ''' - total = None - cur.execute(sql, (epiweek, state)) - for (total,) in cur: - pass - if total is None: - raise Exception('missing data for %d-%s' % (epiweek, state)) - return total + sql = ''' + SELECT + sum(m.`total`) `total` + FROM + `cdc_meta` m + WHERE + m.`epiweek` = %s AND m.`state` = %s + ''' + total = None + cur.execute(sql, (epiweek, state)) + for (total,) in cur: + pass + if total is None: + raise Exception('missing data for %d-%s' % (epiweek, state)) + return total def store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total): - sql = ''' - INSERT INTO - `cdc_extract` (`epiweek`, `state`, `num1`, `num2`, `num3`, `num4`, `num5`, `num6`, `num7`, `num8`, `total`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `num1` = %s, - `num2` = %s, - `num3` = %s, - `num4` = %s, - `num5` = %s, - `num6` = %s, - `num7` = %s, - `num8` = %s, - `total` = %s - ''' - values = [num1, num2, num3, num4, num5, num6, num7, num8, total] - args = tuple([epiweek, state] + values + values) - cur.execute(sql, args) + sql = ''' + INSERT INTO + `cdc_extract` (`epiweek`, `state`, `num1`, `num2`, `num3`, `num4`, `num5`, `num6`, `num7`, `num8`, `total`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `num1` = %s, + `num2` = %s, + `num3` = %s, + `num4` = %s, + `num5` = %s, + `num6` = %s, + `num7` = %s, + `num8` = %s, + `total` = %s + ''' + values = [num1, num2, num3, num4, num5, num6, num7, num8, total] + args = tuple([epiweek, state] + values + values) + cur.execute(sql, args) def extract(first_week=None, last_week=None, test_mode=False): - # page title templates - pages = [ - '%What You Should Know for the % Influenza Season%', - '%What To Do If You Get Sick%', - '%Flu Symptoms & Severity%', - '%How Flu Spreads%', - '%What You Should Know About Flu Antiviral Drugs%', - '%Weekly US Map%', - '%Basics%', - '%Flu Activity & Surveillance%', - ] - - # location information - states = sorted(cdc_upload.STATES.values()) - - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # weeks to update - if first_week is None: - cur.execute('SELECT max(`epiweek`) FROM `cdc_extract`') - for (first_week,) in cur: - pass - if last_week is None: - cur.execute('SELECT max(`epiweek`) FROM `cdc_meta`') - for (last_week,) in cur: - pass - print('extracting %d--%d' % (first_week, last_week)) - - # update each epiweek - for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): - # update each state - for state in states: - try: - num1 = get_num_hits(cur, epiweek, state, pages[0]) - num2 = get_num_hits(cur, epiweek, state, pages[1]) - num3 = get_num_hits(cur, epiweek, state, pages[2]) - num4 = get_num_hits(cur, epiweek, state, pages[3]) - num5 = get_num_hits(cur, epiweek, state, pages[4]) - num6 = get_num_hits(cur, epiweek, state, pages[5]) - num7 = get_num_hits(cur, epiweek, state, pages[6]) - num8 = get_num_hits(cur, epiweek, state, pages[7]) - total = get_total_hits(cur, epiweek, state) - store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total) - print(' %d-%s: %d %d %d %d %d %d %d %d (%d)' % (epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total)) - except Exception as ex: - print(' %d-%s: failed' % (epiweek, state), ex) - #raise ex - sys.stdout.flush() - - # disconnect - cur.close() - if not test_mode: - cnx.commit() - cnx.close() + # page title templates + pages = [ + '%What You Should Know for the % Influenza Season%', + '%What To Do If You Get Sick%', + '%Flu Symptoms & Severity%', + '%How Flu Spreads%', + '%What You Should Know About Flu Antiviral Drugs%', + '%Weekly US Map%', + '%Basics%', + '%Flu Activity & Surveillance%', + ] + + # location information + states = sorted(cdc_upload.STATES.values()) + + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cur = cnx.cursor() + + # weeks to update + if first_week is None: + cur.execute('SELECT max(`epiweek`) FROM `cdc_extract`') + for (first_week,) in cur: + pass + if last_week is None: + cur.execute('SELECT max(`epiweek`) FROM `cdc_meta`') + for (last_week,) in cur: + pass + print('extracting %d--%d' % (first_week, last_week)) + + # update each epiweek + for epiweek in flu.range_epiweeks(first_week, last_week, inclusive=True): + # update each state + for state in states: + try: + num1 = get_num_hits(cur, epiweek, state, pages[0]) + num2 = get_num_hits(cur, epiweek, state, pages[1]) + num3 = get_num_hits(cur, epiweek, state, pages[2]) + num4 = get_num_hits(cur, epiweek, state, pages[3]) + num5 = get_num_hits(cur, epiweek, state, pages[4]) + num6 = get_num_hits(cur, epiweek, state, pages[5]) + num7 = get_num_hits(cur, epiweek, state, pages[6]) + num8 = get_num_hits(cur, epiweek, state, pages[7]) + total = get_total_hits(cur, epiweek, state) + store_result(cur, epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total) + print(' %d-%s: %d %d %d %d %d %d %d %d (%d)' % (epiweek, state, num1, num2, num3, num4, num5, num6, num7, num8, total)) + except Exception as ex: + print(' %d-%s: failed' % (epiweek, state), ex) + # raise ex + sys.stdout.flush() + + # disconnect + cur.close() + if not test_mode: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--epiweek', '-w', default=None, type=int, help='epiweek override') - parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') - args = parser.parse_args() - - # sanity check - first, last, week = args.first, args.last, args.epiweek - for ew in [first, last, week]: - if ew is not None: - flu.check_epiweek(ew) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - if week is not None: - first = last = week - - # extract the page hits for all states on the specified weeks - extract(first, last, args.test) + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') + parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') + parser.add_argument('--epiweek', '-w', default=None, type=int, help='epiweek override') + parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') + args = parser.parse_args() + + # sanity check + first, last, week = args.first, args.last, args.epiweek + for ew in [first, last, week]: + if ew is not None: + flu.check_epiweek(ew) + if first is not None and last is not None and first > last: + raise Exception('epiweeks in the wrong order') + if week is not None: + first = last = week + + # extract the page hits for all states on the specified weeks + extract(first, last, args.test) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/cdcp/cdc_upload.py b/src/acquisition/cdcp/cdc_upload.py index c9c206dfa..b86105a0d 100644 --- a/src/acquisition/cdcp/cdc_upload.py +++ b/src/acquisition/cdcp/cdc_upload.py @@ -69,209 +69,206 @@ + initial version """ -# standard library + import argparse import csv -from datetime import datetime import glob import io import os import shutil +from datetime import datetime from zipfile import ZipFile -# third party -import mysql.connector - -# first party import delphi.operations.secrets as secrets - +import mysql.connector STATES = { - 'Alabama': 'AL', - 'Alaska': 'AK', - 'Arizona': 'AZ', - 'Arkansas': 'AR', - 'California': 'CA', - 'Colorado': 'CO', - 'Connecticut': 'CT', - 'Delaware': 'DE', - 'District of Columbia': 'DC', - 'Florida': 'FL', - 'Georgia': 'GA', - 'Hawaii': 'HI', - 'Idaho': 'ID', - 'Illinois': 'IL', - 'Indiana': 'IN', - 'Iowa': 'IA', - 'Kansas': 'KS', - 'Kentucky': 'KY', - 'Louisiana': 'LA', - 'Maine': 'ME', - 'Maryland': 'MD', - 'Massachusetts': 'MA', - 'Michigan': 'MI', - 'Minnesota': 'MN', - 'Mississippi': 'MS', - 'Missouri': 'MO', - 'Montana': 'MT', - 'Nebraska': 'NE', - 'Nevada': 'NV', - 'New Hampshire': 'NH', - 'New Jersey': 'NJ', - 'New Mexico': 'NM', - 'New York': 'NY', - 'North Carolina': 'NC', - 'North Dakota': 'ND', - 'Ohio': 'OH', - 'Oklahoma': 'OK', - 'Oregon': 'OR', - 'Pennsylvania': 'PA', - 'Rhode Island': 'RI', - 'South Carolina': 'SC', - 'South Dakota': 'SD', - 'Tennessee': 'TN', - 'Texas': 'TX', - 'Utah': 'UT', - 'Vermont': 'VT', - 'Virginia': 'VA', - 'Washington': 'WA', - 'West Virginia': 'WV', - 'Wisconsin': 'WI', - 'Wyoming': 'WY', - #'Puerto Rico': 'PR', - #'Virgin Islands': 'VI', - #'Guam': 'GU', + 'Alabama': 'AL', + 'Alaska': 'AK', + 'Arizona': 'AZ', + 'Arkansas': 'AR', + 'California': 'CA', + 'Colorado': 'CO', + 'Connecticut': 'CT', + 'Delaware': 'DE', + 'District of Columbia': 'DC', + 'Florida': 'FL', + 'Georgia': 'GA', + 'Hawaii': 'HI', + 'Idaho': 'ID', + 'Illinois': 'IL', + 'Indiana': 'IN', + 'Iowa': 'IA', + 'Kansas': 'KS', + 'Kentucky': 'KY', + 'Louisiana': 'LA', + 'Maine': 'ME', + 'Maryland': 'MD', + 'Massachusetts': 'MA', + 'Michigan': 'MI', + 'Minnesota': 'MN', + 'Mississippi': 'MS', + 'Missouri': 'MO', + 'Montana': 'MT', + 'Nebraska': 'NE', + 'Nevada': 'NV', + 'New Hampshire': 'NH', + 'New Jersey': 'NJ', + 'New Mexico': 'NM', + 'New York': 'NY', + 'North Carolina': 'NC', + 'North Dakota': 'ND', + 'Ohio': 'OH', + 'Oklahoma': 'OK', + 'Oregon': 'OR', + 'Pennsylvania': 'PA', + 'Rhode Island': 'RI', + 'South Carolina': 'SC', + 'South Dakota': 'SD', + 'Tennessee': 'TN', + 'Texas': 'TX', + 'Utah': 'UT', + 'Vermont': 'VT', + 'Virginia': 'VA', + 'Washington': 'WA', + 'West Virginia': 'WV', + 'Wisconsin': 'WI', + 'Wyoming': 'WY', + # 'Puerto Rico': 'PR', + # 'Virgin Islands': 'VI', + # 'Guam': 'GU', } sql_cdc = ''' - INSERT INTO - `cdc` (`date`, `page`, `state`, `num`) - VALUES - (%s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `num` = %s + INSERT INTO + `cdc` (`date`, `page`, `state`, `num`) + VALUES + (%s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `num` = %s ''' sql_cdc_meta = ''' - INSERT INTO - `cdc_meta` (`date`, `epiweek`, `state`, `total`) - VALUES - (%s, yearweek(%s, 6), %s, %s) - ON DUPLICATE KEY UPDATE - `total` = %s + INSERT INTO + `cdc_meta` (`date`, `epiweek`, `state`, `total`) + VALUES + (%s, yearweek(%s, 6), %s, %s) + ON DUPLICATE KEY UPDATE + `total` = %s ''' def upload(test_mode): - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - # insert (or update) table `cdc` - def insert_cdc(date, page, state, num): - cur.execute(sql_cdc, (date, page, state, num, num)) - - # insert (or update) table `cdc_meta` - def insert_cdc_meta(date, state, total): - cur.execute(sql_cdc_meta, (date, date, state, total, total)) - - # loop over rows until the header row is found - def find_header(reader): - for row in reader: - if len(row) > 0 and row[0] == 'Date': - return True - return False - - # parse csv files for `cdc` and `cdc_meta` - def parse_csv(meta): - def handler(reader): - if not find_header(reader): - raise Exception('header not found') - count = 0 - cols = 3 if meta else 4 - for row in reader: - if len(row) != cols: - continue - if meta: - (a, c, d) = row - else: - (a, b, c, d) = row - c = c[:-16] - if c not in STATES: - continue - a = datetime.strptime(a, '%b %d, %Y').strftime('%Y-%m-%d') - c = STATES[c] - d = int(d) - if meta: - insert_cdc_meta(a, c, d) + """Connect""" + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cur = cnx.cursor() + + def insert_cdc(date, page, state, num): + """Insert (or update) table `cdc`""" + cur.execute(sql_cdc, (date, page, state, num, num)) + + def insert_cdc_meta(date, state, total): + """Insert (or update) table `cdc_meta`""" + cur.execute(sql_cdc_meta, (date, date, state, total, total)) + + def find_header(reader): + """Loop over rows until the header row is found""" + for row in reader: + if len(row) > 0 and row[0] == 'Date': + return True + return False + + def parse_csv(meta): + """Parse csv files for `cdc` and `cdc_meta`""" + + def handler(reader): + if not find_header(reader): + raise Exception('header not found') + count = 0 + cols = 3 if meta else 4 + for row in reader: + if len(row) != cols: + continue + if meta: + (a, c, d) = row + else: + (a, b, c, d) = row + c = c[:-16] + if c not in STATES: + continue + a = datetime.strptime(a, '%b %d, %Y').strftime('%Y-%m-%d') + c = STATES[c] + d = int(d) + if meta: + insert_cdc_meta(a, c, d) + else: + insert_cdc(a, b, c, d) + count += 1 + return count + return handler + + def parse_zip(zf, level=1): + """Recursively open zip files""" + for name in zf.namelist(): + prefix = ' ' * level + print(prefix, name) + if name[-4:] == '.zip': + with zf.open(name) as temp: + with ZipFile(io.BytesIO(temp.read())) as zf2: + parse_zip(zf2, level + 1) + elif name[-4:] == '.csv': + handler = None + if 'Flu Pages by Region' in name: + handler = parse_csv(False) + elif 'Regions for all CDC' in name: + handler = parse_csv(True) + else: + print(prefix, ' (skipped)') + if handler is not None: + with zf.open(name) as temp: + count = handler(csv.reader(io.StringIO(str(temp.read(), 'utf-8')))) + print(prefix, ' %d rows' % count) + else: + print(prefix, ' (ignored)') + + # find, parse, and move zip files + zip_files = glob.glob('/common/cdc_stage/*.zip') + print('searching...') + for f in zip_files: + print(' ', f) + print('parsing...') + for f in zip_files: + with ZipFile(f) as zf: + parse_zip(zf) + print('moving...') + for f in zip_files: + src = f + dst = os.path.join('/home/automation/cdc_page_stats/', os.path.basename(src)) + print(' ', src, '->', dst) + if test_mode: + print(' (test mode enabled - not moved)') else: - insert_cdc(a, b, c, d) - count += 1 - return count - return handler - - # recursively open zip files - def parse_zip(zf, level=1): - for name in zf.namelist(): - prefix = ' ' * level - print(prefix, name) - if name[-4:] == '.zip': - with zf.open(name) as temp: - with ZipFile(io.BytesIO(temp.read())) as zf2: - parse_zip(zf2, level + 1) - elif name[-4:] == '.csv': - handler = None - if 'Flu Pages by Region' in name: - handler = parse_csv(False) - elif 'Regions for all CDC' in name: - handler = parse_csv(True) - else: - print(prefix, ' (skipped)') - if handler is not None: - with zf.open(name) as temp: - count = handler(csv.reader(io.StringIO(str(temp.read(), 'utf-8')))) - print(prefix, ' %d rows' % count) - else: - print(prefix, ' (ignored)') - - # find, parse, and move zip files - zip_files = glob.glob('/common/cdc_stage/*.zip') - print('searching...') - for f in zip_files: - print(' ', f) - print('parsing...') - for f in zip_files: - with ZipFile(f) as zf: - parse_zip(zf) - print('moving...') - for f in zip_files: - src = f - dst = os.path.join('/home/automation/cdc_page_stats/', os.path.basename(src)) - print(' ', src, '->', dst) - if test_mode: - print(' (test mode enabled - not moved)') - else: - shutil.move(src, dst) - if not os.path.isfile(dst): - raise Exception('unable to move file') - - # disconnect - cur.close() - if not test_mode: - cnx.commit() - cnx.close() + shutil.move(src, dst) + if not os.path.isfile(dst): + raise Exception('unable to move file') + + # disconnect + cur.close() + if not test_mode: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') - args = parser.parse_args() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument('--test', '-t', default=False, action='store_true', help='dry run only') + args = parser.parse_args() - # make it happen - upload(args.test) + # make it happen + upload(args.test) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/covid_hosp/common/database.py b/src/acquisition/covid_hosp/common/database.py index 8875828fa..cc39d8ca6 100644 --- a/src/acquisition/covid_hosp/common/database.py +++ b/src/acquisition/covid_hosp/common/database.py @@ -1,245 +1,243 @@ """Common database code used by multiple `covid_hosp` scrapers.""" -# standard library + +import math from collections import namedtuple from contextlib import contextmanager -import math -# third party +import delphi.operations.secrets as secrets import mysql.connector import pandas as pd -# first party -import delphi.operations.secrets as secrets - Columndef = namedtuple("Columndef", "csv_name sql_name dtype") + class Database: - def __init__(self, - connection, - table_name=None, - hhs_dataset_id=None, - columns_and_types=None, - key_columns=None, - additional_fields=None): - """Create a new Database object. - - Parameters - ---------- - connection - An open connection to a database. - table_name : str - The name of the table which holds the dataset. - hhs_dataset_id : str - The 9-character healthdata.gov identifier for this dataset. - columns_and_types : tuple[str, str, Callable] - List of 3-tuples of (CSV header name, SQL column name, data type) for - all the columns in the CSV file. - additional_fields : tuple[str] - List of 2-tuples of (value, SQL column name) fordditional fields to include - at the end of the row which are not present in the CSV data. - """ - - self.connection = connection - self.table_name = table_name - self.hhs_dataset_id = hhs_dataset_id - self.publication_col_name = "issue" if table_name == 'covid_hosp_state_timeseries' else \ - 'publication_date' - self.columns_and_types = { - c.csv_name: c - for c in (columns_and_types if columns_and_types is not None else []) - } - self.key_columns = key_columns if key_columns is not None else [] - self.additional_fields = additional_fields if additional_fields is not None else [] - - @classmethod - @contextmanager - def connect(database_class, mysql_connector_impl=mysql.connector): - """Connect to a database and provide the connection as a context manager. - - As long as the context manager exits normally, the connection's transaction - will be committed. Otherwise, if the context is exited by an Exception, the - transaction will be rolled back. - - In any case, the connection will be gracefully closed upon exiting the - context manager. - """ - - # connect to the database - user, password = secrets.db.epi - connection = mysql_connector_impl.connect( - host=secrets.db.host, - user=user, - password=password, - database='epidata') - - try: - # provide the connection to the context manager - yield database_class(connection) - - # rollback by default; the following commit will only take place if no - # exception was raised in calling code - connection.commit() - finally: - # close the connection in any case - connection.close() - - @contextmanager - def new_cursor(self): - """Create and provide a database cursor as a context manager. - - The cursor will be gracefully closed upon exiting the context manager. - """ - - cursor = self.connection.cursor() - try: - yield cursor - finally: - cursor.close() - - def contains_revision(self, revision): - """Return whether the given revision already exists in the database. - - Parameters - ---------- - revision : str - Unique revision string. - - Returns - ------- - bool - True iff the revision already exists. - """ - - with self.new_cursor() as cursor: - cursor.execute(''' - SELECT - count(1) > 0 - FROM - `covid_hosp_meta` - WHERE - `hhs_dataset_id` = %s AND `revision_timestamp` = %s - ''', (self.hhs_dataset_id, revision)) - for (result,) in cursor: - return bool(result) - - def insert_metadata(self, publication_date, revision, meta_json): - """Add revision metadata to the database. - - Parameters - ---------- - publication_date : int - Date when the dataset was published in YYYYMMDD format. - revision : str - Unique revision string. - meta_json : str - Metadata serialized as a JSON string. - """ - - with self.new_cursor() as cursor: - cursor.execute(''' - INSERT INTO - `covid_hosp_meta` ( - `dataset_name`, - `hhs_dataset_id`, - `publication_date`, - `revision_timestamp`, - `metadata_json`, - `acquisition_datetime` - ) - VALUES - (%s, %s, %s, %s, %s, NOW()) - ''', (self.table_name, self.hhs_dataset_id, publication_date, revision, meta_json)) - - def insert_dataset(self, publication_date, dataframe): - """Add a dataset to the database. - - Parameters - ---------- - publication_date : int - Date when the dataset was published in YYYYMMDD format. - dataframe : pandas.DataFrame - The dataset. - """ - dataframe_columns_and_types = [ - x for x in self.columns_and_types.values() if x.csv_name in dataframe.columns - ] - - def nan_safe_dtype(dtype, value): - if isinstance(value, float) and math.isnan(value): - return None - return dtype(value) - - # first convert keys and save the results; we'll need them later - for csv_name in self.key_columns: - dataframe.loc[:, csv_name] = dataframe[csv_name].map(self.columns_and_types[csv_name].dtype) - - num_columns = 2 + len(dataframe_columns_and_types) + len(self.additional_fields) - value_placeholders = ', '.join(['%s'] * num_columns) - columns = ', '.join(f'`{i.sql_name}`' for i in dataframe_columns_and_types + self.additional_fields) - sql = f'INSERT INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columns}) ' \ - f'VALUES ({value_placeholders})' - id_and_publication_date = (0, publication_date) - with self.new_cursor() as cursor: - for _, row in dataframe.iterrows(): - values = [] - for c in dataframe_columns_and_types: - values.append(nan_safe_dtype(c.dtype, row[c.csv_name])) - cursor.execute(sql, - id_and_publication_date + - tuple(values) + - tuple(i.csv_name for i in self.additional_fields)) - - # deal with non/seldomly updated columns used like a fk table (if this database needs it) - if hasattr(self, 'AGGREGATE_KEY_COLS'): - ak_cols = self.AGGREGATE_KEY_COLS - - # restrict data to just the key columns and remove duplicate rows - # sort by key columns to ensure that the last ON DUPLICATE KEY overwrite - # uses the most-recent aggregate key information - ak_data = (dataframe[set(ak_cols + self.key_columns)] - .sort_values(self.key_columns)[ak_cols] - .drop_duplicates()) - # cast types - for col in ak_cols: - ak_data[col] = ak_data[col].map( - lambda value: nan_safe_dtype(self.columns_and_types[col].dtype, value) - ) - # fix NULLs - ak_data = ak_data.to_numpy(na_value=None).tolist() - - # create string of tick-quoted and comma-seperated column list - ak_cols_str = ','.join(f'`{col}`' for col in ak_cols) - # ...and ticked and comma-sep'd "column=column" list for ON UPDATE (to keep only the most recent values for each pk) - ak_updates_str = ','.join(f'`{col}`=v.{col}' for col in ak_cols) - # ...and string of VALUES placeholders - values_str = ','.join( ['%s'] * len(ak_cols) ) - # use aggregate key table alias - ak_table = self.table_name + '_key' - # assemble full SQL statement - ak_insert_sql = f'INSERT INTO `{ak_table}` ({ak_cols_str}) VALUES ({values_str}) AS v ON DUPLICATE KEY UPDATE {ak_updates_str}' - - # commit the data - with self.new_cursor() as cur: - cur.executemany(ak_insert_sql, ak_data) - - - def get_max_issue(self): - """Fetch the most recent issue. - - This is used to bookend what updates we pull in from the HHS metadata. - """ - with self.new_cursor() as cursor: - cursor.execute(f''' - SELECT - max(publication_date) - from - `covid_hosp_meta` - WHERE - hhs_dataset_id = "{self.hhs_dataset_id}" - ''') - for (result,) in cursor: - if result is not None: - return pd.Timestamp(str(result)) - return pd.Timestamp("1900/1/1") + def __init__( + self, + connection, + table_name=None, + hhs_dataset_id=None, + columns_and_types=None, + key_columns=None, + additional_fields=None + ): + """ + Create a new Database object. + + Parameters + ---------- + connection + An open connection to a database. + table_name : str + The name of the table which holds the dataset. + hhs_dataset_id : str + The 9-character healthdata.gov identifier for this dataset. + columns_and_types : tuple[str, str, Callable] + List of 3-tuples of (CSV header name, SQL column name, data type) for + all the columns in the CSV file. + additional_fields : tuple[str] + List of 2-tuples of (value, SQL column name) fordditional fields to include + at the end of the row which are not present in the CSV data. + """ + + self.connection = connection + self.table_name = table_name + self.hhs_dataset_id = hhs_dataset_id + self.publication_col_name = "issue" if table_name == 'covid_hosp_state_timeseries' else 'publication_date' + self.columns_and_types = { + c.csv_name: c for c in (columns_and_types if columns_and_types is not None else []) + } + self.key_columns = key_columns if key_columns is not None else [] + self.additional_fields = additional_fields if additional_fields is not None else [] + + @classmethod + @contextmanager + def connect(database_class, mysql_connector_impl=mysql.connector): + """ + Connect to a database and provide the connection as a context manager. + + As long as the context manager exits normally, the connection's transaction + will be committed. Otherwise, if the context is exited by an Exception, the + transaction will be rolled back. + + In any case, the connection will be gracefully closed upon exiting the + context manager. + """ + + # connect to the database + user, password = secrets.db.epi + connection = mysql_connector_impl.connect( + host=secrets.db.host, + user=user, + password=password, + database='epidata') + + try: + # provide the connection to the context manager + yield database_class(connection) + + # rollback by default; the following commit will only take place if no + # exception was raised in calling code + connection.commit() + finally: + # close the connection in any case + connection.close() + + @contextmanager + def new_cursor(self): + """Create and provide a database cursor as a context manager. + + The cursor will be gracefully closed upon exiting the context manager. + """ + + cursor = self.connection.cursor() + try: + yield cursor + finally: + cursor.close() + + def contains_revision(self, revision): + """Return whether the given revision already exists in the database. + + Parameters + ---------- + revision : str + Unique revision string. + + Returns + ------- + bool + True iff the revision already exists. + """ + + with self.new_cursor() as cursor: + cursor.execute(''' + SELECT + count(1) > 0 + FROM + `covid_hosp_meta` + WHERE + `hhs_dataset_id` = %s AND `revision_timestamp` = %s + ''', (self.hhs_dataset_id, revision)) + for (result,) in cursor: + return bool(result) + + def insert_metadata(self, publication_date, revision, meta_json): + """Add revision metadata to the database. + + Parameters + ---------- + publication_date : int + Date when the dataset was published in YYYYMMDD format. + revision : str + Unique revision string. + meta_json : str + Metadata serialized as a JSON string. + """ + + with self.new_cursor() as cursor: + cursor.execute(''' + INSERT INTO + `covid_hosp_meta` ( + `dataset_name`, + `hhs_dataset_id`, + `publication_date`, + `revision_timestamp`, + `metadata_json`, + `acquisition_datetime` + ) + VALUES + (%s, %s, %s, %s, %s, NOW()) + ''', (self.table_name, self.hhs_dataset_id, publication_date, revision, meta_json)) + + def insert_dataset(self, publication_date, dataframe): + """Add a dataset to the database. + + Parameters + ---------- + publication_date : int + Date when the dataset was published in YYYYMMDD format. + dataframe : pandas.DataFrame + The dataset. + """ + dataframe_columns_and_types = [ + x for x in self.columns_and_types.values() if x.csv_name in dataframe.columns + ] + + def nan_safe_dtype(dtype, value): + if isinstance(value, float) and math.isnan(value): + return None + return dtype(value) + + # first convert keys and save the results; we'll need them later + for csv_name in self.key_columns: + dataframe.loc[:, csv_name] = dataframe[csv_name].map(self.columns_and_types[csv_name].dtype) + + num_columns = 2 + len(dataframe_columns_and_types) + len(self.additional_fields) + value_placeholders = ', '.join(['%s'] * num_columns) + columns = ', '.join(f'`{i.sql_name}`' for i in dataframe_columns_and_types + self.additional_fields) + sql = f'INSERT INTO `{self.table_name}` (`id`, `{self.publication_col_name}`, {columns}) ' \ + f'VALUES ({value_placeholders})' + id_and_publication_date = (0, publication_date) + with self.new_cursor() as cursor: + for _, row in dataframe.iterrows(): + values = [] + for c in dataframe_columns_and_types: + values.append(nan_safe_dtype(c.dtype, row[c.csv_name])) + cursor.execute( + sql, + id_and_publication_date + + tuple(values) + + tuple(i.csv_name for i in self.additional_fields) + ) + + # deal with non/seldomly updated columns used like a fk table (if this database needs it) + if hasattr(self, 'AGGREGATE_KEY_COLS'): + ak_cols = self.AGGREGATE_KEY_COLS + + # restrict data to just the key columns and remove duplicate rows + # sort by key columns to ensure that the last ON DUPLICATE KEY overwrite + # uses the most-recent aggregate key information + ak_data = (dataframe[set(ak_cols + self.key_columns)].sort_values(self.key_columns)[ak_cols].drop_duplicates()) + # cast types + for col in ak_cols: + ak_data[col] = ak_data[col].map( + lambda value: nan_safe_dtype(self.columns_and_types[col].dtype, value) + ) + # fix NULLs + ak_data = ak_data.to_numpy(na_value=None).tolist() + + # create string of tick-quoted and comma-seperated column list + ak_cols_str = ','.join(f'`{col}`' for col in ak_cols) + # ...and ticked and comma-sep'd "column=column" list for ON UPDATE (to keep only the most recent values for each pk) + ak_updates_str = ','.join(f'`{col}`=v.{col}' for col in ak_cols) + # ...and string of VALUES placeholders + values_str = ','.join(['%s'] * len(ak_cols)) + # use aggregate key table alias + ak_table = self.table_name + '_key' + # assemble full SQL statement + ak_insert_sql = f'INSERT INTO `{ak_table}` ({ak_cols_str}) VALUES ({values_str}) AS v ON DUPLICATE KEY UPDATE {ak_updates_str}' + + # commit the data + with self.new_cursor() as cur: + cur.executemany(ak_insert_sql, ak_data) + + def get_max_issue(self): + """Fetch the most recent issue. + This is used to bookend what updates we pull in from the HHS metadata. + """ + with self.new_cursor() as cursor: + cursor.execute(f''' + SELECT + max(publication_date) + from + `covid_hosp_meta` + WHERE + hhs_dataset_id = "{self.hhs_dataset_id}" + ''') + for (result,) in cursor: + if result is not None: + return pd.Timestamp(str(result)) + return pd.Timestamp("1900/1/1") diff --git a/src/acquisition/covid_hosp/common/network.py b/src/acquisition/covid_hosp/common/network.py index ba0cca281..1874845a7 100644 --- a/src/acquisition/covid_hosp/common/network.py +++ b/src/acquisition/covid_hosp/common/network.py @@ -1,49 +1,47 @@ -# third party import pandas class Network: - METADATA_URL_TEMPLATE = \ - 'https://healthdata.gov/api/views/%s/rows.csv' - - def fetch_metadata_for_dataset(dataset_id): - """Download and return metadata. - - Parameters - ---------- - dataset_id : str - healthdata.gov dataset identifier of the dataset. - - Returns - ------- - object - The metadata object. - """ - url = Network.METADATA_URL_TEMPLATE % dataset_id - print(f'fetching metadata at {url}') - df = Network.fetch_dataset(url) - df["Update Date"] = pandas.to_datetime(df["Update Date"]) - df.sort_values("Update Date", inplace=True) - df.set_index("Update Date", inplace=True) - return df - - def fetch_dataset(url, pandas_impl=pandas): - """Download and return a dataset. - - Type inference is disabled in favor of explicit type casting at the - database abstraction layer. Pandas behavior is to represent non-missing - values as strings and missing values as `math.nan`. - - Parameters - ---------- - url : str - URL to the dataset in CSV format. - - Returns - ------- - pandas.DataFrame - The dataset. - """ - - print(f'fetching dataset at {url}') - return pandas_impl.read_csv(url, dtype=str) + METADATA_URL_TEMPLATE = 'https://healthdata.gov/api/views/%s/rows.csv' + + def fetch_metadata_for_dataset(dataset_id): + """Download and return metadata. + + Parameters + ---------- + dataset_id : str + healthdata.gov dataset identifier of the dataset. + + Returns + ------- + object + The metadata object. + """ + url = Network.METADATA_URL_TEMPLATE % dataset_id + print(f'fetching metadata at {url}') + df = Network.fetch_dataset(url) + df["Update Date"] = pandas.to_datetime(df["Update Date"]) + df.sort_values("Update Date", inplace=True) + df.set_index("Update Date", inplace=True) + return df + + def fetch_dataset(url, pandas_impl=pandas): + """Download and return a dataset. + + Type inference is disabled in favor of explicit type casting at the + database abstraction layer. Pandas behavior is to represent non-missing + values as strings and missing values as `math.nan`. + + Parameters + ---------- + url : str + URL to the dataset in CSV format. + + Returns + ------- + pandas.DataFrame + The dataset. + """ + + print(f'fetching dataset at {url}') + return pandas_impl.read_csv(url, dtype=str) diff --git a/src/acquisition/covid_hosp/common/test_utils.py b/src/acquisition/covid_hosp/common/test_utils.py index 2a737b383..5a78e8829 100644 --- a/src/acquisition/covid_hosp/common/test_utils.py +++ b/src/acquisition/covid_hosp/common/test_utils.py @@ -8,52 +8,50 @@ dir, hence the existence of this file. """ -# standard library from pathlib import Path -# third party import pandas class UnitTestUtils: - # path to `covid_hosp` test data, relative to the top of the repo - PATH_TO_TESTDATA = 'testdata/acquisition/covid_hosp' + # path to `covid_hosp` test data, relative to the top of the repo + PATH_TO_TESTDATA = 'testdata/acquisition/covid_hosp' - def __init__(self, abs_path_to_caller): - # navigate to the root of the delphi-epidata repo - dataset_name = None - current_path = Path(abs_path_to_caller) - while not (current_path / 'testdata').exists(): + def __init__(self, abs_path_to_caller): + # navigate to the root of the delphi-epidata repo + dataset_name = None + current_path = Path(abs_path_to_caller) + while not (current_path / 'testdata').exists(): - # bail if we made it all the way to root - if not current_path.name: - raise Exception('unable to determine path to delphi-epidata repo') + # bail if we made it all the way to root + if not current_path.name: + raise Exception('unable to determine path to delphi-epidata repo') - # looking for a path like .../acquisition/covid_hosp/ - if current_path.parent.name == 'covid_hosp': - dataset_name = current_path.name + # looking for a path like .../acquisition/covid_hosp/ + if current_path.parent.name == 'covid_hosp': + dataset_name = current_path.name - # move up one level - current_path = current_path.parent + # move up one level + current_path = current_path.parent - # the loop above stops at the top of the repo - path_to_repo = current_path + # the loop above stops at the top of the repo + path_to_repo = current_path - if not dataset_name: - raise Exception('unable to determine name of dataset under test') + if not dataset_name: + raise Exception('unable to determine name of dataset under test') - # path dataset-specific test data, relative to the root of the repo - self.data_dir = ( - path_to_repo / UnitTestUtils.PATH_TO_TESTDATA / dataset_name - ).resolve() + # path dataset-specific test data, relative to the root of the repo + self.data_dir = ( + path_to_repo / UnitTestUtils.PATH_TO_TESTDATA / dataset_name + ).resolve() - def load_sample_metadata(self, metadata_name='metadata.csv'): - df = pandas.read_csv(self.data_dir / metadata_name, dtype=str) - df["Update Date"] = pandas.to_datetime(df["Update Date"]) - df.sort_values("Update Date", inplace=True) - df.set_index("Update Date", inplace=True) - return df + def load_sample_metadata(self, metadata_name='metadata.csv'): + df = pandas.read_csv(self.data_dir / metadata_name, dtype=str) + df["Update Date"] = pandas.to_datetime(df["Update Date"]) + df.sort_values("Update Date", inplace=True) + df.set_index("Update Date", inplace=True) + return df - def load_sample_dataset(self, dataset_name='dataset.csv'): - return pandas.read_csv(self.data_dir / dataset_name, dtype=str) + def load_sample_dataset(self, dataset_name='dataset.csv'): + return pandas.read_csv(self.data_dir / dataset_name, dtype=str) diff --git a/src/acquisition/covid_hosp/common/utils.py b/src/acquisition/covid_hosp/common/utils.py index 99a6b4f33..069f3efe6 100644 --- a/src/acquisition/covid_hosp/common/utils.py +++ b/src/acquisition/covid_hosp/common/utils.py @@ -1,189 +1,187 @@ """Code shared among multiple `covid_hosp` scrapers.""" -# standard library import datetime import re import pandas as pd + class CovidHospException(Exception): - """Exception raised exclusively by `covid_hosp` utilities.""" + """Exception raised exclusively by `covid_hosp` utilities.""" class Utils: - # regex to extract issue date from revision field - # example revision: "Mon, 11/16/2020 - 00:55" - REVISION_PATTERN = re.compile(r'^.*\s(\d+)/(\d+)/(\d+)\s.*$') - - def launch_if_main(entrypoint, runtime_name): - """Call the given function in the main entry point, otherwise no-op.""" - - if runtime_name == '__main__': - entrypoint() - - def int_from_date(date): - """Convert a YYYY/MM/DD date from a string to a YYYYMMDD int. - - Parameters - ---------- - date : str - Date in "YYYY/MM/DD.*" format. - - Returns - ------- - int - Date in YYYYMMDD format. - """ - if isinstance(date, str): - return int(date[:10].replace('/', '').replace('-', '')) - return date - - def parse_bool(value): - """Convert a string to a boolean. - - Parameters - ---------- - value : str - Boolean-like value, like "true" or "false". - - Returns - ------- - bool - If the string contains some version of "true" or "false". - None - If the string is None or empty. - - Raises - ------ - CovidHospException - If the string constains something other than a version of "true" or - "false". - """ - - if not value: - return None - if value.lower() == 'true': - return True - if value.lower() == 'false': - return False - raise CovidHospException(f'cannot convert "{value}" to bool') - - def issues_to_fetch(metadata, newer_than, older_than): - """ - Construct all issue dates and URLs to be ingested based on metadata. - - Parameters - ---------- - metadata pd.DataFrame - HHS metadata indexed by issue date and with column "Archive Link" - newer_than Date - Lower bound (exclusive) of days to get issues for. - older_than Date - Upper bound (exclusive) of days to get issues for - Returns - ------- - Dictionary of {issue day: list of (download urls, index)} - for issues after newer_than and before older_than - """ - daily_issues = {} - n_beyond = 0 - for index in sorted(set(metadata.index)): - day = index.date() - if day > newer_than and day < older_than: - urls = metadata.loc[index, "Archive Link"] - urls_list = [(urls, index)] if isinstance(urls, str) else [(url, index) for url in urls] - if day not in daily_issues: - daily_issues[day] = urls_list - else: - daily_issues[day] += urls_list - elif day >= older_than: - n_beyond += 1 - if n_beyond > 0: - print(f"{n_beyond} issues available on {older_than} or newer") - return daily_issues - - @staticmethod - def merge_by_key_cols(dfs, key_cols): - """Merge a list of data frames as a series of updates. - - Parameters: - ----------- - dfs : list(pd.DataFrame) - Data frames to merge, ordered from earliest to latest. - key_cols: list(str) - Columns to use as the index. - - Returns a single data frame containing the most recent data for each state+date. - """ - - dfs = [df.set_index(key_cols) for df in dfs - if not all(k in df.index.names for k in key_cols)] - result = dfs[0] - for df in dfs[1:]: - # update values for existing keys - result.update(df) - # add any new keys. - ## repeated concatenation in pandas is expensive, but (1) we don't expect - ## batch sizes to be terribly large (7 files max) and (2) this way we can - ## more easily capture the next iteration's updates to any new keys - new_rows = df.loc[[i for i in df.index.to_list() if i not in result.index.to_list()]] - result = pd.concat([result, new_rows]) - - # convert the index rows back to columns - return result.reset_index(level=key_cols) - - @staticmethod - def update_dataset(database, network, newer_than=None, older_than=None): - """Acquire the most recent dataset, unless it was previously acquired. - - Parameters - ---------- - database : delphi.epidata.acquisition.covid_hosp.common.database.Database - A `Database` subclass for a particular dataset. - network : delphi.epidata.acquisition.covid_hosp.common.network.Network - A `Network` subclass for a particular dataset. - newer_than : date - Lower bound (exclusive) of days to get issues for. - older_than : date - Upper bound (exclusive) of days to get issues for - - Returns - ------- - bool - Whether a new dataset was acquired. - """ - metadata = network.fetch_metadata() - datasets = [] - with database.connect() as db: - max_issue = db.get_max_issue() - - older_than = datetime.datetime.today().date() if newer_than is None else older_than - newer_than = max_issue if newer_than is None else newer_than - daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than) - if not daily_issues: - print("no new issues, nothing to do") - return False - for issue, revisions in daily_issues.items(): - issue_int = int(issue.strftime("%Y%m%d")) - # download the dataset and add it to the database - dataset = Utils.merge_by_key_cols([network.fetch_dataset(url) for url, _ in revisions], - db.KEY_COLS) - # add metadata to the database - all_metadata = [] - for url, index in revisions: - all_metadata.append((url, metadata.loc[index].reset_index().to_json())) - datasets.append(( - issue_int, - dataset, - all_metadata - )) - with database.connect() as db: - for issue_int, dataset, all_metadata in datasets: - db.insert_dataset(issue_int, dataset) - for url, metadata_json in all_metadata: - db.insert_metadata(issue_int, url, metadata_json) - print(f'successfully acquired {len(dataset)} rows') - - # note that the transaction is committed by exiting the `with` block - return True + # regex to extract issue date from revision field + # example revision: "Mon, 11/16/2020 - 00:55" + REVISION_PATTERN = re.compile(r'^.*\s(\d+)/(\d+)/(\d+)\s.*$') + + def launch_if_main(entrypoint, runtime_name): + """Call the given function in the main entry point, otherwise no-op.""" + + if runtime_name == '__main__': + entrypoint() + + def int_from_date(date): + """Convert a YYYY/MM/DD date from a string to a YYYYMMDD int. + + Parameters + ---------- + date : str + Date in "YYYY/MM/DD.*" format. + + Returns + ------- + int + Date in YYYYMMDD format. + """ + if isinstance(date, str): + return int(date[:10].replace('/', '').replace('-', '')) + return date + + def parse_bool(value): + """Convert a string to a boolean. + + Parameters + ---------- + value : str + Boolean-like value, like "true" or "false". + + Returns + ------- + bool + If the string contains some version of "true" or "false". + None + If the string is None or empty. + + Raises + ------ + CovidHospException + If the string constains something other than a version of "true" or + "false". + """ + + if not value: + return None + if value.lower() == 'true': + return True + if value.lower() == 'false': + return False + raise CovidHospException(f'cannot convert "{value}" to bool') + + def issues_to_fetch(metadata, newer_than, older_than): + """ + Construct all issue dates and URLs to be ingested based on metadata. + + Parameters + ---------- + metadata pd.DataFrame + HHS metadata indexed by issue date and with column "Archive Link" + newer_than Date + Lower bound (exclusive) of days to get issues for. + older_than Date + Upper bound (exclusive) of days to get issues for + Returns + ------- + Dictionary of {issue day: list of (download urls, index)} + for issues after newer_than and before older_than + """ + daily_issues = {} + n_beyond = 0 + for index in sorted(set(metadata.index)): + day = index.date() + if day > newer_than and day < older_than: + urls = metadata.loc[index, "Archive Link"] + urls_list = [(urls, index)] if isinstance(urls, str) else [(url, index) for url in urls] + if day not in daily_issues: + daily_issues[day] = urls_list + else: + daily_issues[day] += urls_list + elif day >= older_than: + n_beyond += 1 + if n_beyond > 0: + print(f"{n_beyond} issues available on {older_than} or newer") + return daily_issues + + @staticmethod + def merge_by_key_cols(dfs, key_cols): + """Merge a list of data frames as a series of updates. + + Parameters: + ----------- + dfs : list(pd.DataFrame) + Data frames to merge, ordered from earliest to latest. + key_cols: list(str) + Columns to use as the index. + + Returns a single data frame containing the most recent data for each state+date. + """ + + dfs = [df.set_index(key_cols) for df in dfs + if not all(k in df.index.names for k in key_cols)] + result = dfs[0] + for df in dfs[1:]: + # update values for existing keys + result.update(df) + # add any new keys. + # repeated concatenation in pandas is expensive, but (1) we don't expect + # batch sizes to be terribly large (7 files max) and (2) this way we can + # more easily capture the next iteration's updates to any new keys + new_rows = df.loc[[i for i in df.index.to_list() if i not in result.index.to_list()]] + result = pd.concat([result, new_rows]) + + # convert the index rows back to columns + return result.reset_index(level=key_cols) + + @staticmethod + def update_dataset(database, network, newer_than=None, older_than=None): + """Acquire the most recent dataset, unless it was previously acquired. + + Parameters + ---------- + database : delphi.epidata.acquisition.covid_hosp.common.database.Database + A `Database` subclass for a particular dataset. + network : delphi.epidata.acquisition.covid_hosp.common.network.Network + A `Network` subclass for a particular dataset. + newer_than : date + Lower bound (exclusive) of days to get issues for. + older_than : date + Upper bound (exclusive) of days to get issues for + + Returns + ------- + bool + Whether a new dataset was acquired. + """ + metadata = network.fetch_metadata() + datasets = [] + with database.connect() as db: + max_issue = db.get_max_issue() + + older_than = datetime.datetime.today().date() if newer_than is None else older_than + newer_than = max_issue if newer_than is None else newer_than + daily_issues = Utils.issues_to_fetch(metadata, newer_than, older_than) + if not daily_issues: + print("no new issues, nothing to do") + return False + for issue, revisions in daily_issues.items(): + issue_int = int(issue.strftime("%Y%m%d")) + # download the dataset and add it to the database + dataset = Utils.merge_by_key_cols([network.fetch_dataset(url) for url, _ in revisions], db.KEY_COLS) + # add metadata to the database + all_metadata = [] + for url, index in revisions: + all_metadata.append((url, metadata.loc[index].reset_index().to_json())) + datasets.append(( + issue_int, + dataset, + all_metadata + )) + with database.connect() as db: + for issue_int, dataset, all_metadata in datasets: + db.insert_dataset(issue_int, dataset) + for url, metadata_json in all_metadata: + db.insert_metadata(issue_int, url, metadata_json) + print(f'successfully acquired {len(dataset)} rows') + # note that the transaction is committed by exiting the `with` block + return True diff --git a/src/acquisition/covid_hosp/facility/database.py b/src/acquisition/covid_hosp/facility/database.py index 665256a4f..2927a8c8f 100644 --- a/src/acquisition/covid_hosp/facility/database.py +++ b/src/acquisition/covid_hosp/facility/database.py @@ -1,219 +1,197 @@ -# first party -from delphi.epidata.acquisition.covid_hosp.common.database import Database as BaseDatabase from delphi.epidata.acquisition.covid_hosp.common.database import Columndef +from delphi.epidata.acquisition.covid_hosp.common.database import Database as BaseDatabase from delphi.epidata.acquisition.covid_hosp.common.utils import Utils from delphi.epidata.acquisition.covid_hosp.facility.network import Network class Database(BaseDatabase): - TABLE_NAME = 'covid_hosp_facility' - KEY_COLS = ['hospital_pk', 'collection_week'] - AGGREGATE_KEY_COLS = ['address', 'ccn', 'city', 'fips_code', 'geocoded_hospital_address', 'hhs_ids', 'hospital_name', 'hospital_pk', 'hospital_subtype', 'is_metro_micro', 'state', 'zip'] - # These are 3-tuples of ( - # CSV header name, - # SQL db column name, - # data type - # ) for all the columns in the CSV file. - # Note that the corresponding database column names may be shorter - # due to constraints on the length of column names. See - # /src/ddl/covid_hosp.sql for more information. - ORDERED_CSV_COLUMNS = [ - Columndef('hospital_pk', 'hospital_pk', str), - Columndef('collection_week', 'collection_week', Utils.int_from_date), - Columndef('address', 'address', str), - Columndef('all_adult_hospital_beds_7_day_avg', 'all_adult_hospital_beds_7_day_avg', float), - Columndef('all_adult_hospital_beds_7_day_coverage', 'all_adult_hospital_beds_7_day_coverage', int), - Columndef('all_adult_hospital_beds_7_day_sum', 'all_adult_hospital_beds_7_day_sum', int), - Columndef('all_adult_hospital_inpatient_bed_occupied_7_day_avg', - 'all_adult_hospital_inpatient_bed_occupied_7_day_avg', float), - Columndef('all_adult_hospital_inpatient_bed_occupied_7_day_coverage', - 'all_adult_hospital_inpatient_bed_occupied_7_day_coverage', int), - Columndef('all_adult_hospital_inpatient_bed_occupied_7_day_sum', - 'all_adult_hospital_inpatient_bed_occupied_7_day_sum', int), - Columndef('all_adult_hospital_inpatient_beds_7_day_avg', 'all_adult_hospital_inpatient_beds_7_day_avg', - float), - Columndef('all_adult_hospital_inpatient_beds_7_day_coverage', - 'all_adult_hospital_inpatient_beds_7_day_coverage', int), - Columndef('all_adult_hospital_inpatient_beds_7_day_sum', 'all_adult_hospital_inpatient_beds_7_day_sum', - int), - Columndef('ccn', 'ccn', str), - Columndef('city', 'city', str), - Columndef('fips_code', 'fips_code', str), - Columndef('geocoded_hospital_address', 'geocoded_hospital_address', str), - Columndef('hhs_ids', 'hhs_ids', str), - Columndef('hospital_name', 'hospital_name', str), - Columndef('hospital_subtype', 'hospital_subtype', str), - Columndef('icu_beds_used_7_day_avg', 'icu_beds_used_7_day_avg', float), - Columndef('icu_beds_used_7_day_coverage', 'icu_beds_used_7_day_coverage', int), - Columndef('icu_beds_used_7_day_sum', 'icu_beds_used_7_day_sum', int), - Columndef('icu_patients_confirmed_influenza_7_day_avg', 'icu_patients_confirmed_influenza_7_day_avg', - float), - Columndef('icu_patients_confirmed_influenza_7_day_coverage', - 'icu_patients_confirmed_influenza_7_day_coverage', int), - Columndef('icu_patients_confirmed_influenza_7_day_sum', 'icu_patients_confirmed_influenza_7_day_sum', - int), - Columndef('inpatient_beds_7_day_avg', 'inpatient_beds_7_day_avg', float), - Columndef('inpatient_beds_7_day_coverage', 'inpatient_beds_7_day_coverage', int), - Columndef('inpatient_beds_7_day_sum', 'inpatient_beds_7_day_sum', int), - Columndef('inpatient_beds_used_7_day_avg', 'inpatient_beds_used_7_day_avg', float), - Columndef('inpatient_beds_used_7_day_coverage', 'inpatient_beds_used_7_day_coverage', int), - Columndef('inpatient_beds_used_7_day_sum', 'inpatient_beds_used_7_day_sum', int), - Columndef('is_corrected', 'is_corrected', Utils.parse_bool), - Columndef('is_metro_micro', 'is_metro_micro', Utils.parse_bool), - Columndef('previous_day_admission_adult_covid_confirmed_18-19_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_18_19_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_20_29_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_30_39_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_40_49_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_50_59_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_60_69_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_70_79_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_7_day_coverage', - 'previous_day_admission_adult_covid_confirmed_7_day_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_80plus_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown_7_day_sum', - 'previous_day_admission_adult_covid_confirmed_unknown_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19_7_day_sum', - 'previous_day_admission_adult_covid_suspected_18_19_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29_7_day_sum', - 'previous_day_admission_adult_covid_suspected_20_29_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39_7_day_sum', - 'previous_day_admission_adult_covid_suspected_30_39_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49_7_day_sum', - 'previous_day_admission_adult_covid_suspected_40_49_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59_7_day_sum', - 'previous_day_admission_adult_covid_suspected_50_59_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_60-69_7_day_sum', - 'previous_day_admission_adult_covid_suspected_60_69_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79_7_day_sum', - 'previous_day_admission_adult_covid_suspected_70_79_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_7_day_coverage', - 'previous_day_admission_adult_covid_suspected_7_day_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_7_day_sum', - 'previous_day_admission_adult_covid_suspected_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_80+_7_day_sum', - 'previous_day_admission_adult_covid_suspected_80plus_7_day_sum', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown_7_day_sum', - 'previous_day_admission_adult_covid_suspected_unknown_7_day_sum', int), - Columndef('previous_day_admission_influenza_confirmed_7_day_sum', - 'previous_day_admission_influenza_confirmed_7_day_sum', int), - Columndef('previous_day_admission_pediatric_covid_confirmed_7_day_coverage', - 'previous_day_admission_pediatric_covid_confirmed_7_day_coverage', int), - Columndef('previous_day_admission_pediatric_covid_confirmed_7_day_sum', - 'previous_day_admission_pediatric_covid_confirmed_7_day_sum', int), - Columndef('previous_day_admission_pediatric_covid_suspected_7_day_coverage', - 'previous_day_admission_pediatric_covid_suspected_7_day_coverage', int), - Columndef('previous_day_admission_pediatric_covid_suspected_7_day_sum', - 'previous_day_admission_pediatric_covid_suspected_7_day_sum', int), - Columndef('previous_day_covid_ED_visits_7_day_sum', 'previous_day_covid_ed_visits_7_day_sum', int), - Columndef('previous_day_total_ED_visits_7_day_sum', 'previous_day_total_ed_visits_7_day_sum', int), - Columndef('previous_week_patients_covid_vaccinated_doses_all_7_day', - 'previous_week_patients_covid_vaccinated_doses_all_7_day', int), - Columndef('previous_week_patients_covid_vaccinated_doses_all_7_day_sum', - 'previous_week_patients_covid_vaccinated_doses_all_7_day_sum', int), - Columndef('previous_week_patients_covid_vaccinated_doses_one_7_day', - 'previous_week_patients_covid_vaccinated_doses_one_7_day', int), - Columndef('previous_week_patients_covid_vaccinated_doses_one_7_day_sum', - 'previous_week_patients_covid_vaccinated_doses_one_7_day_sum', int), - Columndef('previous_week_personnel_covid_vaccinated_doses_administered_7_day', - 'previous_week_personnel_covid_vaccd_doses_administered_7_day', int), - Columndef('previous_week_personnel_covid_vaccinated_doses_administered_7_day_sum', - 'previous_week_personnel_covid_vaccd_doses_administered_7_day_sum', int), - Columndef('staffed_adult_icu_bed_occupancy_7_day_avg', 'staffed_adult_icu_bed_occupancy_7_day_avg', - float), - Columndef('staffed_adult_icu_bed_occupancy_7_day_coverage', - 'staffed_adult_icu_bed_occupancy_7_day_coverage', int), - Columndef('staffed_adult_icu_bed_occupancy_7_day_sum', 'staffed_adult_icu_bed_occupancy_7_day_sum', - int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_avg', - 'staffed_icu_adult_patients_confirmed_suspected_covid_7d_avg', float), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_coverage', - 'staffed_icu_adult_patients_confirmed_suspected_covid_7d_cov', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum', - 'staffed_icu_adult_patients_confirmed_suspected_covid_7d_sum', int), - Columndef('staffed_icu_adult_patients_confirmed_covid_7_day_avg', - 'staffed_icu_adult_patients_confirmed_covid_7_day_avg', float), - Columndef('staffed_icu_adult_patients_confirmed_covid_7_day_coverage', - 'staffed_icu_adult_patients_confirmed_covid_7_day_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_covid_7_day_sum', - 'staffed_icu_adult_patients_confirmed_covid_7_day_sum', int), - Columndef('state', 'state', str), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg', - 'total_adult_patients_hosp_confirmed_suspected_covid_7d_avg', float), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage', - 'total_adult_patients_hosp_confirmed_suspected_covid_7d_cov', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum', - 'total_adult_patients_hosp_confirmed_suspected_covid_7d_sum', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid_7_day_avg', - 'total_adult_patients_hospitalized_confirmed_covid_7_day_avg', float), - Columndef('total_adult_patients_hospitalized_confirmed_covid_7_day_coverage', - 'total_adult_patients_hospitalized_confirmed_covid_7_day_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid_7_day_sum', - 'total_adult_patients_hospitalized_confirmed_covid_7_day_sum', int), - Columndef('total_beds_7_day_avg', 'total_beds_7_day_avg', float), - Columndef('total_beds_7_day_coverage', 'total_beds_7_day_coverage', int), - Columndef('total_beds_7_day_sum', 'total_beds_7_day_sum', int), - Columndef('total_icu_beds_7_day_avg', 'total_icu_beds_7_day_avg', float), - Columndef('total_icu_beds_7_day_coverage', 'total_icu_beds_7_day_coverage', int), - Columndef('total_icu_beds_7_day_sum', 'total_icu_beds_7_day_sum', int), - Columndef('total_patients_hospitalized_confirmed_influenza_7_day_avg', - 'total_patients_hospitalized_confirmed_influenza_7_day_avg', float), - Columndef('total_patients_hospitalized_confirmed_influenza_7_day_coverage', - 'total_patients_hospitalized_confirmed_influenza_7_day_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza_7_day_sum', - 'total_patients_hospitalized_confirmed_influenza_7_day_sum', int), - Columndef('total_patients_hospitalized_confirmed_influenza_and_covid_7_day_avg', - 'total_patients_hosp_confirmed_influenza_and_covid_7d_avg', float), - Columndef('total_patients_hospitalized_confirmed_influenza_and_covid_7_day_coverage', - 'total_patients_hosp_confirmed_influenza_and_covid_7d_cov', int), - Columndef('total_patients_hospitalized_confirmed_influenza_and_covid_7_day_sum', - 'total_patients_hosp_confirmed_influenza_and_covid_7d_sum', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_7d_avg', float), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_7d_cov', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_7d_sum', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg', - 'total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg', float), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_7_day_coverage', - 'total_pediatric_patients_hosp_confirmed_covid_7d_cov', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum', - 'total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum', int), - Columndef('total_personnel_covid_vaccinated_doses_all_7_day', - 'total_personnel_covid_vaccinated_doses_all_7_day', int), - Columndef('total_personnel_covid_vaccinated_doses_all_7_day_sum', - 'total_personnel_covid_vaccinated_doses_all_7_day_sum', int), - Columndef('total_personnel_covid_vaccinated_doses_none_7_day', - 'total_personnel_covid_vaccinated_doses_none_7_day', int), - Columndef('total_personnel_covid_vaccinated_doses_none_7_day_sum', - 'total_personnel_covid_vaccinated_doses_none_7_day_sum', int), - Columndef('total_personnel_covid_vaccinated_doses_one_7_day', - 'total_personnel_covid_vaccinated_doses_one_7_day', int), - Columndef('total_personnel_covid_vaccinated_doses_one_7_day_sum', - 'total_personnel_covid_vaccinated_doses_one_7_day_sum', int), - Columndef('total_staffed_adult_icu_beds_7_day_avg', 'total_staffed_adult_icu_beds_7_day_avg', float), - Columndef('total_staffed_adult_icu_beds_7_day_coverage', 'total_staffed_adult_icu_beds_7_day_coverage', - int), - Columndef('total_staffed_adult_icu_beds_7_day_sum', 'total_staffed_adult_icu_beds_7_day_sum', int), - Columndef('zip', 'zip', str), - ] + TABLE_NAME = 'covid_hosp_facility' + KEY_COLS = ['hospital_pk', 'collection_week'] + AGGREGATE_KEY_COLS = [ + 'address', + 'ccn', + 'city', + 'fips_code', + 'geocoded_hospital_address', + 'hhs_ids', + 'hospital_name', + 'hospital_pk', + 'hospital_subtype', + 'is_metro_micro', + 'state', + 'zip' + ] + # These are 3-tuples of ( + # CSV header name, + # SQL db column name, + # data type + # ) for all the columns in the CSV file. + # Note that the corresponding database column names may be shorter + # due to constraints on the length of column names. See + # /src/ddl/covid_hosp.sql for more information. + ORDERED_CSV_COLUMNS = [ + Columndef('hospital_pk', 'hospital_pk', str), + Columndef('collection_week', 'collection_week', Utils.int_from_date), + Columndef('address', 'address', str), + Columndef('all_adult_hospital_beds_7_day_avg', 'all_adult_hospital_beds_7_day_avg', float), + Columndef('all_adult_hospital_beds_7_day_coverage', 'all_adult_hospital_beds_7_day_coverage', int), + Columndef('all_adult_hospital_beds_7_day_sum', 'all_adult_hospital_beds_7_day_sum', int), + Columndef('all_adult_hospital_inpatient_bed_occupied_7_day_avg', 'all_adult_hospital_inpatient_bed_occupied_7_day_avg', float), + Columndef('all_adult_hospital_inpatient_bed_occupied_7_day_coverage', 'all_adult_hospital_inpatient_bed_occupied_7_day_coverage', int), + Columndef('all_adult_hospital_inpatient_bed_occupied_7_day_sum', 'all_adult_hospital_inpatient_bed_occupied_7_day_sum', int), + Columndef('all_adult_hospital_inpatient_beds_7_day_avg', 'all_adult_hospital_inpatient_beds_7_day_avg', float), + Columndef('all_adult_hospital_inpatient_beds_7_day_coverage', 'all_adult_hospital_inpatient_beds_7_day_coverage', int), + Columndef('all_adult_hospital_inpatient_beds_7_day_sum', 'all_adult_hospital_inpatient_beds_7_day_sum', int), + Columndef('ccn', 'ccn', str), + Columndef('city', 'city', str), + Columndef('fips_code', 'fips_code', str), + Columndef('geocoded_hospital_address', 'geocoded_hospital_address', str), + Columndef('hhs_ids', 'hhs_ids', str), + Columndef('hospital_name', 'hospital_name', str), + Columndef('hospital_subtype', 'hospital_subtype', str), + Columndef('icu_beds_used_7_day_avg', 'icu_beds_used_7_day_avg', float), + Columndef('icu_beds_used_7_day_coverage', 'icu_beds_used_7_day_coverage', int), + Columndef('icu_beds_used_7_day_sum', 'icu_beds_used_7_day_sum', int), + Columndef('icu_patients_confirmed_influenza_7_day_avg', 'icu_patients_confirmed_influenza_7_day_avg', float), + Columndef('icu_patients_confirmed_influenza_7_day_coverage', 'icu_patients_confirmed_influenza_7_day_coverage', int), + Columndef('icu_patients_confirmed_influenza_7_day_sum', 'icu_patients_confirmed_influenza_7_day_sum', int), + Columndef('inpatient_beds_7_day_avg', 'inpatient_beds_7_day_avg', float), + Columndef('inpatient_beds_7_day_coverage', 'inpatient_beds_7_day_coverage', int), + Columndef('inpatient_beds_7_day_sum', 'inpatient_beds_7_day_sum', int), + Columndef('inpatient_beds_used_7_day_avg', 'inpatient_beds_used_7_day_avg', float), + Columndef('inpatient_beds_used_7_day_coverage', 'inpatient_beds_used_7_day_coverage', int), + Columndef('inpatient_beds_used_7_day_sum', 'inpatient_beds_used_7_day_sum', int), + Columndef('is_corrected', 'is_corrected', Utils.parse_bool), + Columndef('is_metro_micro', 'is_metro_micro', Utils.parse_bool), + Columndef('previous_day_admission_adult_covid_confirmed_18-19_7_day_sum', 'previous_day_admission_adult_covid_confirmed_18_19_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_confirmed_20-29_7_day_sum', 'previous_day_admission_adult_covid_confirmed_20_29_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_confirmed_30-39_7_day_sum', 'previous_day_admission_adult_covid_confirmed_30_39_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_confirmed_40-49_7_day_sum', 'previous_day_admission_adult_covid_confirmed_40_49_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_confirmed_50-59_7_day_sum', 'previous_day_admission_adult_covid_confirmed_50_59_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_confirmed_60-69_7_day_sum', 'previous_day_admission_adult_covid_confirmed_60_69_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_confirmed_70-79_7_day_sum', 'previous_day_admission_adult_covid_confirmed_70_79_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_confirmed_7_day_coverage', 'previous_day_admission_adult_covid_confirmed_7_day_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_7_day_sum', 'previous_day_admission_adult_covid_confirmed_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_confirmed_80+_7_day_sum', 'previous_day_admission_adult_covid_confirmed_80plus_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_confirmed_unknown_7_day_sum', 'previous_day_admission_adult_covid_confirmed_unknown_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_18-19_7_day_sum', 'previous_day_admission_adult_covid_suspected_18_19_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_20-29_7_day_sum', 'previous_day_admission_adult_covid_suspected_20_29_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_30-39_7_day_sum', 'previous_day_admission_adult_covid_suspected_30_39_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_40-49_7_day_sum', 'previous_day_admission_adult_covid_suspected_40_49_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_50-59_7_day_sum', 'previous_day_admission_adult_covid_suspected_50_59_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_60-69_7_day_sum', 'previous_day_admission_adult_covid_suspected_60_69_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_70-79_7_day_sum', 'previous_day_admission_adult_covid_suspected_70_79_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_7_day_coverage', 'previous_day_admission_adult_covid_suspected_7_day_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_7_day_sum', 'previous_day_admission_adult_covid_suspected_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_80+_7_day_sum', 'previous_day_admission_adult_covid_suspected_80plus_7_day_sum', int), + Columndef('previous_day_admission_adult_covid_suspected_unknown_7_day_sum', 'previous_day_admission_adult_covid_suspected_unknown_7_day_sum', int), + Columndef('previous_day_admission_influenza_confirmed_7_day_sum', 'previous_day_admission_influenza_confirmed_7_day_sum', int), + Columndef('previous_day_admission_pediatric_covid_confirmed_7_day_coverage', 'previous_day_admission_pediatric_covid_confirmed_7_day_coverage', int), + Columndef('previous_day_admission_pediatric_covid_confirmed_7_day_sum', 'previous_day_admission_pediatric_covid_confirmed_7_day_sum', int), + Columndef('previous_day_admission_pediatric_covid_suspected_7_day_coverage', 'previous_day_admission_pediatric_covid_suspected_7_day_coverage', int), + Columndef('previous_day_admission_pediatric_covid_suspected_7_day_sum', 'previous_day_admission_pediatric_covid_suspected_7_day_sum', int), + Columndef('previous_day_covid_ED_visits_7_day_sum', 'previous_day_covid_ed_visits_7_day_sum', int), + Columndef('previous_day_total_ED_visits_7_day_sum', 'previous_day_total_ed_visits_7_day_sum', int), + Columndef('previous_week_patients_covid_vaccinated_doses_all_7_day', 'previous_week_patients_covid_vaccinated_doses_all_7_day', int), + Columndef('previous_week_patients_covid_vaccinated_doses_all_7_day_sum', 'previous_week_patients_covid_vaccinated_doses_all_7_day_sum', int), + Columndef('previous_week_patients_covid_vaccinated_doses_one_7_day', 'previous_week_patients_covid_vaccinated_doses_one_7_day', int), + Columndef('previous_week_patients_covid_vaccinated_doses_one_7_day_sum', 'previous_week_patients_covid_vaccinated_doses_one_7_day_sum', int), + Columndef('previous_week_personnel_covid_vaccinated_doses_administered_7_day', 'previous_week_personnel_covid_vaccd_doses_administered_7_day', int), + Columndef( + 'previous_week_personnel_covid_vaccinated_doses_administered_7_day_sum', + 'previous_week_personnel_covid_vaccd_doses_administered_7_day_sum', + int + ), + Columndef('staffed_adult_icu_bed_occupancy_7_day_avg', 'staffed_adult_icu_bed_occupancy_7_day_avg', float), + Columndef('staffed_adult_icu_bed_occupancy_7_day_coverage', 'staffed_adult_icu_bed_occupancy_7_day_coverage', int), + Columndef('staffed_adult_icu_bed_occupancy_7_day_sum', 'staffed_adult_icu_bed_occupancy_7_day_sum', int), + Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_avg', 'staffed_icu_adult_patients_confirmed_suspected_covid_7d_avg', float), + Columndef( + 'staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_coverage', + 'staffed_icu_adult_patients_confirmed_suspected_covid_7d_cov', + int + ), + Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_7_day_sum', 'staffed_icu_adult_patients_confirmed_suspected_covid_7d_sum', int), + Columndef('staffed_icu_adult_patients_confirmed_covid_7_day_avg', 'staffed_icu_adult_patients_confirmed_covid_7_day_avg', float), + Columndef('staffed_icu_adult_patients_confirmed_covid_7_day_coverage', 'staffed_icu_adult_patients_confirmed_covid_7_day_coverage', int), + Columndef('staffed_icu_adult_patients_confirmed_covid_7_day_sum', 'staffed_icu_adult_patients_confirmed_covid_7_day_sum', int), + Columndef('state', 'state', str), + Columndef( + 'total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg', + 'total_adult_patients_hosp_confirmed_suspected_covid_7d_avg', + float + ), + Columndef( + 'total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage', + 'total_adult_patients_hosp_confirmed_suspected_covid_7d_cov', + int + ), + Columndef( + 'total_adult_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum', + 'total_adult_patients_hosp_confirmed_suspected_covid_7d_sum', + int + ), + Columndef('total_adult_patients_hospitalized_confirmed_covid_7_day_avg', 'total_adult_patients_hospitalized_confirmed_covid_7_day_avg', float), + Columndef( + 'total_adult_patients_hospitalized_confirmed_covid_7_day_coverage', + 'total_adult_patients_hospitalized_confirmed_covid_7_day_coverage', + int + ), + Columndef('total_adult_patients_hospitalized_confirmed_covid_7_day_sum', 'total_adult_patients_hospitalized_confirmed_covid_7_day_sum', int), + Columndef('total_beds_7_day_avg', 'total_beds_7_day_avg', float), + Columndef('total_beds_7_day_coverage', 'total_beds_7_day_coverage', int), + Columndef('total_beds_7_day_sum', 'total_beds_7_day_sum', int), + Columndef('total_icu_beds_7_day_avg', 'total_icu_beds_7_day_avg', float), + Columndef('total_icu_beds_7_day_coverage', 'total_icu_beds_7_day_coverage', int), + Columndef('total_icu_beds_7_day_sum', 'total_icu_beds_7_day_sum', int), + Columndef('total_patients_hospitalized_confirmed_influenza_7_day_avg', 'total_patients_hospitalized_confirmed_influenza_7_day_avg', float), + Columndef('total_patients_hospitalized_confirmed_influenza_7_day_coverage', 'total_patients_hospitalized_confirmed_influenza_7_day_coverage', int), + Columndef('total_patients_hospitalized_confirmed_influenza_7_day_sum', 'total_patients_hospitalized_confirmed_influenza_7_day_sum', int), + Columndef('total_patients_hospitalized_confirmed_influenza_and_covid_7_day_avg', 'total_patients_hosp_confirmed_influenza_and_covid_7d_avg', float), + Columndef( + 'total_patients_hospitalized_confirmed_influenza_and_covid_7_day_coverage', + 'total_patients_hosp_confirmed_influenza_and_covid_7d_cov', + int), + Columndef('total_patients_hospitalized_confirmed_influenza_and_covid_7_day_sum', 'total_patients_hosp_confirmed_influenza_and_covid_7d_sum', int), + Columndef( + 'total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_avg', + 'total_pediatric_patients_hosp_confirmed_suspected_covid_7d_avg', + float), + Columndef( + 'total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_coverage', + 'total_pediatric_patients_hosp_confirmed_suspected_covid_7d_cov', + int), + Columndef( + 'total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_7_day_sum', + 'total_pediatric_patients_hosp_confirmed_suspected_covid_7d_sum', + int + ), + Columndef( + 'total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg', + 'total_pediatric_patients_hospitalized_confirmed_covid_7_day_avg', + float + ), + Columndef('total_pediatric_patients_hospitalized_confirmed_covid_7_day_coverage', 'total_pediatric_patients_hosp_confirmed_covid_7d_cov', int), + Columndef('total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum', 'total_pediatric_patients_hospitalized_confirmed_covid_7_day_sum', int), + Columndef('total_personnel_covid_vaccinated_doses_all_7_day', 'total_personnel_covid_vaccinated_doses_all_7_day', int), + Columndef('total_personnel_covid_vaccinated_doses_all_7_day_sum', 'total_personnel_covid_vaccinated_doses_all_7_day_sum', int), + Columndef('total_personnel_covid_vaccinated_doses_none_7_day', 'total_personnel_covid_vaccinated_doses_none_7_day', int), + Columndef('total_personnel_covid_vaccinated_doses_none_7_day_sum', 'total_personnel_covid_vaccinated_doses_none_7_day_sum', int), + Columndef('total_personnel_covid_vaccinated_doses_one_7_day', 'total_personnel_covid_vaccinated_doses_one_7_day', int), + Columndef('total_personnel_covid_vaccinated_doses_one_7_day_sum', 'total_personnel_covid_vaccinated_doses_one_7_day_sum', int), + Columndef('total_staffed_adult_icu_beds_7_day_avg', 'total_staffed_adult_icu_beds_7_day_avg', float), + Columndef('total_staffed_adult_icu_beds_7_day_coverage', 'total_staffed_adult_icu_beds_7_day_coverage', int), + Columndef('total_staffed_adult_icu_beds_7_day_sum', 'total_staffed_adult_icu_beds_7_day_sum', int), + Columndef('zip', 'zip', str), + ] - def __init__(self, *args, **kwargs): - super().__init__( - *args, - **kwargs, - table_name=Database.TABLE_NAME, - hhs_dataset_id=Network.DATASET_ID, - key_columns=Database.KEY_COLS, - columns_and_types=Database.ORDERED_CSV_COLUMNS) + def __init__(self, *args, **kwargs): + super().__init__( + *args, + **kwargs, + table_name=Database.TABLE_NAME, + hhs_dataset_id=Network.DATASET_ID, + key_columns=Database.KEY_COLS, + columns_and_types=Database.ORDERED_CSV_COLUMNS + ) diff --git a/src/acquisition/covid_hosp/facility/network.py b/src/acquisition/covid_hosp/facility/network.py index 6a0092c7f..c318db7cf 100644 --- a/src/acquisition/covid_hosp/facility/network.py +++ b/src/acquisition/covid_hosp/facility/network.py @@ -1,17 +1,16 @@ -# first party from delphi.epidata.acquisition.covid_hosp.common.network import Network as BaseNetwork class Network(BaseNetwork): - DATASET_ID = 'anag-cw7u' - METADATA_ID = 'j4ip-wfsv' + DATASET_ID = 'anag-cw7u' + METADATA_ID = 'j4ip-wfsv' - def fetch_metadata(*args, **kwags): - """Download and return metadata. + def fetch_metadata(*args, **kwags): + """Download and return metadata. - See `fetch_metadata_for_dataset`. - """ + See `fetch_metadata_for_dataset`. + """ - return Network.fetch_metadata_for_dataset( - *args, **kwags, dataset_id=Network.METADATA_ID) + return Network.fetch_metadata_for_dataset( + *args, **kwags, dataset_id=Network.METADATA_ID) diff --git a/src/acquisition/covid_hosp/facility/update.py b/src/acquisition/covid_hosp/facility/update.py index b2b96c2e3..3f19da08a 100644 --- a/src/acquisition/covid_hosp/facility/update.py +++ b/src/acquisition/covid_hosp/facility/update.py @@ -4,7 +4,6 @@ healthdata.gov. """ -# first party from delphi.epidata.acquisition.covid_hosp.common.utils import Utils from delphi.epidata.acquisition.covid_hosp.facility.database import Database from delphi.epidata.acquisition.covid_hosp.facility.network import Network @@ -12,16 +11,16 @@ class Update: - def run(network=Network): - """Acquire the most recent dataset, unless it was previously acquired. + def run(network=Network): + """Acquire the most recent dataset, unless it was previously acquired. - Returns - ------- - bool - Whether a new dataset was acquired. - """ + Returns + ------- + bool + Whether a new dataset was acquired. + """ - return Utils.update_dataset(Database, network) + return Utils.update_dataset(Database, network) # main entry point diff --git a/src/acquisition/covid_hosp/state_daily/database.py b/src/acquisition/covid_hosp/state_daily/database.py index 6a8228994..16c67c823 100644 --- a/src/acquisition/covid_hosp/state_daily/database.py +++ b/src/acquisition/covid_hosp/state_daily/database.py @@ -1,230 +1,162 @@ -# first party -from delphi.epidata.acquisition.covid_hosp.common.database import Database as BaseDatabase from delphi.epidata.acquisition.covid_hosp.common.database import Columndef +from delphi.epidata.acquisition.covid_hosp.common.database import Database as BaseDatabase from delphi.epidata.acquisition.covid_hosp.common.utils import Utils from delphi.epidata.acquisition.covid_hosp.state_daily.network import Network class Database(BaseDatabase): - # note we share a database with state_timeseries - TABLE_NAME = 'covid_hosp_state_timeseries' - KEY_COLS = ['state', 'reporting_cutoff_start'] - # These are 3-tuples of (CSV header name, SQL db column name, data type) for - # all the columns in the CSV file. - # Note that the corresponding database column names may be shorter - # due to constraints on the length of column names. See - # /src/ddl/covid_hosp.sql for more information. - # Additionally, all column names below are shared with state_timeseries, - # except for reporting_cutoff_start (here) and date (there). If you need - # to update a column name, do it in both places. - ORDERED_CSV_COLUMNS = [ - Columndef('state', 'state', str), - Columndef('reporting_cutoff_start', 'date', Utils.int_from_date), - Columndef('adult_icu_bed_covid_utilization', 'adult_icu_bed_covid_utilization', float), - Columndef('adult_icu_bed_covid_utilization_coverage', 'adult_icu_bed_covid_utilization_coverage', int), - Columndef('adult_icu_bed_covid_utilization_denominator', 'adult_icu_bed_covid_utilization_denominator', - int), - Columndef('adult_icu_bed_covid_utilization_numerator', 'adult_icu_bed_covid_utilization_numerator', - int), - Columndef('adult_icu_bed_utilization', 'adult_icu_bed_utilization', float), - Columndef('adult_icu_bed_utilization_coverage', 'adult_icu_bed_utilization_coverage', int), - Columndef('adult_icu_bed_utilization_denominator', 'adult_icu_bed_utilization_denominator', int), - Columndef('adult_icu_bed_utilization_numerator', 'adult_icu_bed_utilization_numerator', int), - Columndef('critical_staffing_shortage_anticipated_within_week_no', - 'critical_staffing_shortage_anticipated_within_week_no', int), - Columndef('critical_staffing_shortage_anticipated_within_week_not_reported', - 'critical_staffing_shortage_anticipated_within_week_not_reported', int), - Columndef('critical_staffing_shortage_anticipated_within_week_yes', - 'critical_staffing_shortage_anticipated_within_week_yes', int), - Columndef('critical_staffing_shortage_today_no', 'critical_staffing_shortage_today_no', int), - Columndef('critical_staffing_shortage_today_not_reported', - 'critical_staffing_shortage_today_not_reported', int), - Columndef('critical_staffing_shortage_today_yes', 'critical_staffing_shortage_today_yes', int), - Columndef('deaths_covid', 'deaths_covid', int), - Columndef('deaths_covid_coverage', 'deaths_covid_coverage', int), - Columndef('geocoded_state', 'geocoded_state', str), - Columndef('hospital_onset_covid', 'hospital_onset_covid', int), - Columndef('hospital_onset_covid_coverage', 'hospital_onset_covid_coverage', int), - Columndef('icu_patients_confirmed_influenza', 'icu_patients_confirmed_influenza', int), - Columndef('icu_patients_confirmed_influenza_coverage', 'icu_patients_confirmed_influenza_coverage', - int), - Columndef('inpatient_bed_covid_utilization', 'inpatient_bed_covid_utilization', float), - Columndef('inpatient_bed_covid_utilization_coverage', 'inpatient_bed_covid_utilization_coverage', int), - Columndef('inpatient_bed_covid_utilization_denominator', 'inpatient_bed_covid_utilization_denominator', - int), - Columndef('inpatient_bed_covid_utilization_numerator', 'inpatient_bed_covid_utilization_numerator', - int), - Columndef('inpatient_beds', 'inpatient_beds', int), - Columndef('inpatient_beds_coverage', 'inpatient_beds_coverage', int), - Columndef('inpatient_beds_used', 'inpatient_beds_used', int), - Columndef('inpatient_beds_used_coverage', 'inpatient_beds_used_coverage', int), - Columndef('inpatient_beds_used_covid', 'inpatient_beds_used_covid', int), - Columndef('inpatient_beds_used_covid_coverage', 'inpatient_beds_used_covid_coverage', int), - Columndef('inpatient_beds_utilization', 'inpatient_beds_utilization', float), - Columndef('inpatient_beds_utilization_coverage', 'inpatient_beds_utilization_coverage', int), - Columndef('inpatient_beds_utilization_denominator', 'inpatient_beds_utilization_denominator', int), - Columndef('inpatient_beds_utilization_numerator', 'inpatient_beds_utilization_numerator', int), - Columndef('on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', - 'on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', int), - Columndef('on_hand_supply_therapeutic_b_bamlanivimab_courses', - 'on_hand_supply_therapeutic_b_bamlanivimab_courses', int), - Columndef('on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', - 'on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', int), - Columndef('percent_of_inpatients_with_covid', 'percent_of_inpatients_with_covid', float), - Columndef('percent_of_inpatients_with_covid_coverage', 'percent_of_inpatients_with_covid_coverage', - int), - Columndef('percent_of_inpatients_with_covid_denominator', - 'percent_of_inpatients_with_covid_denominator', int), - Columndef('percent_of_inpatients_with_covid_numerator', 'percent_of_inpatients_with_covid_numerator', - int), - Columndef('previous_day_admission_adult_covid_confirmed', - 'previous_day_admission_adult_covid_confirmed', int), - Columndef('previous_day_admission_adult_covid_confirmed_18-19', - 'previous_day_admission_adult_covid_confirmed_18_19', int), - Columndef('previous_day_admission_adult_covid_confirmed_18-19_coverage', - 'previous_day_admission_adult_covid_confirmed_18_19_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29', - 'previous_day_admission_adult_covid_confirmed_20_29', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29_coverage', - 'previous_day_admission_adult_covid_confirmed_20_29_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39', - 'previous_day_admission_adult_covid_confirmed_30_39', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39_coverage', - 'previous_day_admission_adult_covid_confirmed_30_39_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49', - 'previous_day_admission_adult_covid_confirmed_40_49', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49_coverage', - 'previous_day_admission_adult_covid_confirmed_40_49_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59', - 'previous_day_admission_adult_covid_confirmed_50_59', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59_coverage', - 'previous_day_admission_adult_covid_confirmed_50_59_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69', - 'previous_day_admission_adult_covid_confirmed_60_69', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69_coverage', - 'previous_day_admission_adult_covid_confirmed_60_69_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79', - 'previous_day_admission_adult_covid_confirmed_70_79', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79_coverage', - 'previous_day_admission_adult_covid_confirmed_70_79_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+', - 'previous_day_admission_adult_covid_confirmed_80plus', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+_coverage', - 'previous_day_admission_adult_covid_confirmed_80plus_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_coverage', - 'previous_day_admission_adult_covid_confirmed_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown', - 'previous_day_admission_adult_covid_confirmed_unknown', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown_coverage', - 'previous_day_admission_adult_covid_confirmed_unknown_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected', - 'previous_day_admission_adult_covid_suspected', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19', - 'previous_day_admission_adult_covid_suspected_18_19', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19_coverage', - 'previous_day_admission_adult_covid_suspected_18_19_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29', - 'previous_day_admission_adult_covid_suspected_20_29', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29_coverage', - 'previous_day_admission_adult_covid_suspected_20_29_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39', - 'previous_day_admission_adult_covid_suspected_30_39', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39_coverage', - 'previous_day_admission_adult_covid_suspected_30_39_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49', - 'previous_day_admission_adult_covid_suspected_40_49', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49_coverage', - 'previous_day_admission_adult_covid_suspected_40_49_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59', - 'previous_day_admission_adult_covid_suspected_50_59', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59_coverage', - 'previous_day_admission_adult_covid_suspected_50_59_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_60_69', #this is correct; csv header is irregular - 'previous_day_admission_adult_covid_suspected_60_69', int), - Columndef('previous_day_admission_adult_covid_suspected_60-69_coverage', - 'previous_day_admission_adult_covid_suspected_60_69_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79', - 'previous_day_admission_adult_covid_suspected_70_79', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79_coverage', - 'previous_day_admission_adult_covid_suspected_70_79_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_80', - 'previous_day_admission_adult_covid_suspected_80plus', int), - Columndef('previous_day_admission_adult_covid_suspected_80+_coverage', - 'previous_day_admission_adult_covid_suspected_80plus_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_coverage', - 'previous_day_admission_adult_covid_suspected_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown', - 'previous_day_admission_adult_covid_suspected_unknown', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown_coverage', - 'previous_day_admission_adult_covid_suspected_unknown_coverage', int), - Columndef('previous_day_admission_influenza_confirmed', 'previous_day_admission_influenza_confirmed', - int), - Columndef('previous_day_admission_influenza_confirmed_coverage', - 'previous_day_admission_influenza_confirmed_coverage', int), - Columndef('previous_day_admission_pediatric_covid_confirmed', - 'previous_day_admission_pediatric_covid_confirmed', int), - Columndef('previous_day_admission_pediatric_covid_confirmed_coverage', - 'previous_day_admission_pediatric_covid_confirmed_coverage', int), - Columndef('previous_day_admission_pediatric_covid_suspected', - 'previous_day_admission_pediatric_covid_suspected', int), - Columndef('previous_day_admission_pediatric_covid_suspected_coverage', - 'previous_day_admission_pediatric_covid_suspected_coverage', int), - Columndef('previous_day_deaths_covid_and_influenza', 'previous_day_deaths_covid_and_influenza', int), - Columndef('previous_day_deaths_covid_and_influenza_coverage', - 'previous_day_deaths_covid_and_influenza_coverage', int), - Columndef('previous_day_deaths_influenza', 'previous_day_deaths_influenza', int), - Columndef('previous_day_deaths_influenza_coverage', 'previous_day_deaths_influenza_coverage', int), - Columndef('previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', - 'previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', int), - Columndef('previous_week_therapeutic_b_bamlanivimab_courses_used', - 'previous_week_therapeutic_b_bamlanivimab_courses_used', int), - Columndef('previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', - 'previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', int), - Columndef('staffed_adult_icu_bed_occupancy', 'staffed_adult_icu_bed_occupancy', int), - Columndef('staffed_adult_icu_bed_occupancy_coverage', 'staffed_adult_icu_bed_occupancy_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid', - 'staffed_icu_adult_patients_confirmed_suspected_covid', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage', - 'staffed_icu_adult_patients_confirmed_suspected_covid_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_covid', 'staffed_icu_adult_patients_confirmed_covid', - int), - Columndef('staffed_icu_adult_patients_confirmed_covid_coverage', - 'staffed_icu_adult_patients_confirmed_covid_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid', - 'total_adult_patients_hosp_confirmed_suspected_covid', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage', - 'total_adult_patients_hosp_confirmed_suspected_covid_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid', - 'total_adult_patients_hosp_confirmed_covid', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid_coverage', - 'total_adult_patients_hosp_confirmed_covid_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza', - 'total_patients_hospitalized_confirmed_influenza', int), - Columndef('total_patients_hospitalized_confirmed_influenza_coverage', - 'total_patients_hospitalized_confirmed_influenza_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza_covid', - 'total_patients_hospitalized_confirmed_influenza_covid', int), - Columndef('total_patients_hospitalized_confirmed_influenza_covid_coverage', - 'total_patients_hospitalized_confirmed_influenza_covid_coverage', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid', - 'total_pediatric_patients_hosp_confirmed_suspected_covid', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_coverage', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid', - 'total_pediatric_patients_hosp_confirmed_covid', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_coverage', - 'total_pediatric_patients_hosp_confirmed_covid_coverage', int), - Columndef('total_staffed_adult_icu_beds', 'total_staffed_adult_icu_beds', int), - Columndef('total_staffed_adult_icu_beds_coverage', 'total_staffed_adult_icu_beds_coverage', int), - ] + # note we share a database with state_timeseries + TABLE_NAME = 'covid_hosp_state_timeseries' + KEY_COLS = ['state', 'reporting_cutoff_start'] + # These are 3-tuples of (CSV header name, SQL db column name, data type) for + # all the columns in the CSV file. + # Note that the corresponding database column names may be shorter + # due to constraints on the length of column names. See + # /src/ddl/covid_hosp.sql for more information. + # Additionally, all column names below are shared with state_timeseries, + # except for reporting_cutoff_start (here) and date (there). If you need + # to update a column name, do it in both places. + ORDERED_CSV_COLUMNS = [ + Columndef('state', 'state', str), + Columndef('reporting_cutoff_start', 'date', Utils.int_from_date), + Columndef('adult_icu_bed_covid_utilization', 'adult_icu_bed_covid_utilization', float), + Columndef('adult_icu_bed_covid_utilization_coverage', 'adult_icu_bed_covid_utilization_coverage', int), + Columndef('adult_icu_bed_covid_utilization_denominator', 'adult_icu_bed_covid_utilization_denominator', int), + Columndef('adult_icu_bed_covid_utilization_numerator', 'adult_icu_bed_covid_utilization_numerator', int), + Columndef('adult_icu_bed_utilization', 'adult_icu_bed_utilization', float), + Columndef('adult_icu_bed_utilization_coverage', 'adult_icu_bed_utilization_coverage', int), + Columndef('adult_icu_bed_utilization_denominator', 'adult_icu_bed_utilization_denominator', int), + Columndef('adult_icu_bed_utilization_numerator', 'adult_icu_bed_utilization_numerator', int), + Columndef('critical_staffing_shortage_anticipated_within_week_no', 'critical_staffing_shortage_anticipated_within_week_no', int), + Columndef('critical_staffing_shortage_anticipated_within_week_not_reported', 'critical_staffing_shortage_anticipated_within_week_not_reported', int), + Columndef('critical_staffing_shortage_anticipated_within_week_yes', 'critical_staffing_shortage_anticipated_within_week_yes', int), + Columndef('critical_staffing_shortage_today_no', 'critical_staffing_shortage_today_no', int), + Columndef('critical_staffing_shortage_today_not_reported', 'critical_staffing_shortage_today_not_reported', int), + Columndef('critical_staffing_shortage_today_yes', 'critical_staffing_shortage_today_yes', int), + Columndef('deaths_covid', 'deaths_covid', int), + Columndef('deaths_covid_coverage', 'deaths_covid_coverage', int), + Columndef('geocoded_state', 'geocoded_state', str), + Columndef('hospital_onset_covid', 'hospital_onset_covid', int), + Columndef('hospital_onset_covid_coverage', 'hospital_onset_covid_coverage', int), + Columndef('icu_patients_confirmed_influenza', 'icu_patients_confirmed_influenza', int), + Columndef('icu_patients_confirmed_influenza_coverage', 'icu_patients_confirmed_influenza_coverage', int), + Columndef('inpatient_bed_covid_utilization', 'inpatient_bed_covid_utilization', float), + Columndef('inpatient_bed_covid_utilization_coverage', 'inpatient_bed_covid_utilization_coverage', int), + Columndef('inpatient_bed_covid_utilization_denominator', 'inpatient_bed_covid_utilization_denominator', int), + Columndef('inpatient_bed_covid_utilization_numerator', 'inpatient_bed_covid_utilization_numerator', int), + Columndef('inpatient_beds', 'inpatient_beds', int), + Columndef('inpatient_beds_coverage', 'inpatient_beds_coverage', int), + Columndef('inpatient_beds_used', 'inpatient_beds_used', int), + Columndef('inpatient_beds_used_coverage', 'inpatient_beds_used_coverage', int), + Columndef('inpatient_beds_used_covid', 'inpatient_beds_used_covid', int), + Columndef('inpatient_beds_used_covid_coverage', 'inpatient_beds_used_covid_coverage', int), + Columndef('inpatient_beds_utilization', 'inpatient_beds_utilization', float), + Columndef('inpatient_beds_utilization_coverage', 'inpatient_beds_utilization_coverage', int), + Columndef('inpatient_beds_utilization_denominator', 'inpatient_beds_utilization_denominator', int), + Columndef('inpatient_beds_utilization_numerator', 'inpatient_beds_utilization_numerator', int), + Columndef('on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', 'on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', int), + Columndef('on_hand_supply_therapeutic_b_bamlanivimab_courses', 'on_hand_supply_therapeutic_b_bamlanivimab_courses', int), + Columndef('on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', 'on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', int), + Columndef('percent_of_inpatients_with_covid', 'percent_of_inpatients_with_covid', float), + Columndef('percent_of_inpatients_with_covid_coverage', 'percent_of_inpatients_with_covid_coverage', int), + Columndef('percent_of_inpatients_with_covid_denominator', 'percent_of_inpatients_with_covid_denominator', int), + Columndef('percent_of_inpatients_with_covid_numerator', 'percent_of_inpatients_with_covid_numerator', int), + Columndef('previous_day_admission_adult_covid_confirmed', 'previous_day_admission_adult_covid_confirmed', int), + Columndef('previous_day_admission_adult_covid_confirmed_18-19', 'previous_day_admission_adult_covid_confirmed_18_19', int), + Columndef('previous_day_admission_adult_covid_confirmed_18-19_coverage', 'previous_day_admission_adult_covid_confirmed_18_19_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_20-29', 'previous_day_admission_adult_covid_confirmed_20_29', int), + Columndef('previous_day_admission_adult_covid_confirmed_20-29_coverage', 'previous_day_admission_adult_covid_confirmed_20_29_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_30-39', 'previous_day_admission_adult_covid_confirmed_30_39', int), + Columndef('previous_day_admission_adult_covid_confirmed_30-39_coverage', 'previous_day_admission_adult_covid_confirmed_30_39_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_40-49', 'previous_day_admission_adult_covid_confirmed_40_49', int), + Columndef('previous_day_admission_adult_covid_confirmed_40-49_coverage', 'previous_day_admission_adult_covid_confirmed_40_49_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_50-59', 'previous_day_admission_adult_covid_confirmed_50_59', int), + Columndef('previous_day_admission_adult_covid_confirmed_50-59_coverage', 'previous_day_admission_adult_covid_confirmed_50_59_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_60-69', 'previous_day_admission_adult_covid_confirmed_60_69', int), + Columndef('previous_day_admission_adult_covid_confirmed_60-69_coverage', 'previous_day_admission_adult_covid_confirmed_60_69_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_70-79', 'previous_day_admission_adult_covid_confirmed_70_79', int), + Columndef('previous_day_admission_adult_covid_confirmed_70-79_coverage', 'previous_day_admission_adult_covid_confirmed_70_79_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_80+', 'previous_day_admission_adult_covid_confirmed_80plus', int), + Columndef('previous_day_admission_adult_covid_confirmed_80+_coverage', 'previous_day_admission_adult_covid_confirmed_80plus_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_coverage', 'previous_day_admission_adult_covid_confirmed_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_unknown', 'previous_day_admission_adult_covid_confirmed_unknown', int), + Columndef('previous_day_admission_adult_covid_confirmed_unknown_coverage', 'previous_day_admission_adult_covid_confirmed_unknown_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected', 'previous_day_admission_adult_covid_suspected', int), + Columndef('previous_day_admission_adult_covid_suspected_18-19', 'previous_day_admission_adult_covid_suspected_18_19', int), + Columndef('previous_day_admission_adult_covid_suspected_18-19_coverage', 'previous_day_admission_adult_covid_suspected_18_19_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_20-29', 'previous_day_admission_adult_covid_suspected_20_29', int), + Columndef('previous_day_admission_adult_covid_suspected_20-29_coverage', 'previous_day_admission_adult_covid_suspected_20_29_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_30-39', 'previous_day_admission_adult_covid_suspected_30_39', int), + Columndef('previous_day_admission_adult_covid_suspected_30-39_coverage', 'previous_day_admission_adult_covid_suspected_30_39_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_40-49', 'previous_day_admission_adult_covid_suspected_40_49', int), + Columndef('previous_day_admission_adult_covid_suspected_40-49_coverage', 'previous_day_admission_adult_covid_suspected_40_49_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_50-59', 'previous_day_admission_adult_covid_suspected_50_59', int), + Columndef('previous_day_admission_adult_covid_suspected_50-59_coverage', 'previous_day_admission_adult_covid_suspected_50_59_coverage', int), + # this is correct; csv header is irregular + Columndef('previous_day_admission_adult_covid_suspected_60_69', 'previous_day_admission_adult_covid_suspected_60_69', int), + Columndef('previous_day_admission_adult_covid_suspected_60-69_coverage', 'previous_day_admission_adult_covid_suspected_60_69_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_70-79', 'previous_day_admission_adult_covid_suspected_70_79', int), + Columndef('previous_day_admission_adult_covid_suspected_70-79_coverage', 'previous_day_admission_adult_covid_suspected_70_79_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_80', 'previous_day_admission_adult_covid_suspected_80plus', int), + Columndef('previous_day_admission_adult_covid_suspected_80+_coverage', 'previous_day_admission_adult_covid_suspected_80plus_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_coverage', 'previous_day_admission_adult_covid_suspected_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_unknown', 'previous_day_admission_adult_covid_suspected_unknown', int), + Columndef('previous_day_admission_adult_covid_suspected_unknown_coverage', 'previous_day_admission_adult_covid_suspected_unknown_coverage', int), + Columndef('previous_day_admission_influenza_confirmed', 'previous_day_admission_influenza_confirmed', int), + Columndef('previous_day_admission_influenza_confirmed_coverage', 'previous_day_admission_influenza_confirmed_coverage', int), + Columndef('previous_day_admission_pediatric_covid_confirmed', 'previous_day_admission_pediatric_covid_confirmed', int), + Columndef('previous_day_admission_pediatric_covid_confirmed_coverage', 'previous_day_admission_pediatric_covid_confirmed_coverage', int), + Columndef('previous_day_admission_pediatric_covid_suspected', 'previous_day_admission_pediatric_covid_suspected', int), + Columndef('previous_day_admission_pediatric_covid_suspected_coverage', 'previous_day_admission_pediatric_covid_suspected_coverage', int), + Columndef('previous_day_deaths_covid_and_influenza', 'previous_day_deaths_covid_and_influenza', int), + Columndef('previous_day_deaths_covid_and_influenza_coverage', 'previous_day_deaths_covid_and_influenza_coverage', int), + Columndef('previous_day_deaths_influenza', 'previous_day_deaths_influenza', int), + Columndef('previous_day_deaths_influenza_coverage', 'previous_day_deaths_influenza_coverage', int), + Columndef('previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', 'previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', int), + Columndef('previous_week_therapeutic_b_bamlanivimab_courses_used', 'previous_week_therapeutic_b_bamlanivimab_courses_used', int), + Columndef( + 'previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', + 'previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', + int + ), + Columndef('staffed_adult_icu_bed_occupancy', 'staffed_adult_icu_bed_occupancy', int), + Columndef('staffed_adult_icu_bed_occupancy_coverage', 'staffed_adult_icu_bed_occupancy_coverage', int), + Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid', 'staffed_icu_adult_patients_confirmed_suspected_covid', int), + Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage', 'staffed_icu_adult_patients_confirmed_suspected_covid_coverage', int), + Columndef('staffed_icu_adult_patients_confirmed_covid', 'staffed_icu_adult_patients_confirmed_covid', int), + Columndef('staffed_icu_adult_patients_confirmed_covid_coverage', 'staffed_icu_adult_patients_confirmed_covid_coverage', int), + Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid', 'total_adult_patients_hosp_confirmed_suspected_covid', int), + Columndef( + 'total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage', + 'total_adult_patients_hosp_confirmed_suspected_covid_coverage', + int + ), + Columndef('total_adult_patients_hospitalized_confirmed_covid', 'total_adult_patients_hosp_confirmed_covid', int), + Columndef('total_adult_patients_hospitalized_confirmed_covid_coverage', 'total_adult_patients_hosp_confirmed_covid_coverage', int), + Columndef('total_patients_hospitalized_confirmed_influenza', 'total_patients_hospitalized_confirmed_influenza', int), + Columndef('total_patients_hospitalized_confirmed_influenza_coverage', 'total_patients_hospitalized_confirmed_influenza_coverage', int), + Columndef('total_patients_hospitalized_confirmed_influenza_covid', 'total_patients_hospitalized_confirmed_influenza_covid', int), + Columndef('total_patients_hospitalized_confirmed_influenza_covid_coverage', 'total_patients_hospitalized_confirmed_influenza_covid_coverage', int), + Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid', 'total_pediatric_patients_hosp_confirmed_suspected_covid', int), + Columndef( + 'total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage', + 'total_pediatric_patients_hosp_confirmed_suspected_covid_coverage', + int + ), + Columndef('total_pediatric_patients_hospitalized_confirmed_covid', 'total_pediatric_patients_hosp_confirmed_covid', int), + Columndef('total_pediatric_patients_hospitalized_confirmed_covid_coverage', 'total_pediatric_patients_hosp_confirmed_covid_coverage', int), + Columndef('total_staffed_adult_icu_beds', 'total_staffed_adult_icu_beds', int), + Columndef('total_staffed_adult_icu_beds_coverage', 'total_staffed_adult_icu_beds_coverage', int), + ] - def __init__(self, *args, **kwargs): - super().__init__( - *args, - **kwargs, - table_name=Database.TABLE_NAME, - hhs_dataset_id=Network.DATASET_ID, - columns_and_types=Database.ORDERED_CSV_COLUMNS, - key_columns=Database.KEY_COLS, - additional_fields=[Columndef('D', 'record_type', None)]) + def __init__(self, *args, **kwargs): + super().__init__( + *args, + **kwargs, + table_name=Database.TABLE_NAME, + hhs_dataset_id=Network.DATASET_ID, + columns_and_types=Database.ORDERED_CSV_COLUMNS, + key_columns=Database.KEY_COLS, + additional_fields=[Columndef('D', 'record_type', None)] + ) diff --git a/src/acquisition/covid_hosp/state_daily/network.py b/src/acquisition/covid_hosp/state_daily/network.py index f4678cc9b..7ef6f52e4 100644 --- a/src/acquisition/covid_hosp/state_daily/network.py +++ b/src/acquisition/covid_hosp/state_daily/network.py @@ -1,36 +1,37 @@ # first party from delphi.epidata.acquisition.covid_hosp.common.network import Network as BaseNetwork + class Network(BaseNetwork): - DATASET_ID = '6xf2-c3ie' - METADATA_ID = '4cnb-m4rz' + DATASET_ID = '6xf2-c3ie' + METADATA_ID = '4cnb-m4rz' - @staticmethod - def fetch_metadata(*args, **kwags): - """Download and return metadata. + @staticmethod + def fetch_metadata(*args, **kwags): + """ + Download and return metadata. - See `fetch_metadata_for_dataset`. - """ + See `fetch_metadata_for_dataset`. + """ - return Network.fetch_metadata_for_dataset( - *args, **kwags, dataset_id=Network.METADATA_ID) + return Network.fetch_metadata_for_dataset(*args, **kwags, dataset_id=Network.METADATA_ID) - @staticmethod - def fetch_revisions(metadata, newer_than): - """ - Extract all dataset URLs from metadata for issues after newer_than. + @staticmethod + def fetch_revisions(metadata, newer_than): + """ + Extract all dataset URLs from metadata for issues after newer_than. - Parameters - ---------- - metadata DataFrame - Metadata DF containing all rows of metadata from data source page. + Parameters + ---------- + metadata DataFrame + Metadata DF containing all rows of metadata from data source page. - newer_than Timestamp or datetime - Date and time of issue to use as lower bound for new URLs. + newer_than Timestamp or datetime + Date and time of issue to use as lower bound for new URLs. - Returns - ------- - List of URLs of issues after newer_than - """ - return list(metadata.loc[metadata.index > newer_than, "Archive Link"]) + Returns + ------- + List of URLs of issues after newer_than + """ + return list(metadata.loc[metadata.index > newer_than, "Archive Link"]) diff --git a/src/acquisition/covid_hosp/state_daily/update.py b/src/acquisition/covid_hosp/state_daily/update.py index 12a51e6c3..fde0ab4c7 100644 --- a/src/acquisition/covid_hosp/state_daily/update.py +++ b/src/acquisition/covid_hosp/state_daily/update.py @@ -3,7 +3,7 @@ dataset provided by the US Department of Health & Human Services via healthdata.gov. """ -# first party + from delphi.epidata.acquisition.covid_hosp.common.utils import Utils from delphi.epidata.acquisition.covid_hosp.state_daily.database import Database from delphi.epidata.acquisition.covid_hosp.state_daily.network import Network @@ -11,17 +11,17 @@ class Update: - @staticmethod - def run(network=Network): - """Acquire the most recent dataset, unless it was previously acquired. + @staticmethod + def run(network=Network): + """Acquire the most recent dataset, unless it was previously acquired. - Returns - ------- - bool - Whether a new dataset was acquired. - """ + Returns + ------- + bool + Whether a new dataset was acquired. + """ - return Utils.update_dataset(Database, network) + return Utils.update_dataset(Database, network) # main entry point diff --git a/src/acquisition/covid_hosp/state_timeseries/database.py b/src/acquisition/covid_hosp/state_timeseries/database.py index 348d9fc0b..9accb9cf3 100644 --- a/src/acquisition/covid_hosp/state_timeseries/database.py +++ b/src/acquisition/covid_hosp/state_timeseries/database.py @@ -1,229 +1,160 @@ # first party -from delphi.epidata.acquisition.covid_hosp.common.database import Database as BaseDatabase from delphi.epidata.acquisition.covid_hosp.common.database import Columndef +from delphi.epidata.acquisition.covid_hosp.common.database import Database as BaseDatabase from delphi.epidata.acquisition.covid_hosp.common.utils import Utils from delphi.epidata.acquisition.covid_hosp.state_timeseries.network import Network class Database(BaseDatabase): - TABLE_NAME = 'covid_hosp_state_timeseries' - KEY_COLS = ['state', 'date'] - # These are 3-tuples of (CSV header name, SQL db column name, data type) for - # all the columns in the CSV file. - # Note that the corresponding database column names may be shorter - # due to constraints on the length of column names. See - # /src/ddl/covid_hosp.sql for more information. - # Additionally, all column names below are shared with state_daily, - # except for reporting_cutoff_start (there) and date (here). If you need - # to update a column name, do it in both places. - ORDERED_CSV_COLUMNS = [ - Columndef('state', 'state', str), - Columndef('date', 'date', Utils.int_from_date), - Columndef('adult_icu_bed_covid_utilization', 'adult_icu_bed_covid_utilization', float), - Columndef('adult_icu_bed_covid_utilization_coverage', 'adult_icu_bed_covid_utilization_coverage', int), - Columndef('adult_icu_bed_covid_utilization_denominator', 'adult_icu_bed_covid_utilization_denominator', - int), - Columndef('adult_icu_bed_covid_utilization_numerator', 'adult_icu_bed_covid_utilization_numerator', - int), - Columndef('adult_icu_bed_utilization', 'adult_icu_bed_utilization', float), - Columndef('adult_icu_bed_utilization_coverage', 'adult_icu_bed_utilization_coverage', int), - Columndef('adult_icu_bed_utilization_denominator', 'adult_icu_bed_utilization_denominator', int), - Columndef('adult_icu_bed_utilization_numerator', 'adult_icu_bed_utilization_numerator', int), - Columndef('critical_staffing_shortage_anticipated_within_week_no', - 'critical_staffing_shortage_anticipated_within_week_no', int), - Columndef('critical_staffing_shortage_anticipated_within_week_not_reported', - 'critical_staffing_shortage_anticipated_within_week_not_reported', int), - Columndef('critical_staffing_shortage_anticipated_within_week_yes', - 'critical_staffing_shortage_anticipated_within_week_yes', int), - Columndef('critical_staffing_shortage_today_no', 'critical_staffing_shortage_today_no', int), - Columndef('critical_staffing_shortage_today_not_reported', - 'critical_staffing_shortage_today_not_reported', int), - Columndef('critical_staffing_shortage_today_yes', 'critical_staffing_shortage_today_yes', int), - Columndef('deaths_covid', 'deaths_covid', int), - Columndef('deaths_covid_coverage', 'deaths_covid_coverage', int), - Columndef('geocoded_state', 'geocoded_state', str), - Columndef('hospital_onset_covid', 'hospital_onset_covid', int), - Columndef('hospital_onset_covid_coverage', 'hospital_onset_covid_coverage', int), - Columndef('icu_patients_confirmed_influenza', 'icu_patients_confirmed_influenza', int), - Columndef('icu_patients_confirmed_influenza_coverage', 'icu_patients_confirmed_influenza_coverage', - int), - Columndef('inpatient_bed_covid_utilization', 'inpatient_bed_covid_utilization', float), - Columndef('inpatient_bed_covid_utilization_coverage', 'inpatient_bed_covid_utilization_coverage', int), - Columndef('inpatient_bed_covid_utilization_denominator', 'inpatient_bed_covid_utilization_denominator', - int), - Columndef('inpatient_bed_covid_utilization_numerator', 'inpatient_bed_covid_utilization_numerator', - int), - Columndef('inpatient_beds', 'inpatient_beds', int), - Columndef('inpatient_beds_coverage', 'inpatient_beds_coverage', int), - Columndef('inpatient_beds_used', 'inpatient_beds_used', int), - Columndef('inpatient_beds_used_coverage', 'inpatient_beds_used_coverage', int), - Columndef('inpatient_beds_used_covid', 'inpatient_beds_used_covid', int), - Columndef('inpatient_beds_used_covid_coverage', 'inpatient_beds_used_covid_coverage', int), - Columndef('inpatient_beds_utilization', 'inpatient_beds_utilization', float), - Columndef('inpatient_beds_utilization_coverage', 'inpatient_beds_utilization_coverage', int), - Columndef('inpatient_beds_utilization_denominator', 'inpatient_beds_utilization_denominator', int), - Columndef('inpatient_beds_utilization_numerator', 'inpatient_beds_utilization_numerator', int), - Columndef('on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', - 'on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', int), - Columndef('on_hand_supply_therapeutic_b_bamlanivimab_courses', - 'on_hand_supply_therapeutic_b_bamlanivimab_courses', int), - Columndef('on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', - 'on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', int), - Columndef('percent_of_inpatients_with_covid', 'percent_of_inpatients_with_covid', float), - Columndef('percent_of_inpatients_with_covid_coverage', 'percent_of_inpatients_with_covid_coverage', - int), - Columndef('percent_of_inpatients_with_covid_denominator', - 'percent_of_inpatients_with_covid_denominator', int), - Columndef('percent_of_inpatients_with_covid_numerator', 'percent_of_inpatients_with_covid_numerator', - int), - Columndef('previous_day_admission_adult_covid_confirmed', - 'previous_day_admission_adult_covid_confirmed', int), - Columndef('previous_day_admission_adult_covid_confirmed_18-19', - 'previous_day_admission_adult_covid_confirmed_18_19', int), - Columndef('previous_day_admission_adult_covid_confirmed_18-19_coverage', - 'previous_day_admission_adult_covid_confirmed_18_19_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29', - 'previous_day_admission_adult_covid_confirmed_20_29', int), - Columndef('previous_day_admission_adult_covid_confirmed_20-29_coverage', - 'previous_day_admission_adult_covid_confirmed_20_29_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39', - 'previous_day_admission_adult_covid_confirmed_30_39', int), - Columndef('previous_day_admission_adult_covid_confirmed_30-39_coverage', - 'previous_day_admission_adult_covid_confirmed_30_39_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49', - 'previous_day_admission_adult_covid_confirmed_40_49', int), - Columndef('previous_day_admission_adult_covid_confirmed_40-49_coverage', - 'previous_day_admission_adult_covid_confirmed_40_49_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59', - 'previous_day_admission_adult_covid_confirmed_50_59', int), - Columndef('previous_day_admission_adult_covid_confirmed_50-59_coverage', - 'previous_day_admission_adult_covid_confirmed_50_59_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69', - 'previous_day_admission_adult_covid_confirmed_60_69', int), - Columndef('previous_day_admission_adult_covid_confirmed_60-69_coverage', - 'previous_day_admission_adult_covid_confirmed_60_69_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79', - 'previous_day_admission_adult_covid_confirmed_70_79', int), - Columndef('previous_day_admission_adult_covid_confirmed_70-79_coverage', - 'previous_day_admission_adult_covid_confirmed_70_79_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+', - 'previous_day_admission_adult_covid_confirmed_80plus', int), - Columndef('previous_day_admission_adult_covid_confirmed_80+_coverage', - 'previous_day_admission_adult_covid_confirmed_80plus_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_coverage', - 'previous_day_admission_adult_covid_confirmed_coverage', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown', - 'previous_day_admission_adult_covid_confirmed_unknown', int), - Columndef('previous_day_admission_adult_covid_confirmed_unknown_coverage', - 'previous_day_admission_adult_covid_confirmed_unknown_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected', - 'previous_day_admission_adult_covid_suspected', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19', - 'previous_day_admission_adult_covid_suspected_18_19', int), - Columndef('previous_day_admission_adult_covid_suspected_18-19_coverage', - 'previous_day_admission_adult_covid_suspected_18_19_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29', - 'previous_day_admission_adult_covid_suspected_20_29', int), - Columndef('previous_day_admission_adult_covid_suspected_20-29_coverage', - 'previous_day_admission_adult_covid_suspected_20_29_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39', - 'previous_day_admission_adult_covid_suspected_30_39', int), - Columndef('previous_day_admission_adult_covid_suspected_30-39_coverage', - 'previous_day_admission_adult_covid_suspected_30_39_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49', - 'previous_day_admission_adult_covid_suspected_40_49', int), - Columndef('previous_day_admission_adult_covid_suspected_40-49_coverage', - 'previous_day_admission_adult_covid_suspected_40_49_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59', - 'previous_day_admission_adult_covid_suspected_50_59', int), - Columndef('previous_day_admission_adult_covid_suspected_50-59_coverage', - 'previous_day_admission_adult_covid_suspected_50_59_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_60-69', - 'previous_day_admission_adult_covid_suspected_60_69', int), - Columndef('previous_day_admission_adult_covid_suspected_60-69_coverage', - 'previous_day_admission_adult_covid_suspected_60_69_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79', - 'previous_day_admission_adult_covid_suspected_70_79', int), - Columndef('previous_day_admission_adult_covid_suspected_70-79_coverage', - 'previous_day_admission_adult_covid_suspected_70_79_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_80+', - 'previous_day_admission_adult_covid_suspected_80plus', int), - Columndef('previous_day_admission_adult_covid_suspected_80+_coverage', - 'previous_day_admission_adult_covid_suspected_80plus_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_coverage', - 'previous_day_admission_adult_covid_suspected_coverage', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown', - 'previous_day_admission_adult_covid_suspected_unknown', int), - Columndef('previous_day_admission_adult_covid_suspected_unknown_coverage', - 'previous_day_admission_adult_covid_suspected_unknown_coverage', int), - Columndef('previous_day_admission_influenza_confirmed', 'previous_day_admission_influenza_confirmed', - int), - Columndef('previous_day_admission_influenza_confirmed_coverage', - 'previous_day_admission_influenza_confirmed_coverage', int), - Columndef('previous_day_admission_pediatric_covid_confirmed', - 'previous_day_admission_pediatric_covid_confirmed', int), - Columndef('previous_day_admission_pediatric_covid_confirmed_coverage', - 'previous_day_admission_pediatric_covid_confirmed_coverage', int), - Columndef('previous_day_admission_pediatric_covid_suspected', - 'previous_day_admission_pediatric_covid_suspected', int), - Columndef('previous_day_admission_pediatric_covid_suspected_coverage', - 'previous_day_admission_pediatric_covid_suspected_coverage', int), - Columndef('previous_day_deaths_covid_and_influenza', 'previous_day_deaths_covid_and_influenza', int), - Columndef('previous_day_deaths_covid_and_influenza_coverage', - 'previous_day_deaths_covid_and_influenza_coverage', int), - Columndef('previous_day_deaths_influenza', 'previous_day_deaths_influenza', int), - Columndef('previous_day_deaths_influenza_coverage', 'previous_day_deaths_influenza_coverage', int), - Columndef('previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', - 'previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', int), - Columndef('previous_week_therapeutic_b_bamlanivimab_courses_used', - 'previous_week_therapeutic_b_bamlanivimab_courses_used', int), - Columndef('previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', - 'previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', int), - Columndef('staffed_adult_icu_bed_occupancy', 'staffed_adult_icu_bed_occupancy', int), - Columndef('staffed_adult_icu_bed_occupancy_coverage', 'staffed_adult_icu_bed_occupancy_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid', - 'staffed_icu_adult_patients_confirmed_suspected_covid', int), - Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage', - 'staffed_icu_adult_patients_confirmed_suspected_covid_coverage', int), - Columndef('staffed_icu_adult_patients_confirmed_covid', 'staffed_icu_adult_patients_confirmed_covid', - int), - Columndef('staffed_icu_adult_patients_confirmed_covid_coverage', - 'staffed_icu_adult_patients_confirmed_covid_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid', - 'total_adult_patients_hosp_confirmed_suspected_covid', int), - Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage', - 'total_adult_patients_hosp_confirmed_suspected_covid_coverage', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid', - 'total_adult_patients_hosp_confirmed_covid', int), - Columndef('total_adult_patients_hospitalized_confirmed_covid_coverage', - 'total_adult_patients_hosp_confirmed_covid_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza', - 'total_patients_hospitalized_confirmed_influenza', int), - Columndef('total_patients_hospitalized_confirmed_influenza_coverage', - 'total_patients_hospitalized_confirmed_influenza_coverage', int), - Columndef('total_patients_hospitalized_confirmed_influenza_covid', - 'total_patients_hospitalized_confirmed_influenza_covid', int), - Columndef('total_patients_hospitalized_confirmed_influenza_covid_coverage', - 'total_patients_hospitalized_confirmed_influenza_covid_coverage', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid', - 'total_pediatric_patients_hosp_confirmed_suspected_covid', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage', - 'total_pediatric_patients_hosp_confirmed_suspected_covid_coverage', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid', - 'total_pediatric_patients_hosp_confirmed_covid', int), - Columndef('total_pediatric_patients_hospitalized_confirmed_covid_coverage', - 'total_pediatric_patients_hosp_confirmed_covid_coverage', int), - Columndef('total_staffed_adult_icu_beds', 'total_staffed_adult_icu_beds', int), - Columndef('total_staffed_adult_icu_beds_coverage', 'total_staffed_adult_icu_beds_coverage', int), - ] + TABLE_NAME = 'covid_hosp_state_timeseries' + KEY_COLS = ['state', 'date'] + # These are 3-tuples of (CSV header name, SQL db column name, data type) for + # all the columns in the CSV file. + # Note that the corresponding database column names may be shorter + # due to constraints on the length of column names. See + # /src/ddl/covid_hosp.sql for more information. + # Additionally, all column names below are shared with state_daily, + # except for reporting_cutoff_start (there) and date (here). If you need + # to update a column name, do it in both places. + ORDERED_CSV_COLUMNS = [ + Columndef('state', 'state', str), + Columndef('date', 'date', Utils.int_from_date), + Columndef('adult_icu_bed_covid_utilization', 'adult_icu_bed_covid_utilization', float), + Columndef('adult_icu_bed_covid_utilization_coverage', 'adult_icu_bed_covid_utilization_coverage', int), + Columndef('adult_icu_bed_covid_utilization_denominator', 'adult_icu_bed_covid_utilization_denominator', int), + Columndef('adult_icu_bed_covid_utilization_numerator', 'adult_icu_bed_covid_utilization_numerator', int), + Columndef('adult_icu_bed_utilization', 'adult_icu_bed_utilization', float), + Columndef('adult_icu_bed_utilization_coverage', 'adult_icu_bed_utilization_coverage', int), + Columndef('adult_icu_bed_utilization_denominator', 'adult_icu_bed_utilization_denominator', int), + Columndef('adult_icu_bed_utilization_numerator', 'adult_icu_bed_utilization_numerator', int), + Columndef('critical_staffing_shortage_anticipated_within_week_no', 'critical_staffing_shortage_anticipated_within_week_no', int), + Columndef('critical_staffing_shortage_anticipated_within_week_not_reported', 'critical_staffing_shortage_anticipated_within_week_not_reported', int), + Columndef('critical_staffing_shortage_anticipated_within_week_yes', 'critical_staffing_shortage_anticipated_within_week_yes', int), + Columndef('critical_staffing_shortage_today_no', 'critical_staffing_shortage_today_no', int), + Columndef('critical_staffing_shortage_today_not_reported', 'critical_staffing_shortage_today_not_reported', int), + Columndef('critical_staffing_shortage_today_yes', 'critical_staffing_shortage_today_yes', int), + Columndef('deaths_covid', 'deaths_covid', int), + Columndef('deaths_covid_coverage', 'deaths_covid_coverage', int), + Columndef('geocoded_state', 'geocoded_state', str), + Columndef('hospital_onset_covid', 'hospital_onset_covid', int), + Columndef('hospital_onset_covid_coverage', 'hospital_onset_covid_coverage', int), + Columndef('icu_patients_confirmed_influenza', 'icu_patients_confirmed_influenza', int), + Columndef('icu_patients_confirmed_influenza_coverage', 'icu_patients_confirmed_influenza_coverage', int), + Columndef('inpatient_bed_covid_utilization', 'inpatient_bed_covid_utilization', float), + Columndef('inpatient_bed_covid_utilization_coverage', 'inpatient_bed_covid_utilization_coverage', int), + Columndef('inpatient_bed_covid_utilization_denominator', 'inpatient_bed_covid_utilization_denominator', int), + Columndef('inpatient_bed_covid_utilization_numerator', 'inpatient_bed_covid_utilization_numerator', int), + Columndef('inpatient_beds', 'inpatient_beds', int), + Columndef('inpatient_beds_coverage', 'inpatient_beds_coverage', int), + Columndef('inpatient_beds_used', 'inpatient_beds_used', int), + Columndef('inpatient_beds_used_coverage', 'inpatient_beds_used_coverage', int), + Columndef('inpatient_beds_used_covid', 'inpatient_beds_used_covid', int), + Columndef('inpatient_beds_used_covid_coverage', 'inpatient_beds_used_covid_coverage', int), + Columndef('inpatient_beds_utilization', 'inpatient_beds_utilization', float), + Columndef('inpatient_beds_utilization_coverage', 'inpatient_beds_utilization_coverage', int), + Columndef('inpatient_beds_utilization_denominator', 'inpatient_beds_utilization_denominator', int), + Columndef('inpatient_beds_utilization_numerator', 'inpatient_beds_utilization_numerator', int), + Columndef('on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', 'on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses', int), + Columndef('on_hand_supply_therapeutic_b_bamlanivimab_courses', 'on_hand_supply_therapeutic_b_bamlanivimab_courses', int), + Columndef('on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', 'on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses', int), + Columndef('percent_of_inpatients_with_covid', 'percent_of_inpatients_with_covid', float), + Columndef('percent_of_inpatients_with_covid_coverage', 'percent_of_inpatients_with_covid_coverage', int), + Columndef('percent_of_inpatients_with_covid_denominator', 'percent_of_inpatients_with_covid_denominator', int), + Columndef('percent_of_inpatients_with_covid_numerator', 'percent_of_inpatients_with_covid_numerator', int), + Columndef('previous_day_admission_adult_covid_confirmed', 'previous_day_admission_adult_covid_confirmed', int), + Columndef('previous_day_admission_adult_covid_confirmed_18-19', 'previous_day_admission_adult_covid_confirmed_18_19', int), + Columndef('previous_day_admission_adult_covid_confirmed_18-19_coverage', 'previous_day_admission_adult_covid_confirmed_18_19_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_20-29', 'previous_day_admission_adult_covid_confirmed_20_29', int), + Columndef('previous_day_admission_adult_covid_confirmed_20-29_coverage', 'previous_day_admission_adult_covid_confirmed_20_29_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_30-39', 'previous_day_admission_adult_covid_confirmed_30_39', int), + Columndef('previous_day_admission_adult_covid_confirmed_30-39_coverage', 'previous_day_admission_adult_covid_confirmed_30_39_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_40-49', 'previous_day_admission_adult_covid_confirmed_40_49', int), + Columndef('previous_day_admission_adult_covid_confirmed_40-49_coverage', 'previous_day_admission_adult_covid_confirmed_40_49_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_50-59', 'previous_day_admission_adult_covid_confirmed_50_59', int), + Columndef('previous_day_admission_adult_covid_confirmed_50-59_coverage', 'previous_day_admission_adult_covid_confirmed_50_59_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_60-69', 'previous_day_admission_adult_covid_confirmed_60_69', int), + Columndef('previous_day_admission_adult_covid_confirmed_60-69_coverage', 'previous_day_admission_adult_covid_confirmed_60_69_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_70-79', 'previous_day_admission_adult_covid_confirmed_70_79', int), + Columndef('previous_day_admission_adult_covid_confirmed_70-79_coverage', 'previous_day_admission_adult_covid_confirmed_70_79_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_80+', 'previous_day_admission_adult_covid_confirmed_80plus', int), + Columndef('previous_day_admission_adult_covid_confirmed_80+_coverage', 'previous_day_admission_adult_covid_confirmed_80plus_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_coverage', 'previous_day_admission_adult_covid_confirmed_coverage', int), + Columndef('previous_day_admission_adult_covid_confirmed_unknown', 'previous_day_admission_adult_covid_confirmed_unknown', int), + Columndef('previous_day_admission_adult_covid_confirmed_unknown_coverage', 'previous_day_admission_adult_covid_confirmed_unknown_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected', 'previous_day_admission_adult_covid_suspected', int), + Columndef('previous_day_admission_adult_covid_suspected_18-19', 'previous_day_admission_adult_covid_suspected_18_19', int), + Columndef('previous_day_admission_adult_covid_suspected_18-19_coverage', 'previous_day_admission_adult_covid_suspected_18_19_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_20-29', 'previous_day_admission_adult_covid_suspected_20_29', int), + Columndef('previous_day_admission_adult_covid_suspected_20-29_coverage', 'previous_day_admission_adult_covid_suspected_20_29_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_30-39', 'previous_day_admission_adult_covid_suspected_30_39', int), + Columndef('previous_day_admission_adult_covid_suspected_30-39_coverage', 'previous_day_admission_adult_covid_suspected_30_39_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_40-49', 'previous_day_admission_adult_covid_suspected_40_49', int), + Columndef('previous_day_admission_adult_covid_suspected_40-49_coverage', 'previous_day_admission_adult_covid_suspected_40_49_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_50-59', 'previous_day_admission_adult_covid_suspected_50_59', int), + Columndef('previous_day_admission_adult_covid_suspected_50-59_coverage', 'previous_day_admission_adult_covid_suspected_50_59_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_60-69', 'previous_day_admission_adult_covid_suspected_60_69', int), + Columndef('previous_day_admission_adult_covid_suspected_60-69_coverage', 'previous_day_admission_adult_covid_suspected_60_69_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_70-79', 'previous_day_admission_adult_covid_suspected_70_79', int), + Columndef('previous_day_admission_adult_covid_suspected_70-79_coverage', 'previous_day_admission_adult_covid_suspected_70_79_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_80+', 'previous_day_admission_adult_covid_suspected_80plus', int), + Columndef('previous_day_admission_adult_covid_suspected_80+_coverage', 'previous_day_admission_adult_covid_suspected_80plus_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_coverage', 'previous_day_admission_adult_covid_suspected_coverage', int), + Columndef('previous_day_admission_adult_covid_suspected_unknown', 'previous_day_admission_adult_covid_suspected_unknown', int), + Columndef('previous_day_admission_adult_covid_suspected_unknown_coverage', 'previous_day_admission_adult_covid_suspected_unknown_coverage', int), + Columndef('previous_day_admission_influenza_confirmed', 'previous_day_admission_influenza_confirmed', int), + Columndef('previous_day_admission_influenza_confirmed_coverage', 'previous_day_admission_influenza_confirmed_coverage', int), + Columndef('previous_day_admission_pediatric_covid_confirmed', 'previous_day_admission_pediatric_covid_confirmed', int), + Columndef('previous_day_admission_pediatric_covid_confirmed_coverage', 'previous_day_admission_pediatric_covid_confirmed_coverage', int), + Columndef('previous_day_admission_pediatric_covid_suspected', 'previous_day_admission_pediatric_covid_suspected', int), + Columndef('previous_day_admission_pediatric_covid_suspected_coverage', 'previous_day_admission_pediatric_covid_suspected_coverage', int), + Columndef('previous_day_deaths_covid_and_influenza', 'previous_day_deaths_covid_and_influenza', int), + Columndef('previous_day_deaths_covid_and_influenza_coverage', 'previous_day_deaths_covid_and_influenza_coverage', int), + Columndef('previous_day_deaths_influenza', 'previous_day_deaths_influenza', int), + Columndef('previous_day_deaths_influenza_coverage', 'previous_day_deaths_influenza_coverage', int), + Columndef('previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', 'previous_week_therapeutic_a_casirivimab_imdevimab_courses_used', int), + Columndef('previous_week_therapeutic_b_bamlanivimab_courses_used', 'previous_week_therapeutic_b_bamlanivimab_courses_used', int), + Columndef( + 'previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', + 'previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used', + int + ), + Columndef('staffed_adult_icu_bed_occupancy', 'staffed_adult_icu_bed_occupancy', int), + Columndef('staffed_adult_icu_bed_occupancy_coverage', 'staffed_adult_icu_bed_occupancy_coverage', int), + Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid', 'staffed_icu_adult_patients_confirmed_suspected_covid', int), + Columndef('staffed_icu_adult_patients_confirmed_and_suspected_covid_coverage', 'staffed_icu_adult_patients_confirmed_suspected_covid_coverage', int), + Columndef('staffed_icu_adult_patients_confirmed_covid', 'staffed_icu_adult_patients_confirmed_covid', int), + Columndef('staffed_icu_adult_patients_confirmed_covid_coverage', 'staffed_icu_adult_patients_confirmed_covid_coverage', int), + Columndef('total_adult_patients_hospitalized_confirmed_and_suspected_covid', 'total_adult_patients_hosp_confirmed_suspected_covid', int), + Columndef( + 'total_adult_patients_hospitalized_confirmed_and_suspected_covid_coverage', + 'total_adult_patients_hosp_confirmed_suspected_covid_coverage', + int + ), + Columndef('total_adult_patients_hospitalized_confirmed_covid', 'total_adult_patients_hosp_confirmed_covid', int), + Columndef('total_adult_patients_hospitalized_confirmed_covid_coverage', 'total_adult_patients_hosp_confirmed_covid_coverage', int), + Columndef('total_patients_hospitalized_confirmed_influenza', 'total_patients_hospitalized_confirmed_influenza', int), + Columndef('total_patients_hospitalized_confirmed_influenza_coverage', 'total_patients_hospitalized_confirmed_influenza_coverage', int), + Columndef('total_patients_hospitalized_confirmed_influenza_covid', 'total_patients_hospitalized_confirmed_influenza_covid', int), + Columndef('total_patients_hospitalized_confirmed_influenza_covid_coverage', 'total_patients_hospitalized_confirmed_influenza_covid_coverage', int), + Columndef('total_pediatric_patients_hospitalized_confirmed_and_suspected_covid', 'total_pediatric_patients_hosp_confirmed_suspected_covid', int), + Columndef( + 'total_pediatric_patients_hospitalized_confirmed_and_suspected_covid_coverage', + 'total_pediatric_patients_hosp_confirmed_suspected_covid_coverage', + int + ), + Columndef('total_pediatric_patients_hospitalized_confirmed_covid', 'total_pediatric_patients_hosp_confirmed_covid', int), + Columndef('total_pediatric_patients_hospitalized_confirmed_covid_coverage', 'total_pediatric_patients_hosp_confirmed_covid_coverage', int), + Columndef('total_staffed_adult_icu_beds', 'total_staffed_adult_icu_beds', int), + Columndef('total_staffed_adult_icu_beds_coverage', 'total_staffed_adult_icu_beds_coverage', int), + ] - def __init__(self, *args, **kwargs): - super().__init__( - *args, - **kwargs, - table_name=Database.TABLE_NAME, - hhs_dataset_id=Network.DATASET_ID, - columns_and_types=Database.ORDERED_CSV_COLUMNS, - key_columns=Database.KEY_COLS, - additional_fields=[Columndef('T', 'record_type', None)]) + def __init__(self, *args, **kwargs): + super().__init__( + *args, + **kwargs, + table_name=Database.TABLE_NAME, + hhs_dataset_id=Network.DATASET_ID, + columns_and_types=Database.ORDERED_CSV_COLUMNS, + key_columns=Database.KEY_COLS, + additional_fields=[Columndef('T', 'record_type', None)]) diff --git a/src/acquisition/covid_hosp/state_timeseries/network.py b/src/acquisition/covid_hosp/state_timeseries/network.py index 7bd5082a8..a074154aa 100644 --- a/src/acquisition/covid_hosp/state_timeseries/network.py +++ b/src/acquisition/covid_hosp/state_timeseries/network.py @@ -1,17 +1,16 @@ -# first party from delphi.epidata.acquisition.covid_hosp.common.network import Network as BaseNetwork class Network(BaseNetwork): - DATASET_ID = 'g62h-syeh' - METADATA_ID = 'qqte-vkut' + DATASET_ID = 'g62h-syeh' + METADATA_ID = 'qqte-vkut' - def fetch_metadata(*args, **kwags): - """Download and return metadata. + def fetch_metadata(*args, **kwags): + """Download and return metadata. - See `fetch_metadata_for_dataset`. - """ + See `fetch_metadata_for_dataset`. + """ - return Network.fetch_metadata_for_dataset( - *args, **kwags, dataset_id=Network.METADATA_ID) + return Network.fetch_metadata_for_dataset( + *args, **kwags, dataset_id=Network.METADATA_ID) diff --git a/src/acquisition/covid_hosp/state_timeseries/update.py b/src/acquisition/covid_hosp/state_timeseries/update.py index 7c8e79941..1cf85b4d9 100644 --- a/src/acquisition/covid_hosp/state_timeseries/update.py +++ b/src/acquisition/covid_hosp/state_timeseries/update.py @@ -3,8 +3,6 @@ Timeseries" dataset provided by the US Department of Health & Human Services via healthdata.gov. """ - -# first party from delphi.epidata.acquisition.covid_hosp.common.utils import Utils from delphi.epidata.acquisition.covid_hosp.state_timeseries.database import Database from delphi.epidata.acquisition.covid_hosp.state_timeseries.network import Network @@ -12,16 +10,16 @@ class Update: - def run(network=Network): - """Acquire the most recent dataset, unless it was previously acquired. + def run(network=Network): + """Acquire the most recent dataset, unless it was previously acquired. - Returns - ------- - bool - Whether a new dataset was acquired. - """ + Returns + ------- + bool + Whether a new dataset was acquired. + """ - return Utils.update_dataset(Database, network) + return Utils.update_dataset(Database, network) # main entry point diff --git a/src/acquisition/covidcast/covidcast_meta_cache_updater.py b/src/acquisition/covidcast/covidcast_meta_cache_updater.py index a46345b62..1445482a3 100644 --- a/src/acquisition/covidcast/covidcast_meta_cache_updater.py +++ b/src/acquisition/covidcast/covidcast_meta_cache_updater.py @@ -1,83 +1,82 @@ """Updates the cache for the `covidcast_meta` endpiont.""" -# standard library + import argparse import sys import time -# first party from delphi.epidata.acquisition.covidcast.database import Database from delphi.epidata.acquisition.covidcast.logger import get_structured_logger from delphi.epidata.client.delphi_epidata import Epidata + def get_argument_parser(): - """Define command line arguments.""" + """Define command line arguments.""" - parser = argparse.ArgumentParser() - parser.add_argument("--log_file", help="filename for log output") - parser.add_argument("--num_threads", type=int, help="number of worker threads to spawn for processing source/signal pairs") - return parser + parser = argparse.ArgumentParser() + parser.add_argument("--log_file", help="filename for log output") + parser.add_argument("--num_threads", type=int, help="number of worker threads to spawn for processing source/signal pairs") + return parser def main(args, epidata_impl=Epidata, database_impl=Database): - """Update the covidcast metadata cache. - - `args`: parsed command-line arguments - """ - log_file = None - num_threads = None - if (args): - log_file = args.log_file - num_threads = args.num_threads - - logger = get_structured_logger( - "metadata_cache_updater", - filename=log_file) - start_time = time.time() - database = database_impl() - database.connect() - - # fetch metadata - try: - metadata_calculation_start_time = time.time() - metadata = database.compute_covidcast_meta(n_threads=num_threads) - metadata_calculation_interval_in_seconds = time.time() - metadata_calculation_start_time - except: - # clean up before failing - database.disconnect(True) - raise - - args = ("success",1) - if len(metadata)==0: - args = ("no results",-2) - - logger.info('covidcast_meta result: %s (code %d)' % args) - - if args[-1] != 1: - logger.error('unable to cache epidata') - return False - - # update the cache - try: - metadata_update_start_time = time.time() - database.update_covidcast_meta_cache(metadata) - metadata_update_interval_in_seconds = time.time() - metadata_update_start_time - logger.info('successfully cached epidata') - finally: - # no catch block so that an exception above will cause the program to - # fail after the following cleanup - database.disconnect(True) - - logger.info( - "Generated and updated covidcast metadata", - metadata_calculation_interval_in_seconds=round( - metadata_calculation_interval_in_seconds, 2), - metadata_update_interval_in_seconds=round( - metadata_update_interval_in_seconds, 2), - total_runtime_in_seconds=round(time.time() - start_time, 2)) - return True + """Update the covidcast metadata cache. + + `args`: parsed command-line arguments + """ + log_file = None + num_threads = None + if (args): + log_file = args.log_file + num_threads = args.num_threads + + logger = get_structured_logger( + "metadata_cache_updater", + filename=log_file + ) + start_time = time.time() + database = database_impl() + database.connect() + + # fetch metadata + try: + metadata_calculation_start_time = time.time() + metadata = database.compute_covidcast_meta(n_threads=num_threads) + metadata_calculation_interval_in_seconds = time.time() - metadata_calculation_start_time + except: # noqa + # clean up before failing + database.disconnect(True) + raise + + args = ("success", 1) + if len(metadata) == 0: + args = ("no results", -2) + + logger.info('covidcast_meta result: %s (code %d)' % args) + + if args[-1] != 1: + logger.error('unable to cache epidata') + return False + + # update the cache + try: + metadata_update_start_time = time.time() + database.update_covidcast_meta_cache(metadata) + metadata_update_interval_in_seconds = time.time() - metadata_update_start_time + logger.info('successfully cached epidata') + finally: + # no catch block so that an exception above will cause the program to + # fail after the following cleanup + database.disconnect(True) + + logger.info( + "Generated and updated covidcast metadata", + metadata_calculation_interval_in_seconds=round(metadata_calculation_interval_in_seconds, 2), + metadata_update_interval_in_seconds=round(metadata_update_interval_in_seconds, 2), + total_runtime_in_seconds=round(time.time() - start_time, 2)) + return True if __name__ == '__main__': - if not main(get_argument_parser().parse_args()): - sys.exit(1) + if not main(get_argument_parser().parse_args()): + sys.exit(1) diff --git a/src/acquisition/covidcast/covidcast_row.py b/src/acquisition/covidcast/covidcast_row.py index 23e19eb57..ca877cc56 100644 --- a/src/acquisition/covidcast/covidcast_row.py +++ b/src/acquisition/covidcast/covidcast_row.py @@ -3,7 +3,6 @@ import pandas as pd - PANDAS_DTYPES = { "source": str, "signal": str, @@ -25,6 +24,7 @@ "value_updated_timestamp": "Int64", } + @dataclass class CovidcastRow: """A container for the values of a single covidcast database row. @@ -72,7 +72,7 @@ def as_dict(self, ignore_fields: Optional[List[str]] = None) -> dict: for key in ignore_fields: del d[key] return d - + def as_api_row_dict(self, ignore_fields: Optional[List[str]] = None) -> dict: """Returns a dict view into the row with the fields returned by the API server.""" return self.as_dict(ignore_fields=self._api_row_ignore_fields + (ignore_fields or [])) @@ -113,7 +113,6 @@ def time_pair(self): return f"{self.time_type}:{self.time_value}" - def check_valid_dtype(dtype): try: pd.api.types.pandas_dtype(dtype) diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py index 0fa936802..8249eb35a 100644 --- a/src/acquisition/covidcast/csv_importer.py +++ b/src/acquisition/covidcast/csv_importer.py @@ -1,6 +1,6 @@ """Collects and reads covidcast data from a set of local CSV files.""" -# standard library + import os import re from dataclasses import dataclass @@ -8,409 +8,395 @@ from glob import glob from typing import Iterator, NamedTuple, Optional, Tuple -# third party import epiweeks as epi import pandas as pd - -# first party -from delphi_utils import Nans -from delphi.utils.epiweek import delta_epiweeks from delphi.epidata.acquisition.covidcast.covidcast_row import CovidcastRow from delphi.epidata.acquisition.covidcast.logger import get_structured_logger +from delphi.utils.epiweek import delta_epiweeks +from delphi_utils import Nans DataFrameRow = NamedTuple('DFRow', [ - ('geo_id', str), - ('value', float), - ('stderr', float), - ('sample_size', float), - ('missing_value', int), - ('missing_stderr', int), - ('missing_sample_size', int) + ('geo_id', str), + ('value', float), + ('stderr', float), + ('sample_size', float), + ('missing_value', int), + ('missing_stderr', int), + ('missing_sample_size', int) ]) PathDetails = NamedTuple('PathDetails', [ - ('issue', int), - ('lag', int), - ('source', str), - ("signal", str), - ('time_type', str), - ('time_value', int), - ('geo_type', str), + ('issue', int), + ('lag', int), + ('source', str), + ("signal", str), + ('time_type', str), + ('time_value', int), + ('geo_type', str), ]) @dataclass class CsvRowValue: - """A container for the values of a single validated covidcast CSV row.""" - geo_value: str - value: float - stderr: float - sample_size: float - missing_value: int - missing_stderr: int - missing_sample_size: int + """A container for the values of a single validated covidcast CSV row.""" + geo_value: str + value: float + stderr: float + sample_size: float + missing_value: int + missing_stderr: int + missing_sample_size: int class CsvImporter: - """Finds and parses covidcast CSV files.""" - - # .../source/yyyymmdd_geo_signal.csv - PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(\w+?)_(\w+)\.csv$') - - # .../source/weekly_yyyyww_geo_signal.csv - PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(\w+?)_(\w+)\.csv$') - - # .../issue_yyyymmdd - PATTERN_ISSUE_DIR = re.compile(r'^.*/([^/]*)/issue_(\d{8})$') - - # set of allowed resolutions (aka "geo_type") - GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation'} - - # set of required CSV columns - REQUIRED_COLUMNS = {'geo_id', 'val', 'se', 'sample_size'} - - # reasonable time bounds for sanity checking time values - MIN_YEAR = 2019 - MAX_YEAR = 2030 - - # The datatypes expected by pandas.read_csv. Int64 is like float in that it can handle both numbers and nans. - DTYPES = { - "geo_id": str, - "val": float, - "se": float, - "sample_size": float, - "missing_val": "Int64", - "missing_se": "Int64", - "missing_sample_size": "Int64" - } - - - @staticmethod - def is_sane_day(value): - """Return whether `value` is a sane (maybe not valid) YYYYMMDD date. - - Truthy return is is a datetime.date object representing `value`.""" - - year, month, day = value // 10000, (value % 10000) // 100, value % 100 - - nearby_year = CsvImporter.MIN_YEAR <= year <= CsvImporter.MAX_YEAR - valid_month = 1 <= month <= 12 - sensible_day = 1 <= day <= 31 - - if not (nearby_year and valid_month and sensible_day): - return False - return date(year=year,month=month,day=day) - - - @staticmethod - def is_sane_week(value): - """Return whether `value` is a sane (maybe not valid) YYYYWW epiweek. - - Truthy return is `value`.""" - - year, week = value // 100, value % 100 - - nearby_year = CsvImporter.MIN_YEAR <= year <= CsvImporter.MAX_YEAR - sensible_week = 1 <= week <= 53 - - if not (nearby_year and sensible_week): - return False - return value - - - @staticmethod - def find_issue_specific_csv_files(scan_dir): - logger = get_structured_logger('find_issue_specific_csv_files') - for path in sorted(glob(os.path.join(scan_dir, '*'))): - issuedir_match = CsvImporter.PATTERN_ISSUE_DIR.match(path.lower()) - if issuedir_match and os.path.isdir(path): - issue_date_value = int(issuedir_match.group(2)) - issue_date = CsvImporter.is_sane_day(issue_date_value) - if issue_date: - logger.info(event='processing csv files from issue', detail=issue_date, file=path) - yield from CsvImporter.find_csv_files(path, issue=(issue_date, epi.Week.fromdate(issue_date))) + """Finds and parses covidcast CSV files.""" + + # .../source/yyyymmdd_geo_signal.csv + PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(\w+?)_(\w+)\.csv$') + + # .../source/weekly_yyyyww_geo_signal.csv + PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(\w+?)_(\w+)\.csv$') + + # .../issue_yyyymmdd + PATTERN_ISSUE_DIR = re.compile(r'^.*/([^/]*)/issue_(\d{8})$') + + # set of allowed resolutions (aka "geo_type") + GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation'} + + # set of required CSV columns + REQUIRED_COLUMNS = {'geo_id', 'val', 'se', 'sample_size'} + + # reasonable time bounds for sanity checking time values + MIN_YEAR = 2019 + MAX_YEAR = 2030 + + # The datatypes expected by pandas.read_csv. Int64 is like float in that it can handle both numbers and nans. + DTYPES = { + "geo_id": str, + "val": float, + "se": float, + "sample_size": float, + "missing_val": "Int64", + "missing_se": "Int64", + "missing_sample_size": "Int64" + } + + @staticmethod + def is_sane_day(value): + """Return whether `value` is a sane (maybe not valid) YYYYMMDD date. + + Truthy return is is a datetime.date object representing `value`.""" + + year, month, day = value // 10000, (value % 10000) // 100, value % 100 + + nearby_year = CsvImporter.MIN_YEAR <= year <= CsvImporter.MAX_YEAR + valid_month = 1 <= month <= 12 + sensible_day = 1 <= day <= 31 + + if not (nearby_year and valid_month and sensible_day): + return False + return date(year=year, month=month, day=day) + + @staticmethod + def is_sane_week(value): + """Return whether `value` is a sane (maybe not valid) YYYYWW epiweek. + + Truthy return is `value`.""" + + year, week = value // 100, value % 100 + + nearby_year = CsvImporter.MIN_YEAR <= year <= CsvImporter.MAX_YEAR + sensible_week = 1 <= week <= 53 + + if not (nearby_year and sensible_week): + return False + return value + + @staticmethod + def find_issue_specific_csv_files(scan_dir): + logger = get_structured_logger('find_issue_specific_csv_files') + for path in sorted(glob(os.path.join(scan_dir, '*'))): + issuedir_match = CsvImporter.PATTERN_ISSUE_DIR.match(path.lower()) + if issuedir_match and os.path.isdir(path): + issue_date_value = int(issuedir_match.group(2)) + issue_date = CsvImporter.is_sane_day(issue_date_value) + if issue_date: + logger.info(event='processing csv files from issue', detail=issue_date, file=path) + yield from CsvImporter.find_csv_files(path, issue=(issue_date, epi.Week.fromdate(issue_date))) + else: + logger.warning(event='invalid issue directory day', detail=issue_date_value, file=path) + + @staticmethod + def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()))): + """Recursively search for and yield covidcast-format CSV files. + + scan_dir: the directory to scan (recursively) + + The return value is a tuple of (path, details), where, if the path was + valid, details is a tuple of (source, signal, time_type, geo_type, + time_value, issue, lag) (otherwise None). + """ + logger = get_structured_logger('find_csv_files') + issue_day, issue_epiweek = issue + issue_day_value = int(issue_day.strftime("%Y%m%d")) + issue_epiweek_value = int(str(issue_epiweek)) + issue_value = -1 + lag_value = -1 + + for path in sorted(glob(os.path.join(scan_dir, '*', '*'))): + # safe to ignore this file + if not path.lower().endswith('.csv'): + continue + + # match a daily or weekly naming pattern + daily_match = CsvImporter.PATTERN_DAILY.match(path.lower()) + weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower()) + if not daily_match and not weekly_match: + logger.warning(event='invalid csv path/filename', detail=path, file=path) + yield (path, None) + continue + + # extract and validate time resolution + if daily_match: + time_type = 'day' + time_value = int(daily_match.group(2)) + match = daily_match + time_value_day = CsvImporter.is_sane_day(time_value) + if not time_value_day: + logger.warning(event='invalid filename day', detail=time_value, file=path) + yield (path, None) + continue + issue_value = issue_day_value + lag_value = (issue_day-time_value_day).days + else: + time_type = 'week' + time_value = int(weekly_match.group(2)) + match = weekly_match + time_value_week = CsvImporter.is_sane_week(time_value) + if not time_value_week: + logger.warning(event='invalid filename week', detail=time_value, file=path) + yield (path, None) + continue + issue_value = issue_epiweek_value + lag_value = delta_epiweeks(time_value_week, issue_epiweek_value) + + # extract and validate geographic resolution + geo_type = match.group(3).lower() + if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS: + logger.warning(event='invalid geo_type', detail=geo_type, file=path) + yield (path, None) + continue + + # extract additional values, lowercased for consistency + source = match.group(1).lower() + signal = match.group(4).lower() + if len(signal) > 64: + logger.warning(event='invalid signal name (64 char limit)', detail=signal, file=path) + yield (path, None) + continue + + yield (path, PathDetails(issue_value, lag_value, source, signal, time_type, time_value, geo_type)) + + @staticmethod + def is_header_valid(columns): + """Return whether the given pandas columns contains the required fields.""" + + return set(columns) >= CsvImporter.REQUIRED_COLUMNS + + @staticmethod + def floaty_int(value: str) -> int: + """Cast a string to an int, even if it looks like a float. + + For example, "-1" and "-1.0" should both result in -1. Non-integer floats + will cause `ValueError` to be reaised. + """ + + float_value = float(value) + if not float_value.is_integer(): + raise ValueError('not an int: "%s"' % str(value)) + return int(float_value) + + @staticmethod + def maybe_apply(func, quantity): + """Apply the given function to the given quantity if not null-ish.""" + if str(quantity).lower() in ('inf', '-inf'): + raise ValueError("Quantity given was an inf.") + elif str(quantity).lower() in ('', 'na', 'nan', 'none'): + return None else: - logger.warning(event='invalid issue directory day', detail=issue_date_value, file=path) - - - @staticmethod - def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()))): - """Recursively search for and yield covidcast-format CSV files. - - scan_dir: the directory to scan (recursively) - - The return value is a tuple of (path, details), where, if the path was - valid, details is a tuple of (source, signal, time_type, geo_type, - time_value, issue, lag) (otherwise None). - """ - logger = get_structured_logger('find_csv_files') - issue_day,issue_epiweek=issue - issue_day_value=int(issue_day.strftime("%Y%m%d")) - issue_epiweek_value=int(str(issue_epiweek)) - issue_value=-1 - lag_value=-1 - - for path in sorted(glob(os.path.join(scan_dir, '*', '*'))): - # safe to ignore this file - if not path.lower().endswith('.csv'): - continue - - # match a daily or weekly naming pattern - daily_match = CsvImporter.PATTERN_DAILY.match(path.lower()) - weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower()) - if not daily_match and not weekly_match: - logger.warning(event='invalid csv path/filename', detail=path, file=path) - yield (path, None) - continue - - # extract and validate time resolution - if daily_match: - time_type = 'day' - time_value = int(daily_match.group(2)) - match = daily_match - time_value_day = CsvImporter.is_sane_day(time_value) - if not time_value_day: - logger.warning(event='invalid filename day', detail=time_value, file=path) - yield (path, None) - continue - issue_value=issue_day_value - lag_value=(issue_day-time_value_day).days - else: - time_type = 'week' - time_value = int(weekly_match.group(2)) - match = weekly_match - time_value_week=CsvImporter.is_sane_week(time_value) - if not time_value_week: - logger.warning(event='invalid filename week', detail=time_value, file=path) - yield (path, None) - continue - issue_value=issue_epiweek_value - lag_value=delta_epiweeks(time_value_week, issue_epiweek_value) - - # # extract and validate geographic resolution - geo_type = match.group(3).lower() - if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS: - logger.warning(event='invalid geo_type', detail=geo_type, file=path) - yield (path, None) - continue - - # extract additional values, lowercased for consistency - source = match.group(1).lower() - signal = match.group(4).lower() - if len(signal) > 64: - logger.warning(event='invalid signal name (64 char limit)',detail=signal, file=path) - yield (path, None) - continue - - yield (path, PathDetails(issue_value, lag_value, source, signal, time_type, time_value, geo_type)) - - - @staticmethod - def is_header_valid(columns): - """Return whether the given pandas columns contains the required fields.""" - - return set(columns) >= CsvImporter.REQUIRED_COLUMNS - - - @staticmethod - def floaty_int(value: str) -> int: - """Cast a string to an int, even if it looks like a float. - - For example, "-1" and "-1.0" should both result in -1. Non-integer floats - will cause `ValueError` to be reaised. - """ - - float_value = float(value) - if not float_value.is_integer(): - raise ValueError('not an int: "%s"' % str(value)) - return int(float_value) - - - @staticmethod - def maybe_apply(func, quantity): - """Apply the given function to the given quantity if not null-ish.""" - if str(quantity).lower() in ('inf', '-inf'): - raise ValueError("Quantity given was an inf.") - elif str(quantity).lower() in ('', 'na', 'nan', 'none'): - return None - else: - return func(quantity) - - - @staticmethod - def validate_quantity(row, attr_quantity): - """Take a row and validate a given associated quantity (e.g., val, se, stderr). - - Returns either a float, a None, or "Error". - """ - try: - quantity = CsvImporter.maybe_apply(float, getattr(row, attr_quantity)) - return quantity - except (ValueError, AttributeError): - # val was a string or another data - return "Error" - - - @staticmethod - def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=None): - """Take a row and validate the missing code associated with - a quantity (e.g., val, se, stderr). - - Returns either a nan code for assignment to the missing quantity - or a None to signal an error with the missing code. We decline - to infer missing codes except for a very simple cases; the default - is to produce an error so that the issue can be fixed in indicators. - """ - logger = get_structured_logger('load_csv') if logger is None else logger - missing_entry = getattr(row, "missing_" + attr_name, None) - - try: - missing_entry = CsvImporter.floaty_int(missing_entry) # convert from string to float to int - except (ValueError, TypeError): - missing_entry = None - - if missing_entry is None and attr_quantity is not None: - return Nans.NOT_MISSING.value - if missing_entry is None and attr_quantity is None: - return Nans.OTHER.value - - if missing_entry != Nans.NOT_MISSING.value and attr_quantity is not None: - logger.warning(event = f"missing_{attr_name} column contradicting {attr_name} presence.", detail = (str(row)), file = filepath) - return Nans.NOT_MISSING.value - if missing_entry == Nans.NOT_MISSING.value and attr_quantity is None: - logger.warning(event = f"missing_{attr_name} column contradicting {attr_name} presence.", detail = (str(row)), file = filepath) - return Nans.OTHER.value - - return missing_entry - - - @staticmethod - def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[str] = None) -> Tuple[Optional[CsvRowValue], Optional[str]]: - """Extract and return `CsvRowValue` from a CSV row, with sanity checks. - - Also returns the name of the field which failed sanity check, or None. - - row: the pandas table row to extract - geo_type: the geographic resolution of the file - """ - - # use consistent capitalization (e.g. for states) - try: - geo_id = row.geo_id.lower() - except AttributeError: - # geo_id was `None` - return (None, 'geo_id') - - if geo_type in ('hrr', 'msa', 'dma', 'hhs'): - # these particular ids are prone to be written as ints -- and floats - try: - geo_id = str(CsvImporter.floaty_int(geo_id)) - except ValueError: - # expected a number, but got a string - return (None, 'geo_id') - - # sanity check geo_id with respect to geo_type - if geo_type == 'county': - if len(geo_id) != 5 or not '01000' <= geo_id <= '80000': - return (None, 'geo_id') - - elif geo_type == 'hrr': - if not 1 <= int(geo_id) <= 500: - return (None, 'geo_id') - - elif geo_type == 'msa': - if len(geo_id) != 5 or not '10000' <= geo_id <= '99999': - return (None, 'geo_id') - - elif geo_type == 'dma': - if not 450 <= int(geo_id) <= 950: - return (None, 'geo_id') - - elif geo_type == 'state': - # note that geo_id is lowercase - if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz': - return (None, 'geo_id') - - elif geo_type == 'hhs': - if not 1 <= int(geo_id) <= 10: - return (None, 'geo_id') - - elif geo_type == 'nation': - # geo_id is lowercase - if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz': - return (None, 'geo_id') - - else: - return (None, 'geo_type') - - # Validate row values - value = CsvImporter.validate_quantity(row, "value") - # value was a string or another dtype - if value == "Error": - return (None, 'value') - stderr = CsvImporter.validate_quantity(row, "stderr") - # stderr is a string, another dtype, or negative - if stderr == "Error" or (stderr is not None and stderr < 0): - return (None, 'stderr') - sample_size = CsvImporter.validate_quantity(row, "sample_size") - # sample_size is a string, another dtype, or negative - if sample_size == "Error" or (sample_size is not None and sample_size < 0): - return (None, 'sample_size') - - # Validate and write missingness codes - missing_value = CsvImporter.validate_missing_code(row, value, "value", filepath) - missing_stderr = CsvImporter.validate_missing_code(row, stderr, "stderr", filepath) - missing_sample_size = CsvImporter.validate_missing_code(row, sample_size, "sample_size", filepath) - - # return extracted and validated row values - return (CsvRowValue(geo_id, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size), None) - - - @staticmethod - def load_csv(filepath: str, details: PathDetails) -> Iterator[Optional[CovidcastRow]]: - """Load, validate, and yield data as `RowValues` from a CSV file. - - filepath: the CSV file to be loaded - geo_type: the geographic resolution (e.g. county) - - In case of a validation error, `None` is yielded for the offending row, - including the header. - """ - logger = get_structured_logger('load_csv') - - try: - table = pd.read_csv(filepath, dtype=CsvImporter.DTYPES) - except ValueError as e: - logger.warning(event='Failed to open CSV with specified dtypes, switching to str', detail=str(e), file=filepath) - table = pd.read_csv(filepath, dtype='str') - - if not CsvImporter.is_header_valid(table.columns): - logger.warning(event='invalid header', detail=table.columns, file=filepath) - yield None - return - - table.rename(columns={"val": "value", "se": "stderr", "missing_val": "missing_value", "missing_se": "missing_stderr"}, inplace=True) - - for row in table.itertuples(index=False): - csv_row_values, error = CsvImporter.extract_and_check_row(row, details.geo_type, filepath) - - if error: - logger.warning(event = 'invalid value for row', detail=(str(row), error), file=filepath) - yield None - continue - - yield CovidcastRow( - details.source, - details.signal, - details.time_type, - details.geo_type, - details.time_value, - csv_row_values.geo_value, - csv_row_values.value, - csv_row_values.stderr, - csv_row_values.sample_size, - csv_row_values.missing_value, - csv_row_values.missing_stderr, - csv_row_values.missing_sample_size, - details.issue, - details.lag, - ) + return func(quantity) + + @staticmethod + def validate_quantity(row, attr_quantity): + """Take a row and validate a given associated quantity (e.g., val, se, stderr). + + Returns either a float, a None, or "Error". + """ + try: + quantity = CsvImporter.maybe_apply(float, getattr(row, attr_quantity)) + return quantity + except (ValueError, AttributeError): + # val was a string or another data + return "Error" + + @staticmethod + def validate_missing_code(row, attr_quantity, attr_name, filepath=None, logger=None): + """Take a row and validate the missing code associated with + a quantity (e.g., val, se, stderr). + + Returns either a nan code for assignment to the missing quantity + or a None to signal an error with the missing code. We decline + to infer missing codes except for a very simple cases; the default + is to produce an error so that the issue can be fixed in indicators. + """ + logger = get_structured_logger('load_csv') if logger is None else logger + missing_entry = getattr(row, "missing_" + attr_name, None) + + try: + missing_entry = CsvImporter.floaty_int(missing_entry) # convert from string to float to int + except (ValueError, TypeError): + missing_entry = None + + if missing_entry is None and attr_quantity is not None: + return Nans.NOT_MISSING.value + if missing_entry is None and attr_quantity is None: + return Nans.OTHER.value + + if missing_entry != Nans.NOT_MISSING.value and attr_quantity is not None: + logger.warning(event=f"missing_{attr_name} column contradicting {attr_name} presence.", detail=(str(row)), file=filepath) + return Nans.NOT_MISSING.value + if missing_entry == Nans.NOT_MISSING.value and attr_quantity is None: + logger.warning(event=f"missing_{attr_name} column contradicting {attr_name} presence.", detail=(str(row)), file=filepath) + return Nans.OTHER.value + + return missing_entry + + @staticmethod + def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[str] = None) -> Tuple[Optional[CsvRowValue], Optional[str]]: + """Extract and return `CsvRowValue` from a CSV row, with sanity checks. + + Also returns the name of the field which failed sanity check, or None. + + row: the pandas table row to extract + geo_type: the geographic resolution of the file + """ + + # use consistent capitalization (e.g. for states) + try: + geo_id = row.geo_id.lower() + except AttributeError: + # geo_id was `None` + return (None, 'geo_id') + + if geo_type in ('hrr', 'msa', 'dma', 'hhs'): + # these particular ids are prone to be written as ints -- and floats + try: + geo_id = str(CsvImporter.floaty_int(geo_id)) + except ValueError: + # expected a number, but got a string + return (None, 'geo_id') + + # sanity check geo_id with respect to geo_type + if geo_type == 'county': + if len(geo_id) != 5 or not '01000' <= geo_id <= '80000': + return (None, 'geo_id') + + elif geo_type == 'hrr': + if not 1 <= int(geo_id) <= 500: + return (None, 'geo_id') + + elif geo_type == 'msa': + if len(geo_id) != 5 or not '10000' <= geo_id <= '99999': + return (None, 'geo_id') + + elif geo_type == 'dma': + if not 450 <= int(geo_id) <= 950: + return (None, 'geo_id') + + elif geo_type == 'state': + # note that geo_id is lowercase + if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz': + return (None, 'geo_id') + + elif geo_type == 'hhs': + if not 1 <= int(geo_id) <= 10: + return (None, 'geo_id') + + elif geo_type == 'nation': + # geo_id is lowercase + if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz': + return (None, 'geo_id') + + else: + return (None, 'geo_type') + + # Validate row values + value = CsvImporter.validate_quantity(row, "value") + # value was a string or another dtype + if value == "Error": + return (None, 'value') + stderr = CsvImporter.validate_quantity(row, "stderr") + # stderr is a string, another dtype, or negative + if stderr == "Error" or (stderr is not None and stderr < 0): + return (None, 'stderr') + sample_size = CsvImporter.validate_quantity(row, "sample_size") + # sample_size is a string, another dtype, or negative + if sample_size == "Error" or (sample_size is not None and sample_size < 0): + return (None, 'sample_size') + + # Validate and write missingness codes + missing_value = CsvImporter.validate_missing_code(row, value, "value", filepath) + missing_stderr = CsvImporter.validate_missing_code(row, stderr, "stderr", filepath) + missing_sample_size = CsvImporter.validate_missing_code(row, sample_size, "sample_size", filepath) + + # return extracted and validated row values + return (CsvRowValue(geo_id, value, stderr, sample_size, missing_value, missing_stderr, missing_sample_size), None) + + @staticmethod + def load_csv(filepath: str, details: PathDetails) -> Iterator[Optional[CovidcastRow]]: + """Load, validate, and yield data as `RowValues` from a CSV file. + + filepath: the CSV file to be loaded + geo_type: the geographic resolution (e.g. county) + + In case of a validation error, `None` is yielded for the offending row, + including the header. + """ + logger = get_structured_logger('load_csv') + + try: + table = pd.read_csv(filepath, dtype=CsvImporter.DTYPES) + except ValueError as e: + logger.warning(event='Failed to open CSV with specified dtypes, switching to str', detail=str(e), file=filepath) + table = pd.read_csv(filepath, dtype='str') + + if not CsvImporter.is_header_valid(table.columns): + logger.warning(event='invalid header', detail=table.columns, file=filepath) + yield None + return + + table.rename(columns={"val": "value", "se": "stderr", "missing_val": "missing_value", "missing_se": "missing_stderr"}, inplace=True) + + for row in table.itertuples(index=False): + csv_row_values, error = CsvImporter.extract_and_check_row(row, details.geo_type, filepath) + + if error: + logger.warning(event='invalid value for row', detail=(str(row), error), file=filepath) + yield None + continue + + yield CovidcastRow( + details.source, + details.signal, + details.time_type, + details.geo_type, + details.time_value, + csv_row_values.geo_value, + csv_row_values.value, + csv_row_values.stderr, + csv_row_values.sample_size, + csv_row_values.missing_value, + csv_row_values.missing_stderr, + csv_row_values.missing_sample_size, + details.issue, + details.lag, + ) diff --git a/src/acquisition/covidcast/csv_to_database.py b/src/acquisition/covidcast/csv_to_database.py index 842e820c9..16db01719 100644 --- a/src/acquisition/covidcast/csv_to_database.py +++ b/src/acquisition/covidcast/csv_to_database.py @@ -1,13 +1,12 @@ """Imports covidcast CSVs and stores them in the epidata database.""" -# standard library + import argparse import os import time from logging import Logger from typing import Callable, Iterable, Optional, Tuple -# first party from delphi.epidata.acquisition.covidcast.csv_importer import CsvImporter, PathDetails from delphi.epidata.acquisition.covidcast.database import Database, DBLoadStateException from delphi.epidata.acquisition.covidcast.file_archiver import FileArchiver @@ -15,166 +14,169 @@ def get_argument_parser(): - """Define command line arguments.""" - - parser = argparse.ArgumentParser() - parser.add_argument( - '--data_dir', - help='top-level directory where CSVs are stored') - parser.add_argument( - '--specific_issue_date', - action='store_true', - help='indicates argument is where issuedate-specific subdirectories can be found.') - parser.add_argument( - '--log_file', - help="filename for log output (defaults to stdout)") - return parser + """Define command line arguments.""" + + parser = argparse.ArgumentParser() + parser.add_argument( + '--data_dir', + help='top-level directory where CSVs are stored' + ) + parser.add_argument( + '--specific_issue_date', + action='store_true', + help='indicates argument is where issuedate-specific subdirectories can be found.' + ) + parser.add_argument( + '--log_file', + help="filename for log output (defaults to stdout)" + ) + return parser def collect_files(data_dir: str, specific_issue_date: bool): - """Fetch path and data profile details for each file to upload.""" - logger= get_structured_logger('collect_files') - if specific_issue_date: - results = list(CsvImporter.find_issue_specific_csv_files(data_dir)) - else: - results = list(CsvImporter.find_csv_files(os.path.join(data_dir, 'receiving'))) - logger.info(f'found {len(results)} files') - return results + """Fetch path and data profile details for each file to upload.""" + logger = get_structured_logger('collect_files') + if specific_issue_date: + results = list(CsvImporter.find_issue_specific_csv_files(data_dir)) + else: + results = list(CsvImporter.find_csv_files(os.path.join(data_dir, 'receiving'))) + logger.info(f'found {len(results)} files') + return results def make_handlers(data_dir: str, specific_issue_date: bool): - if specific_issue_date: - # issue-specific uploads are always one-offs, so we can leave all - # files in place without worrying about cleaning up - def handle_failed(path_src, filename, source, logger): - logger.info(event='leaving failed file alone', dest=source, file=filename) - - def handle_successful(path_src, filename, source, logger): - logger.info(event='archiving as successful',file=filename) - FileArchiver.archive_inplace(path_src, filename) - else: - # normal automation runs require some shuffling to remove files - # from receiving and place them in the archive - archive_successful_dir = os.path.join(data_dir, 'archive', 'successful') - archive_failed_dir = os.path.join(data_dir, 'archive', 'failed') - - # helper to archive a failed file without compression - def handle_failed(path_src, filename, source, logger): - logger.info(event='archiving as failed - ', detail=source, file=filename) - path_dst = os.path.join(archive_failed_dir, source) - compress = False - FileArchiver.archive_file(path_src, path_dst, filename, compress) - - # helper to archive a successful file with compression - def handle_successful(path_src, filename, source, logger): - logger.info(event='archiving as successful',file=filename) - path_dst = os.path.join(archive_successful_dir, source) - compress = True - FileArchiver.archive_file(path_src, path_dst, filename, compress) - - return handle_successful, handle_failed + if specific_issue_date: + # issue-specific uploads are always one-offs, so we can leave all + # files in place without worrying about cleaning up + def handle_failed(path_src, filename, source, logger): + logger.info(event='leaving failed file alone', dest=source, file=filename) + + def handle_successful(path_src, filename, source, logger): + logger.info(event='archiving as successful', file=filename) + FileArchiver.archive_inplace(path_src, filename) + else: + # normal automation runs require some shuffling to remove files + # from receiving and place them in the archive + archive_successful_dir = os.path.join(data_dir, 'archive', 'successful') + archive_failed_dir = os.path.join(data_dir, 'archive', 'failed') + # helper to archive a failed file without compression + def handle_failed(path_src, filename, source, logger): + logger.info(event='archiving as failed - ', detail=source, file=filename) + path_dst = os.path.join(archive_failed_dir, source) + compress = False + FileArchiver.archive_file(path_src, path_dst, filename, compress) -def upload_archive( - path_details: Iterable[Tuple[str, Optional[PathDetails]]], - database: Database, - handlers: Tuple[Callable], - logger: Logger - ): - """Upload CSVs to the database and archive them using the specified handlers. - - :path_details: output from CsvImporter.find*_csv_files - - :database: an open connection to the epidata database - - :handlers: functions for archiving (successful, failed) files - - :return: the number of modified rows - """ - archive_as_successful, archive_as_failed = handlers - total_modified_row_count = 0 - # iterate over each file - for path, details in path_details: - logger.info(event='handling', dest=path) - path_src, filename = os.path.split(path) - - # file path or name was invalid, source is unknown - if not details: - archive_as_failed(path_src, filename, 'unknown',logger) - continue - - csv_rows = CsvImporter.load_csv(path, details) - rows_list = list(csv_rows) - all_rows_valid = rows_list and all(r is not None for r in rows_list) - if all_rows_valid: - try: - modified_row_count = database.insert_or_update_bulk(rows_list) - logger.info(f"insert_or_update_bulk {filename} returned {modified_row_count}") - logger.info( - "Inserted database rows", - row_count = modified_row_count, - source = details.source, - signal = details.signal, - geo_type = details.geo_type, - time_value = details.time_value, - issue = details.issue, - lag = details.lag - ) - if modified_row_count is None or modified_row_count: # else would indicate zero rows inserted - total_modified_row_count += (modified_row_count if modified_row_count else 0) - database.commit() - except DBLoadStateException as e: - # if the db is in a state that is not fit for loading new data, - # then we should stop processing any more files - raise e - except Exception as e: - all_rows_valid = False - logger.exception('exception while inserting rows', exc_info=e) - database.rollback() - - # archive the current file based on validation results - if all_rows_valid: - archive_as_successful(path_src, filename, details.source, logger) - else: - archive_as_failed(path_src, filename, details.source, logger) + # helper to archive a successful file with compression + def handle_successful(path_src, filename, source, logger): + logger.info(event='archiving as successful', file=filename) + path_dst = os.path.join(archive_successful_dir, source) + compress = True + FileArchiver.archive_file(path_src, path_dst, filename, compress) - return total_modified_row_count + return handle_successful, handle_failed + + +def upload_archive( + path_details: Iterable[Tuple[str, Optional[PathDetails]]], + database: Database, + handlers: Tuple[Callable], + logger: Logger +): + """Upload CSVs to the database and archive them using the specified handlers. + + :path_details: output from CsvImporter.find*_csv_files + + :database: an open connection to the epidata database + + :handlers: functions for archiving (successful, failed) files + + :return: the number of modified rows + """ + archive_as_successful, archive_as_failed = handlers + total_modified_row_count = 0 + # iterate over each file + for path, details in path_details: + logger.info(event='handling', dest=path) + path_src, filename = os.path.split(path) + + # file path or name was invalid, source is unknown + if not details: + archive_as_failed(path_src, filename, 'unknown', logger) + continue + + csv_rows = CsvImporter.load_csv(path, details) + rows_list = list(csv_rows) + all_rows_valid = rows_list and all(r is not None for r in rows_list) + if all_rows_valid: + try: + modified_row_count = database.insert_or_update_bulk(rows_list) + logger.info(f"insert_or_update_bulk {filename} returned {modified_row_count}") + logger.info( + "Inserted database rows", + row_count=modified_row_count, + source=details.source, + signal=details.signal, + geo_type=details.geo_type, + time_value=details.time_value, + issue=details.issue, + lag=details.lag + ) + if modified_row_count is None or modified_row_count: # else would indicate zero rows inserted + total_modified_row_count += (modified_row_count if modified_row_count else 0) + database.commit() + except DBLoadStateException as e: + # if the db is in a state that is not fit for loading new data, + # then we should stop processing any more files + raise e + except Exception as e: + all_rows_valid = False + logger.exception('exception while inserting rows', exc_info=e) + database.rollback() + + # archive the current file based on validation results + if all_rows_valid: + archive_as_successful(path_src, filename, details.source, logger) + else: + archive_as_failed(path_src, filename, details.source, logger) + + return total_modified_row_count def main(args): - """Find, parse, and upload covidcast signals.""" + """Find, parse, and upload covidcast signals.""" - logger = get_structured_logger("csv_ingestion", filename=args.log_file) - start_time = time.time() + logger = get_structured_logger("csv_ingestion", filename=args.log_file) + start_time = time.time() - # shortcut escape without hitting db if nothing to do - path_details = collect_files(args.data_dir, args.specific_issue_date) - if not path_details: - logger.info('nothing to do; exiting...') - return + # shortcut escape without hitting db if nothing to do + path_details = collect_files(args.data_dir, args.specific_issue_date) + if not path_details: + logger.info('nothing to do; exiting...') + return - logger.info("Ingesting CSVs", csv_count = len(path_details)) + logger.info("Ingesting CSVs", csv_count=len(path_details)) - database = Database() - database.connect() + database = Database() + database.connect() - try: - modified_row_count = upload_archive( - path_details, - database, - make_handlers(args.data_dir, args.specific_issue_date), - logger - ) - logger.info("Finished inserting/updating database rows", row_count = modified_row_count) - finally: - database.do_analyze() - # unconditionally commit database changes since CSVs have been archived - database.disconnect(True) + try: + modified_row_count = upload_archive( + path_details, + database, + make_handlers(args.data_dir, args.specific_issue_date), + logger + ) + logger.info("Finished inserting/updating database rows", row_count=modified_row_count) + finally: + database.do_analyze() + # unconditionally commit database changes since CSVs have been archived + database.disconnect(True) - logger.info( - "Ingested CSVs into database", - total_runtime_in_seconds=round(time.time() - start_time, 2)) + logger.info( + "Ingested CSVs into database", + total_runtime_in_seconds=round(time.time() - start_time, 2)) if __name__ == '__main__': - main(get_argument_parser().parse_args()) + main(get_argument_parser().parse_args()) diff --git a/src/acquisition/covidcast/database.py b/src/acquisition/covidcast/database.py index 3beedac82..5ed99dea9 100644 --- a/src/acquisition/covidcast/database.py +++ b/src/acquisition/covidcast/database.py @@ -2,568 +2,567 @@ See src/ddl/covidcast.sql for an explanation of each field. """ + +import json import threading from math import ceil from multiprocessing import cpu_count -from queue import Queue, Empty +from queue import Empty, Queue from typing import List -# third party -import json -import mysql.connector - -# first party import delphi.operations.secrets as secrets -from delphi.epidata.acquisition.covidcast.logger import get_structured_logger +import mysql.connector from delphi.epidata.acquisition.covidcast.covidcast_row import CovidcastRow +from delphi.epidata.acquisition.covidcast.logger import get_structured_logger class DBLoadStateException(Exception): - pass + pass class Database: - """A collection of covidcast database operations.""" - - DATABASE_NAME = 'covid' - - load_table = "epimetric_load" - # if you want to deal with foreign key ids: use table - # if you want to deal with source/signal names, geo type/values, etc: use view - latest_table = "epimetric_latest" - latest_view = latest_table + "_v" - history_table = "epimetric_full" - history_view = history_table + "_v" - # TODO: consider using class variables like this for dimension table names too - # TODO: also consider that for composite key tuples, like short_comp_key and long_comp_key as used in delete_batch() - - - def connect(self, connector_impl=mysql.connector): - """Establish a connection to the database.""" - - u, p = secrets.db.epi - self._connector_impl = connector_impl - self._connection = self._connector_impl.connect( - host=secrets.db.host, - user=u, - password=p, - database=Database.DATABASE_NAME) - self._cursor = self._connection.cursor() - - def commit(self): - self._connection.commit() - - def rollback(self): - self._connection.rollback() - - def disconnect(self, commit): - """Close the database connection. - - commit: if true, commit changes, otherwise rollback - """ - - self._cursor.close() - if commit: - self._connection.commit() - self._connection.close() - - - - def count_all_load_rows(self): - self._cursor.execute(f'SELECT count(1) FROM `{self.load_table}`') - for (num,) in self._cursor: - return num - - def _reset_load_table_ai_counter(self): - """Corrects the AUTO_INCREMENT counter in the load table. - - To be used in emergencies only, if the load table was accidentally TRUNCATEd. - This ensures any `epimetric_id`s generated by the load table will not collide with the history or latest tables. - This is also destructive to any data in the load table. - """ - - self._cursor.execute('DELETE FROM epimetric_load') - # NOTE: 'ones' are used as filler here for the (required) NOT NULL columns. - self._cursor.execute(""" - INSERT INTO epimetric_load - (epimetric_id, - source, `signal`, geo_type, geo_value, time_type, time_value, issue, `lag`, value_updated_timestamp) - VALUES - ((SELECT 1+MAX(epimetric_id) FROM epimetric_full), - '1', '1', '1', '1', '1', 1, 1, 1, 1);""") - self._cursor.execute('DELETE FROM epimetric_load') - - def do_analyze(self): - """performs and stores key distribution analyses, used for join order and index selection""" - # TODO: consider expanding this to update columns' histograms - # https://dev.mysql.com/doc/refman/8.0/en/analyze-table.html#analyze-table-histogram-statistics-analysis - self._cursor.execute( - f'''ANALYZE TABLE - signal_dim, geo_dim, - {self.load_table}, {self.history_table}, {self.latest_table}''') - output = [self._cursor.column_names] + self._cursor.fetchall() - get_structured_logger('do_analyze').info("ANALYZE results", results=str(output)) - - def insert_or_update_bulk(self, cc_rows): - return self.insert_or_update_batch(cc_rows) - - def insert_or_update_batch(self, cc_rows: List[CovidcastRow], batch_size=2**20, commit_partial=False, suppress_jobs=False): - """ - Insert new rows into the load table and dispatch into dimension and fact tables. - """ - - if 0 != self.count_all_load_rows(): - err_msg = "Non-zero count in the load table!!! This indicates a previous acquisition run may have failed, another acquisition is in progress, or this process does not otherwise have exclusive access to the db!" - get_structured_logger("insert_or_update_batch").fatal(err_msg) - raise DBLoadStateException(err_msg) - - # NOTE: `value_update_timestamp` is hardcoded to "NOW" (which is appropriate) and - # `is_latest_issue` is hardcoded to 1 (which is temporary and addressed later in this method) - insert_into_loader_sql = f''' - INSERT INTO `{self.load_table}` - (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, - `value_updated_timestamp`, `value`, `stderr`, `sample_size`, `issue`, `lag`, - `is_latest_issue`, `missing_value`, `missing_stderr`, `missing_sample_size`) - VALUES - (%s, %s, %s, %s, %s, %s, - UNIX_TIMESTAMP(NOW()), %s, %s, %s, %s, %s, - 1, %s, %s, %s) - ''' - - # all load table entries are already marked "is_latest_issue". - # if an entry in the load table is NOT in the latest table, it is clearly now the latest value for that key (so we do nothing (thanks to INNER join)). - # if an entry *IS* in both load and latest tables, but latest table issue is newer, unmark is_latest_issue in load. - fix_is_latest_issue_sql = f''' - UPDATE - `{self.load_table}` JOIN `{self.latest_view}` - USING (`source`, `signal`, `geo_type`, `geo_value`, `time_type`, `time_value`) - SET `{self.load_table}`.`is_latest_issue`=0 - WHERE `{self.load_table}`.`issue` < `{self.latest_view}`.`issue` - ''' - - # TODO: consider handling cc_rows as a generator instead of a list - - try: - num_rows = len(cc_rows) - total = 0 - if not batch_size: - batch_size = num_rows - num_batches = ceil(num_rows/batch_size) - for batch_num in range(num_batches): - start = batch_num * batch_size - end = min(num_rows, start + batch_size) - - args = [( - row.source, - row.signal, - row.time_type, - row.geo_type, - row.time_value, - row.geo_value, - row.value, - row.stderr, - row.sample_size, - row.issue, - row.lag, - row.missing_value, - row.missing_stderr, - row.missing_sample_size - ) for row in cc_rows[start:end]] - - - self._cursor.executemany(insert_into_loader_sql, args) - modified_row_count = self._cursor.rowcount - self._cursor.execute(fix_is_latest_issue_sql) - if not suppress_jobs: - self.run_dbjobs() # TODO: incorporate the logic of dbjobs() into this method [once calls to dbjobs() are no longer needed for migrations] - - if modified_row_count is None or modified_row_count == -1: - # the SQL connector does not support returning number of rows affected (see PEP 249) - total = None - else: - total += modified_row_count - if commit_partial: - self._connection.commit() - except Exception as e: - # rollback is handled in csv_to_database; if you're calling this yourself, handle your own rollback - raise e - return total - - def run_dbjobs(self): - - # we do this LEFT JOIN trick because mysql cant do set difference (aka EXCEPT or MINUS) - # (as in " select distinct source, signal from signal_dim minus select distinct source, signal from epimetric_load ") - signal_dim_add_new_load = f''' - INSERT INTO signal_dim (`source`, `signal`) - SELECT DISTINCT sl.source, sl.signal - FROM {self.load_table} AS sl LEFT JOIN signal_dim AS sd - USING (`source`, `signal`) - WHERE sd.source IS NULL - ''' - - # again, same trick to get around lack of EXCEPT/MINUS - geo_dim_add_new_load = f''' - INSERT INTO geo_dim (`geo_type`, `geo_value`) - SELECT DISTINCT sl.geo_type, sl.geo_value - FROM {self.load_table} AS sl LEFT JOIN geo_dim AS gd - USING (`geo_type`, `geo_value`) - WHERE gd.geo_type IS NULL - ''' - - epimetric_full_load = f''' - INSERT INTO {self.history_table} - (epimetric_id, signal_key_id, geo_key_id, issue, data_as_of_dt, - time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp, - computation_as_of_dt, missing_value, missing_stderr, missing_sample_size) - SELECT - epimetric_id, sd.signal_key_id, gd.geo_key_id, issue, data_as_of_dt, + """A collection of covidcast database operations.""" + + DATABASE_NAME = 'covid' + + load_table = "epimetric_load" + # if you want to deal with foreign key ids: use table + # if you want to deal with source/signal names, geo type/values, etc: use view + latest_table = "epimetric_latest" + latest_view = latest_table + "_v" + history_table = "epimetric_full" + history_view = history_table + "_v" + # TODO: consider using class variables like this for dimension table names too + # TODO: also consider that for composite key tuples, like short_comp_key and long_comp_key as used in delete_batch() + + def connect(self, connector_impl=mysql.connector): + """Establish a connection to the database.""" + + u, p = secrets.db.epi + self._connector_impl = connector_impl + self._connection = self._connector_impl.connect( + host=secrets.db.host, + user=u, + password=p, + database=Database.DATABASE_NAME) + self._cursor = self._connection.cursor() + + def commit(self): + self._connection.commit() + + def rollback(self): + self._connection.rollback() + + def disconnect(self, commit): + """Close the database connection. + + commit: if true, commit changes, otherwise rollback + """ + + self._cursor.close() + if commit: + self._connection.commit() + self._connection.close() + + def count_all_load_rows(self): + self._cursor.execute(f'SELECT count(1) FROM `{self.load_table}`') + for (num,) in self._cursor: + return num + + def _reset_load_table_ai_counter(self): + """Corrects the AUTO_INCREMENT counter in the load table. + + To be used in emergencies only, if the load table was accidentally TRUNCATEd. + This ensures any `epimetric_id`s generated by the load table will not collide with the history or latest tables. + This is also destructive to any data in the load table. + """ + + self._cursor.execute('DELETE FROM epimetric_load') + # NOTE: 'ones' are used as filler here for the (required) NOT NULL columns. + self._cursor.execute( + """ + INSERT INTO epimetric_load + (epimetric_id, + source, `signal`, geo_type, geo_value, time_type, time_value, issue, `lag`, value_updated_timestamp) + VALUES + ((SELECT 1+MAX(epimetric_id) FROM epimetric_full), + '1', '1', '1', '1', '1', 1, 1, 1, 1); + """ + ) + self._cursor.execute('DELETE FROM epimetric_load') + + def do_analyze(self): + """performs and stores key distribution analyses, used for join order and index selection""" + # TODO: consider expanding this to update columns' histograms + # https://dev.mysql.com/doc/refman/8.0/en/analyze-table.html#analyze-table-histogram-statistics-analysis + self._cursor.execute( + f''' + ANALYZE TABLE + signal_dim, geo_dim, + {self.load_table}, {self.history_table}, {self.latest_table} + ''' + ) + output = [self._cursor.column_names] + self._cursor.fetchall() + get_structured_logger('do_analyze').info("ANALYZE results", results=str(output)) + + def insert_or_update_bulk(self, cc_rows): + return self.insert_or_update_batch(cc_rows) + + def insert_or_update_batch(self, cc_rows: List[CovidcastRow], batch_size=2**20, commit_partial=False, suppress_jobs=False): + """ + Insert new rows into the load table and dispatch into dimension and fact tables. + """ + + if 0 != self.count_all_load_rows(): + err_msg = "Non-zero count in the load table!!! This indicates a previous acquisition run may have failed, \ + another acquisition is in progress, or this process does not otherwise have exclusive access to the db!" + get_structured_logger("insert_or_update_batch").fatal(err_msg) + raise DBLoadStateException(err_msg) + + # NOTE: `value_update_timestamp` is hardcoded to "NOW" (which is appropriate) and + # `is_latest_issue` is hardcoded to 1 (which is temporary and addressed later in this method) + insert_into_loader_sql = f''' + INSERT INTO `{self.load_table}` + (`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`, + `value_updated_timestamp`, `value`, `stderr`, `sample_size`, `issue`, `lag`, + `is_latest_issue`, `missing_value`, `missing_stderr`, `missing_sample_size`) + VALUES + (%s, %s, %s, %s, %s, %s, + UNIX_TIMESTAMP(NOW()), %s, %s, %s, %s, %s, + 1, %s, %s, %s) + ''' + + # all load table entries are already marked "is_latest_issue". + # if an entry in the load table is NOT in the latest table, it is clearly now the latest value for that key (so we do nothing (thanks to INNER join)). + # if an entry *IS* in both load and latest tables, but latest table issue is newer, unmark is_latest_issue in load. + fix_is_latest_issue_sql = f''' + UPDATE + `{self.load_table}` JOIN `{self.latest_view}` + USING (`source`, `signal`, `geo_type`, `geo_value`, `time_type`, `time_value`) + SET `{self.load_table}`.`is_latest_issue`=0 + WHERE `{self.load_table}`.`issue` < `{self.latest_view}`.`issue` + ''' + + # TODO: consider handling cc_rows as a generator instead of a list + + try: + num_rows = len(cc_rows) + total = 0 + if not batch_size: + batch_size = num_rows + num_batches = ceil(num_rows/batch_size) + for batch_num in range(num_batches): + start = batch_num * batch_size + end = min(num_rows, start + batch_size) + + args = [( + row.source, + row.signal, + row.time_type, + row.geo_type, + row.time_value, + row.geo_value, + row.value, + row.stderr, + row.sample_size, + row.issue, + row.lag, + row.missing_value, + row.missing_stderr, + row.missing_sample_size + ) for row in cc_rows[start:end]] + + self._cursor.executemany(insert_into_loader_sql, args) + modified_row_count = self._cursor.rowcount + self._cursor.execute(fix_is_latest_issue_sql) + if not suppress_jobs: + self.run_dbjobs() # TODO: incorporate the logic of dbjobs() into this method [once calls to dbjobs() are no longer needed for migrations] + + if modified_row_count is None or modified_row_count == -1: + # the SQL connector does not support returning number of rows affected (see PEP 249) + total = None + else: + total += modified_row_count + if commit_partial: + self._connection.commit() + except Exception as e: + # rollback is handled in csv_to_database; if you're calling this yourself, handle your own rollback + raise e + return total + + def run_dbjobs(self): + + # we do this LEFT JOIN trick because mysql cant do set difference (aka EXCEPT or MINUS) + # (as in " select distinct source, signal from signal_dim minus select distinct source, signal from epimetric_load ") + signal_dim_add_new_load = f''' + INSERT INTO signal_dim (`source`, `signal`) + SELECT DISTINCT sl.source, sl.signal + FROM {self.load_table} AS sl LEFT JOIN signal_dim AS sd + USING (`source`, `signal`) + WHERE sd.source IS NULL + ''' + + # again, same trick to get around lack of EXCEPT/MINUS + geo_dim_add_new_load = f''' + INSERT INTO geo_dim (`geo_type`, `geo_value`) + SELECT DISTINCT sl.geo_type, sl.geo_value + FROM {self.load_table} AS sl LEFT JOIN geo_dim AS gd + USING (`geo_type`, `geo_value`) + WHERE gd.geo_type IS NULL + ''' + + epimetric_full_load = f''' + INSERT INTO {self.history_table} + (epimetric_id, signal_key_id, geo_key_id, issue, data_as_of_dt, time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp, - computation_as_of_dt, missing_value, missing_stderr, missing_sample_size - FROM `{self.load_table}` sl - INNER JOIN signal_dim sd USING (source, `signal`) - INNER JOIN geo_dim gd USING (geo_type, geo_value) - ON DUPLICATE KEY UPDATE - `epimetric_id` = sl.`epimetric_id`, - `value_updated_timestamp` = sl.`value_updated_timestamp`, - `value` = sl.`value`, - `stderr` = sl.`stderr`, - `sample_size` = sl.`sample_size`, - `lag` = sl.`lag`, - `missing_value` = sl.`missing_value`, - `missing_stderr` = sl.`missing_stderr`, - `missing_sample_size` = sl.`missing_sample_size` - ''' - - epimetric_latest_load = f''' - INSERT INTO {self.latest_table} - (epimetric_id, signal_key_id, geo_key_id, issue, data_as_of_dt, - time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp, - computation_as_of_dt, missing_value, missing_stderr, missing_sample_size) - SELECT - epimetric_id, sd.signal_key_id, gd.geo_key_id, issue, data_as_of_dt, + computation_as_of_dt, missing_value, missing_stderr, missing_sample_size) + SELECT + epimetric_id, sd.signal_key_id, gd.geo_key_id, issue, data_as_of_dt, + time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp, + computation_as_of_dt, missing_value, missing_stderr, missing_sample_size + FROM `{self.load_table}` sl + INNER JOIN signal_dim sd USING (source, `signal`) + INNER JOIN geo_dim gd USING (geo_type, geo_value) + ON DUPLICATE KEY UPDATE + `epimetric_id` = sl.`epimetric_id`, + `value_updated_timestamp` = sl.`value_updated_timestamp`, + `value` = sl.`value`, + `stderr` = sl.`stderr`, + `sample_size` = sl.`sample_size`, + `lag` = sl.`lag`, + `missing_value` = sl.`missing_value`, + `missing_stderr` = sl.`missing_stderr`, + `missing_sample_size` = sl.`missing_sample_size` + ''' + + epimetric_latest_load = f''' + INSERT INTO {self.latest_table} + (epimetric_id, signal_key_id, geo_key_id, issue, data_as_of_dt, time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp, - computation_as_of_dt, missing_value, missing_stderr, missing_sample_size - FROM `{self.load_table}` sl - INNER JOIN signal_dim sd USING (source, `signal`) - INNER JOIN geo_dim gd USING (geo_type, geo_value) - WHERE is_latest_issue = 1 - ON DUPLICATE KEY UPDATE - `epimetric_id` = sl.`epimetric_id`, - `value_updated_timestamp` = sl.`value_updated_timestamp`, - `value` = sl.`value`, - `stderr` = sl.`stderr`, - `sample_size` = sl.`sample_size`, - `issue` = sl.`issue`, - `lag` = sl.`lag`, - `missing_value` = sl.`missing_value`, - `missing_stderr` = sl.`missing_stderr`, - `missing_sample_size` = sl.`missing_sample_size` - ''' - - # NOTE: DO NOT `TRUNCATE` THIS TABLE! doing so will ruin the AUTO_INCREMENT counter that the history and latest tables depend on... - epimetric_load_delete_processed = f''' - DELETE FROM `{self.load_table}` - ''' - - logger = get_structured_logger("run_dbjobs") - import time - time_q = [time.time()] - - try: - self._cursor.execute(signal_dim_add_new_load) - time_q.append(time.time()) - logger.debug('signal_dim_add_new_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) - - self._cursor.execute(geo_dim_add_new_load) - time_q.append(time.time()) - logger.debug('geo_dim_add_new_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) - - self._cursor.execute(epimetric_full_load) - time_q.append(time.time()) - logger.debug('epimetric_full_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) - - self._cursor.execute(epimetric_latest_load) - time_q.append(time.time()) - logger.debug('epimetric_latest_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) - - self._cursor.execute(epimetric_load_delete_processed) - time_q.append(time.time()) - logger.debug('epimetric_load_delete_processed', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) - except Exception as e: - raise e - - return self - - - def delete_batch(self, cc_deletions): - """ - Remove rows specified by a csv file or list of tuples. - - If cc_deletions is a filename, the file should include a header row and use the following field order: - - geo_id - - value (ignored) - - stderr (ignored) - - sample_size (ignored) - - issue (YYYYMMDD format) - - time_value (YYYYMMDD format) - - geo_type - - signal - - source - - If cc_deletions is a list of tuples, the tuples should use the following field order (=same as above, plus time_type): - - geo_id - - value (ignored) - - stderr (ignored) - - sample_size (ignored) - - issue (YYYYMMDD format) - - time_value (YYYYMMDD format) - - geo_type - - signal - - source - - time_type - """ - - tmp_table_name = "tmp_delete_table" - # composite keys: - short_comp_key = "`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`" - long_comp_key = short_comp_key + ", `issue`" - - create_tmp_table_sql = f''' -CREATE TABLE {tmp_table_name} LIKE {self.load_table}; -''' - - amend_tmp_table_sql = f''' -ALTER TABLE {tmp_table_name} ADD COLUMN delete_history_id BIGINT UNSIGNED, - ADD COLUMN delete_latest_id BIGINT UNSIGNED, - ADD COLUMN update_latest BINARY(1) DEFAULT 0; -''' - - load_tmp_table_infile_sql = f''' -LOAD DATA INFILE "{cc_deletions}" -INTO TABLE {tmp_table_name} -FIELDS TERMINATED BY "," -IGNORE 1 LINES -(`geo_value`, `value`, `stderr`, `sample_size`, `issue`, `time_value`, `geo_type`, `signal`, `source`) -SET time_type="day"; -''' - - load_tmp_table_insert_sql = f''' -INSERT INTO {tmp_table_name} -(`geo_value`, `value`, `stderr`, `sample_size`, `issue`, `time_value`, `geo_type`, `signal`, `source`, `time_type`, -`value_updated_timestamp`, `lag`, `is_latest_issue`) -VALUES -(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, -0, 0, 0) -''' - - add_history_id_sql = f''' -UPDATE {tmp_table_name} d INNER JOIN {self.history_view} h USING ({long_comp_key}) -SET d.delete_history_id=h.epimetric_id; -''' - - # if a row we are deleting also appears in the 'latest' table (with a matching 'issue')... - mark_for_update_latest_sql = f''' -UPDATE {tmp_table_name} d INNER JOIN {self.latest_view} ell USING ({long_comp_key}) -SET d.update_latest=1, d.delete_latest_id=ell.epimetric_id; -''' - - delete_history_sql = f''' -DELETE h FROM {tmp_table_name} d INNER JOIN {self.history_table} h ON d.delete_history_id=h.epimetric_id; -''' - - # ...remove it from 'latest'... - delete_latest_sql = f''' -DELETE ell FROM {tmp_table_name} d INNER JOIN {self.latest_table} ell ON d.delete_latest_id=ell.epimetric_id; -''' - - # ...and re-write that record with its next-latest issue (from 'history') instead. - # NOTE: this must be executed *AFTER* `delete_history_sql` to ensure we get the correct `issue` - # AND also after `delete_latest_sql` so that we dont get a key collision on insert. - update_latest_sql = f''' -INSERT INTO {self.latest_table} - (epimetric_id, - signal_key_id, geo_key_id, time_type, time_value, issue, - value, stderr, sample_size, `lag`, value_updated_timestamp, - missing_value, missing_stderr, missing_sample_size) -SELECT - h.epimetric_id, - h.signal_key_id, h.geo_key_id, h.time_type, h.time_value, h.issue, - h.value, h.stderr, h.sample_size, h.`lag`, h.value_updated_timestamp, - h.missing_value, h.missing_stderr, h.missing_sample_size -FROM {self.history_view} h JOIN ( - SELECT {short_comp_key}, MAX(hh.issue) AS issue - FROM {self.history_view} hh JOIN {tmp_table_name} dd USING ({short_comp_key}) - WHERE dd.update_latest=1 GROUP BY {short_comp_key} - ) d USING ({long_comp_key}); -''' - - drop_tmp_table_sql = f'DROP TABLE IF EXISTS {tmp_table_name}' - - total = None - try: - self._cursor.execute(drop_tmp_table_sql) - self._cursor.execute(create_tmp_table_sql) - self._cursor.execute(amend_tmp_table_sql) - if isinstance(cc_deletions, str): - self._cursor.execute(load_tmp_table_infile_sql) - elif isinstance(cc_deletions, list): - def split_list(lst, n): - for i in range(0, len(lst), n): - yield lst[i:(i+n)] - for deletions_batch in split_list(cc_deletions, 100000): - self._cursor.executemany(load_tmp_table_insert_sql, deletions_batch) - print(f"load_tmp_table_insert_sql:{self._cursor.rowcount}") - else: - raise Exception(f"Bad deletions argument: need a filename or a list of tuples; got a {type(cc_deletions)}") - self._cursor.execute(add_history_id_sql) - print(f"add_history_id_sql:{self._cursor.rowcount}") - self._cursor.execute(mark_for_update_latest_sql) - print(f"mark_for_update_latest_sql:{self._cursor.rowcount}") - self._cursor.execute(delete_history_sql) - print(f"delete_history_sql:{self._cursor.rowcount}") - total = self._cursor.rowcount - # TODO: consider reporting rows removed and/or replaced in latest table as well - self._cursor.execute(delete_latest_sql) - print(f"delete_latest_sql:{self._cursor.rowcount}") - self._cursor.execute(update_latest_sql) - print(f"update_latest_sql:{self._cursor.rowcount}") - self._connection.commit() - - if total == -1: - # the SQL connector does not support returning number of rows affected (see PEP 249) + computation_as_of_dt, missing_value, missing_stderr, missing_sample_size) + SELECT + epimetric_id, sd.signal_key_id, gd.geo_key_id, issue, data_as_of_dt, + time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp, + computation_as_of_dt, missing_value, missing_stderr, missing_sample_size + FROM `{self.load_table}` sl + INNER JOIN signal_dim sd USING (source, `signal`) + INNER JOIN geo_dim gd USING (geo_type, geo_value) + WHERE is_latest_issue = 1 + ON DUPLICATE KEY UPDATE + `epimetric_id` = sl.`epimetric_id`, + `value_updated_timestamp` = sl.`value_updated_timestamp`, + `value` = sl.`value`, + `stderr` = sl.`stderr`, + `sample_size` = sl.`sample_size`, + `issue` = sl.`issue`, + `lag` = sl.`lag`, + `missing_value` = sl.`missing_value`, + `missing_stderr` = sl.`missing_stderr`, + `missing_sample_size` = sl.`missing_sample_size` + ''' + + # NOTE: DO NOT `TRUNCATE` THIS TABLE! doing so will ruin the AUTO_INCREMENT counter that the history and latest tables depend on... + epimetric_load_delete_processed = f''' + DELETE FROM `{self.load_table}` + ''' + + logger = get_structured_logger("run_dbjobs") + import time + time_q = [time.time()] + + try: + self._cursor.execute(signal_dim_add_new_load) + time_q.append(time.time()) + logger.debug('signal_dim_add_new_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) + + self._cursor.execute(geo_dim_add_new_load) + time_q.append(time.time()) + logger.debug('geo_dim_add_new_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) + + self._cursor.execute(epimetric_full_load) + time_q.append(time.time()) + logger.debug('epimetric_full_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) + + self._cursor.execute(epimetric_latest_load) + time_q.append(time.time()) + logger.debug('epimetric_latest_load', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) + + self._cursor.execute(epimetric_load_delete_processed) + time_q.append(time.time()) + logger.debug('epimetric_load_delete_processed', rows=self._cursor.rowcount, elapsed=time_q[-1]-time_q[-2]) + except Exception as e: + raise e + + return self + + def delete_batch(self, cc_deletions): + """ + Remove rows specified by a csv file or list of tuples. + + If cc_deletions is a filename, the file should include a header row and use the following field order: + - geo_id + - value (ignored) + - stderr (ignored) + - sample_size (ignored) + - issue (YYYYMMDD format) + - time_value (YYYYMMDD format) + - geo_type + - signal + - source + + If cc_deletions is a list of tuples, the tuples should use the following field order (=same as above, plus time_type): + - geo_id + - value (ignored) + - stderr (ignored) + - sample_size (ignored) + - issue (YYYYMMDD format) + - time_value (YYYYMMDD format) + - geo_type + - signal + - source + - time_type + """ + + tmp_table_name = "tmp_delete_table" + # composite keys: + short_comp_key = "`source`, `signal`, `time_type`, `geo_type`, `time_value`, `geo_value`" + long_comp_key = short_comp_key + ", `issue`" + + create_tmp_table_sql = f''' + CREATE TABLE {tmp_table_name} LIKE {self.load_table}; + ''' + + amend_tmp_table_sql = f''' + ALTER TABLE {tmp_table_name} ADD COLUMN delete_history_id BIGINT UNSIGNED, + ADD COLUMN delete_latest_id BIGINT UNSIGNED, + ADD COLUMN update_latest BINARY(1) DEFAULT 0; + ''' + + load_tmp_table_infile_sql = f''' + LOAD DATA INFILE "{cc_deletions}" + INTO TABLE {tmp_table_name} + FIELDS TERMINATED BY "," + IGNORE 1 LINES + (`geo_value`, `value`, `stderr`, `sample_size`, `issue`, `time_value`, `geo_type`, `signal`, `source`) + SET time_type="day"; + ''' + + load_tmp_table_insert_sql = f''' + INSERT INTO {tmp_table_name} + (`geo_value`, `value`, `stderr`, `sample_size`, `issue`, `time_value`, `geo_type`, `signal`, `source`, `time_type`, + `value_updated_timestamp`, `lag`, `is_latest_issue`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, + 0, 0, 0) + ''' + + add_history_id_sql = f''' + UPDATE {tmp_table_name} d INNER JOIN {self.history_view} h USING ({long_comp_key}) + SET d.delete_history_id=h.epimetric_id; + ''' + + # if a row we are deleting also appears in the 'latest' table (with a matching 'issue')... + mark_for_update_latest_sql = f''' + UPDATE {tmp_table_name} d INNER JOIN {self.latest_view} ell USING ({long_comp_key}) + SET d.update_latest=1, d.delete_latest_id=ell.epimetric_id; + ''' + + delete_history_sql = f''' + DELETE h FROM {tmp_table_name} d INNER JOIN {self.history_table} h ON d.delete_history_id=h.epimetric_id; + ''' + + # ...remove it from 'latest'... + delete_latest_sql = f''' + DELETE ell FROM {tmp_table_name} d INNER JOIN {self.latest_table} ell ON d.delete_latest_id=ell.epimetric_id; + ''' + + # ...and re-write that record with its next-latest issue (from 'history') instead. + # NOTE: this must be executed *AFTER* `delete_history_sql` to ensure we get the correct `issue` + # AND also after `delete_latest_sql` so that we dont get a key collision on insert. + update_latest_sql = f''' + INSERT INTO {self.latest_table} + (epimetric_id, + signal_key_id, geo_key_id, time_type, time_value, issue, + value, stderr, sample_size, `lag`, value_updated_timestamp, + missing_value, missing_stderr, missing_sample_size) + SELECT + h.epimetric_id, + h.signal_key_id, h.geo_key_id, h.time_type, h.time_value, h.issue, + h.value, h.stderr, h.sample_size, h.`lag`, h.value_updated_timestamp, + h.missing_value, h.missing_stderr, h.missing_sample_size + FROM {self.history_view} h JOIN ( + SELECT {short_comp_key}, MAX(hh.issue) AS issue + FROM {self.history_view} hh JOIN {tmp_table_name} dd USING ({short_comp_key}) + WHERE dd.update_latest=1 GROUP BY {short_comp_key} + ) d USING ({long_comp_key}); + ''' + + drop_tmp_table_sql = f'DROP TABLE IF EXISTS {tmp_table_name}' + total = None - except Exception as e: - raise e - finally: - self._cursor.execute(drop_tmp_table_sql) - return total - - - def compute_covidcast_meta(self, table_name=None, n_threads=None): - """Compute and return metadata on all COVIDcast signals.""" - logger = get_structured_logger("compute_covidcast_meta") - - if table_name is None: - table_name = self.latest_view - - if n_threads is None: - logger.info("n_threads unspecified, automatically choosing based on number of detected cores...") - n_threads = max(1, cpu_count()*9//10) # aka number of concurrent db connections, which [sh|c]ould be ~<= 90% of the #cores available to SQL server - # NOTE: this may present a small problem if this job runs on different hardware than the db, - # which is why this value can be overriden by optional argument. - logger.info(f"using {n_threads} workers") - - srcsigs = Queue() # multi-consumer threadsafe! - sql = f'SELECT `source`, `signal` FROM `{table_name}` GROUP BY `source`, `signal` ORDER BY `source` ASC, `signal` ASC;' - self._cursor.execute(sql) - for source, signal in self._cursor: - srcsigs.put((source, signal)) - - inner_sql = f''' - SELECT - `source` AS `data_source`, - `signal`, - `time_type`, - `geo_type`, - MIN(`time_value`) AS `min_time`, - MAX(`time_value`) AS `max_time`, - COUNT(DISTINCT `geo_value`) AS `num_locations`, - MIN(`value`) AS `min_value`, - MAX(`value`) AS `max_value`, - ROUND(AVG(`value`),7) AS `mean_value`, - ROUND(STD(`value`),7) AS `stdev_value`, - MAX(`value_updated_timestamp`) AS `last_update`, - MAX(`issue`) as `max_issue`, - MIN(`lag`) as `min_lag`, - MAX(`lag`) as `max_lag` - FROM - `{table_name}` - WHERE - `source` = %s AND - `signal` = %s - GROUP BY - `time_type`, - `geo_type` - ORDER BY - `time_type` ASC, - `geo_type` ASC - ''' - - meta = [] - meta_lock = threading.Lock() - - def worker(): - name = threading.current_thread().name - logger.info("starting thread", thread=name) - # set up new db connection for thread - worker_dbc = Database() - worker_dbc.connect(connector_impl=self._connector_impl) - w_cursor = worker_dbc._cursor - try: - while True: - (source, signal) = srcsigs.get_nowait() # this will throw the Empty caught below - logger.info("starting pair", thread=name, pair=f"({source}, {signal})") - w_cursor.execute(inner_sql, (source, signal)) - with meta_lock: - meta.extend(list( - dict(zip(w_cursor.column_names, x)) for x in w_cursor - )) - srcsigs.task_done() - except Empty: - logger.info("no jobs left, thread terminating", thread=name) - finally: - worker_dbc.disconnect(False) # cleanup - - threads = [] - for n in range(n_threads): - t = threading.Thread(target=worker, name='MetacacheThread-'+str(n)) - t.start() - threads.append(t) - - srcsigs.join() - logger.info("jobs complete") - for t in threads: - t.join() - logger.info("all threads terminated") - - # sort the metadata because threaded workers dgaf - sorting_fields = "data_source signal time_type geo_type".split() - sortable_fields_fn = lambda x: [(field, x[field]) for field in sorting_fields] - prepended_sortables_fn = lambda x: sortable_fields_fn(x) + list(x.items()) - tuple_representation = list(map(prepended_sortables_fn, meta)) - tuple_representation.sort() - meta = list(map(dict, tuple_representation)) # back to dict form - - return meta - - - def update_covidcast_meta_cache(self, metadata): - """Updates the `covidcast_meta_cache` table.""" - - sql = ''' - UPDATE - `covidcast_meta_cache` - SET - `timestamp` = UNIX_TIMESTAMP(NOW()), - `epidata` = %s - ''' - epidata_json = json.dumps(metadata) - - self._cursor.execute(sql, (epidata_json,)) - - def retrieve_covidcast_meta_cache(self): - """Useful for viewing cache entries (was used in debugging)""" - - sql = ''' - SELECT `epidata` - FROM `covidcast_meta_cache` - ORDER BY `timestamp` DESC - LIMIT 1; - ''' - self._cursor.execute(sql) - cache_json = self._cursor.fetchone()[0] - cache = json.loads(cache_json) - cache_hash = {} - for entry in cache: - cache_hash[(entry['data_source'], entry['signal'], entry['time_type'], entry['geo_type'])] = entry - return cache_hash + try: + self._cursor.execute(drop_tmp_table_sql) + self._cursor.execute(create_tmp_table_sql) + self._cursor.execute(amend_tmp_table_sql) + if isinstance(cc_deletions, str): + self._cursor.execute(load_tmp_table_infile_sql) + elif isinstance(cc_deletions, list): + def split_list(lst, n): + for i in range(0, len(lst), n): + yield lst[i:(i+n)] + for deletions_batch in split_list(cc_deletions, 100000): + self._cursor.executemany(load_tmp_table_insert_sql, deletions_batch) + print(f"load_tmp_table_insert_sql:{self._cursor.rowcount}") + else: + raise Exception(f"Bad deletions argument: need a filename or a list of tuples; got a {type(cc_deletions)}") + self._cursor.execute(add_history_id_sql) + print(f"add_history_id_sql:{self._cursor.rowcount}") + self._cursor.execute(mark_for_update_latest_sql) + print(f"mark_for_update_latest_sql:{self._cursor.rowcount}") + self._cursor.execute(delete_history_sql) + print(f"delete_history_sql:{self._cursor.rowcount}") + total = self._cursor.rowcount + # TODO: consider reporting rows removed and/or replaced in latest table as well + self._cursor.execute(delete_latest_sql) + print(f"delete_latest_sql:{self._cursor.rowcount}") + self._cursor.execute(update_latest_sql) + print(f"update_latest_sql:{self._cursor.rowcount}") + self._connection.commit() + + if total == -1: + # the SQL connector does not support returning number of rows affected (see PEP 249) + total = None + except Exception as e: + raise e + finally: + self._cursor.execute(drop_tmp_table_sql) + return total + + def compute_covidcast_meta(self, table_name=None, n_threads=None): + """Compute and return metadata on all COVIDcast signals.""" + logger = get_structured_logger("compute_covidcast_meta") + + if table_name is None: + table_name = self.latest_view + + if n_threads is None: + logger.info("n_threads unspecified, automatically choosing based on number of detected cores...") + # aka number of concurrent db connections, which [sh|c]ould be ~<= 90% of the #cores available to SQL server + n_threads = max(1, cpu_count()*9//10) + # NOTE: this may present a small problem if this job runs on different hardware than the db, + # which is why this value can be overriden by optional argument. + logger.info(f"using {n_threads} workers") + + srcsigs = Queue() # multi-consumer threadsafe! + sql = f'SELECT `source`, `signal` FROM `{table_name}` GROUP BY `source`, `signal` ORDER BY `source` ASC, `signal` ASC;' + self._cursor.execute(sql) + for source, signal in self._cursor: + srcsigs.put((source, signal)) + + inner_sql = f''' + SELECT + `source` AS `data_source`, + `signal`, + `time_type`, + `geo_type`, + MIN(`time_value`) AS `min_time`, + MAX(`time_value`) AS `max_time`, + COUNT(DISTINCT `geo_value`) AS `num_locations`, + MIN(`value`) AS `min_value`, + MAX(`value`) AS `max_value`, + ROUND(AVG(`value`),7) AS `mean_value`, + ROUND(STD(`value`),7) AS `stdev_value`, + MAX(`value_updated_timestamp`) AS `last_update`, + MAX(`issue`) as `max_issue`, + MIN(`lag`) as `min_lag`, + MAX(`lag`) as `max_lag` + FROM + `{table_name}` + WHERE + `source` = %s AND + `signal` = %s + GROUP BY + `time_type`, + `geo_type` + ORDER BY + `time_type` ASC, + `geo_type` ASC + ''' + + meta = [] + meta_lock = threading.Lock() + + def worker(): + name = threading.current_thread().name + logger.info("starting thread", thread=name) + # set up new db connection for thread + worker_dbc = Database() + worker_dbc.connect(connector_impl=self._connector_impl) + w_cursor = worker_dbc._cursor + try: + while True: + (source, signal) = srcsigs.get_nowait() # this will throw the Empty caught below + logger.info("starting pair", thread=name, pair=f"({source}, {signal})") + w_cursor.execute(inner_sql, (source, signal)) + with meta_lock: + meta.extend(list( + dict(zip(w_cursor.column_names, x)) for x in w_cursor + )) + srcsigs.task_done() + except Empty: + logger.info("no jobs left, thread terminating", thread=name) + finally: + worker_dbc.disconnect(False) # cleanup + + threads = [] + for n in range(n_threads): + t = threading.Thread(target=worker, name='MetacacheThread-'+str(n)) + t.start() + threads.append(t) + + srcsigs.join() + logger.info("jobs complete") + for t in threads: + t.join() + logger.info("all threads terminated") + + # sort the metadata because threaded workers dgaf + sorting_fields = "data_source signal time_type geo_type".split() + sortable_fields_fn = lambda x: [(field, x[field]) for field in sorting_fields] # noqa + prepended_sortables_fn = lambda x: sortable_fields_fn(x) + list(x.items()) # noqa + tuple_representation = list(map(prepended_sortables_fn, meta)) + tuple_representation.sort() + meta = list(map(dict, tuple_representation)) # back to dict form + + return meta + + def update_covidcast_meta_cache(self, metadata): + """Updates the `covidcast_meta_cache` table.""" + + sql = ''' + UPDATE + `covidcast_meta_cache` + SET + `timestamp` = UNIX_TIMESTAMP(NOW()), + `epidata` = %s + ''' + epidata_json = json.dumps(metadata) + + self._cursor.execute(sql, (epidata_json,)) + + def retrieve_covidcast_meta_cache(self): + """Useful for viewing cache entries (was used in debugging)""" + + sql = ''' + SELECT `epidata` + FROM `covidcast_meta_cache` + ORDER BY `timestamp` DESC + LIMIT 1; + ''' + self._cursor.execute(sql) + cache_json = self._cursor.fetchone()[0] + cache = json.loads(cache_json) + cache_hash = {} + for entry in cache: + cache_hash[(entry['data_source'], entry['signal'], entry['time_type'], entry['geo_type'])] = entry + return cache_hash diff --git a/src/acquisition/covidcast/delete_batch.py b/src/acquisition/covidcast/delete_batch.py index fe40897fd..ece97c233 100644 --- a/src/acquisition/covidcast/delete_batch.py +++ b/src/acquisition/covidcast/delete_batch.py @@ -23,13 +23,15 @@ def get_argument_parser(): help="filename for log output (defaults to stdout)") return parser + def handle_file(deletion_file, database, logger): logger.info("Deleting from csv file", filename=deletion_file) rows = [] with open(deletion_file) as f: for line in f: fields = line.strip().split(",") - if len(fields) < 9: continue + if len(fields) < 9: + continue rows.append(fields + ["day"]) rows = rows[1:] try: @@ -41,6 +43,7 @@ def handle_file(deletion_file, database, logger): database.rollback() return 0 + def main(args): """Delete rows from covidcast.""" @@ -64,5 +67,6 @@ def main(args): "Deleted CSVs from database", total_runtime_in_seconds=round(time.time() - start_time, 2), row_count=all_n) + if __name__ == '__main__': main(get_argument_parser().parse_args()) diff --git a/src/acquisition/covidcast/file_archiver.py b/src/acquisition/covidcast/file_archiver.py index 92686f3cf..ab586c248 100644 --- a/src/acquisition/covidcast/file_archiver.py +++ b/src/acquisition/covidcast/file_archiver.py @@ -1,72 +1,76 @@ """Moves files into various archival directories.""" -# standard library + import gzip import os import shutil -# first party from delphi.epidata.acquisition.covidcast.logger import get_structured_logger + class FileArchiver: - """Archives files by moving and compressing.""" - - @staticmethod - def archive_inplace(path, filename, - gzip=gzip, - os=os, - shutil=shutil, - open_impl=open): - return FileArchiver.archive_file(path, path, filename, True, gzip, os, shutil, open_impl) - - @staticmethod - def archive_file( - path_src, - path_dst, - filename, - compress, - gzip=gzip, - os=os, - shutil=shutil, - open_impl=open): - """Archive a file and return the path and `stat` of the destination file. - - WARNING: This is a potentially destructive operation. See details below. - - path_src: the directory which contains the file to be archived - path_dst: the directory into which the file should be moved - filename: the name of the file within `path_src` - compress: gzips the file if true, otherise moves the file unmodified - - The destination directory will be created if necessary. If the destination - file already exists, it will be overwritten. - """ - - logger = get_structured_logger("file_archiver") - src = os.path.join(path_src, filename) - dst = os.path.join(path_dst, filename) - - if compress: - dst += '.gz' - - # make sure the destination directory exists - os.makedirs(path_dst, exist_ok=True) - - if os.path.exists(dst): - # warn that destination is about to be overwritten - logger.warning(event='destination exists, will overwrite', file=dst) - - if compress: - # make a compressed copy - with open_impl(src, 'rb') as f_in: - with gzip.open(dst, 'wb') as f_out: - shutil.copyfileobj(f_in, f_out) - - # delete the original - os.remove(src) - else: - # just move (i.e. rename) the original - shutil.move(src, dst) - - # return filesystem information about the destination file - return (dst, os.stat(dst)) + """Archives files by moving and compressing.""" + + @staticmethod + def archive_inplace( + path, + filename, + gzip=gzip, + os=os, + shutil=shutil, + open_impl=open + ): + return FileArchiver.archive_file(path, path, filename, True, gzip, os, shutil, open_impl) + + @staticmethod + def archive_file( + path_src, + path_dst, + filename, + compress, + gzip=gzip, + os=os, + shutil=shutil, + open_impl=open + ): + """Archive a file and return the path and `stat` of the destination file. + + WARNING: This is a potentially destructive operation. See details below. + + path_src: the directory which contains the file to be archived + path_dst: the directory into which the file should be moved + filename: the name of the file within `path_src` + compress: gzips the file if true, otherise moves the file unmodified + + The destination directory will be created if necessary. If the destination + file already exists, it will be overwritten. + """ + + logger = get_structured_logger("file_archiver") + src = os.path.join(path_src, filename) + dst = os.path.join(path_dst, filename) + + if compress: + dst += '.gz' + + # make sure the destination directory exists + os.makedirs(path_dst, exist_ok=True) + + if os.path.exists(dst): + # warn that destination is about to be overwritten + logger.warning(event='destination exists, will overwrite', file=dst) + + if compress: + # make a compressed copy + with open_impl(src, 'rb') as f_in: + with gzip.open(dst, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + # delete the original + os.remove(src) + else: + # just move (i.e. rename) the original + shutil.move(src, dst) + + # return filesystem information about the destination file + return (dst, os.stat(dst)) diff --git a/src/acquisition/covidcast/generate_islatest_fix_sql.py b/src/acquisition/covidcast/generate_islatest_fix_sql.py index 115a7d131..e012c9808 100644 --- a/src/acquisition/covidcast/generate_islatest_fix_sql.py +++ b/src/acquisition/covidcast/generate_islatest_fix_sql.py @@ -1,56 +1,60 @@ # what data to operate on base_where_clause = "WHERE `source`='jhu-csse' AND `time_type`='day'" -### base_where_clause = "WHERE `source`='src2' AND `time_type`='day'" ### +# base_where_clause = "WHERE `source`='src2' AND `time_type`='day'" # signal name construction # NOTE: selecting these (unique) from the database takes 7-8 mins, so reconstructing here for efficiency # TODO: maybe just put the damn UNIQUE query in here so you dont fat-finger it again george. -# also these hardcoded signals are unique to JHU data, or are at least not used by all sources. +# also these hardcoded signals are unique to JHU data, or are at least not used by all sources. signals = [] for case in ('confirmed_', 'deaths_'): - for period in ('7dav_', ''): # NOTE: that is a V as in "7 Day AVerage", not a "Y" as in "7 DAY" - for count in ('cumulative_', 'incidence_'): - for typ in ('num', 'prop'): - signals.append(case+period+count+typ) -### signals = ['sig2'] ### + for period in ('7dav_', ''): # NOTE: that is a V as in "7 Day AVerage", not a "Y" as in "7 DAY" + for count in ('cumulative_', 'incidence_'): + for typ in ('num', 'prop'): + signals.append(case+period+count+typ) +# signals = ['sig2'] # variable to split on, 'time_value' is good because its high cardinality is suitable for chunking PARTITION_VARIABLE = 'time_value' -PARTITION_SPLITS = [20200101 + i*100 for i in range(10)] # first day of the month for jan - oct 2020 in YYYYMMDD form -### PARTITION_SPLITS = [1,2] ### +PARTITION_SPLITS = [20200101 + i*100 for i in range(10)] # first day of the month for jan - oct 2020 in YYYYMMDD form +# PARTITION_SPLITS = [1,2] -print(''' --- --- run this as: --- python3 generate_islatest_fix_sql.py > islatest_fix.sql --- mysql -vvv -p epidata < islatest_fix.sql --- or: --- date ; (python3 generate_islatest_fix_sql.py | mysql -vvv -p epidata ) ; date --- -''') +print( + ''' + -- + -- run this as: + -- python3 generate_islatest_fix_sql.py > islatest_fix.sql + -- mysql -vvv -p epidata < islatest_fix.sql + -- or: + -- date ; (python3 generate_islatest_fix_sql.py | mysql -vvv -p epidata ) ; date + -- + ''' +) # create temp table print("CREATE TABLE `islatest_fix` (`latest_id` INT(11) NOT NULL, PRIMARY KEY (`latest_id`)) ENGINE=InnoDB DEFAULT CHARSET=utf8;") # find latest issue by partition (and by signal) and save primary ids into temp table for partition_index in range(len(PARTITION_SPLITS)+1): - ge_condition = 'TRUE' if partition_index == 0 else f'`{PARTITION_VARIABLE}` >= {PARTITION_SPLITS[partition_index - 1]}' - l_condition = 'TRUE' if partition_index == len(PARTITION_SPLITS) else f'`{PARTITION_VARIABLE}` < {PARTITION_SPLITS[partition_index]}' - partition_condition = f'({ge_condition}) AND ({l_condition})' - for sig in signals: - where_clause = base_where_clause + " AND `signal`='%s' AND %s" % (sig, partition_condition) - - print(''' -INSERT INTO `islatest_fix` - SELECT id FROM - ( SELECT `source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value`, MAX(`issue`) AS `issue` FROM `covidcast` - ''' + where_clause + ''' - GROUP BY `source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value` - ) b - LEFT JOIN `covidcast` a - USING (`source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value`, `issue`); -''') + ge_condition = 'TRUE' if partition_index == 0 else f'`{PARTITION_VARIABLE}` >= {PARTITION_SPLITS[partition_index - 1]}' + l_condition = 'TRUE' if partition_index == len(PARTITION_SPLITS) else f'`{PARTITION_VARIABLE}` < {PARTITION_SPLITS[partition_index]}' + partition_condition = f'({ge_condition}) AND ({l_condition})' + for sig in signals: + where_clause = base_where_clause + " AND `signal`='%s' AND %s" % (sig, partition_condition) + + print( + ''' + INSERT INTO `islatest_fix` + SELECT id FROM + ( SELECT `source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value`, MAX(`issue`) AS `issue` FROM `covidcast` + ''' + where_clause + ''' + GROUP BY `source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value` + ) b + LEFT JOIN `covidcast` a + USING (`source`, `signal`, `time_type`, `geo_type`, `geo_value`, `time_value`, `issue`); + ''' + ) # clear any current (potentially erroneous) is_latest_issue flags print("UPDATE `covidcast` SET `is_latest_issue`=0 " + base_where_clause + " AND `is_latest_issue`=1;") @@ -60,4 +64,4 @@ # clean up temp table print("-- TODO: drop this table") -print("-- DROP TABLE `islatest_fix`;") \ No newline at end of file +print("-- DROP TABLE `islatest_fix`;") diff --git a/src/acquisition/covidcast/logger.py b/src/acquisition/covidcast/logger.py index ad3b3679f..f69b357e4 100644 --- a/src/acquisition/covidcast/logger.py +++ b/src/acquisition/covidcast/logger.py @@ -3,8 +3,10 @@ import os import sys import threading + import structlog + def handle_exceptions(logger): """Handle exceptions using the provided logger.""" def exception_handler(etype, value, traceback): diff --git a/src/acquisition/covidcast/migrate_epidata_to_v4.py b/src/acquisition/covidcast/migrate_epidata_to_v4.py index f5522337e..1694c684f 100644 --- a/src/acquisition/covidcast/migrate_epidata_to_v4.py +++ b/src/acquisition/covidcast/migrate_epidata_to_v4.py @@ -1,6 +1,7 @@ import argparse import sys import time + from delphi.epidata.acquisition.covidcast.database import Database # run as: @@ -8,9 +9,9 @@ # ("-u" allows unbuffered print statements so we can watch timing in closer-to-real-time) -#####import delphi.operations.secrets as secrets -#####secrets.db.host = '172.30.n.n' # aka 'epidata-db-qa-01' -#####secrets.db.epi = ('delphi', 'xxxxxxxx') +# ####import delphi.operations.secrets as secrets +# ####secrets.db.host = '172.30.n.n' # aka 'epidata-db-qa-01' +# ####secrets.db.epi = ('delphi', 'xxxxxxxx') # ^ these are already set appropriately on qa-automation in/by the operations module ^ @@ -60,37 +61,41 @@ ''' + def start_tx(cursor): - cursor.execute('SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;') - cursor.execute('SET autocommit=0;') # starts a transaction as suggested in https://dev.mysql.com/doc/refman/8.0/en/lock-tables.html - # NOTE: locks must be specified for any aliases of table names that are used - cursor.execute('''LOCK TABLES epidata.covidcast AS cc READ, - epimetric_load WRITE, epimetric_load AS sl WRITE, - epimetric_full WRITE, - epimetric_latest WRITE, - signal_dim WRITE, signal_dim AS sd READ, - geo_dim WRITE, geo_dim AS gd READ;''') - cursor.execute('SET unique_checks=0;') + cursor.execute('SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED;') + cursor.execute('SET autocommit=0;') # starts a transaction as suggested in https://dev.mysql.com/doc/refman/8.0/en/lock-tables.html + # NOTE: locks must be specified for any aliases of table names that are used + cursor.execute('''LOCK TABLES epidata.covidcast AS cc READ, + epimetric_load WRITE, epimetric_load AS sl WRITE, + epimetric_full WRITE, + epimetric_latest WRITE, + signal_dim WRITE, signal_dim AS sd READ, + geo_dim WRITE, geo_dim AS gd READ;''') + cursor.execute('SET unique_checks=0;') + def finish_tx(cursor): - cursor.execute('SET unique_checks=1;') - cursor.execute('COMMIT;') - cursor.execute('UNLOCK TABLES;') + cursor.execute('SET unique_checks=1;') + cursor.execute('COMMIT;') + cursor.execute('UNLOCK TABLES;') def do_batches(db, start, upper_lim, batch_size): - # NOTE: upper_lim is not actually selected for ; make sure it exceeds any ids you want to include - batch_lower = start + # NOTE: upper_lim is not actually selected for ; make sure it exceeds any ids you want to include + batch_lower = start - while batch_lower < upper_lim: - batch_upper = min(batch_lower + batch_size, upper_lim) + while batch_lower < upper_lim: + batch_upper = min(batch_lower + batch_size, upper_lim) # NOTE: first rows of column names are identical, second rows are for specifying a rename and a literal batch_sql = f""" INSERT INTO epimetric_load ( - `issue`, `source`, `signal`, geo_type, geo_value, time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp, is_latest_issue, missing_value, missing_stderr, missing_sample_size + `issue`, `source`, `signal`, geo_type, geo_value, time_type, time_value, `value`, stderr, sample_size, + `lag`, value_updated_timestamp, is_latest_issue, missing_value, missing_stderr, missing_sample_size ) SELECT - `issue`, `source`, `signal`, geo_type, geo_value, time_type, time_value, `value`, stderr, sample_size, `lag`, value_updated_timestamp, is_latest_issue, missing_value, missing_stderr, missing_sample_size + `issue`, `source`, `signal`, geo_type, geo_value, time_type, time_value, `value`, stderr, sample_size, + `lag`, value_updated_timestamp, is_latest_issue, missing_value, missing_stderr, missing_sample_size FROM epidata.covidcast AS cc USE INDEX(`PRIMARY`) WHERE {batch_lower} <= cc.id AND cc.id < {batch_upper}; """ @@ -98,10 +103,10 @@ def do_batches(db, start, upper_lim, batch_size): # TODO: might it be worth adding "ORDER BY id ASC" ? if use_transaction_wrappers: - start_tx(db._cursor) + start_tx(db._cursor) print(f"-=-=-=-=-=-=-=- RUNNING BATCH STARTING AT {batch_lower} -=-=-=-=-=-=-=-") - print(f"-=-=-=-=-=-=-=- RUNNING ''INSERT INTO SELECT FROM''... ", end="") + print("-=-=-=-=-=-=-=- RUNNING ''INSERT INTO SELECT FROM''... ", end="") t = time.time() db._cursor.execute(batch_sql) print(f"elapsed: {time.time()-t} sec, rows: {db._cursor.rowcount} -=-=-=-=-=-=-=-") @@ -114,7 +119,7 @@ def do_batches(db, start, upper_lim, batch_size): t = time.time() db.commit() if use_transaction_wrappers: - finish_tx(db._cursor) + finish_tx(db._cursor) print(f"elapsed: {time.time()-t} sec -=-=-=-=-=-=-=-") print("\n\n") @@ -123,65 +128,65 @@ def do_batches(db, start, upper_lim, batch_size): def main(destination_schema, batch_size, start_id, upper_lim_override): - Database.DATABASE_NAME = destination_schema - db = Database() - db.connect() - if use_autocommit: - db._connection.autocommit = True - - if upper_lim_override: - upper_lim = upper_lim_override - else: - # find upper limit for data to be imported - db._cursor.execute("SELECT MAX(id) FROM epidata.covidcast;") + Database.DATABASE_NAME = destination_schema + db = Database() + db.connect() + if use_autocommit: + db._connection.autocommit = True + + if upper_lim_override: + upper_lim = upper_lim_override + else: + # find upper limit for data to be imported + db._cursor.execute("SELECT MAX(id) FROM epidata.covidcast;") for (max_id,) in db._cursor: - upper_lim = 1 + max_id - - print(f"migrating data to schema '{destination_schema}', with batch size {batch_size} and {start_id} <= ids < {upper_lim}") - if start_id==0: - print("this WILL truncate any existing v4 tables") - print() - if input("type 'yes' to continue: ") != 'yes': - sys.exit('operation cancelled!') - - print(f"starting run at: {time.strftime('%c')}") - - if start_id==0: - # clear tables in the v4 schema - print("truncating tables...") - for table in "epimetric_load epimetric_latest epimetric_full geo_dim signal_dim".split(): - db._cursor.execute(f"TRUNCATE TABLE {table}") - db.commit() - start_id = 1 - - # run batch loop - do_batches(db, start_id, upper_lim, batch_size) - - # get table counts [the quick and dirty way] - print("-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-") - db._cursor.execute(f"SELECT MAX(epimetric_id) FROM epimetric_full;") - for (max_id,) in db._cursor: - print(f"epimetric_full: {max_id}") - db._cursor.execute(f"SELECT MAX(epimetric_id) FROM epimetric_latest;") - for (max_id,) in db._cursor: - print(f"epimetric_latest: {max_id} (this should be <= the number above)") - db._cursor.execute(f"SELECT COUNT(signal_key_id), MAX(signal_key_id) FROM signal_dim;") - for (count_id, max_id) in db._cursor: - print(f"signal_dim: count {count_id} / max {max_id}") - db._cursor.execute(f"SELECT COUNT(geo_key_id), MAX(geo_key_id) FROM geo_dim;") - for (count_id, max_id) in db._cursor: - print(f"geo_dim: count {count_id} / max {max_id}") + upper_lim = 1 + max_id + + print(f"migrating data to schema '{destination_schema}', with batch size {batch_size} and {start_id} <= ids < {upper_lim}") + if start_id == 0: + print("this WILL truncate any existing v4 tables") + print() + if input("type 'yes' to continue: ") != 'yes': + sys.exit('operation cancelled!') + + print(f"starting run at: {time.strftime('%c')}") + + if start_id == 0: + # clear tables in the v4 schema + print("truncating tables...") + for table in "epimetric_load epimetric_latest epimetric_full geo_dim signal_dim".split(): + db._cursor.execute(f"TRUNCATE TABLE {table}") + db.commit() + start_id = 1 + + # run batch loop + do_batches(db, start_id, upper_lim, batch_size) + + # get table counts [the quick and dirty way] + print("-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-") + db._cursor.execute("SELECT MAX(epimetric_id) FROM epimetric_full;") + for (max_id,) in db._cursor: + print(f"epimetric_full: {max_id}") + db._cursor.execute("SELECT MAX(epimetric_id) FROM epimetric_latest;") + for (max_id,) in db._cursor: + print(f"epimetric_latest: {max_id} (this should be <= the number above)") + db._cursor.execute("SELECT COUNT(signal_key_id), MAX(signal_key_id) FROM signal_dim;") + for (count_id, max_id) in db._cursor: + print(f"signal_dim: count {count_id} / max {max_id}") + db._cursor.execute("SELECT COUNT(geo_key_id), MAX(geo_key_id) FROM geo_dim;") + for (count_id, max_id) in db._cursor: + print(f"geo_dim: count {count_id} / max {max_id}") - return upper_lim + return upper_lim if __name__ == '__main__': - argparser = argparse.ArgumentParser() - argparser.add_argument('--destination_schema', type=str, default='covid') - argparser.add_argument('--batch_size', type=int, default=20_000_000) - argparser.add_argument('--start_id', type=int, default=0) - argparser.add_argument('--upper_lim_override', type=int) # should default to None - args = argparser.parse_args() - - upper_lim = main(args.destination_schema, args.batch_size, args.start_id, args.upper_lim_override) - print(f"the next execution of this program should include argument: --start_id={upper_lim}") + argparser = argparse.ArgumentParser() + argparser.add_argument('--destination_schema', type=str, default='covid') + argparser.add_argument('--batch_size', type=int, default=20_000_000) + argparser.add_argument('--start_id', type=int, default=0) + argparser.add_argument('--upper_lim_override', type=int) # should default to None + args = argparser.parse_args() + + upper_lim = main(args.destination_schema, args.batch_size, args.start_id, args.upper_lim_override) + print(f"the next execution of this program should include argument: --start_id={upper_lim}") diff --git a/src/acquisition/covidcast/signal_dash_data_generator.py b/src/acquisition/covidcast/signal_dash_data_generator.py index 2e7467487..75246337e 100644 --- a/src/acquisition/covidcast/signal_dash_data_generator.py +++ b/src/acquisition/covidcast/signal_dash_data_generator.py @@ -1,27 +1,24 @@ """Updates the signal dashboard data.""" -# standard library import argparse +import datetime import sys import time -import datetime -import mysql.connector -import pandas as pd - from dataclasses import dataclass -from epiweeks import Week from typing import List -# first party import covidcast import delphi.operations.secrets as secrets +import mysql.connector +import pandas as pd from delphi.epidata.acquisition.covidcast.logger import get_structured_logger - +from epiweeks import Week LOOKBACK_DAYS_FOR_COVERAGE = 56 BASE_COVIDCAST = covidcast.covidcast.Epidata.BASE_URL[:-len("api.php")] + "covidcast" COVERAGE_URL = f"{BASE_COVIDCAST}/coverage?format=csv&signal={{source}}:{{signal}}&days={LOOKBACK_DAYS_FOR_COVERAGE}" + @dataclass class DashboardSignal: """Container class for information about dashboard signals.""" @@ -150,11 +147,11 @@ def write_coverage( def get_enabled_signals(self) -> List[DashboardSignal]: """Retrieve all enabled signals from the database""" - select_statement = f'''SELECT `id`, + select_statement = f'''SELECT `id`, `name`, `source`, `covidcast_signal`, - `latest_coverage_update`, + `latest_coverage_update`, `latest_status_update` FROM `{Database.SIGNAL_TABLE_NAME}` WHERE `enabled` @@ -203,12 +200,12 @@ def get_coverage(dashboard_signal: DashboardSignal) -> List[DashboardSignalCover try: count_by_geo_type_df["time_value"] = count_by_geo_type_df["time_value"].apply( lambda x: pd.to_datetime(str(x), format="%Y%m%d")) - except: + except: # noqa count_by_geo_type_df["time_value"] = count_by_geo_type_df["time_value"].apply( lambda x: pd.to_datetime(Week(x // 100, x % 100).startdate())) signal_coverage_list = [] - + for _, row in count_by_geo_type_df.iterrows(): signal_coverage = DashboardSignalCoverage( signal_id=dashboard_signal.db_id, diff --git a/src/acquisition/covidcast/test_utils.py b/src/acquisition/covidcast/test_utils.py index 96db2c164..83b2a0d16 100644 --- a/src/acquisition/covidcast/test_utils.py +++ b/src/acquisition/covidcast/test_utils.py @@ -1,15 +1,14 @@ +import unittest from dataclasses import fields from datetime import date from typing import Any, Dict, Iterable, List, Optional, Sequence -import unittest +import delphi.operations.secrets as secrets import pandas as pd - -from delphi_utils import Nans from delphi.epidata.acquisition.covidcast.covidcast_row import CovidcastRow from delphi.epidata.acquisition.covidcast.database import Database from delphi.epidata.server.utils.dates import day_to_time_value, time_value_to_day -import delphi.operations.secrets as secrets +from delphi_utils import Nans # all the Nans we use here are just one value, so this is a shortcut to it: nmv = Nans.NOT_MISSING.value @@ -73,8 +72,8 @@ def covidcast_rows_from_args(sanitize_fields: bool = False, test_mode: bool = Tr Example: covidcast_rows_from_args(value=[1, 2, 3], time_value=[1, 2, 3]) will yield - [CovidcastTestRow.make_default_row(value=1, time_value=1), CovidcastTestRow.make_default_row(value=2, time_value=2), CovidcastTestRow.make_default_row(value=3, time_value=3)] - with all the defaults from CovidcastTestRow. + [CovidcastTestRow.make_default_row(value=1, time_value=1), CovidcastTestRow.make_default_row(value=2, time_value=2), + CovidcastTestRow.make_default_row(value=3, time_value=3)] with all the defaults from CovidcastTestRow. """ # If any iterables were passed instead of lists, convert them to lists. kwargs = {key: list(value) for key, value in kwargs.items()} @@ -96,7 +95,8 @@ def covidcast_rows_from_records(records: Iterable[dict], sanity_check: bool = Fa You can use csv.DictReader before this to read a CSV file. """ records = list(records) - return [CovidcastTestRow.make_default_row(**record) if not sanity_check else CovidcastTestRow.make_default_row(**record)._sanitize_fields() for record in records] + return [CovidcastTestRow.make_default_row(**record) if not sanity_check else + CovidcastTestRow.make_default_row(**record)._sanitize_fields() for record in records] def covidcast_rows_as_dicts(rows: Iterable[CovidcastTestRow], ignore_fields: Optional[List[str]] = None) -> List[dict]: @@ -168,12 +168,12 @@ def tearDown(self): del self._db def localSetUp(self): - # stub; override in subclasses to perform custom setup. + # stub; override in subclasses to perform custom setup. # runs after tables have been truncated but before database changes have been committed pass def localTearDown(self): - # stub; override in subclasses to perform custom teardown. + # stub; override in subclasses to perform custom teardown. # runs after database changes have been committed pass @@ -181,7 +181,9 @@ def _insert_rows(self, rows: Sequence[CovidcastTestRow]): # inserts rows into the database using the full acquisition process, including 'dbjobs' load into history & latest tables n = self._db.insert_or_update_bulk(rows) print(f"{n} rows added to load table & dispatched to v4 schema") - self._db._connection.commit() # NOTE: this isnt expressly needed for our test cases, but would be if using external access (like through client lib) to ensure changes are visible outside of this db session + # NOTE: this isnt expressly needed for our test cases, but would be if using external access (like through client lib) + # To ensure changes are visible outside of this db session + self._db._connection.commit() def params_from_row(self, row: CovidcastTestRow, **kwargs): ret = { @@ -193,4 +195,4 @@ def params_from_row(self, row: CovidcastTestRow, **kwargs): 'geo_value': row.geo_value, } ret.update(kwargs) - return ret \ No newline at end of file + return ret diff --git a/src/acquisition/ecdc/ecdc_db_update.py b/src/acquisition/ecdc/ecdc_db_update.py index 63689c1d5..5f8fb405c 100644 --- a/src/acquisition/ecdc/ecdc_db_update.py +++ b/src/acquisition/ecdc/ecdc_db_update.py @@ -14,7 +14,7 @@ | Field | Type | Null | Key | Default | Extra | +----------------+-------------+------+-----+---------+----------------+ | id | int(11) | NO | PRI | NULL | auto_increment | -| release_date | date | NO | MUL | NULL | | +| release_date | date | NO | MUL | NULL | | | issue | int(11) | NO | MUL | NULL | | | epiweek | int(11) | NO | MUL | NULL | | | region | varchar(12) | NO | MUL | NULL | | @@ -33,22 +33,20 @@ import argparse import datetime import glob -import subprocess -import random import os +import random +import subprocess -# third party -import mysql.connector - -# first party import delphi.operations.secrets as secrets +import mysql.connector from delphi.epidata.acquisition.ecdc.ecdc_ili import download_ecdc_data -from delphi.utils.epiweek import delta_epiweeks from delphi.utils.epidate import EpiDate +from delphi.utils.epiweek import delta_epiweeks + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') try: cursor = cnx.cursor() cursor.execute(''' @@ -62,31 +60,35 @@ def ensure_tables_exist(): `incidence_rate` DOUBLE NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + ''') cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) - except: + return float(f.replace(',', '')) + except: # noqa return 0 + def safe_int(i): try: - return int(i.replace(',','')) - except: + return int(i.replace(',', '')) + except: # noqa return 0 + def get_rows(cnx, table='ecdc_ili'): - # Count and return the number of rows in the `ecdc_ili` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + # Count and return the number of rows in the `ecdc_ili` table. + select = cnx.cursor() + select.execute('SELECT count(1) num FROM %s' % table) + for (num,) in select: + pass + select.close() + return num + def update_from_file(issue, date, dir, test_mode=False): # Read ECDC data from CSVs and insert into (or update) the database. @@ -98,12 +100,12 @@ def update_from_file(issue, date, dir, test_mode=False): insert = cnx.cursor() # load the data, ignoring empty rows - files = glob.glob(os.path.join(dir,"*.csv")) + files = glob.glob(os.path.join(dir, "*.csv")) rows = [] for filename in files: - with open(filename,'r') as f: - for l in f: - data = list(map(lambda s: s.strip().replace('"',''),l.split(','))) + with open(filename, 'r') as f: + for l in f: # noqa + data = list(map(lambda s: s.strip().replace('"', ''), l.split(','))) row = {} row['epiweek'] = int(data[1][:4] + data[1][5:]) row['region'] = data[4] @@ -114,25 +116,25 @@ def update_from_file(issue, date, dir, test_mode=False): print(' found %d entries' % len(entries)) sql = ''' - INSERT INTO - `ecdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, - `incidence_rate`) - VALUES - ('%s', %s, %s, '%s', %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, '%s'), - `incidence_rate` = %s + INSERT INTO + `ecdc_ili` (`release_date`, `issue`, `epiweek`, `region`, `lag`, + `incidence_rate`) + VALUES + ('%s', %s, %s, '%s', %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, '%s'), + `incidence_rate` = %s ''' for row in entries: lag = delta_epiweeks(row['epiweek'], issue) data_args = [row['incidence_rate']] - insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args + insert_args = [date, issue, row['epiweek'], row['region'], lag] + data_args update_args = [date] + data_args try: insert.execute(sql % tuple(insert_args + update_args)) - except: + except: # noqa pass # cleanup @@ -143,9 +145,10 @@ def update_from_file(issue, date, dir, test_mode=False): else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print('rows after: %d (added %d)' % (rows2, rows2-rows1)) cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() @@ -184,27 +187,28 @@ def main(): flag = flag + 1 tmp_dir = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(8)) tmp_dir = 'downloads_' + tmp_dir - subprocess.call(["mkdir",tmp_dir]) + subprocess.call(["mkdir", tmp_dir]) # Use temporary directory to avoid data from different time # downloaded to same folder download_ecdc_data(download_dir=tmp_dir) issue = EpiDate.today().get_ew() files = glob.glob('%s/*.csv' % tmp_dir) for filename in files: - with open(filename,'r') as f: + with open(filename, 'r') as f: _ = f.readline() db_error = False for filename in files: try: update_from_file(issue, date, filename, test_mode=args.test) - subprocess.call(["rm",filename]) - except: + subprocess.call(["rm", filename]) + except: # noqa db_error = True - subprocess.call(["rm","-r",tmp_dir]) + subprocess.call(["rm", "-r", tmp_dir]) if not db_error: - break # Exit loop with success + break # Exit loop with success if flag >= max_tries: print('WARNING: Database `ecdc_ili` did not update successfully') + if __name__ == '__main__': main() diff --git a/src/acquisition/ecdc/ecdc_ili.py b/src/acquisition/ecdc/ecdc_ili.py index 1dd0505d1..8a39a9f6a 100644 --- a/src/acquisition/ecdc/ecdc_ili.py +++ b/src/acquisition/ecdc/ecdc_ili.py @@ -5,42 +5,41 @@ import os import re -import requests import time +import requests from bs4 import BeautifulSoup from selenium import webdriver -from selenium.webdriver.support.ui import Select -from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import Select, WebDriverWait -def download_ecdc_data(download_dir = "downloads"): +def download_ecdc_data(download_dir="downloads"): url = 'https://flunewseurope.org/PrimaryCareData' resp = requests.get(url) soup = BeautifulSoup(resp.content, 'lxml') mydivs = soup.findAll('div') for div in mydivs: dic = div.attrs - if dic.get('class')== ['graph-container'] and dic.get('id')== 'dinfl06': + if dic.get('class') == ['graph-container'] and dic.get('id') == 'dinfl06': break # get new url of the ILI chunck url = div.contents[1].attrs['src'] opts = webdriver.firefox.options.Options() opts.set_headless() fp = webdriver.FirefoxProfile() - fp.set_preference("browser.download.folderList",2) - fp.set_preference("browser.download.manager.showWhenStarting",False) - fp.set_preference("browser.download.dir",os.path.abspath(download_dir)) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk","text/csv") + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", os.path.abspath(download_dir)) + fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") try: - driver = webdriver.Firefox(options=opts,firefox_profile=fp) + driver = webdriver.Firefox(options=opts, firefox_profile=fp) driver.get(url) for i in range(2, 54): # select country try: - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl03_ddValue'))) + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, 'fluNewsReportViewer_ctl04_ctl03_ddValue'))) Select(driver.find_element_by_tag_name('select')).select_by_value(str(i)) time.sleep(3) soup = BeautifulSoup(driver.page_source, 'html.parser') @@ -53,18 +52,18 @@ def download_ecdc_data(download_dir = "downloads"): break if type(ind) == str: # select clinical tyle - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'fluNewsReportViewer_ctl04_ctl05_ddValue'))) + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, 'fluNewsReportViewer_ctl04_ctl05_ddValue'))) Select(driver.find_element_by_id('fluNewsReportViewer_ctl04_ctl05_ddValue')).select_by_value(ind) - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnSelectExportType'))) + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, 'btnSelectExportType'))) driver.find_element_by_id('btnSelectExportType').click() - WebDriverWait(driver,30).until(EC.element_to_be_clickable((By.ID,'btnExportToCsv'))) + WebDriverWait(driver, 30).until(EC.element_to_be_clickable((By.ID, 'btnExportToCsv'))) driver.find_element_by_id('btnExportToCsv').click() time.sleep(3) - except: + except: # noqa driver.get(url) - except: + except: # noqa print('WARNING: ECDC Scraper may not have downloaded all of the available data.') - #cleanup + # cleanup os.system('''pkill "firefox" ''') os.system('''pkill "(firefox-bin)"''') os.system('''pkill "geckodriver*"''') diff --git a/src/acquisition/flusurv/flusurv.py b/src/acquisition/flusurv/flusurv.py index 6b8d247ae..ad69f9fa3 100644 --- a/src/acquisition/flusurv/flusurv.py +++ b/src/acquisition/flusurv/flusurv.py @@ -35,182 +35,178 @@ + initial version """ -# standard library -from datetime import datetime + import json import time +from datetime import datetime -# third party import requests - -# first party from delphi.utils.epidate import EpiDate - # all currently available FluSurv locations and their associated codes # the number pair represents NetworkID and CatchmentID location_codes = { - 'CA': (2, 1), - 'CO': (2, 2), - 'CT': (2, 3), - 'GA': (2, 4), - 'IA': (3, 5), - 'ID': (3, 6), - 'MD': (2, 7), - 'MI': (3, 8), - 'MN': (2, 9), - 'NM': (2, 11), - 'NY_albany': (2, 13), - 'NY_rochester': (2, 14), - 'OH': (3, 15), - 'OK': (3, 16), - 'OR': (2, 17), - 'RI': (3, 18), - 'SD': (3, 19), - 'TN': (2, 20), - 'UT': (3, 21), - 'network_all': (1, 22), - 'network_eip': (2, 22), - 'network_ihsp': (3, 22), + 'CA': (2, 1), + 'CO': (2, 2), + 'CT': (2, 3), + 'GA': (2, 4), + 'IA': (3, 5), + 'ID': (3, 6), + 'MD': (2, 7), + 'MI': (3, 8), + 'MN': (2, 9), + 'NM': (2, 11), + 'NY_albany': (2, 13), + 'NY_rochester': (2, 14), + 'OH': (3, 15), + 'OK': (3, 16), + 'OR': (2, 17), + 'RI': (3, 18), + 'SD': (3, 19), + 'TN': (2, 20), + 'UT': (3, 21), + 'network_all': (1, 22), + 'network_eip': (2, 22), + 'network_ihsp': (3, 22), } def fetch_json(path, payload, call_count=1, requests_impl=requests): - """Send a request to the server and return the parsed JSON response.""" - - # it's polite to self-identify this "bot" - delphi_url = 'https://delphi.cmu.edu/index.html' - user_agent = 'Mozilla/5.0 (compatible; delphibot/1.0; +%s)' % delphi_url - - # the FluSurv AMF server - flusurv_url = 'https://gis.cdc.gov/GRASP/Flu3/' + path - - # request headers - headers = { - 'Accept-Encoding': 'gzip', - 'User-Agent': user_agent, - } - if payload is not None: - headers['Content-Type'] = 'application/json;charset=UTF-8' - - # send the request and read the response - if payload is None: - method = requests_impl.get - data = None - else: - method = requests_impl.post - data = json.dumps(payload) - resp = method(flusurv_url, headers=headers, data=data) - - # check the HTTP status code - if resp.status_code == 500 and call_count <= 2: - # the server often fails with this status, so wait and retry - delay = 10 * call_count - print('got status %d, will retry in %d sec...' % (resp.status_code, delay)) - time.sleep(delay) - return fetch_json(path, payload, call_count=call_count + 1) - elif resp.status_code != 200: - raise Exception(['status code != 200', resp.status_code]) - - # check response mime type - if 'application/json' not in resp.headers.get('Content-Type', ''): - raise Exception('response is not json') - - # return the decoded json object - return resp.json() + """Send a request to the server and return the parsed JSON response.""" + + # it's polite to self-identify this "bot" + delphi_url = 'https://delphi.cmu.edu/index.html' + user_agent = 'Mozilla/5.0 (compatible; delphibot/1.0; +%s)' % delphi_url + + # the FluSurv AMF server + flusurv_url = 'https://gis.cdc.gov/GRASP/Flu3/' + path + + # request headers + headers = { + 'Accept-Encoding': 'gzip', + 'User-Agent': user_agent, + } + if payload is not None: + headers['Content-Type'] = 'application/json;charset=UTF-8' + + # send the request and read the response + if payload is None: + method = requests_impl.get + data = None + else: + method = requests_impl.post + data = json.dumps(payload) + resp = method(flusurv_url, headers=headers, data=data) + + # check the HTTP status code + if resp.status_code == 500 and call_count <= 2: + # the server often fails with this status, so wait and retry + delay = 10 * call_count + print('got status %d, will retry in %d sec...' % (resp.status_code, delay)) + time.sleep(delay) + return fetch_json(path, payload, call_count=call_count + 1) + elif resp.status_code != 200: + raise Exception(['status code != 200', resp.status_code]) + + # check response mime type + if 'application/json' not in resp.headers.get('Content-Type', ''): + raise Exception('response is not json') + + # return the decoded json object + return resp.json() def fetch_flusurv_object(location_code): - """Return decoded FluSurv JSON object for the given location.""" - return fetch_json('PostPhase03GetData', { - 'appversion': 'Public', - 'networkid': location_code[0], - 'cacthmentid': location_code[1], - }) + """Return decoded FluSurv JSON object for the given location.""" + return fetch_json('PostPhase03GetData', { + 'appversion': 'Public', + 'networkid': location_code[0], + 'cacthmentid': location_code[1], + }) def mmwrid_to_epiweek(mmwrid): - """Convert a CDC week index into an epiweek.""" + """Convert a CDC week index into an epiweek.""" - # Add the difference in IDs, which are sequential, to a reference epiweek, - # which is 2003w40 in this case. - epiweek_200340 = EpiDate(2003, 9, 28) - mmwrid_200340 = 2179 - return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew() + # Add the difference in IDs, which are sequential, to a reference epiweek, + # which is 2003w40 in this case. + epiweek_200340 = EpiDate(2003, 9, 28) + mmwrid_200340 = 2179 + return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew() def extract_from_object(data_in): - """ - Given a FluSurv data object, return hospitaliation rates. - - The returned object is indexed first by epiweek, then by zero-indexed age - group. - """ - - # an object to hold the result - data_out = {} - - # iterate over all seasons and age groups - for obj in data_in['busdata']['dataseries']: - if obj['age'] in (10, 11, 12): - # TODO(https://github.com/cmu-delphi/delphi-epidata/issues/242): - # capture as-of-yet undefined age groups 10, 11, and 12 - continue - age_index = obj['age'] - 1 - # iterage over weeks - for mmwrid, _, _, rate in obj['data']: - epiweek = mmwrid_to_epiweek(mmwrid) - if epiweek not in data_out: - # weekly rate of each age group - data_out[epiweek] = [None] * 9 - prev_rate = data_out[epiweek][age_index] - if prev_rate is None: - # this is the first time to see a rate for this epiweek/age - data_out[epiweek][age_index] = rate - elif prev_rate != rate: - # a different rate was already found for this epiweek/age - format_args = (epiweek, obj['age'], prev_rate, rate) - print('warning: %d %d %f != %f' % format_args) - - # sanity check the result - if len(data_out) == 0: - raise Exception('no data found') - - # print the result and return flu data - print('found data for %d weeks' % len(data_out)) - return data_out + """ + Given a FluSurv data object, return hospitaliation rates. + + The returned object is indexed first by epiweek, then by zero-indexed age + group. + """ + + # an object to hold the result + data_out = {} + + # iterate over all seasons and age groups + for obj in data_in['busdata']['dataseries']: + if obj['age'] in (10, 11, 12): + # TODO(https://github.com/cmu-delphi/delphi-epidata/issues/242): + # capture as-of-yet undefined age groups 10, 11, and 12 + continue + age_index = obj['age'] - 1 + # iterage over weeks + for mmwrid, _, _, rate in obj['data']: + epiweek = mmwrid_to_epiweek(mmwrid) + if epiweek not in data_out: + # weekly rate of each age group + data_out[epiweek] = [None] * 9 + prev_rate = data_out[epiweek][age_index] + if prev_rate is None: + # this is the first time to see a rate for this epiweek/age + data_out[epiweek][age_index] = rate + elif prev_rate != rate: + # a different rate was already found for this epiweek/age + format_args = (epiweek, obj['age'], prev_rate, rate) + print('warning: %d %d %f != %f' % format_args) + + # sanity check the result + if len(data_out) == 0: + raise Exception('no data found') + + # print the result and return flu data + print('found data for %d weeks' % len(data_out)) + return data_out def get_data(location_code): - """ - Fetch and parse flu data for the given location. + """ + Fetch and parse flu data for the given location. - This method performs the following operations: - - fetches FluSurv data from CDC - - extracts and returns hospitaliation rates - """ + This method performs the following operations: + - fetches FluSurv data from CDC + - extracts and returns hospitaliation rates + """ - # fetch - print('[fetching flusurv data...]') - data_in = fetch_flusurv_object(location_code) + # fetch + print('[fetching flusurv data...]') + data_in = fetch_flusurv_object(location_code) - # extract - print('[extracting values...]') - data_out = extract_from_object(data_in) + # extract + print('[extracting values...]') + data_out = extract_from_object(data_in) - # return - print('[scraped successfully]') - return data_out + # return + print('[scraped successfully]') + return data_out def get_current_issue(): - """Scrape the current issue from the FluSurv main page.""" + """Scrape the current issue from the FluSurv main page.""" - # fetch - data = fetch_json('GetPhase03InitApp?appVersion=Public', None) + # fetch + data = fetch_json('GetPhase03InitApp?appVersion=Public', None) - # extract - date = datetime.strptime(data['loaddatetime'], '%b %d, %Y') + # extract + date = datetime.strptime(data['loaddatetime'], '%b %d, %Y') - # convert and return - return EpiDate(date.year, date.month, date.day).get_ew() + # convert and return + return EpiDate(date.year, date.month, date.day).get_ew() diff --git a/src/acquisition/flusurv/flusurv_update.py b/src/acquisition/flusurv/flusurv_update.py index 35fadba05..1ce35e868 100644 --- a/src/acquisition/flusurv/flusurv_update.py +++ b/src/acquisition/flusurv/flusurv_update.py @@ -68,122 +68,119 @@ + initial version """ -# standard library + import argparse -# third party +import delphi.operations.secrets as secrets import mysql.connector - -# first party from delphi.epidata.acquisition.flusurv import flusurv -import delphi.operations.secrets as secrets from delphi.utils.epidate import EpiDate from delphi.utils.epiweek import delta_epiweeks def get_rows(cur): - """Return the number of rows in the `flusurv` table.""" + """Return the number of rows in the `flusurv` table.""" - # count all rows - cur.execute('SELECT count(1) `num` FROM `flusurv`') - for (num,) in cur: - return num + # count all rows + cur.execute('SELECT count(1) `num` FROM `flusurv`') + for (num,) in cur: + return num def update(issue, location_name, test_mode=False): - """Fetch and store the currently avialble weekly FluSurv dataset.""" - - # fetch data - location_code = flusurv.location_codes[location_name] - print('fetching data for', location_name, location_code) - data = flusurv.get_data(location_code) - - # metadata - epiweeks = sorted(data.keys()) - location = location_name - release_date = str(EpiDate.today()) - - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect( - host=secrets.db.host, user=u, password=p, database='epidata') - cur = cnx.cursor() - rows1 = get_rows(cur) - print('rows before: %d' % rows1) - - # SQL for insert/update - sql = ''' - INSERT INTO `flusurv` ( - `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`, - `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`, - `rate_age_5`, `rate_age_6`, `rate_age_7` - ) - VALUES ( - %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s - ) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `rate_age_0` = coalesce(%s, `rate_age_0`), - `rate_age_1` = coalesce(%s, `rate_age_1`), - `rate_age_2` = coalesce(%s, `rate_age_2`), - `rate_age_3` = coalesce(%s, `rate_age_3`), - `rate_age_4` = coalesce(%s, `rate_age_4`), - `rate_overall` = coalesce(%s, `rate_overall`), - `rate_age_5` = coalesce(%s, `rate_age_5`), - `rate_age_6` = coalesce(%s, `rate_age_6`), - `rate_age_7` = coalesce(%s, `rate_age_7`) - ''' - - # insert/update each row of data (one per epiweek) - for epiweek in epiweeks: - lag = delta_epiweeks(epiweek, issue) - if lag > 52: - # Ignore values older than one year, as (1) they are assumed not to - # change, and (2) it would adversely affect database performance if all - # values (including duplicates) were stored on each run. - continue - args_meta = [release_date, issue, epiweek, location, lag] - args_insert = data[epiweek] - args_update = [release_date] + data[epiweek] - cur.execute(sql, tuple(args_meta + args_insert + args_update)) - - # commit and disconnect - rows2 = get_rows(cur) - print('rows after: %d (+%d)' % (rows2, rows2 - rows1)) - cur.close() - if test_mode: - print('test mode: not committing database changes') - else: - cnx.commit() - cnx.close() + """Fetch and store the currently avialble weekly FluSurv dataset.""" + + # fetch data + location_code = flusurv.location_codes[location_name] + print('fetching data for', location_name, location_code) + data = flusurv.get_data(location_code) + + # metadata + epiweeks = sorted(data.keys()) + location = location_name + release_date = str(EpiDate.today()) + + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect( + host=secrets.db.host, user=u, password=p, database='epidata') + cur = cnx.cursor() + rows1 = get_rows(cur) + print('rows before: %d' % rows1) + + # SQL for insert/update + sql = ''' + INSERT INTO `flusurv` ( + `release_date`, `issue`, `epiweek`, `location`, `lag`, `rate_age_0`, + `rate_age_1`, `rate_age_2`, `rate_age_3`, `rate_age_4`, `rate_overall`, + `rate_age_5`, `rate_age_6`, `rate_age_7` + ) + VALUES ( + %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s + ) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `rate_age_0` = coalesce(%s, `rate_age_0`), + `rate_age_1` = coalesce(%s, `rate_age_1`), + `rate_age_2` = coalesce(%s, `rate_age_2`), + `rate_age_3` = coalesce(%s, `rate_age_3`), + `rate_age_4` = coalesce(%s, `rate_age_4`), + `rate_overall` = coalesce(%s, `rate_overall`), + `rate_age_5` = coalesce(%s, `rate_age_5`), + `rate_age_6` = coalesce(%s, `rate_age_6`), + `rate_age_7` = coalesce(%s, `rate_age_7`) + ''' + + # insert/update each row of data (one per epiweek) + for epiweek in epiweeks: + lag = delta_epiweeks(epiweek, issue) + if lag > 52: + # Ignore values older than one year, as (1) they are assumed not to + # change, and (2) it would adversely affect database performance if all + # values (including duplicates) were stored on each run. + continue + args_meta = [release_date, issue, epiweek, location, lag] + args_insert = data[epiweek] + args_update = [release_date] + data[epiweek] + cur.execute(sql, tuple(args_meta + args_insert + args_update)) + + # commit and disconnect + rows2 = get_rows(cur) + print('rows after: %d (+%d)' % (rows2, rows2 - rows1)) + cur.close() + if test_mode: + print('test mode: not committing database changes') + else: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - 'location', - help='location for which data should be scraped (e.g. "CA" or "all")' - ) - parser.add_argument( - '--test', '-t', - default=False, action='store_true', help='do not commit database changes' - ) - args = parser.parse_args() - - # scrape current issue from the main page - issue = flusurv.get_current_issue() - print('current issue: %d' % issue) - - # fetch flusurv data - if args.location == 'all': - # all locations - for location in flusurv.location_codes.keys(): - update(issue, location, args.test) - else: - # single location - update(issue, args.location, args.test) + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + 'location', + help='location for which data should be scraped (e.g. "CA" or "all")' + ) + parser.add_argument( + '--test', '-t', + default=False, action='store_true', help='do not commit database changes' + ) + args = parser.parse_args() + + # scrape current issue from the main page + issue = flusurv.get_current_issue() + print('current issue: %d' % issue) + + # fetch flusurv data + if args.location == 'all': + # all locations + for location in flusurv.location_codes.keys(): + update(issue, location, args.test) + else: + # single location + update(issue, args.location, args.test) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/fluview/fluview.py b/src/acquisition/fluview/fluview.py index d723cbc59..c1b04c2e5 100644 --- a/src/acquisition/fluview/fluview.py +++ b/src/acquisition/fluview/fluview.py @@ -4,11 +4,11 @@ =============== Fetches ILINet data (surveillance of outpatient influenza-like illness) from -CDC. +CDC. -This script provides functions for first fetching metadata from Fluview which -are then used to build a request that will get all data for the different tier -types (national, hhs regions, census divisions and states). This data is +This script provides functions for first fetching metadata from Fluview which +are then used to build a request that will get all data for the different tier +types (national, hhs regions, census divisions and states). This data is downloaded as one zip file per tier type (locally). This file replaces scrape_flu_data.sh, which performed a similar function for @@ -21,196 +21,195 @@ Changes: - 10/03/18: added field for 'WHO_NREVSS' data to download data from clinical - labs as well as public health labs. + labs as well as public health labs. """ -# standard library + import datetime import os import time -# third party import requests class Key: - """ - Constants for navigating the metadata object contained in the web response - from CDC. - """ + """ + Constants for navigating the metadata object contained in the web response + from CDC. + """ - class TierType: - nat = 'National' - hhs = 'HHS Regions' - cen = 'Census Divisions' - sta = 'State' + class TierType: + nat = 'National' + hhs = 'HHS Regions' + cen = 'Census Divisions' + sta = 'State' - class TierListEntry: - hhs = 'hhsregion' - cen = 'censusregions' - sta = 'states' + class TierListEntry: + hhs = 'hhsregion' + cen = 'censusregions' + sta = 'states' - class TierIdEntry: - hhs = 'hhsregionid' - cen = 'censusregionid' - sta = 'stateid' + class TierIdEntry: + hhs = 'hhsregionid' + cen = 'censusregionid' + sta = 'stateid' def check_status(resp, status, content_type): - """Raise an exception if the status code or content type is unexpected.""" - if resp.status_code != status: - raise Exception('got unexpected status code: ' + str(resp.status_code)) - actual_type = resp.headers.get('Content-Type', None) - if actual_type is None or content_type not in actual_type.lower(): - raise Exception('got unexpected content type: ' + str(actual_type)) + """Raise an exception if the status code or content type is unexpected.""" + if resp.status_code != status: + raise Exception('got unexpected status code: ' + str(resp.status_code)) + actual_type = resp.headers.get('Content-Type', None) + if actual_type is None or content_type not in actual_type.lower(): + raise Exception('got unexpected content type: ' + str(actual_type)) def fetch_metadata(sess): - """ - Return metadata indicating the current issue and also numeric constants - representing the various locations. - """ - url = 'https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public' - resp = sess.get(url) - check_status(resp, 200, 'application/json') - return resp.json() + """ + Return metadata indicating the current issue and also numeric constants + representing the various locations. + """ + url = 'https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public' + resp = sess.get(url) + check_status(resp, 200, 'application/json') + return resp.json() def get_issue_and_locations(data): - """Extract the issue and per-tier location lists from the metadata object.""" - - def get_tier_ids(name): - for row in data['regiontypes']: - if row['description'] == name: - return row['regiontypeid'] - raise Exception() - - tier_ids = dict((name, get_tier_ids(name)) for name in ( - Key.TierType.nat, - Key.TierType.hhs, - Key.TierType.cen, - Key.TierType.sta, - )) - - location_ids = { - Key.TierType.nat: [0], - Key.TierType.hhs: [], - Key.TierType.cen: [], - Key.TierType.sta: [], - } - - # add location ids for HHS - for row in data[Key.TierListEntry.hhs]: - location_ids[Key.TierType.hhs].append(row[Key.TierIdEntry.hhs]) - location_ids[Key.TierType.hhs] = sorted(set(location_ids[Key.TierType.hhs])) - num = len(location_ids[Key.TierType.hhs]) - if num != 10: - raise Exception('expected 10 hhs regions, found %d' % num) - - # add location ids for census divisions - for row in data[Key.TierListEntry.cen]: - location_ids[Key.TierType.cen].append(row[Key.TierIdEntry.cen]) - location_ids[Key.TierType.cen] = sorted(set(location_ids[Key.TierType.cen])) - num = len(location_ids[Key.TierType.cen]) - if num != 9: - raise Exception('expected 9 census divisions, found %d' % num) - - # add location ids for states - for row in data[Key.TierListEntry.sta]: - location_ids[Key.TierType.sta].append(row[Key.TierIdEntry.sta]) - location_ids[Key.TierType.sta] = sorted(set(location_ids[Key.TierType.sta])) - num = len(location_ids[Key.TierType.sta]) - if num != 57: - raise Exception('expected 57 states/territories/cities, found %d' % num) - - # return a useful subset of the metadata - # (latest epiweek, latest season, tier ids, location ids) - return { - 'epiweek': data['mmwr'][-1]['yearweek'], - 'season_id': data['mmwr'][-1]['seasonid'], - 'tier_ids': tier_ids, - 'location_ids': location_ids, - } + """Extract the issue and per-tier location lists from the metadata object.""" + + def get_tier_ids(name): + for row in data['regiontypes']: + if row['description'] == name: + return row['regiontypeid'] + raise Exception() + + tier_ids = dict((name, get_tier_ids(name)) for name in ( + Key.TierType.nat, + Key.TierType.hhs, + Key.TierType.cen, + Key.TierType.sta, + )) + + location_ids = { + Key.TierType.nat: [0], + Key.TierType.hhs: [], + Key.TierType.cen: [], + Key.TierType.sta: [], + } + + # add location ids for HHS + for row in data[Key.TierListEntry.hhs]: + location_ids[Key.TierType.hhs].append(row[Key.TierIdEntry.hhs]) + location_ids[Key.TierType.hhs] = sorted(set(location_ids[Key.TierType.hhs])) + num = len(location_ids[Key.TierType.hhs]) + if num != 10: + raise Exception('expected 10 hhs regions, found %d' % num) + + # add location ids for census divisions + for row in data[Key.TierListEntry.cen]: + location_ids[Key.TierType.cen].append(row[Key.TierIdEntry.cen]) + location_ids[Key.TierType.cen] = sorted(set(location_ids[Key.TierType.cen])) + num = len(location_ids[Key.TierType.cen]) + if num != 9: + raise Exception('expected 9 census divisions, found %d' % num) + + # add location ids for states + for row in data[Key.TierListEntry.sta]: + location_ids[Key.TierType.sta].append(row[Key.TierIdEntry.sta]) + location_ids[Key.TierType.sta] = sorted(set(location_ids[Key.TierType.sta])) + num = len(location_ids[Key.TierType.sta]) + if num != 57: + raise Exception('expected 57 states/territories/cities, found %d' % num) + + # return a useful subset of the metadata + # (latest epiweek, latest season, tier ids, location ids) + return { + 'epiweek': data['mmwr'][-1]['yearweek'], + 'season_id': data['mmwr'][-1]['seasonid'], + 'tier_ids': tier_ids, + 'location_ids': location_ids, + } def download_data(tier_id, location_ids, season_ids, filename): - """Download zipped ILINet data for the given locations and seasons.""" - - def get_entry(num, name=None): - return {'ID': num, 'Name': (name if name else num)} - - # download the data (in memory) - url = 'https://gis.cdc.gov/grasp/flu2/PostPhase02DataDownload' - data = { - 'AppVersion': 'Public', - 'DatasourceDT': [get_entry(1, 'ILINet'), get_entry(0, 'WHO_NREVSS')], - 'RegionTypeId': tier_id, - 'SubRegionsDT': [get_entry(loc) for loc in sorted(location_ids)], - 'SeasonsDT': [get_entry(season) for season in sorted(season_ids)], - } - resp = requests.post(url, json=data) - check_status(resp, 200, 'application/octet-stream') - payload = resp.content - - # save the data to file and return the file length - with open(filename, 'wb') as f: - f.write(payload) - return len(payload) + """Download zipped ILINet data for the given locations and seasons.""" + + def get_entry(num, name=None): + return {'ID': num, 'Name': (name if name else num)} + + # download the data (in memory) + url = 'https://gis.cdc.gov/grasp/flu2/PostPhase02DataDownload' + data = { + 'AppVersion': 'Public', + 'DatasourceDT': [get_entry(1, 'ILINet'), get_entry(0, 'WHO_NREVSS')], + 'RegionTypeId': tier_id, + 'SubRegionsDT': [get_entry(loc) for loc in sorted(location_ids)], + 'SeasonsDT': [get_entry(season) for season in sorted(season_ids)], + } + resp = requests.post(url, json=data) + check_status(resp, 200, 'application/octet-stream') + payload = resp.content + + # save the data to file and return the file length + with open(filename, 'wb') as f: + f.write(payload) + return len(payload) def save_latest(path=None): - """ - Save the latest two seasons of data for all locations, separately for each - location tier (i.e. national, HHS, census, and states). - """ - - # set up the session - sess = requests.session() - sess.headers.update({ - # it's polite to self-identify this "bot" - 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', - }) - - # get metatdata - print('looking up ilinet metadata') - data = fetch_metadata(sess) - info = get_issue_and_locations(data) - issue = info['epiweek'] - print('current issue: %d' % issue) - - # establish timing - dt = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') - current_season = info['season_id'] - seasons = [s for s in range(current_season - 1, current_season + 1)] - - # make the destination path if it doesn't already exist - if path is not None: - os.makedirs(path, exist_ok=True) - - # download the data file for each tier - files = [] - for delphi_name, cdc_name in ( - ('nat', Key.TierType.nat), - ('hhs', Key.TierType.hhs), - ('cen', Key.TierType.cen), - ('sta', Key.TierType.sta), - ): - name = 'ilinet_%s_%d_%s.zip' % (delphi_name, issue, dt) - if path is None: - filename = name - else: - filename = os.path.join(path, name) - tier_id = info['tier_ids'][cdc_name] - locations = info['location_ids'][cdc_name] - - # download and show timing information - print('downloading %s' % delphi_name) - t0 = time.time() - size = download_data(tier_id, locations, seasons, filename) - t1 = time.time() - - print(' saved %s (%d bytes in %.1f seconds)' % (filename, size, t1 - t0)) - files.append(filename) - - # return the current issue and the list of downloaded files - return issue, files + """ + Save the latest two seasons of data for all locations, separately for each + location tier (i.e. national, HHS, census, and states). + """ + + # set up the session + sess = requests.session() + sess.headers.update({ + # it's polite to self-identify this "bot" + 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', + }) + + # get metatdata + print('looking up ilinet metadata') + data = fetch_metadata(sess) + info = get_issue_and_locations(data) + issue = info['epiweek'] + print('current issue: %d' % issue) + + # establish timing + dt = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + current_season = info['season_id'] + seasons = [s for s in range(current_season - 1, current_season + 1)] + + # make the destination path if it doesn't already exist + if path is not None: + os.makedirs(path, exist_ok=True) + + # download the data file for each tier + files = [] + for delphi_name, cdc_name in ( + ('nat', Key.TierType.nat), + ('hhs', Key.TierType.hhs), + ('cen', Key.TierType.cen), + ('sta', Key.TierType.sta), + ): + name = 'ilinet_%s_%d_%s.zip' % (delphi_name, issue, dt) + if path is None: + filename = name + else: + filename = os.path.join(path, name) + tier_id = info['tier_ids'][cdc_name] + locations = info['location_ids'][cdc_name] + + # download and show timing information + print('downloading %s' % delphi_name) + t0 = time.time() + size = download_data(tier_id, locations, seasons, filename) + t1 = time.time() + + print(' saved %s (%d bytes in %.1f seconds)' % (filename, size, t1 - t0)) + files.append(filename) + + # return the current issue and the list of downloaded files + return issue, files diff --git a/src/acquisition/fluview/fluview_locations.py b/src/acquisition/fluview/fluview_locations.py index 9c851bc6f..0b38c355f 100644 --- a/src/acquisition/fluview/fluview_locations.py +++ b/src/acquisition/fluview/fluview_locations.py @@ -15,100 +15,98 @@ # https://gis.cdc.gov/grasp/flu2/GetPhase02InitApp?appVersion=Public # The values are used in queries of Delphi's Epidata API. cdc_to_delphi = { - 'national': { - 'x': 'nat', - }, - 'hhs regions': { - 'region 1': 'hhs1', - 'region 2': 'hhs2', - 'region 3': 'hhs3', - 'region 4': 'hhs4', - 'region 5': 'hhs5', - 'region 6': 'hhs6', - 'region 7': 'hhs7', - 'region 8': 'hhs8', - 'region 9': 'hhs9', - 'region 10': 'hhs10', - }, - 'census regions': { - 'new england': 'cen1', - 'mid-atlantic': 'cen2', - 'east north central': 'cen3', - 'west north central': 'cen4', - 'south atlantic': 'cen5', - 'east south central': 'cen6', - 'west south central': 'cen7', - 'mountain': 'cen8', - 'pacific': 'cen9', - }, - 'states': { - # states/territories: two-letter ISO 3166 - 'alabama': 'al', - 'alaska': 'ak', - 'arizona': 'az', - 'arkansas': 'ar', - 'california': 'ca', - 'colorado': 'co', - 'connecticut': 'ct', - 'delaware': 'de', - 'florida': 'fl', - 'georgia': 'ga', - 'hawaii': 'hi', - 'idaho': 'id', - 'illinois': 'il', - 'indiana': 'in', - 'iowa': 'ia', - 'kansas': 'ks', - 'kentucky': 'ky', - 'louisiana': 'la', - 'maine': 'me', - 'maryland': 'md', - 'massachusetts': 'ma', - 'michigan': 'mi', - 'minnesota': 'mn', - 'mississippi': 'ms', - 'missouri': 'mo', - 'montana': 'mt', - 'nebraska': 'ne', - 'nevada': 'nv', - 'new hampshire': 'nh', - 'new jersey': 'nj', - 'new mexico': 'nm', - # Even though it's called "New York", this location doesn't include New - # York City ("jfk"). New York ("ny") is actually this *plus* jfk. - 'new york': 'ny_minus_jfk', - 'north carolina': 'nc', - 'north dakota': 'nd', - 'ohio': 'oh', - 'oklahoma': 'ok', - 'oregon': 'or', - 'pennsylvania': 'pa', - 'rhode island': 'ri', - 'south carolina': 'sc', - 'south dakota': 'sd', - 'tennessee': 'tn', - 'texas': 'tx', - 'utah': 'ut', - 'vermont': 'vt', - 'virginia': 'va', - 'washington': 'wa', - 'west virginia': 'wv', - 'wisconsin': 'wi', - 'wyoming': 'wy', - 'american samoa': 'as', - 'commonwealth of the northern mariana islands': 'mp', - 'district of columbia': 'dc', - 'guam': 'gu', - 'puerto rico': 'pr', - 'virgin islands': 'vi', - # cities: three-letter IATA - 'chicago': 'ord', - 'los angeles': 'lax', - 'new york city': 'jfk', - }, + 'national': {'x': 'nat'}, + 'hhs regions': { + 'region 1': 'hhs1', + 'region 2': 'hhs2', + 'region 3': 'hhs3', + 'region 4': 'hhs4', + 'region 5': 'hhs5', + 'region 6': 'hhs6', + 'region 7': 'hhs7', + 'region 8': 'hhs8', + 'region 9': 'hhs9', + 'region 10': 'hhs10', + }, + 'census regions': { + 'new england': 'cen1', + 'mid-atlantic': 'cen2', + 'east north central': 'cen3', + 'west north central': 'cen4', + 'south atlantic': 'cen5', + 'east south central': 'cen6', + 'west south central': 'cen7', + 'mountain': 'cen8', + 'pacific': 'cen9', + }, + 'states': { + # states/territories: two-letter ISO 3166 + 'alabama': 'al', + 'alaska': 'ak', + 'arizona': 'az', + 'arkansas': 'ar', + 'california': 'ca', + 'colorado': 'co', + 'connecticut': 'ct', + 'delaware': 'de', + 'florida': 'fl', + 'georgia': 'ga', + 'hawaii': 'hi', + 'idaho': 'id', + 'illinois': 'il', + 'indiana': 'in', + 'iowa': 'ia', + 'kansas': 'ks', + 'kentucky': 'ky', + 'louisiana': 'la', + 'maine': 'me', + 'maryland': 'md', + 'massachusetts': 'ma', + 'michigan': 'mi', + 'minnesota': 'mn', + 'mississippi': 'ms', + 'missouri': 'mo', + 'montana': 'mt', + 'nebraska': 'ne', + 'nevada': 'nv', + 'new hampshire': 'nh', + 'new jersey': 'nj', + 'new mexico': 'nm', + # Even though it's called "New York", this location doesn't include New + # York City ("jfk"). New York ("ny") is actually this *plus* jfk. + 'new york': 'ny_minus_jfk', + 'north carolina': 'nc', + 'north dakota': 'nd', + 'ohio': 'oh', + 'oklahoma': 'ok', + 'oregon': 'or', + 'pennsylvania': 'pa', + 'rhode island': 'ri', + 'south carolina': 'sc', + 'south dakota': 'sd', + 'tennessee': 'tn', + 'texas': 'tx', + 'utah': 'ut', + 'vermont': 'vt', + 'virginia': 'va', + 'washington': 'wa', + 'west virginia': 'wv', + 'wisconsin': 'wi', + 'wyoming': 'wy', + 'american samoa': 'as', + 'commonwealth of the northern mariana islands': 'mp', + 'district of columbia': 'dc', + 'guam': 'gu', + 'puerto rico': 'pr', + 'virgin islands': 'vi', + # cities: three-letter IATA + 'chicago': 'ord', + 'los angeles': 'lax', + 'new york city': 'jfk', + }, } def get_location_name(region_type, region_name): - """Convert a CDC location type and name pair into a Delphi location name.""" - return cdc_to_delphi[region_type.lower()][region_name.lower()] + """Convert a CDC location type and name pair into a Delphi location name.""" + return cdc_to_delphi[region_type.lower()][region_name.lower()] diff --git a/src/acquisition/fluview/fluview_notify.py b/src/acquisition/fluview/fluview_notify.py index 13f0f3559..c63d8a05f 100644 --- a/src/acquisition/fluview/fluview_notify.py +++ b/src/acquisition/fluview/fluview_notify.py @@ -21,51 +21,46 @@ + Initial version """ -# standard library import argparse -# third party -import mysql.connector - -# first party import delphi.operations.secrets as secrets - +import mysql.connector if __name__ == '__main__': - # Args and usage - parser = argparse.ArgumentParser() - parser.add_argument('-t', '--test', action='store_const', const=True, default=False, help="do dry run only, don't update the database") - args = parser.parse_args() + # Args and usage + parser = argparse.ArgumentParser() + parser.add_argument('-t', '--test', action='store_const', const=True, default=False, help="do dry run only, don't update the database") + args = parser.parse_args() - # connect - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() + # connect + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cur = cnx.cursor() - # get the last known issue from the automation table `variables` - cur.execute('SELECT `value` FROM automation.`variables` WHERE `name` = %s', ('most_recent_issue',)) - for (issue1,) in cur: - issue1 = int(issue1) - print('last known issue:', issue1) - # get the most recent issue from the epidata table `fluview` - cur.execute('SELECT max(`issue`) FROM `fluview`') - for (issue2,) in cur: - issue2 = int(issue2) - print('most recent issue:', issue2) + # get the last known issue from the automation table `variables` + cur.execute('SELECT `value` FROM automation.`variables` WHERE `name` = %s', ('most_recent_issue',)) + for (issue1,) in cur: + issue1 = int(issue1) + print('last known issue:', issue1) + # get the most recent issue from the epidata table `fluview` + cur.execute('SELECT max(`issue`) FROM `fluview`') + for (issue2,) in cur: + issue2 = int(issue2) + print('most recent issue:', issue2) - if issue2 > issue1: - print('new data is available!') - if args.test: - print('test mode - not making any changes') - else: - # update the variable - cur.execute('UPDATE automation.`variables` SET `value` = %s WHERE `name` = %s', (issue2, 'most_recent_issue')) - # queue the 'New FluView Available' flow - cur.execute('CALL automation.RunStep(36)') - elif issue2 < issue2: - raise Exception('most recent issue is older than the last known issue') + if issue2 > issue1: + print('new data is available!') + if args.test: + print('test mode - not making any changes') + else: + # update the variable + cur.execute('UPDATE automation.`variables` SET `value` = %s WHERE `name` = %s', (issue2, 'most_recent_issue')) + # queue the 'New FluView Available' flow + cur.execute('CALL automation.RunStep(36)') + elif issue2 < issue2: # TODO: expression has no any sense + raise Exception('most recent issue is older than the last known issue') - # cleanup - cnx.commit() - cur.close() - cnx.close() + # cleanup + cnx.commit() + cur.close() + cnx.close() diff --git a/src/acquisition/fluview/fluview_update.py b/src/acquisition/fluview/fluview_update.py index 65bec7a40..054ba182e 100644 --- a/src/acquisition/fluview/fluview_update.py +++ b/src/acquisition/fluview/fluview_update.py @@ -120,14 +120,13 @@ import io import zipfile -# third party -import mysql.connector - # first party import delphi.operations.secrets as secrets +# third party +import mysql.connector from delphi.utils.epiweek import delta_epiweeks, join_epiweek -from . import fluview -from . import fluview_locations + +from . import fluview, fluview_locations # sheet names ILINET_SHEET = 'ILINet.csv' @@ -137,391 +136,404 @@ CL_TABLE = 'fluview_clinical' PHL_TABLE = 'fluview_public' + def optional_int(i): - return int(i) if i not in ('', 'X') else None + return int(i) if i not in ('', 'X') else None + def optional_float(i, j): - return float(i) if i not in ('', 'X') else float(j) + return float(i) if i not in ('', 'X') else float(j) + def nullable_float(i): - return float(i) if i not in ('', 'X') else None + return float(i) if i not in ('', 'X') else None + def get_ilinet_data(row): - if row[0] == 'REGION TYPE' and row != [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - '% WEIGHTED ILI', - '%UNWEIGHTED ILI', - 'AGE 0-4', - 'AGE 25-49', - 'AGE 25-64', - 'AGE 5-24', - 'AGE 50-64', - 'AGE 65', - 'ILITOTAL', - 'NUM. OF PROVIDERS', - 'TOTAL PATIENTS' - ]: - raise Exception('header row has changed') - if len(row) == 1 or row[0] == 'REGION TYPE': - # this is a header row - return None - if row[5] == 'X': - # ILI isn't reported, ignore this row - return None - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': join_epiweek(int(row[2]), int(row[3])), - 'wili': optional_float(*row[4:6]), - 'ili': float(row[5]), - 'age0': optional_int(row[6]), - 'age1': optional_int(row[9]), - 'age2': optional_int(row[8]), - 'age3': optional_int(row[7]), - 'age4': optional_int(row[10]), - 'age5': optional_int(row[11]), - 'n_ili': optional_int(row[12]), - 'n_providers': optional_int(row[13]), - 'n_patients': optional_int(row[14]), - } + if row[0] == 'REGION TYPE' and row != [ + 'REGION TYPE', + 'REGION', + 'YEAR', + 'WEEK', + '% WEIGHTED ILI', + '%UNWEIGHTED ILI', + 'AGE 0-4', + 'AGE 25-49', + 'AGE 25-64', + 'AGE 5-24', + 'AGE 50-64', + 'AGE 65', + 'ILITOTAL', + 'NUM. OF PROVIDERS', + 'TOTAL PATIENTS' + ]: + raise Exception('header row has changed') + if len(row) == 1 or row[0] == 'REGION TYPE': + # this is a header row + return None + if row[5] == 'X': + # ILI isn't reported, ignore this row + return None + return { + 'location': fluview_locations.get_location_name(*row[:2]), + 'epiweek': join_epiweek(int(row[2]), int(row[3])), + 'wili': optional_float(*row[4:6]), + 'ili': float(row[5]), + 'age0': optional_int(row[6]), + 'age1': optional_int(row[9]), + 'age2': optional_int(row[8]), + 'age3': optional_int(row[7]), + 'age4': optional_int(row[10]), + 'age5': optional_int(row[11]), + 'n_ili': optional_int(row[12]), + 'n_providers': optional_int(row[13]), + 'n_patients': optional_int(row[14]), + } + def get_clinical_data(row): - if row[0] == 'REGION TYPE' and row != [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - 'TOTAL SPECIMENS', - 'TOTAL A', - 'TOTAL B', - 'PERCENT POSITIVE', - 'PERCENT A', - 'PERCENT B' - ]: - raise Exception('header row has changed for clinical lab data.') - if len(row) == 1 or row[0] == 'REGION TYPE': - # this is a header row - return None - if row[4] == 'X': - # data is not reported, ignore this row - return None - # ignore percentage calculations for now - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': join_epiweek(int(row[2]), int(row[3])), - 'total_specimens': int(row[4]), - 'total_a': optional_int(row[5]), - 'total_b': optional_int(row[6]), - 'percent_positive': nullable_float(row[7]), - 'percent_a': nullable_float(row[8]), - 'percent_b': nullable_float(row[9]) - } + if row[0] == 'REGION TYPE' and row != [ + 'REGION TYPE', + 'REGION', + 'YEAR', + 'WEEK', + 'TOTAL SPECIMENS', + 'TOTAL A', + 'TOTAL B', + 'PERCENT POSITIVE', + 'PERCENT A', + 'PERCENT B' + ]: + raise Exception('header row has changed for clinical lab data.') + if len(row) == 1 or row[0] == 'REGION TYPE': + # this is a header row + return None + if row[4] == 'X': + # data is not reported, ignore this row + return None + # ignore percentage calculations for now + return { + 'location': fluview_locations.get_location_name(*row[:2]), + 'epiweek': join_epiweek(int(row[2]), int(row[3])), + 'total_specimens': int(row[4]), + 'total_a': optional_int(row[5]), + 'total_b': optional_int(row[6]), + 'percent_positive': nullable_float(row[7]), + 'percent_a': nullable_float(row[8]), + 'percent_b': nullable_float(row[9]) + } + def get_public_data(row): - hrow1 = [ - 'REGION TYPE', - 'REGION', - 'SEASON_DESCRIPTION', - 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', - 'BVic', - 'BYam', - 'H3N2v' - ] - hrow2 = [ - 'REGION TYPE', - 'REGION', - 'YEAR', - 'WEEK', - 'TOTAL SPECIMENS', - 'A (2009 H1N1)', - 'A (H3)', - 'A (Subtyping not Performed)', - 'B', - 'BVic', - 'BYam', - 'H3N2v' - ] - if row[0] == 'REGION TYPE' and row != hrow1 and row != hrow2: - raise Exception('header row has changed for public health lab data.') - if len(row) == 1 or row[0] == 'REGION TYPE': - # header row - return None - if row[3] == 'X': - # data is not reported, ignore this row - return None - # handle case where data is reported by season, not by epiweek - is_weekly = len(row) == len(hrow2) - # set epiweek - if is_weekly: - epiweek = join_epiweek(int(row[2]), int(row[3])) - else: - epiweek = int(row[2][7:11]) * 100 + 40 - # row offset - offset = 1 if is_weekly else 0 - return { - 'location': fluview_locations.get_location_name(*row[:2]), - 'epiweek': epiweek, - 'total_specimens': int(row[3 + offset]), - 'total_a_h1n1': optional_int(row[4+ offset]), - 'total_a_h3': optional_int(row[5 + offset]), - 'total_a_h3n2v': optional_int(row[10 + offset]), - 'total_a_no_sub': optional_int(row[6 + offset]), - 'total_b': optional_int(row[7 + offset]), - 'total_b_vic': optional_int(row[8 + offset]), - 'total_b_yam': optional_int(row[9 + offset]) - } + hrow1 = [ + 'REGION TYPE', + 'REGION', + 'SEASON_DESCRIPTION', + 'TOTAL SPECIMENS', + 'A (2009 H1N1)', + 'A (H3)', + 'A (Subtyping not Performed)', + 'B', + 'BVic', + 'BYam', + 'H3N2v' + ] + hrow2 = [ + 'REGION TYPE', + 'REGION', + 'YEAR', + 'WEEK', + 'TOTAL SPECIMENS', + 'A (2009 H1N1)', + 'A (H3)', + 'A (Subtyping not Performed)', + 'B', + 'BVic', + 'BYam', + 'H3N2v' + ] + if row[0] == 'REGION TYPE' and row != hrow1 and row != hrow2: + raise Exception('header row has changed for public health lab data.') + if len(row) == 1 or row[0] == 'REGION TYPE': + # header row + return None + if row[3] == 'X': + # data is not reported, ignore this row + return None + # handle case where data is reported by season, not by epiweek + is_weekly = len(row) == len(hrow2) + # set epiweek + if is_weekly: + epiweek = join_epiweek(int(row[2]), int(row[3])) + else: + epiweek = int(row[2][7:11]) * 100 + 40 + # row offset + offset = 1 if is_weekly else 0 + return { + 'location': fluview_locations.get_location_name(*row[:2]), + 'epiweek': epiweek, + 'total_specimens': int(row[3 + offset]), + 'total_a_h1n1': optional_int(row[4 + offset]), + 'total_a_h3': optional_int(row[5 + offset]), + 'total_a_h3n2v': optional_int(row[10 + offset]), + 'total_a_no_sub': optional_int(row[6 + offset]), + 'total_b': optional_int(row[7 + offset]), + 'total_b_vic': optional_int(row[8 + offset]), + 'total_b_yam': optional_int(row[9 + offset]) + } + def load_zipped_csv(filename, sheetname='ILINet.csv'): - """Read rows from a zipped CSV, which is expected to be named as specified - by the sheetname parameter. Default is ILINet.csv, for the default flu data.""" - with zipfile.ZipFile(filename) as f: - with f.open(sheetname) as ff: - return [row for row in csv.reader(io.StringIO(str(ff.read(), 'utf-8')))] + """Read rows from a zipped CSV, which is expected to be named as specified + by the sheetname parameter. Default is ILINet.csv, for the default flu data.""" + with zipfile.ZipFile(filename) as f: + with f.open(sheetname) as ff: + return [row for row in csv.reader(io.StringIO(str(ff.read(), 'utf-8')))] + def get_rows(cnx, table='fluview'): - """Count and return the number of rows in the `fluview` table. - Looking at the fluview table by default, but may pass parameter - to look at public health or clinical lab data instead.""" - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + """Count and return the number of rows in the `fluview` table. + Looking at the fluview table by default, but may pass parameter + to look at public health or clinical lab data instead.""" + select = cnx.cursor() + select.execute('SELECT count(1) num FROM %s' % table) + for (num,) in select: + pass + select.close() + return num + def update_from_file_clinical(issue, date, filename, test_mode=False): - """ - Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, CL_TABLE) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename, CL_SHEET) - print(' loaded %d rows' % len(rows)) - data = [get_clinical_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, - `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, - `percent_b`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `total_specimens` = %s, - `total_a` = %s, - `total_b` = %s, - `percent_positive` = %s, - `percent_a` = %s, - `percent_b` = %s - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['total_specimens'], row['total_a'], row['total_b'], - row['percent_positive'], row['percent_a'], row['percent_b'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + rows1 = get_rows(cnx, CL_TABLE) + print('rows before: %d' % (rows1)) + insert = cnx.cursor() + + # load the data, ignoring empty rows + print('loading data from %s as issued on %d' % (filename, issue)) + rows = load_zipped_csv(filename, CL_SHEET) + print(' loaded %d rows' % len(rows)) + data = [get_clinical_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(' found %d entries' % len(entries)) + + sql = ''' + INSERT INTO + `fluview_clinical` (`release_date`, `issue`, `epiweek`, `region`, `lag`, + `total_specimens`, `total_a`, `total_b`, `percent_positive`, `percent_a`, + `percent_b`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `total_specimens` = %s, + `total_a` = %s, + `total_b` = %s, + `percent_positive` = %s, + `percent_a` = %s, + `percent_b` = %s + ''' + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row['epiweek'], issue) + args = [ + row['total_specimens'], row['total_a'], row['total_b'], + row['percent_positive'], row['percent_a'], row['percent_b'] + ] + ins_args = [date, issue, row['epiweek'], row['location'], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print('test mode, not committing') + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) + cnx.close() + def update_from_file_public(issue, date, filename, test_mode=False): - """ - Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx, PHL_TABLE) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename, PHL_SHEET) - print(' loaded %d rows' % len(rows)) - data = [get_public_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview_public` (`release_date`, `issue`, `epiweek`, `region`, `lag`, - `total_specimens`, `total_a_h1n1`, `total_a_h3`, `total_a_h3n2v`, - `total_a_no_sub`, `total_b`, `total_b_vic`, `total_b_yam`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `total_specimens` = %s, - `total_a_h1n1` = %s, - `total_a_h3` = %s, - `total_a_h3n2v` = %s, - `total_a_no_sub` = %s, - `total_b` = %s, - `total_b_vic` = %s, - `total_b_yam` = %s - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['total_specimens'], row['total_a_h1n1'], row['total_a_h3'], - row['total_a_h3n2v'], row['total_a_no_sub'], row['total_b'], - row['total_b_vic'], row['total_b_yam'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read WHO/NREVSS data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + rows1 = get_rows(cnx, PHL_TABLE) + print('rows before: %d' % (rows1)) + insert = cnx.cursor() + + # load the data, ignoring empty rows + print('loading data from %s as issued on %d' % (filename, issue)) + rows = load_zipped_csv(filename, PHL_SHEET) + print(' loaded %d rows' % len(rows)) + data = [get_public_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(' found %d entries' % len(entries)) + + sql = ''' + INSERT INTO + `fluview_public` (`release_date`, `issue`, `epiweek`, `region`, `lag`, + `total_specimens`, `total_a_h1n1`, `total_a_h3`, `total_a_h3n2v`, + `total_a_no_sub`, `total_b`, `total_b_vic`, `total_b_yam`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `total_specimens` = %s, + `total_a_h1n1` = %s, + `total_a_h3` = %s, + `total_a_h3n2v` = %s, + `total_a_no_sub` = %s, + `total_b` = %s, + `total_b_vic` = %s, + `total_b_yam` = %s + ''' + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row['epiweek'], issue) + args = [ + row['total_specimens'], row['total_a_h1n1'], row['total_a_h3'], + row['total_a_h3n2v'], row['total_a_no_sub'], row['total_b'], + row['total_b_vic'], row['total_b_yam'] + ] + ins_args = [date, issue, row['epiweek'], row['location'], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print('test mode, not committing') + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) + cnx.close() + def update_from_file(issue, date, filename, test_mode=False): - """ - Read ILINet data from a zipped CSV and insert into (or update) the database. - """ - - # database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx) - print('rows before: %d' % (rows1)) - insert = cnx.cursor() - - # load the data, ignoring empty rows - print('loading data from %s as issued on %d' % (filename, issue)) - rows = load_zipped_csv(filename) - print(' loaded %d rows' % len(rows)) - data = [get_ilinet_data(row) for row in rows] - entries = [obj for obj in data if obj] - print(' found %d entries' % len(entries)) - - sql = ''' - INSERT INTO - `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`, - `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`, - `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), - `num_ili` = %s, - `num_patients` = %s, - `num_providers` = %s, - `wili` = %s, - `ili` = %s, - `num_age_0` = coalesce(%s, `num_age_0`), - `num_age_1` = coalesce(%s, `num_age_1`), - `num_age_2` = coalesce(%s, `num_age_2`), - `num_age_3` = coalesce(%s, `num_age_3`), - `num_age_4` = coalesce(%s, `num_age_4`), - `num_age_5` = coalesce(%s, `num_age_5`) - ''' - - # insert each row - insert = cnx.cursor() - for row in entries: - lag = delta_epiweeks(row['epiweek'], issue) - args = [ - row['n_ili'], row['n_patients'], row['n_providers'], row['wili'], - row['ili'], row['age0'], row['age1'], row['age2'], row['age3'], - row['age4'], row['age5'] - ] - ins_args = [date, issue, row['epiweek'], row['location'], lag] + args - upd_args = [date] + args - insert.execute(sql, ins_args + upd_args) - - # cleanup - insert.close() - if test_mode: - print('test mode, not committing') - rows2 = rows1 - else: - cnx.commit() - rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - cnx.close() + """ + Read ILINet data from a zipped CSV and insert into (or update) the database. + """ + + # database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + rows1 = get_rows(cnx) + print('rows before: %d' % (rows1)) + insert = cnx.cursor() + + # load the data, ignoring empty rows + print('loading data from %s as issued on %d' % (filename, issue)) + rows = load_zipped_csv(filename) + print(' loaded %d rows' % len(rows)) + data = [get_ilinet_data(row) for row in rows] + entries = [obj for obj in data if obj] + print(' found %d entries' % len(entries)) + + sql = ''' + INSERT INTO + `fluview` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `num_ili`, + `num_patients`, `num_providers`, `wili`, `ili`, `num_age_0`, `num_age_1`, + `num_age_2`, `num_age_3`, `num_age_4`, `num_age_5`) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), + `num_ili` = %s, + `num_patients` = %s, + `num_providers` = %s, + `wili` = %s, + `ili` = %s, + `num_age_0` = coalesce(%s, `num_age_0`), + `num_age_1` = coalesce(%s, `num_age_1`), + `num_age_2` = coalesce(%s, `num_age_2`), + `num_age_3` = coalesce(%s, `num_age_3`), + `num_age_4` = coalesce(%s, `num_age_4`), + `num_age_5` = coalesce(%s, `num_age_5`) + ''' + + # insert each row + insert = cnx.cursor() + for row in entries: + lag = delta_epiweeks(row['epiweek'], issue) + args = [ + row['n_ili'], row['n_patients'], row['n_providers'], row['wili'], + row['ili'], row['age0'], row['age1'], row['age2'], row['age3'], + row['age4'], row['age5'] + ] + ins_args = [date, issue, row['epiweek'], row['location'], lag] + args + upd_args = [date] + args + insert.execute(sql, ins_args + upd_args) + + # cleanup + insert.close() + if test_mode: + print('test mode, not committing') + rows2 = rows1 + else: + cnx.commit() + rows2 = get_rows(cnx) + print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) + cnx.close() + def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - parser.add_argument( - '--file', - type=str, - help='load an existing zip file (otherwise fetch current data)' - ) - parser.add_argument( - '--issue', - type=int, - help='issue of the file (e.g. 201740); used iff --file is given' - ) - args = parser.parse_args() - - if (args.file is None) != (args.issue is None): - raise Exception('--file and --issue must both be present or absent') - - date = datetime.datetime.now().strftime('%Y-%m-%d') - print('assuming release date is today, %s' % date) - - if args.file: - update_from_file(args.issue, date, args.file, test_mode=args.test) - update_from_file_clinical(args.issue, date, args.file, test_mode=args.test) - # TODO: header row has changed for public health lab data - # update_from_file_public(args.issue, date, args.file, test_mode=args.test) - else: - issue, files = fluview.save_latest(path='flu_data') - for filename in files: - update_from_file(issue, date, filename, test_mode=args.test) - update_from_file_clinical(issue, date, filename, test_mode=args.test) - # TODO: header row has changed for public health lab data - # update_from_file_public(issue, date, filename, test_mode=args.test) + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + '--test', + action='store_true', + help='do dry run only, do not update the database' + ) + parser.add_argument( + '--file', + type=str, + help='load an existing zip file (otherwise fetch current data)' + ) + parser.add_argument( + '--issue', + type=int, + help='issue of the file (e.g. 201740); used iff --file is given' + ) + args = parser.parse_args() + + if (args.file is None) != (args.issue is None): + raise Exception('--file and --issue must both be present or absent') + + date = datetime.datetime.now().strftime('%Y-%m-%d') + print('assuming release date is today, %s' % date) + + if args.file: + update_from_file(args.issue, date, args.file, test_mode=args.test) + update_from_file_clinical(args.issue, date, args.file, test_mode=args.test) + # TODO: header row has changed for public health lab data + # update_from_file_public(args.issue, date, args.file, test_mode=args.test) + else: + issue, files = fluview.save_latest(path='flu_data') + for filename in files: + update_from_file(issue, date, filename, test_mode=args.test) + update_from_file_clinical(issue, date, filename, test_mode=args.test) + # TODO: header row has changed for public health lab data + # update_from_file_public(issue, date, filename, test_mode=args.test) + if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/fluview/impute_missing_values.py b/src/acquisition/fluview/impute_missing_values.py index 7f9a23231..fceac6910 100644 --- a/src/acquisition/fluview/impute_missing_values.py +++ b/src/acquisition/fluview/impute_missing_values.py @@ -48,301 +48,300 @@ # standard library import argparse +# first party +import delphi.operations.secrets as secrets # third party import mysql.connector import numpy as np - -# first party -import delphi.operations.secrets as secrets from delphi.utils.epiweek import delta_epiweeks from delphi.utils.geo.locations import Locations class Database: - """Database wrapper and abstraction layer.""" + """Database wrapper and abstraction layer.""" - class Sql: - """Container for SQL constants.""" + class Sql: + """Container for SQL constants.""" - # Count the total number of imputed rows. - count_rows = ''' - SELECT - count(1) `num` - FROM - `fluview_imputed` - ''' + # Count the total number of imputed rows. + count_rows = ''' + SELECT + count(1) `num` + FROM + `fluview_imputed` + ''' # Find (issue, epiweek) pairs that exist in table `fluview` but not in # table `fluview_imputed`. Note that only issues >= 201740 are selected # because that's when CDC first started posting state-level ILINet data. # This assumes that `fluview` is always missing at least one location. find_missing_rows = ''' - SELECT - fv.`issue`, fv.`epiweek` - FROM ( SELECT - `issue`, `epiweek` - FROM - `fluview` + fv.`issue`, fv.`epiweek` + FROM ( + SELECT + `issue`, `epiweek` + FROM + `fluview` + WHERE + `issue` >= 201740 + GROUP BY + `issue`, `epiweek` + ) fv + LEFT JOIN ( + SELECT + `issue`, `epiweek` + FROM + `fluview_imputed` + GROUP BY + `issue`, `epiweek` + ) fvi + ON + fvi.`issue` = fv.`issue` AND fvi.`epiweek` = fv.`epiweek` WHERE - `issue` >= 201740 - GROUP BY - `issue`, `epiweek` - ) fv - LEFT JOIN ( - SELECT - `issue`, `epiweek` - FROM - `fluview_imputed` - GROUP BY - `issue`, `epiweek` - ) fvi - ON - fvi.`issue` = fv.`issue` AND fvi.`epiweek` = fv.`epiweek` - WHERE - fvi.`issue` IS NULL + fvi.`issue` IS NULL ''' # Read all location rows from the `fluview` table for a given issue and # epiweek. get_known_values = ''' - SELECT - `region`, `num_ili`, `num_patients`, `num_providers` - FROM - `fluview` - WHERE - `issue` = %s AND `epiweek` = %s + SELECT + `region`, `num_ili`, `num_patients`, `num_providers` + FROM + `fluview` + WHERE + `issue` = %s AND `epiweek` = %s ''' # Insert location rows into the `fluview_imputed` table for a given issue # and epiweek. add_imputed_values = ''' - INSERT INTO - `fluview_imputed` ( - `issue`, - `epiweek`, - `region`, - `lag`, - `num_ili`, - `num_patients`, - `num_providers`, - `ili` - ) - VALUES - (%s, %s, %s, %s, %s, %s, %s, %s) + INSERT INTO + `fluview_imputed` ( + `issue`, + `epiweek`, + `region`, + `lag`, + `num_ili`, + `num_patients`, + `num_providers`, + `ili` + ) + VALUES + (%s, %s, %s, %s, %s, %s, %s, %s) ''' - def connect(self): - """Connect to the database.""" - u, p = secrets.db.epi - self.cnx = mysql.connector.connect(user=u, password=p, database='epidata') - self.cur = self.cnx.cursor() - - def close(self, commit): - """ - Close the connection to the database, committing or rolling back changes as - indicated. - """ - self.cur.close() - if commit: - self.cnx.commit() - else: - print('test mode, not committing') - self.cnx.close() - - def count_rows(self): - """Count and return the number of rows in the `fluview_imputed` table.""" - self.cur.execute(Database.Sql.count_rows) - for (num,) in self.cur: - return num - - def find_missing_rows(self): - """ - Find rows that still have missing values. Each missing row is uniquely - identified by an (issue, epiweek, location) tuple. This function finds the - first two. - """ + def connect(self): + """Connect to the database.""" + u, p = secrets.db.epi + self.cnx = mysql.connector.connect(user=u, password=p, database='epidata') + self.cur = self.cnx.cursor() + + def close(self, commit): + """ + Close the connection to the database, committing or rolling back changes as + indicated. + """ + self.cur.close() + if commit: + self.cnx.commit() + else: + print('test mode, not committing') + self.cnx.close() + + def count_rows(self): + """Count and return the number of rows in the `fluview_imputed` table.""" + self.cur.execute(Database.Sql.count_rows) + for (num,) in self.cur: + return num + + def find_missing_rows(self): + """ + Find rows that still have missing values. Each missing row is uniquely + identified by an (issue, epiweek, location) tuple. This function finds the + first two. + """ + + self.cur.execute(Database.Sql.find_missing_rows) + return [(issue, epiweek) for (issue, epiweek) in self.cur] + + def get_known_values(self, issue, epiweek): + """ + Fetch ILINet data for all locations available for the given issue and + epiweek. The returned value is a dict mapping from locations to ILI data. + """ + + self.cur.execute(Database.Sql.get_known_values, (issue, epiweek)) + return dict([ + (loc, (n_ili, n_pat, n_prov)) + for + (loc, n_ili, n_pat, n_prov) + in self.cur + ]) + + def add_imputed_values(self, issue, epiweek, imputed): + """ + Store imputed ILINet data for the given locations on the given issue and + epiweek. The imputed value is a dict mapping from locations to ILI data. + """ + + for loc in imputed.keys(): + lag, n_ili, n_pat, n_prov, ili = imputed[loc] + args = (issue, epiweek, loc, lag, n_ili, n_pat, n_prov, ili) + self.cur.execute(Database.Sql.add_imputed_values, args) - self.cur.execute(Database.Sql.find_missing_rows) - return [(issue, epiweek) for (issue, epiweek) in self.cur] - def get_known_values(self, issue, epiweek): - """ - Fetch ILINet data for all locations available for the given issue and - epiweek. The returned value is a dict mapping from locations to ILI data. - """ +class StatespaceException(Exception): + """Used to indicate that imputation is not possible with the given inputs.""" - self.cur.execute(Database.Sql.get_known_values, (issue, epiweek)) - return dict([ - (loc, (n_ili, n_pat, n_prov)) - for - (loc, n_ili, n_pat, n_prov) - in self.cur - ]) - def add_imputed_values(self, issue, epiweek, imputed): +def get_location_graph(): """ - Store imputed ILINet data for the given locations on the given issue and - epiweek. The imputed value is a dict mapping from locations to ILI data. + Return a matrix where rows represent regions, columns represent atoms, and + each entry is a 1 if the region contains the atom, otherwise 0. The + corresponding lists of regions and atoms are also returned. """ - for loc in imputed.keys(): - lag, n_ili, n_pat, n_prov, ili = imputed[loc] - args = (issue, epiweek, loc, lag, n_ili, n_pat, n_prov, ili) - self.cur.execute(Database.Sql.add_imputed_values, args) - - -class StatespaceException(Exception): - """Used to indicate that imputation is not possible with the given inputs.""" - - -def get_location_graph(): - """ - Return a matrix where rows represent regions, columns represent atoms, and - each entry is a 1 if the region contains the atom, otherwise 0. The - corresponding lists of regions and atoms are also returned. - """ - - regions = sorted(Locations.region_list) - atoms = sorted(Locations.atom_list) - graph = np.zeros((len(regions), len(atoms))) - for i, r in enumerate(regions): - for a in Locations.region_map[r]: - j = atoms.index(a) - graph[i, j] = 1 - return graph, regions, atoms + regions = sorted(Locations.region_list) + atoms = sorted(Locations.atom_list) + graph = np.zeros((len(regions), len(atoms))) + for i, r in enumerate(regions): + for a in Locations.region_map[r]: + j = atoms.index(a) + graph[i, j] = 1 + return graph, regions, atoms def get_fusion_parameters(known_locations): - """ - Return a matrix that fuses known ILI values into unknown ILI values. The - corresponding lists of known and unknown locations are also returned. + """ + Return a matrix that fuses known ILI values into unknown ILI values. The + corresponding lists of known and unknown locations are also returned. - The goal is to infer ILI data in all locations, given ILI data in some - partial set of locations. This function takes a sensor fusion approach. + The goal is to infer ILI data in all locations, given ILI data in some + partial set of locations. This function takes a sensor fusion approach. - Let $z$ be a column vector of values in reported locations. Let $y$ be the - desired column vector of values in unreported locations. With matrices $H$ - (mapping from latent state to reported values), $W$ (mapping from latent - state to unreported values), and $R = I$ (covariance, which is identity): + Let $z$ be a column vector of values in reported locations. Let $y$ be the + desired column vector of values in unreported locations. With matrices $H$ + (mapping from latent state to reported values), $W$ (mapping from latent + state to unreported values), and $R = I$ (covariance, which is identity): - $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ - $y = W (H^T H)^{-1} H^T z$ + $y = W (H^T R^{-1} H)^{-1} H^T R^{-1} z$ + $y = W (H^T H)^{-1} H^T z$ - This is equavalent to OLS regression with an added translation from atomic - locations to missing locations. Unknown values are computed as a linear - combination of known values. - """ + This is equavalent to OLS regression with an added translation from atomic + locations to missing locations. Unknown values are computed as a linear + combination of known values. + """ - graph, regions, atoms = get_location_graph() - is_known = np.array([r in known_locations for r in regions]) - is_unknown = np.logical_not(is_known) - if not np.any(is_known): - raise StatespaceException('no values are known') - if not np.any(is_unknown): - raise StatespaceException('no values are unknown') + graph, regions, atoms = get_location_graph() + is_known = np.array([r in known_locations for r in regions]) + is_unknown = np.logical_not(is_known) + if not np.any(is_known): + raise StatespaceException('no values are known') + if not np.any(is_unknown): + raise StatespaceException('no values are unknown') - H = graph[is_known, :] - W = graph[is_unknown, :] - if np.linalg.matrix_rank(H) != len(atoms): - raise StatespaceException('system is underdetermined') + H = graph[is_known, :] + W = graph[is_unknown, :] + if np.linalg.matrix_rank(H) != len(atoms): + raise StatespaceException('system is underdetermined') - HtH = np.dot(H.T, H) - HtH_inv = np.linalg.inv(HtH) - H_pseudo_inv = np.dot(HtH_inv, H.T) - fuser = np.dot(W, H_pseudo_inv) + HtH = np.dot(H.T, H) + HtH_inv = np.linalg.inv(HtH) + H_pseudo_inv = np.dot(HtH_inv, H.T) + fuser = np.dot(W, H_pseudo_inv) - locations = np.array(regions) - filter_locations = lambda selected: list(map(str, locations[selected])) - return fuser, filter_locations(is_known), filter_locations(is_unknown) + locations = np.array(regions) + filter_locations = lambda selected: list(map(str, locations[selected])) # noqa + return fuser, filter_locations(is_known), filter_locations(is_unknown) def get_lag_and_ili(issue, epiweek, num_ili, num_patients): - """ - Compute and return reporting lag and percent ILI from imputed ILINet data. - """ - lag = delta_epiweeks(epiweek, issue) - ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients) - return lag, ili + """ + Compute and return reporting lag and percent ILI from imputed ILINet data. + """ + lag = delta_epiweeks(epiweek, issue) + ili = 100.0 * (0 if num_patients == 0 else num_ili / num_patients) + return lag, ili def impute_missing_values(database, test_mode=False): - """ - Determine whether values are missing for any states and territories. If so, - impute them and store them in the database. - """ - - # database connection - database.connect() - rows1 = database.count_rows() - print('rows before: %d' % (rows1)) - - # iterate over missing epiweeks - missing_rows = database.find_missing_rows() - print('missing data for %d epiweeks' % len(missing_rows)) - for issue, epiweek in missing_rows: - print('i=%d e=%d' % (issue, epiweek)) - - # get known values from table `fluview` - known_values = database.get_known_values(issue, epiweek) - - # Unlike most other state-level data, which typically begins publicly on - # 2010w40, data for PR begins on 2013w40. Before this, there are no reports - # for PR. Here we assume that no report is equivalent to a report of all - # zeros (number of ILI, patients, and providers). That's mostly true, with - # the notable exception of wILI, but that's not relevant here. By assuming - # that PR reports zero on those weeks, it's possible to impute values for - # VI, which are otherwise not reported until 2015w40. - assume_pr_zero = epiweek < 201340 and 'pr' not in known_values - if assume_pr_zero: - known_values['pr'] = (0, 0, 0) - - # get the imputation matrix and lists of known and unknown locations - F, known, unknown = get_fusion_parameters(known_values.keys()) - - # finally, impute the missing values - z = np.array([known_values[k] for k in known]) - y = np.dot(F, z) - - # possibly also record the assumptions made for PR - if assume_pr_zero: - unknown.append('pr') - y = np.vstack((y, [known_values['pr']])) - - # add lag and percent ILI to the data for each imputed location - imputed_values = {} - for loc, values in zip(unknown, y): - n_ili, n_pat, n_prov = map(int, np.rint(values)) - lag, ili = get_lag_and_ili(issue, epiweek, n_ili, n_pat) - imputed_values[loc] = (lag, n_ili, n_pat, n_prov, ili) - print(' %s: %s' % (loc, str(imputed_values[loc]))) - - # save all imputed values in table `fluview_imputed` - database.add_imputed_values(issue, epiweek, imputed_values) - - # database cleanup - rows2 = database.count_rows() - print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) - commit = not test_mode - database.close(commit) + """ + Determine whether values are missing for any states and territories. If so, + impute them and store them in the database. + """ + + # database connection + database.connect() + rows1 = database.count_rows() + print('rows before: %d' % (rows1)) + + # iterate over missing epiweeks + missing_rows = database.find_missing_rows() + print('missing data for %d epiweeks' % len(missing_rows)) + for issue, epiweek in missing_rows: + print('i=%d e=%d' % (issue, epiweek)) + + # get known values from table `fluview` + known_values = database.get_known_values(issue, epiweek) + + # Unlike most other state-level data, which typically begins publicly on + # 2010w40, data for PR begins on 2013w40. Before this, there are no reports + # for PR. Here we assume that no report is equivalent to a report of all + # zeros (number of ILI, patients, and providers). That's mostly true, with + # the notable exception of wILI, but that's not relevant here. By assuming + # that PR reports zero on those weeks, it's possible to impute values for + # VI, which are otherwise not reported until 2015w40. + assume_pr_zero = epiweek < 201340 and 'pr' not in known_values + if assume_pr_zero: + known_values['pr'] = (0, 0, 0) + + # get the imputation matrix and lists of known and unknown locations + F, known, unknown = get_fusion_parameters(known_values.keys()) + + # finally, impute the missing values + z = np.array([known_values[k] for k in known]) + y = np.dot(F, z) + + # possibly also record the assumptions made for PR + if assume_pr_zero: + unknown.append('pr') + y = np.vstack((y, [known_values['pr']])) + + # add lag and percent ILI to the data for each imputed location + imputed_values = {} + for loc, values in zip(unknown, y): + n_ili, n_pat, n_prov = map(int, np.rint(values)) + lag, ili = get_lag_and_ili(issue, epiweek, n_ili, n_pat) + imputed_values[loc] = (lag, n_ili, n_pat, n_prov, ili) + print(' %s: %s' % (loc, str(imputed_values[loc]))) + + # save all imputed values in table `fluview_imputed` + database.add_imputed_values(issue, epiweek, imputed_values) + + # database cleanup + rows2 = database.count_rows() + print('rows after: %d (added %d)' % (rows2, rows2 - rows1)) + commit = not test_mode + database.close(commit) def get_argument_parser(): - """Set up command line arguments and usage.""" - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - action='store_true', - help='do dry run only, do not update the database' - ) - return parser + """Set up command line arguments and usage.""" + parser = argparse.ArgumentParser() + parser.add_argument( + '--test', + action='store_true', + help='do dry run only, do not update the database' + ) + return parser def main(): - """Run this script from the command line.""" - args = get_argument_parser().parse_args() - impute_missing_values(Database(), test_mode=args.test) + """Run this script from the command line.""" + args = get_argument_parser().parse_args() + impute_missing_values(Database(), test_mode=args.test) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/ght/ght_update.py b/src/acquisition/ght/ght_update.py index c1e9b8d94..81e570358 100644 --- a/src/acquisition/ght/ght_update.py +++ b/src/acquisition/ght/ght_update.py @@ -69,16 +69,13 @@ import argparse import time +import delphi.operations.secrets as secrets +import delphi.utils.epiweek as flu # third party import mysql.connector -from apiclient.discovery import build # first party -from .google_health_trends import GHT -from .google_health_trends import NO_LOCATION_STR -import delphi.operations.secrets as secrets -import delphi.utils.epiweek as flu - +from .google_health_trends import GHT, NO_LOCATION_STR # secret key for accessing Google's health trends APIs # see: https://console.developers.google.com/apis/credentials?project=delphi-epi-trends @@ -249,143 +246,143 @@ def update(locations, terms, first=None, last=None, countries=['US']): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `ght`') - for (num,) in cur: - pass - return num - - # check from 4 weeks preceeding the last week with data through this week - cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`') - for (ew0, ew1) in cur: - ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) - ew0 = ew0 if first is None else first - ew1 = ew1 if last is None else last - print('Checking epiweeks between %d and %d...' % (ew0, ew1)) - - # keep track of how many rows were added - rows_before = get_num_rows() - - # check Google Trends for new and/or revised data - sql = ''' - INSERT INTO - `ght` (`query`, `location`, `epiweek`, `value`) - VALUES - (%s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `value` = %s - ''' - total_rows = 0 - ght = GHT(API_KEY) - for term in terms: - print(' [%s] using term' % term) - ll, cl = len(locations), len(countries) - for i in range(max(ll,cl)): - location = locations[i] if i < ll else locations[0] - country = countries[i] if i < cl else countries[0] - try: - #term2 = ('"%s"' % term) if ' ' in term else term - term2 = term - attempt = 0 - while True: - attempt += 1 - try: - result = ght.get_data(ew0, ew1, location, term2, country=country) - break - except Exception as ex: - if attempt >= 5: - raise ex - else: - delay = 2 ** attempt - print(' [%s|%s] caught exception (will retry in %ds):' % (term, location, delay), ex) - time.sleep(delay) - values = [p['value'] for p in result['data']['lines'][0]['points']] - ew = result['start_week'] - num_missing = 0 - for v in values: - # Default SQL location value for US country for backwards compatibility - # i.e. California's location is still stored as 'CA', - # and having location == 'US' is still stored as 'US' - sql_location = location if location != NO_LOCATION_STR else country - - # Change SQL location for non-US countries - if country != 'US': - # Underscore added to distinguish countries from 2-letter US states - sql_location = country + "_" - if location != NO_LOCATION_STR: - sql_location = sql_location + location - sql_data = (term, sql_location, ew, v, v) - cur.execute(sql, sql_data) - total_rows += 1 - if v == 0: - num_missing += 1 - #print(' [%s|%s|%d] missing value' % (term, location, ew)) - ew = flu.add_epiweeks(ew, 1) - if num_missing > 0: - print(' [%s|%s] missing %d/%d value(s)' % (term, location, num_missing, len(values))) - except Exception as ex: - print(' [%s|%s] caught exception (will NOT retry):' % (term, location), ex) - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cur = cnx.cursor() + + def get_num_rows(): + cur.execute('SELECT count(1) `num` FROM `ght`') + for (num,) in cur: + pass + return num + + # check from 4 weeks preceeding the last week with data through this week + cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `ght`') + for (ew0, ew1) in cur: + ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) + ew0 = ew0 if first is None else first + ew1 = ew1 if last is None else last + print('Checking epiweeks between %d and %d...' % (ew0, ew1)) + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check Google Trends for new and/or revised data + sql = ''' + INSERT INTO + `ght` (`query`, `location`, `epiweek`, `value`) + VALUES + (%s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `value` = %s + ''' + total_rows = 0 + ght = GHT(API_KEY) + for term in terms: + print(' [%s] using term' % term) + ll, cl = len(locations), len(countries) + for i in range(max(ll, cl)): + location = locations[i] if i < ll else locations[0] + country = countries[i] if i < cl else countries[0] + try: + # term2 = ('"%s"' % term) if ' ' in term else term + term2 = term + attempt = 0 + while True: + attempt += 1 + try: + result = ght.get_data(ew0, ew1, location, term2, country=country) + break + except Exception as ex: + if attempt >= 5: + raise ex + else: + delay = 2 ** attempt + print(' [%s|%s] caught exception (will retry in %ds):' % (term, location, delay), ex) + time.sleep(delay) + values = [p['value'] for p in result['data']['lines'][0]['points']] + ew = result['start_week'] + num_missing = 0 + for v in values: + # Default SQL location value for US country for backwards compatibility + # i.e. California's location is still stored as 'CA', + # and having location == 'US' is still stored as 'US' + sql_location = location if location != NO_LOCATION_STR else country + + # Change SQL location for non-US countries + if country != 'US': + # Underscore added to distinguish countries from 2-letter US states + sql_location = country + "_" + if location != NO_LOCATION_STR: + sql_location = sql_location + location + sql_data = (term, sql_location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + # print(' [%s|%s|%d] missing value' % (term, location, ew)) + ew = flu.add_epiweeks(ew, 1) + if num_missing > 0: + print(' [%s|%s] missing %d/%d value(s)' % (term, location, num_missing, len(values))) + except Exception as ex: + print(' [%s|%s] caught exception (will NOT retry):' % (term, location), ex) + + # keep track of how many rows were added + rows_after = get_num_rows() + print('Inserted %d/%d row(s)' % (rows_after - rows_before, total_rows)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('location', action='store', type=str, default=None, help='location(s) (ex: all; US; TX; CA,LA,WY)') - parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: all; /m/0cycc; "flu fever")') - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--country', '-c', default='US', type=str, help='location country (ex: US; BR)') - args = parser.parse_args() - - # sanity check - first, last = args.first, args.last - if first is not None: - flu.check_epiweek(first) - if last is not None: - flu.check_epiweek(last) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - - # decide what to update - if args.location.lower() == 'all': - locations = LOCATIONS - elif args.location.lower() == 'none': - locations = [NO_LOCATION_STR] - else: - locations = args.location.upper().split(',') - if args.term.lower() == 'all': - terms = TERMS - else: - terms = [args.term] - - # country argument - # Check that country follows ISO 1366 Alpha-2 code. - # See https://www.iso.org/obp/ui/#search. - countries = args.country.upper().split(',') - if not all(map(lambda x: len(x) == 2, countries)): - raise Exception('country name must be two letters (ISO 1366 Alpha-2)') - - # if length of locations and countries is > 1, need to be the same - if len(locations) > 1 and len(countries) > 1 and len(locations) != len(countries): - raise Exception('locations and countries must be length 1, or same length') - - # run the update - update(locations, terms, first, last, countries) + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument('location', action='store', type=str, default=None, help='location(s) (ex: all; US; TX; CA,LA,WY)') + parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: all; /m/0cycc; "flu fever")') + parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') + parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') + parser.add_argument('--country', '-c', default='US', type=str, help='location country (ex: US; BR)') + args = parser.parse_args() + + # sanity check + first, last = args.first, args.last + if first is not None: + flu.check_epiweek(first) + if last is not None: + flu.check_epiweek(last) + if first is not None and last is not None and first > last: + raise Exception('epiweeks in the wrong order') + + # decide what to update + if args.location.lower() == 'all': + locations = LOCATIONS + elif args.location.lower() == 'none': + locations = [NO_LOCATION_STR] + else: + locations = args.location.upper().split(',') + if args.term.lower() == 'all': + terms = TERMS + else: + terms = [args.term] + + # country argument + # Check that country follows ISO 1366 Alpha-2 code. + # See https://www.iso.org/obp/ui/#search. + countries = args.country.upper().split(',') + if not all(map(lambda x: len(x) == 2, countries)): + raise Exception('country name must be two letters (ISO 1366 Alpha-2)') + + # if length of locations and countries is > 1, need to be the same + if len(locations) > 1 and len(countries) > 1 and len(locations) != len(countries): + raise Exception('locations and countries must be length 1, or same length') + + # run the update + update(locations, terms, first, last, countries) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/ght/google_health_trends.py b/src/acquisition/ght/google_health_trends.py index 66a11c227..31aaa84b5 100644 --- a/src/acquisition/ght/google_health_trends.py +++ b/src/acquisition/ght/google_health_trends.py @@ -20,120 +20,120 @@ * separated GHT class from ght_update.py ''' -# standard library + import argparse import time -# third party +import delphi.utils.epiweek as flu from apiclient.discovery import build - -# first party from delphi.utils.epidate import EpiDate -import delphi.utils.epiweek as flu NO_LOCATION_STR = 'none' + class GHT: - # Google Trends API endpoint - DISCOVERY_URL = 'https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest' - - def __init__(self, key, delay=1): - self.service = build('trends', 'v1beta', developerKey=key, discoveryServiceUrl=GHT.DISCOVERY_URL) - self.delay = delay - - # converts a YYYYWW week into a YYYY-MM-DD date (using Sunday of the week) - @staticmethod - def _ew2date(ew): - # parse the epiweek - year, week = flu.split_epiweek(ew) - # get the date object (middle of the week; Wednesday) - date = EpiDate.from_epiweek(year, week) - # go to the first day of the week (Sunday) - date = date.add_days(-3) - # date as string - return str(date) - - # get data from Google APIs - # see: https://developers.google.com/apis-explorer/#p/trends/v1beta/trends.getTimelinesForHealth - def get_data(self, start_week, end_week, location, term, resolution='week', country='US'): - start_date = GHT._ew2date(start_week) - end_date = GHT._ew2date(end_week) - num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 - - # getTimelinesForHealth parameters - params = { - 'terms': term, - 'time_startDate': start_date, - 'time_endDate': end_date, - 'timelineResolution': resolution, - } - # We have a special check for the US for backwards compatibility. - # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. - # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. - if country == 'US': - if location == 'US' or location == NO_LOCATION_STR: - params['geoRestriction_country'] = 'US' - else: - params['geoRestriction_region'] = 'US-' + location - else: - if location == NO_LOCATION_STR: - params['geoRestriction_country'] = country - else: - params['geoRestriction_region'] = country + '-' + location - - # make the API call - data = self.service.getTimelinesForHealth(**params).execute() - - # extract the values - try: - values = [p['value'] for p in data['lines'][0]['points']] - except: - values = None - - # throttle request rate - time.sleep(self.delay) - - # return the results - return { - 'start_week': start_week, - 'end_week': end_week, - 'num_weeks': num_weeks, - 'location': location, - 'country' : country, - 'term': term, - 'resolution': resolution, - 'data': data, - 'values': values, - } + # Google Trends API endpoint + DISCOVERY_URL = 'https://www.googleapis.com/discovery/v1/apis/trends/v1beta/rest' + + def __init__(self, key, delay=1): + self.service = build('trends', 'v1beta', developerKey=key, discoveryServiceUrl=GHT.DISCOVERY_URL) + self.delay = delay + + @staticmethod + def _ew2date(ew): + """Converts a YYYYWW week into a YYYY-MM-DD date (using Sunday of the week)""" + # parse the epiweek + year, week = flu.split_epiweek(ew) + # get the date object (middle of the week; Wednesday) + date = EpiDate.from_epiweek(year, week) + # go to the first day of the week (Sunday) + date = date.add_days(-3) + # date as string + return str(date) + + def get_data(self, start_week, end_week, location, term, resolution='week', country='US'): + """ + Get data from Google APIs + see: https://developers.google.com/apis-explorer/#p/trends/v1beta/trends.getTimelinesForHealth + """ + start_date = GHT._ew2date(start_week) + end_date = GHT._ew2date(end_week) + num_weeks = flu.delta_epiweeks(start_week, end_week) + 1 + + # getTimelinesForHealth parameters + params = { + 'terms': term, + 'time_startDate': start_date, + 'time_endDate': end_date, + 'timelineResolution': resolution, + } + # We have a special check for the US for backwards compatibility. + # i.e. if the country is 'US' AND the location is 'US', just put the geo-restriction for country. + # In contrast, another country might have a sub-region with initials 'US' and we want the region restriction instead. + if country == 'US': + if location == 'US' or location == NO_LOCATION_STR: + params['geoRestriction_country'] = 'US' + else: + params['geoRestriction_region'] = 'US-' + location + else: + if location == NO_LOCATION_STR: + params['geoRestriction_country'] = country + else: + params['geoRestriction_region'] = country + '-' + location + + # make the API call + data = self.service.getTimelinesForHealth(**params).execute() + + # extract the values + try: + values = [p['value'] for p in data['lines'][0]['points']] + except: # noqa + values = None + + # throttle request rate + time.sleep(self.delay) + + # return the results + return { + 'start_week': start_week, + 'end_week': end_week, + 'num_weeks': num_weeks, + 'location': location, + 'country': country, + 'term': term, + 'resolution': resolution, + 'data': data, + 'values': values, + } def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('apikey', action='store', type=str, default=None, help='API key') - parser.add_argument('startweek', action='store', type=int, default=None, help='first week (ex: 201440)') - parser.add_argument('endweek', action='store', type=int, default=None, help='last week (ex: 201520)') - parser.add_argument('location', action='store', type=str, default=None, help='location (ex: US)') - parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: /m/0cycc)') - args = parser.parse_args() - - # get the data - ght = GHT(args.apikey) - result = ght.get_data(args.startweek, args.endweek, args.location, args.term) - values = result['values'] - - # sanity check - expected_weeks = result['num_weeks'] - received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) - if expected_weeks != received_weeks: - raise Exception('expected %d weeks, received %d' % (expected_weeks, received_weeks)) - - # results - epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] - for (epiweek, value) in zip(epiweeks, values): - print('%6d: %.3f' % (epiweek, value)) + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument('apikey', action='store', type=str, default=None, help='API key') + parser.add_argument('startweek', action='store', type=int, default=None, help='first week (ex: 201440)') + parser.add_argument('endweek', action='store', type=int, default=None, help='last week (ex: 201520)') + parser.add_argument('location', action='store', type=str, default=None, help='location (ex: US)') + parser.add_argument('term', action='store', type=str, default=None, help='term/query/topic (ex: /m/0cycc)') + args = parser.parse_args() + + # get the data + ght = GHT(args.apikey) + result = ght.get_data(args.startweek, args.endweek, args.location, args.term) + values = result['values'] + + # sanity check + expected_weeks = result['num_weeks'] + received_weeks = len([v for v in values if v is not None and type(v) == float and v >= 0]) + if expected_weeks != received_weeks: + raise Exception('expected %d weeks, received %d' % (expected_weeks, received_weeks)) + + # results + epiweeks = [ew for ew in flu.range_epiweeks(args.startweek, args.endweek, inclusive=True)] + for (epiweek, value) in zip(epiweeks, values): + print('%6d: %.3f' % (epiweek, value)) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/kcdc/kcdc_update.py b/src/acquisition/kcdc/kcdc_update.py index 70c167738..b88c9e78f 100644 --- a/src/acquisition/kcdc/kcdc_update.py +++ b/src/acquisition/kcdc/kcdc_update.py @@ -14,7 +14,7 @@ | Field | Type | Null | Key | Default | Extra | +--------------+-------------+------+-----+---------+----------------+ | id | int(11) | NO | PRI | NULL | auto_increment | -| release_date | date | NO | MUL | NULL | | +| release_date | date | NO | MUL | NULL | | | issue | int(11) | NO | MUL | NULL | | | epiweek | int(11) | NO | MUL | NULL | | | region | varchar(12) | NO | MUL | NULL | | @@ -32,19 +32,17 @@ import argparse import datetime -import requests - -# third party -import mysql.connector -# first party import delphi.operations.secrets as secrets -from delphi.utils.epiweek import delta_epiweeks, range_epiweeks, add_epiweeks +import mysql.connector +import requests from delphi.utils.epidate import EpiDate +from delphi.utils.epiweek import add_epiweeks, delta_epiweeks, range_epiweeks + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') try: cursor = cnx.cursor() cursor.execute(''' @@ -58,40 +56,44 @@ def ensure_tables_exist(): `ili` DOUBLE NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + ''') cnx.commit() finally: cnx.close() + def safe_float(f): try: - return float(f.replace(',','')) - except: + return float(f.replace(',', '')) + except: # noqa return 0 + def safe_int(i): try: - return int(i.replace(',','')) - except: + return int(i.replace(',', '')) + except: # noqa return 0 + def get_rows(cnx, table='kcdc_ili'): - # Count and return the number of rows in the `kcdc_ili` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + # Count and return the number of rows in the `kcdc_ili` table. + select = cnx.cursor() + select.execute('SELECT count(1) num FROM %s' % table) + for (num,) in select: + pass + select.close() + return num + def get_kcdc_data(): issue = EpiDate.today().get_ew() last_season = issue//100 + (1 if issue % 100 > 35 else 0) url = 'http://www.cdc.go.kr/npt/biz/npp/iss/influenzaListAjax.do' params = { - 'icdNm': 'influenza', - 'startYear': '2004', # Started in 2004 - 'endYear': str(last_season) + 'icdNm': 'influenza', + 'startYear': '2004', # Started in 2004 + 'endYear': str(last_season) } response = requests.post(url, params) datas = response.json() @@ -99,20 +101,21 @@ def get_kcdc_data(): ews = [] ilis = [] ew1 = 200436 - for year in range(2004,last_season): + for year in range(2004, last_season): year_data = data[year-2004] if year > 2004: ew1 = ews[-1] + 1 ili_yr = year_data["VALUE"].split('`') ili_yr = [float(f) for f in ili_yr if f != ''] - ew2 = add_epiweeks(ew1,len(ili_yr)) - new_ews = list(range_epiweeks(ew1,ew2)) + ew2 = add_epiweeks(ew1, len(ili_yr)) + new_ews = list(range_epiweeks(ew1, ew2)) for i in range(len(new_ews)): j = float(ili_yr[i]) ilis.append(j) ews.append(new_ews[i]) return ews, ilis + def update_from_data(ews, ilis, date, issue, test_mode=False): u, p = secrets.db.epi cnx = mysql.connector.connect(user=u, password=p, database='epidata') @@ -136,8 +139,8 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): ili = ilis[i] lag = delta_epiweeks(ews[i], issue) - insert_args = [date,issue,ew,'ROK',lag,ili] - update_args = [date,ili] + insert_args = [date, issue, ew, 'ROK', lag, ili] + update_args = [date, ili] try: insert.execute(sql % tuple(insert_args + update_args)) except Exception: @@ -151,9 +154,10 @@ def update_from_data(ews, ilis, date, issue, test_mode=False): else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print('rows after: %d (added %d)' % (rows2, rows2-rows1)) cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() @@ -170,7 +174,7 @@ def main(): ensure_tables_exist() - ews,ilis = get_kcdc_data() + ews, ilis = get_kcdc_data() update_from_data(ews, ilis, date, issue, test_mode=args.test) diff --git a/src/acquisition/nidss/taiwan_nidss.py b/src/acquisition/nidss/taiwan_nidss.py index 27da863e1..1a861bc16 100644 --- a/src/acquisition/nidss/taiwan_nidss.py +++ b/src/acquisition/nidss/taiwan_nidss.py @@ -24,246 +24,243 @@ * Original version, inspired by healthtweets.py """ -# standard library import argparse import base64 import re -# third party import requests - -# first party -from delphi.utils.epiweek import range_epiweeks, add_epiweeks, check_epiweek +from delphi.utils.epiweek import add_epiweeks, check_epiweek, range_epiweeks class NIDSS: - """An API for scraping the NIDSS site.""" + """An API for scraping the NIDSS site.""" - # The page where the flu data is kept - FLU_URL = 'https://nidss.cdc.gov.tw/en/CDCWNH01.aspx?dc=wnh' + # The page where the flu data is kept + FLU_URL = 'https://nidss.cdc.gov.tw/en/CDCWNH01.aspx?dc=wnh' - # Link to the dengue data - DENGUE_URL = 'http://nidss.cdc.gov.tw/Download/Weekly_Age_County_Gender_061.csv' + # Link to the dengue data + DENGUE_URL = 'http://nidss.cdc.gov.tw/Download/Weekly_Age_County_Gender_061.csv' - # Translate location names to English - # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan - _TRANSLATED = { - b'5Y2X5oqV57ij': 'Nantou_County', - b'5Y+w5Lit5biC': 'Taichung_City', - b'5Y+w5YyX5biC': 'Taipei_City', - b'5Y+w5Y2X5biC': 'Tainan_City', - b'5Y+w5p2x57ij': 'Taitung_County', - b'5ZiJ576p5biC': 'Chiayi_City', - b'5ZiJ576p57ij': 'Chiayi_County', - b'5Z+66ZqG5biC': 'Keelung_City', - b'5a6c6Jit57ij': 'Yilan_County', - b'5bGP5p2x57ij': 'Pingtung_County', - b'5b2w5YyW57ij': 'Changhua_County', - b'5paw5YyX5biC': 'New_Taipei_City', - b'5paw56u55biC': 'Hsinchu_City', - b'5paw56u557ij': 'Hsinchu_County', - b'5qGD5ZyS5biC': 'Taoyuan_City', - b'5r6O5rmW57ij': 'Penghu_County', - b'6Iqx6JOu57ij': 'Hualien_County', - b'6IuX5qCX57ij': 'Miaoli_County', - b'6YeR6ZaA57ij': 'Kinmen_County', - b'6Zuy5p6X57ij': 'Yunlin_County', - b'6auY6ZuE5biC': 'Kaohsiung_City', - b'6YCj5rGf57ij': 'Lienchiang_County', - } + # Translate location names to English + # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan + _TRANSLATED = { + b'5Y2X5oqV57ij': 'Nantou_County', + b'5Y+w5Lit5biC': 'Taichung_City', + b'5Y+w5YyX5biC': 'Taipei_City', + b'5Y+w5Y2X5biC': 'Tainan_City', + b'5Y+w5p2x57ij': 'Taitung_County', + b'5ZiJ576p5biC': 'Chiayi_City', + b'5ZiJ576p57ij': 'Chiayi_County', + b'5Z+66ZqG5biC': 'Keelung_City', + b'5a6c6Jit57ij': 'Yilan_County', + b'5bGP5p2x57ij': 'Pingtung_County', + b'5b2w5YyW57ij': 'Changhua_County', + b'5paw5YyX5biC': 'New_Taipei_City', + b'5paw56u55biC': 'Hsinchu_City', + b'5paw56u557ij': 'Hsinchu_County', + b'5qGD5ZyS5biC': 'Taoyuan_City', + b'5r6O5rmW57ij': 'Penghu_County', + b'6Iqx6JOu57ij': 'Hualien_County', + b'6IuX5qCX57ij': 'Miaoli_County', + b'6YeR6ZaA57ij': 'Kinmen_County', + b'6Zuy5p6X57ij': 'Yunlin_County', + b'6auY6ZuE5biC': 'Kaohsiung_City', + b'6YCj5rGf57ij': 'Lienchiang_County', + } - # Map locations to regions - # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan - # https://en.wikipedia.org/wiki/Regions_of_Taiwan#Hexchotomy - LOCATION_TO_REGION = { - # Taipei - 'Taipei_City': 'Taipei', - 'Keelung_City': 'Taipei', - 'New_Taipei_City': 'Taipei', - 'Yilan_County': 'Taipei', - 'Kinmen_County': 'Taipei', - 'Lienchiang_County': 'Taipei', - # Northern - 'Hsinchu_City': 'Northern', - 'Taoyuan_City': 'Northern', - 'Hsinchu_County': 'Northern', - 'Miaoli_County': 'Northern', - # Central - 'Taichung_City': 'Central', - 'Changhua_County': 'Central', - 'Nantou_County': 'Central', - # Southern - 'Tainan_City': 'Southern', - 'Chiayi_City': 'Southern', - 'Yunlin_County': 'Southern', - 'Chiayi_County': 'Southern', - # Kaoping - 'Kaohsiung_City': 'Kaoping', - 'Pingtung_County': 'Kaoping', - 'Penghu_County': 'Kaoping', - # Eastern - 'Hualien_County': 'Eastern', - 'Taitung_County': 'Eastern', - } + # Map locations to regions + # https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Taiwan + # https://en.wikipedia.org/wiki/Regions_of_Taiwan#Hexchotomy + LOCATION_TO_REGION = { + # Taipei + 'Taipei_City': 'Taipei', + 'Keelung_City': 'Taipei', + 'New_Taipei_City': 'Taipei', + 'Yilan_County': 'Taipei', + 'Kinmen_County': 'Taipei', + 'Lienchiang_County': 'Taipei', + # Northern + 'Hsinchu_City': 'Northern', + 'Taoyuan_City': 'Northern', + 'Hsinchu_County': 'Northern', + 'Miaoli_County': 'Northern', + # Central + 'Taichung_City': 'Central', + 'Changhua_County': 'Central', + 'Nantou_County': 'Central', + # Southern + 'Tainan_City': 'Southern', + 'Chiayi_City': 'Southern', + 'Yunlin_County': 'Southern', + 'Chiayi_County': 'Southern', + # Kaoping + 'Kaohsiung_City': 'Kaoping', + 'Pingtung_County': 'Kaoping', + 'Penghu_County': 'Kaoping', + # Eastern + 'Hualien_County': 'Eastern', + 'Taitung_County': 'Eastern', + } - @staticmethod - def _get_metadata(html): - issue_pattern = re.compile('^.*Latest available data: Week (\\d+), (\\d{4})\\..*$') - release_pattern = re.compile('^.*Data as of \\d+:\\d+:\\d+, (\\d{4})/(\\d{2})/(\\d{2})\\..*$') - issue, release = None, None - for line in html.split('\n'): - match = issue_pattern.match(line) - if match is not None: - year, week = int(match.group(2)), int(match.group(1)) - issue = year * 100 + week - match = release_pattern.match(line) - if match is not None: - year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) - release = '%04d-%02d-%02d' % (year, month, day) - if issue is None or release is None: - raise Exception('metadata not found') - return issue, release + @staticmethod + def _get_metadata(html): + issue_pattern = re.compile('^.*Latest available data: Week (\\d+), (\\d{4})\\..*$') + release_pattern = re.compile('^.*Data as of \\d+:\\d+:\\d+, (\\d{4})/(\\d{2})/(\\d{2})\\..*$') + issue, release = None, None + for line in html.split('\n'): + match = issue_pattern.match(line) + if match is not None: + year, week = int(match.group(2)), int(match.group(1)) + issue = year * 100 + week + match = release_pattern.match(line) + if match is not None: + year, month, day = int(match.group(1)), int(match.group(2)), int(match.group(3)) + release = '%04d-%02d-%02d' % (year, month, day) + if issue is None or release is None: + raise Exception('metadata not found') + return issue, release - @staticmethod - def _get_flu_data(html): - week_pattern = re.compile('^categories: \\[(.*)\\],$') - value_pattern = re.compile('^series: \\[(.*)\\],$') - data = {} - parsing_ili = True - for line in html.split('\n'): - line = line.strip() - match = week_pattern.match(line) - if match is not None: - weeks = [int(x[1:-1]) for x in match.group(1).split(',')] - for week in weeks: - check_epiweek(week) - if week not in data: - data[week] = {} - match = value_pattern.match(line) - if match is not None: - for item in match.group(1).split('},{'): - parts = item.replace('{', '').replace('}', '').strip().split(' ') - location = parts[1][1:-2] - def num(value): - if parsing_ili: - return float(value) - else: - if '.' in value: - raise Exception('expected type int for visits') - return int(value) - values = [num(x) for x in parts[3][1:-1].split(',')] - unit = 'ili' if parsing_ili else 'visits' - if len(weeks) != len(values): - raise Exception('len(weeks) != len(values)') - for week, value in zip(weeks, values): - if location not in data[week]: - data[week][location] = {} - data[week][location][unit] = value - parsing_ili = False - if len(data) == 0: - raise Exception('no data') - return data + @staticmethod + def _get_flu_data(html): + week_pattern = re.compile('^categories: \\[(.*)\\],$') + value_pattern = re.compile('^series: \\[(.*)\\],$') + data = {} + parsing_ili = True + for line in html.split('\n'): + line = line.strip() + match = week_pattern.match(line) + if match is not None: + weeks = [int(x[1:-1]) for x in match.group(1).split(',')] + for week in weeks: + check_epiweek(week) + if week not in data: + data[week] = {} + match = value_pattern.match(line) + if match is not None: + for item in match.group(1).split('},{'): + parts = item.replace('{', '').replace('}', '').strip().split(' ') + location = parts[1][1:-2] - @staticmethod - def get_flu_data(): - # Fetch the flu page - response = requests.get(NIDSS.FLU_URL) - if response.status_code != 200: - raise Exception('request failed [%d]' % response.status_code) - html = response.text - # Parse metadata - latest_week, release_date = NIDSS._get_metadata(html) - # Parse flu data - data = NIDSS._get_flu_data(html) - # Return results indexed by week and location - return latest_week, release_date, data + def num(value): + if parsing_ili: + return float(value) + else: + if '.' in value: + raise Exception('expected type int for visits') + return int(value) + values = [num(x) for x in parts[3][1:-1].split(',')] + unit = 'ili' if parsing_ili else 'visits' + if len(weeks) != len(values): + raise Exception('len(weeks) != len(values)') + for week, value in zip(weeks, values): + if location not in data[week]: + data[week][location] = {} + data[week][location][unit] = value + parsing_ili = False + if len(data) == 0: + raise Exception('no data') + return data - @staticmethod - def get_dengue_data(first_week, last_week): - # Check week order - if first_week > last_week: - first_week, last_week = last_week, first_week - # Bounds check - if first_week < 200301 or last_week < 200301: - raise Exception('week out of range') - # Initialize data by week and location (zeroes are not reported) - data = {} - for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): - data[week] = {} - for location in NIDSS.LOCATION_TO_REGION.keys(): - data[week][location] = 0 - # Download CSV - response = requests.get(NIDSS.DENGUE_URL) - if response.status_code != 200: - raise Exception('export Dengue failed [%d]' % response.status_code) - csv = response.content.decode('big5-tw') - # Parse the data - lines = [l.strip() for l in csv.split('\n')[1:] if l.strip() != ''] - for line in lines: - fields = line.split(',') - location_b64 = base64.b64encode(fields[3].encode('utf-8')) - location = NIDSS._TRANSLATED[location_b64] - # Fields currently unused: - # region = NIDSS.LOCATION_TO_REGION[location] - # imported_b64 = base64.b64encode(fields[6].encode('utf-8')) - # imported = imported_b64 == b'5piv' - # sex = fields[5] - # age = fields[7] - count = int(fields[8]) - year = int(fields[1]) - week = int(fields[2]) - # Week 53 was reported each year in 2003-2007 - if year < 2008 and year != 2003 and week > 52: - week = 52 - # Epiweek system change in 2009 - # See also: http://research.undefinedx.com/forum/index.php?topic=300.0 - if year == 2009: - week -= 1 - if week == 0: - year, week = 2008, 53 - epiweek = year * 100 + week - if epiweek < first_week or epiweek > last_week: - # Outside of the requested range - continue - if epiweek not in data or location not in data[epiweek]: - # Not a vaild U.S. epiweek - raise Exception('data missing %d-%s' % (epiweek, location)) - # Add the counts to the location on this epiweek - data[epiweek][location] += count - # Return results indexed by week and location - return data + @staticmethod + def get_flu_data(): + # Fetch the flu page + response = requests.get(NIDSS.FLU_URL) + if response.status_code != 200: + raise Exception('request failed [%d]' % response.status_code) + html = response.text + # Parse metadata + latest_week, release_date = NIDSS._get_metadata(html) + # Parse flu data + data = NIDSS._get_flu_data(html) + # Return results indexed by week and location + return latest_week, release_date, data + + @staticmethod + def get_dengue_data(first_week, last_week): + # Check week order + if first_week > last_week: + first_week, last_week = last_week, first_week + # Bounds check + if first_week < 200301 or last_week < 200301: + raise Exception('week out of range') + # Initialize data by week and location (zeroes are not reported) + data = {} + for week in range_epiweeks(first_week, add_epiweeks(last_week, 1)): + data[week] = {} + for location in NIDSS.LOCATION_TO_REGION.keys(): + data[week][location] = 0 + # Download CSV + response = requests.get(NIDSS.DENGUE_URL) + if response.status_code != 200: + raise Exception('export Dengue failed [%d]' % response.status_code) + csv = response.content.decode('big5-tw') + # Parse the data + lines = [l.strip() for l in csv.split('\n')[1:] if l.strip() != ''] # noqa + for line in lines: + fields = line.split(',') + location_b64 = base64.b64encode(fields[3].encode('utf-8')) + location = NIDSS._TRANSLATED[location_b64] + # Fields currently unused: + # region = NIDSS.LOCATION_TO_REGION[location] + # imported_b64 = base64.b64encode(fields[6].encode('utf-8')) + # imported = imported_b64 == b'5piv' + # sex = fields[5] + # age = fields[7] + count = int(fields[8]) + year = int(fields[1]) + week = int(fields[2]) + # Week 53 was reported each year in 2003-2007 + if year < 2008 and year != 2003 and week > 52: + week = 52 + # Epiweek system change in 2009 + # See also: http://research.undefinedx.com/forum/index.php?topic=300.0 + if year == 2009: + week -= 1 + if week == 0: + year, week = 2008, 53 + epiweek = year * 100 + week + if epiweek < first_week or epiweek > last_week: + # Outside of the requested range + continue + if epiweek not in data or location not in data[epiweek]: + # Not a vaild U.S. epiweek + raise Exception('data missing %d-%s' % (epiweek, location)) + # Add the counts to the location on this epiweek + data[epiweek][location] += count + # Return results indexed by week and location + return data def main(): - # Args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - 'epiweek', - action='store', - type=int, - help='fetch data on this epiweek (ex: 201537)' - ) - args = parser.parse_args() - ew = args.epiweek + # Args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + 'epiweek', + action='store', + type=int, + help='fetch data on this epiweek (ex: 201537)' + ) + args = parser.parse_args() + ew = args.epiweek - # Get the data - latest_week, release_date, fdata = NIDSS.get_flu_data() - ddata = NIDSS.get_dengue_data(ew, ew) + # Get the data + latest_week, release_date, fdata = NIDSS.get_flu_data() + ddata = NIDSS.get_dengue_data(ew, ew) - # Print the results - print('*** Meta ***') - print('latest_week:', latest_week) - print('release_date:', release_date) - print('*** Flu ***') - for region in sorted(list(fdata[ew].keys())): - visits, ili = fdata[ew][region]['visits'], fdata[ew][region]['ili'] - print('region=%s | visits=%d | ili=%.3f' % (region, visits, ili)) - print('*** Dengue ***') - for location in sorted(list(ddata[ew].keys())): - region = NIDSS.LOCATION_TO_REGION[location] - count = ddata[ew][location] - print('location=%s | region=%s | count=%d' % (location, region, count)) + # Print the results + print('*** Meta ***') + print('latest_week:', latest_week) + print('release_date:', release_date) + print('*** Flu ***') + for region in sorted(list(fdata[ew].keys())): + visits, ili = fdata[ew][region]['visits'], fdata[ew][region]['ili'] + print('region=%s | visits=%d | ili=%.3f' % (region, visits, ili)) + print('*** Dengue ***') + for location in sorted(list(ddata[ew].keys())): + region = NIDSS.LOCATION_TO_REGION[location] + count = ddata[ew][location] + print('location=%s | region=%s | count=%d' % (location, region, count)) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/nidss/taiwan_update.py b/src/acquisition/nidss/taiwan_update.py index 830a7738d..c4779ea65 100644 --- a/src/acquisition/nidss/taiwan_update.py +++ b/src/acquisition/nidss/taiwan_update.py @@ -73,106 +73,104 @@ * Original version, inspired by load_epidata_fluview.py """ -# standard library + import argparse -# third party +import delphi.operations.secrets as secrets import mysql.connector +from delphi.utils.epiweek import * -# first party from .taiwan_nidss import NIDSS -import delphi.operations.secrets as secrets -from delphi.utils.epiweek import * # Get a row count just to know how many new rows are inserted def get_rows(cnx): - select = cnx.cursor() - select.execute('SELECT count(1) num FROM nidss_flu') - for (num,) in select: - rows_flu = num - select.execute('SELECT count(1) num FROM nidss_dengue') - for (num,) in select: - rows_dengue = num - select.close() - return (rows_flu, rows_dengue) + select = cnx.cursor() + select.execute('SELECT count(1) num FROM nidss_flu') + for (num,) in select: + rows_flu = num + select.execute('SELECT count(1) num FROM nidss_dengue') + for (num,) in select: + rows_dengue = num + select.close() + return (rows_flu, rows_dengue) def update(test_mode=False): - # test mode - if test_mode: - print('test mode enabled: changes will not be saved') - - # Database connection - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - rows1 = get_rows(cnx) - print('rows before (flu): %d' % (rows1[0])) - print('rows before (dengue): %d' % (rows1[1])) - insert = cnx.cursor() - sql_flu = ''' - INSERT INTO - `nidss_flu` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `visits`, `ili`) - VALUES - (%s, %s, %s, %s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `release_date` = least(`release_date`, %s), `visits` = %s, `ili` = %s - ''' - sql_dengue = ''' - INSERT INTO - `nidss_dengue` (`epiweek`, `location`, `region`, `count`) - VALUES - (%s, %s, %s, %s) - ON DUPLICATE KEY UPDATE - `count` = %s - ''' - - # Scrape flu data - current_week, release_date, data = NIDSS.get_flu_data() - for epiweek in sorted(list(data.keys())): - lag = delta_epiweeks(epiweek, current_week) - for region in data[epiweek].keys(): - visits, ili = data[epiweek][region]['visits'], data[epiweek][region]['ili'] - params1 = [release_date, current_week, epiweek, region, lag, visits, ili] - params2 = [release_date, visits, ili] - insert.execute(sql_flu, tuple(params1 + params2)) - - # Scrape dengue data from the past year - data = NIDSS.get_dengue_data(add_epiweeks(current_week, -51), current_week) - for epiweek in sorted(list(data.keys())): - for location in sorted(list(data[epiweek].keys())): - region = NIDSS.LOCATION_TO_REGION[location] - count = data[epiweek][location] - params = (epiweek, location, region, count, count) - insert.execute(sql_dengue, params) - - # Cleanup - insert.close() - rows2 = get_rows(cnx) - print('rows after (flu): %d (added %d)' % (rows2[0], rows2[0] - rows1[0])) - print('rows after (dengue): %d (added %d)' % (rows2[1], rows2[1] - rows1[1])) - if test_mode: - print('test mode: changes not commited') - else: - cnx.commit() - cnx.close() + # test mode + if test_mode: + print('test mode enabled: changes will not be saved') + + # Database connection + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + rows1 = get_rows(cnx) + print('rows before (flu): %d' % (rows1[0])) + print('rows before (dengue): %d' % (rows1[1])) + insert = cnx.cursor() + sql_flu = ''' + INSERT INTO + `nidss_flu` (`release_date`, `issue`, `epiweek`, `region`, `lag`, `visits`, `ili`) + VALUES + (%s, %s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `release_date` = least(`release_date`, %s), `visits` = %s, `ili` = %s + ''' + sql_dengue = ''' + INSERT INTO + `nidss_dengue` (`epiweek`, `location`, `region`, `count`) + VALUES + (%s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + `count` = %s + ''' + + # Scrape flu data + current_week, release_date, data = NIDSS.get_flu_data() + for epiweek in sorted(list(data.keys())): + lag = delta_epiweeks(epiweek, current_week) + for region in data[epiweek].keys(): + visits, ili = data[epiweek][region]['visits'], data[epiweek][region]['ili'] + params1 = [release_date, current_week, epiweek, region, lag, visits, ili] + params2 = [release_date, visits, ili] + insert.execute(sql_flu, tuple(params1 + params2)) + + # Scrape dengue data from the past year + data = NIDSS.get_dengue_data(add_epiweeks(current_week, -51), current_week) + for epiweek in sorted(list(data.keys())): + for location in sorted(list(data[epiweek].keys())): + region = NIDSS.LOCATION_TO_REGION[location] + count = data[epiweek][location] + params = (epiweek, location, region, count, count) + insert.execute(sql_dengue, params) + + # Cleanup + insert.close() + rows2 = get_rows(cnx) + print('rows after (flu): %d (added %d)' % (rows2[0], rows2[0] - rows1[0])) + print('rows after (dengue): %d (added %d)' % (rows2[1], rows2[1] - rows1[1])) + if test_mode: + print('test mode: changes not commited') + else: + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument( - '--test', - '-t', - action='store_true', - default=False, - help='test mode, do not commit changes' - ) - args = parser.parse_args() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument( + '--test', + '-t', + action='store_true', + default=False, + help='test mode, do not commit changes' + ) + args = parser.parse_args() - # fetch and store NIDSS data - update(args.test) + # fetch and store NIDSS data + update(args.test) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/norostat/norostat_add_history.py b/src/acquisition/norostat/norostat_add_history.py index 64fd11ff7..a1a91cbc8 100644 --- a/src/acquisition/norostat/norostat_add_history.py +++ b/src/acquisition/norostat/norostat_add_history.py @@ -7,39 +7,41 @@ script as well. """ -# standard library -import re +import collections import os +import re import time -import collections - -# first party -from . import norostat_sql -from . import norostat_raw +from . import norostat_raw, norostat_sql def main(): - norostat_sql.ensure_tables_exist() - snapshot_dir = os.path.expanduser("~/norostat_history/wayback/websites/www.cdc.gov/norovirus/reporting/norostat/data-table.html/") - snapshot_version_counter = collections.Counter() - for subdir in os.listdir(snapshot_dir): - if re.match(r'[0-9]+', subdir) is not None: - # appears to be snapshot dir - snapshot_version_counter[subdir] = 0 # register that loop found this snapshot directory - for norostat_capitalization in ["norostat","noroSTAT"]: - time.sleep(0.002) # ensure parse times are unique, assuming OS can accurately sleep and measure to ms precision - path = os.path.join(snapshot_dir,subdir,"norovirus","reporting",norostat_capitalization,"data-table.html") - if os.path.isfile(path): - print("Processing file ", path) - with open(path, 'r') as datatable_file: - content = datatable_file.read() - wide_raw = norostat_raw.parse_content_to_wide_raw(content) - long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) - norostat_sql.record_long_raw(long_raw) - snapshot_version_counter[subdir] += 1 - print('Successfully uploaded the following snapshots, with the count indicating the number of data-table versions found inside each snapshot (expected to be 1, or maybe 2 if there was a change in capitalization; 0 indicates the NoroSTAT page was not found within a snapshot directory); just "Counter()" indicates no snapshot directories were found:', snapshot_version_counter) - norostat_sql.update_point() + norostat_sql.ensure_tables_exist() + snapshot_dir = os.path.expanduser("~/norostat_history/wayback/websites/www.cdc.gov/norovirus/reporting/norostat/data-table.html/") + snapshot_version_counter = collections.Counter() + for subdir in os.listdir(snapshot_dir): + if re.match(r'[0-9]+', subdir) is not None: + # appears to be snapshot dir + snapshot_version_counter[subdir] = 0 # register that loop found this snapshot directory + for norostat_capitalization in ["norostat", "noroSTAT"]: + time.sleep(0.002) # ensure parse times are unique, assuming OS can accurately sleep and measure to ms precision + path = os.path.join(snapshot_dir, subdir, "norovirus", "reporting", norostat_capitalization, "data-table.html") + if os.path.isfile(path): + print("Processing file ", path) + with open(path, 'r') as datatable_file: + content = datatable_file.read() + wide_raw = norostat_raw.parse_content_to_wide_raw(content) + long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) + norostat_sql.record_long_raw(long_raw) + snapshot_version_counter[subdir] += 1 + print( + 'Successfully uploaded the following snapshots, with the count indicating the number of data-table versions found inside each snapshot \ + (expected to be 1, or maybe 2 if there was a change in capitalization; 0 indicates the NoroSTAT page was not found within a snapshot directory); \ + just "Counter()" indicates no snapshot directories were found:', + snapshot_version_counter + ) + norostat_sql.update_point() + if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/norostat/norostat_raw.py b/src/acquisition/norostat/norostat_raw.py index 582de9684..10a5e648e 100644 --- a/src/acquisition/norostat/norostat_raw.py +++ b/src/acquisition/norostat/norostat_raw.py @@ -6,107 +6,110 @@ and (constant) location. Here, the location will be (a str representing) a set of states. """ - - - -# standard library import datetime -import re import pickle +import re -# third party -import requests import lxml.html import pandas as pd +import requests + +from .norostat_utils import dtype_kind, expect_result_eq, expect_value_eq -# first party -from .norostat_utils import * def fetch_content(norostat_datatable_url="https://www.cdc.gov/norovirus/reporting/norostat/data-table.html"): - """Download NoroSTAT data-table. Returns the html content.""" - headers = { - 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', - } - resp = requests.get(norostat_datatable_url, headers=headers) - expect_value_eq(resp.status_code, 200, - 'Wanted status code {}. Received: ') - expect_value_eq(resp.headers.get("Content-Type"), "text/html", - 'Expected Content-Type "{}"; Received ') - return resp.content + """Download NoroSTAT data-table. Returns the html content.""" + headers = { + 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', + } + resp = requests.get(norostat_datatable_url, headers=headers) + expect_value_eq(resp.status_code, 200, + 'Wanted status code {}. Received: ') + expect_value_eq(resp.headers.get("Content-Type"), "text/html", + 'Expected Content-Type "{}"; Received ') + return resp.content + def save_sample_content(content, f="sample_content.pickle"): - """Save the content from fetch_content into a pickle file for most testing (don't download unnecessarily).""" - with open(f, "wb") as handle: - pickle.dump(content, handle) + """Save the content from fetch_content into a pickle file for most testing (don't download unnecessarily).""" + with open(f, "wb") as handle: + pickle.dump(content, handle) + def load_sample_content(f="sample_content.pickle"): - """Load data from a past call to fetch_content from a pickle file for most testing (don't download unnecessarily).""" - with open(f, "rb") as handle: - content = pickle.load(handle) - return content + """Load data from a past call to fetch_content from a pickle file for most testing (don't download unnecessarily).""" + with open(f, "rb") as handle: + content = pickle.load(handle) + return content + def parse_content_to_wide_raw(content): - """Convert the html content for the data-table into a wide data frame, then stick it in a tuple along with the release_date, parse_time, and (constant) location.""" - parse_time = datetime.datetime.now() - html_root = lxml.html.fromstring(content) - # Extract the release date, a.k.a. dateModified, a.k.a. "Page last updated" date; ~Dec 2018 this is only available in a meta tag; previously, it was available in a visible span - dateModified_meta_elts = html_root.xpath('//meta[@property="cdc:last_updated"]') - dateModified_span_elts = html_root.xpath('//span[@itemprop="dateModified"]') - if len(dateModified_meta_elts) == 1: - [dateModified_meta_elt] = dateModified_meta_elts - dateModified = dateModified_meta_elt.attrib['content'] - elif len(dateModified_span_elts) == 1: - [dateModified_span_elt] = dateModified_span_elts - dateModified = dateModified_span_elt.text - else: - raise Exception("Could not find the expected number of dateModified meta or span tags.") - # FIXME check/enforce locale - release_date = datetime.datetime.strptime(dateModified, "%B %d, %Y").date() - # Check that table description still specifies suspected&confirmed norovirus - # outbreaks (insensitive to case of certain letters and allowing for both old - # "to the" and new "through the" text), then extract list of states from the - # description: - [description_elt] = html_root.xpath('''//p[ - contains(translate(text(), "SCNORHD", "scnorhd"), "suspected and confirmed norovirus outbreaks reported by state health departments in") and - ( - contains(text(), "to the") or - contains(text(), "through the") + """ + Convert the html content for the data-table into a wide data frame, + then stick it in a tuple along with the release_date, parse_time, and (constant) location. + """ + parse_time = datetime.datetime.now() + html_root = lxml.html.fromstring(content) + # Extract the release date, a.k.a. dateModified, a.k.a. "Page last updated" date; + # ~Dec 2018 this is only available in a meta tag; previously, it was available in a visible span + dateModified_meta_elts = html_root.xpath('//meta[@property="cdc:last_updated"]') + dateModified_span_elts = html_root.xpath('//span[@itemprop="dateModified"]') + if len(dateModified_meta_elts) == 1: + [dateModified_meta_elt] = dateModified_meta_elts + dateModified = dateModified_meta_elt.attrib['content'] + elif len(dateModified_span_elts) == 1: + [dateModified_span_elt] = dateModified_span_elts + dateModified = dateModified_span_elt.text + else: + raise Exception("Could not find the expected number of dateModified meta or span tags.") + # FIXME check/enforce locale + release_date = datetime.datetime.strptime(dateModified, "%B %d, %Y").date() + # Check that table description still specifies suspected&confirmed norovirus + # outbreaks (insensitive to case of certain letters and allowing for both old + # "to the" and new "through the" text), then extract list of states from the + # description: + [description_elt] = html_root.xpath('''//p[ + contains(translate(text(), "SCNORHD", "scnorhd"), "suspected and confirmed norovirus outbreaks reported by state health departments in") and + ( + contains(text(), "to the") or + contains(text(), "through the") + ) + ]''') + location = re.match(".*?[Dd]epartments in (.*?) (?:to)|(?:through) the.*$", description_elt.text).group(1) + # Attempt to find exactly 1 table (note: it would be nice to filter on the + # associated caption, but no such caption is present in earlier versions): + [table] = html_root.xpath('//table') + # Convert html table to DataFrame: + # Directly reading in the table with pd.read_html performs unwanted dtype + # inference, but reveals the column names: + [wide_raw_df_with_unwanted_conversions] = pd.read_html(lxml.html.tostring(table)) + # We want all columns to be string columns. However, there does not appear + # to be an option to disable dtype inference in pd.read_html. Hide all + # entries inside 1-tuple wrappers using pre-dtype-inference converters, + # then unpack afterward (the entries fed to the converters should already + # be strings, but "convert" them to strings just in case): + [wide_raw_df_with_wrappers] = pd.read_html( + lxml.html.tostring(table), + converters={col: lambda entry: (str(entry),) + for col in wide_raw_df_with_unwanted_conversions.columns} ) - ]''') - location = re.match(".*?[Dd]epartments in (.*?) (?:to)|(?:through) the.*$", description_elt.text).group(1) - # Attempt to find exactly 1 table (note: it would be nice to filter on the - # associated caption, but no such caption is present in earlier versions): - [table] = html_root.xpath('//table') - # Convert html table to DataFrame: - # Directly reading in the table with pd.read_html performs unwanted dtype - # inference, but reveals the column names: - [wide_raw_df_with_unwanted_conversions] = pd.read_html(lxml.html.tostring(table)) - # We want all columns to be string columns. However, there does not appear - # to be an option to disable dtype inference in pd.read_html. Hide all - # entries inside 1-tuple wrappers using pre-dtype-inference converters, - # then unpack afterward (the entries fed to the converters should already - # be strings, but "convert" them to strings just in case): - [wide_raw_df_with_wrappers] = pd.read_html( - lxml.html.tostring(table), - converters= {col: lambda entry: (str(entry),) - for col in wide_raw_df_with_unwanted_conversions.columns} - ) - # Unwrap entries: - wide_raw_df = wide_raw_df_with_wrappers.applymap(lambda wrapper: wrapper[0]) - # Check format: - expect_value_eq(wide_raw_df.columns[0], "Week", - 'Expected raw_colnames[0] to be "{}"; encountered ') - for colname in wide_raw_df.columns: - expect_result_eq(dtype_kind, wide_raw_df[colname].head(), "O", - 'Expected (head of) "%s" column to have dtype kind "{}"; instead had dtype kind & head '%(colname)) - # Pack up df with metadata: - wide_raw = (wide_raw_df, release_date, parse_time, location) - return wide_raw + # Unwrap entries: + wide_raw_df = wide_raw_df_with_wrappers.applymap(lambda wrapper: wrapper[0]) + # Check format: + expect_value_eq(wide_raw_df.columns[0], "Week", + 'Expected raw_colnames[0] to be "{}"; encountered ') + for colname in wide_raw_df.columns: + expect_result_eq(dtype_kind, wide_raw_df[colname].head(), "O", + 'Expected (head of) "%s" column to have dtype kind "{}"; instead had dtype kind & head ' % (colname)) + # Pack up df with metadata: + wide_raw = (wide_raw_df, release_date, parse_time, location) + return wide_raw + def melt_wide_raw_to_long_raw(wide_raw): - (wide_raw_df, release_date, parse_time, location) = wide_raw - long_raw_df = wide_raw_df \ - .melt(id_vars=["Week"], var_name="measurement_type", value_name="value") \ - .rename(index=str, columns={"Week": "week"}) - long_raw = (long_raw_df, release_date, parse_time, location) - return long_raw + (wide_raw_df, release_date, parse_time, location) = wide_raw + long_raw_df = wide_raw_df \ + .melt(id_vars=["Week"], var_name="measurement_type", value_name="value") \ + .rename(index=str, columns={"Week": "week"}) + long_raw = (long_raw_df, release_date, parse_time, location) + return long_raw diff --git a/src/acquisition/paho/paho_db_update.py b/src/acquisition/paho/paho_db_update.py index d07885f79..76a86434a 100644 --- a/src/acquisition/paho/paho_db_update.py +++ b/src/acquisition/paho/paho_db_update.py @@ -50,23 +50,21 @@ import csv import datetime import glob -import subprocess import random +import subprocess from io import StringIO -# third party +import delphi.operations.secrets as secrets import mysql.connector import pycountry - -# first party -import delphi.operations.secrets as secrets from delphi.epidata.acquisition.paho.paho_download import get_paho_data -from delphi.utils.epiweek import delta_epiweeks, check_epiweek from delphi.utils.epidate import EpiDate +from delphi.utils.epiweek import check_epiweek, delta_epiweeks + def ensure_tables_exist(): - (u,p) = secrets.db.epi - cnx = mysql.connector.connect(user=u,password=p,database='epidata') + (u, p) = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') try: cursor = cnx.cursor() cursor.execute(''' @@ -85,34 +83,38 @@ def ensure_tables_exist(): `num_deaths` INT(11) NOT NULL, UNIQUE KEY (`issue`, `epiweek`, `region`) ); - '''); + ''') cnx.commit() finally: cnx.close() + def safe_float(f): try: return float(f.replace(',','')) except: return 0 + def safe_int(i): try: return int(i.replace(',','')) except: return 0 + def get_rows(cnx, table='paho_dengue'): - # Count and return the number of rows in the `fluview` table. - select = cnx.cursor() - select.execute('SELECT count(1) num FROM %s' % table) - for (num,) in select: - pass - select.close() - return num + # Count and return the number of rows in the `fluview` table. + select = cnx.cursor() + select.execute('SELECT count(1) num FROM %s' % table) + for (num,) in select: + pass + select.close() + return num + def get_paho_row(row): - if row[0] == "\ufeffIncidence Rate (c)" and row != "\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f),ID,Country or Subregion,Deaths,EW,Confirmed,Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d),Total of Dengue Cases (b),Year,Population x 1000".split(","): + if row[0] == "\ufeffIncidence Rate (c)" and row != "\ufeffIncidence Rate (c),(SD/D) x100 (e),CFR (f),ID,Country or Subregion,Deaths,EW,Confirmed,Epidemiological Week (a),Pop (no usar),Serotype,Severe Dengue (d),Total of Dengue Cases (b),Year,Population x 1000".split(","): # noqa raise Exception('PAHO header row has changed') if len(row) == 1 or row[0] == "Incidence Rate (c)": # this is a header row @@ -145,6 +147,7 @@ def get_paho_row(row): 'cfr': safe_float(row[2]) } + def update_from_file(issue, date, filename, test_mode=False): # Read PAHO data from CSV and insert into (or update) the database. @@ -163,10 +166,10 @@ def update_from_file(issue, date, filename, test_mode=False): # load the data, ignoring empty rows print('loading data from %s as issued on %d' % (filename, issue)) - with open(filename,'r',encoding='utf-8') as f: + with open(filename, 'r', encoding='utf-8') as f: c = f.read() rows = [] - for l in csv.reader(StringIO(c), delimiter=','): + for l in csv.reader(StringIO(c), delimiter=','): # noqa rows.append(get_paho_row(l)) print(' loaded %d rows' % len(rows)) entries = [obj for obj in rows if obj] @@ -190,13 +193,13 @@ def update_from_file(issue, date, filename, test_mode=False): ''' for row in entries: - if row['issue'] > issue: # Issued in a week that hasn't happened yet + if row['issue'] > issue: # Issued in a week that hasn't happened yet continue lag = delta_epiweeks(row['epiweek'], issue) data_args = [row['total_pop'], row['serotype'], row['num_dengue'], row['incidence_rate'], row['num_severe'], row['num_deaths']] - insert_args = [date,issue,row['epiweek'],row['region'],lag] + data_args + insert_args = [date, issue, row['epiweek'], row['region'], lag] + data_args update_args = [date] + data_args insert.execute(sql % tuple(insert_args + update_args)) @@ -208,9 +211,10 @@ def update_from_file(issue, date, filename, test_mode=False): else: cnx.commit() rows2 = get_rows(cnx) - print('rows after: %d (added %d)' % (rows2,rows2-rows1)) + print('rows after: %d (added %d)' % (rows2, rows2-rows1)) cnx.close() + def main(): # args and usage parser = argparse.ArgumentParser() @@ -249,7 +253,7 @@ def main(): flag = flag + 1 tmp_dir = ''.join(random.choice('0123456789abcdefghijklmnopqrstuvwxyz') for i in range(8)) tmp_dir = 'downloads_' + tmp_dir - subprocess.call(["mkdir",tmp_dir]) + subprocess.call(["mkdir", tmp_dir]) # Use temporary directory to avoid data from different time # downloaded to same folder get_paho_data(dir=tmp_dir) @@ -258,23 +262,24 @@ def main(): issueset = set() files = glob.glob('%s/*.csv' % tmp_dir) for filename in files: - with open(filename,'r') as f: + with open(filename, 'r') as f: _ = f.readline() data = f.readline().split(',') issueset.add(data[6]) db_error = False - if len(issueset) >= 53: # Shouldn't be more than 53 + if len(issueset) >= 53: # Shouldn't be more than 53 for filename in files: try: update_from_file(issue, date, filename, test_mode=args.test) - subprocess.call(["rm",filename]) + subprocess.call(["rm", filename]) except: db_error = True - subprocess.call(["rm","-r",tmp_dir]) + subprocess.call(["rm", "-r", tmp_dir]) if not db_error: - break # Exit loop with success + break # Exit loop with success if flag >= max_tries: print('WARNING: Database `paho_dengue` did not update successfully') + if __name__ == '__main__': main() diff --git a/src/acquisition/paho/paho_download.py b/src/acquisition/paho/paho_download.py index 60dd13ae8..1a3631241 100644 --- a/src/acquisition/paho/paho_download.py +++ b/src/acquisition/paho/paho_download.py @@ -4,17 +4,16 @@ import os -# Start up a browser -from selenium.webdriver import Firefox -from selenium.webdriver import FirefoxProfile -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException +from selenium.webdriver import Firefox, FirefoxProfile +from selenium.webdriver.common.by import By from selenium.webdriver.firefox.options import Options +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait headerheight = 0 + def wait_for(browser, css_selector, delay=10): try: WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CSS_SELECTOR, css_selector))) @@ -22,35 +21,37 @@ def wait_for(browser, css_selector, delay=10): print('Success Loading %s' % (css_selector)) except TimeoutException: print("Loading %s took too much time!" % (css_selector)) - + + def find_and_click(browser, element): element.location_once_scrolled_into_view browser.switch_to.default_content() - browser.execute_script("window.scrollBy(0,-%d)"%headerheight) + browser.execute_script("window.scrollBy(0,-%d)" % headerheight) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) element.click() + def get_paho_data(offset=0, dir='downloads'): opts = Options() opts.set_headless() assert opts.headless # Operating in headless mode fp = FirefoxProfile() - fp.set_preference("browser.download.folderList",2) - fp.set_preference("browser.download.manager.showWhenStarting",False) - fp.set_preference("browser.download.dir",os.path.abspath(dir)) - fp.set_preference("browser.helperApps.neverAsk.saveToDisk","text/csv") + fp.set_preference("browser.download.folderList", 2) + fp.set_preference("browser.download.manager.showWhenStarting", False) + fp.set_preference("browser.download.dir", os.path.abspath(dir)) + fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "text/csv") - browser = Firefox(options=opts,firefox_profile=fp) + browser = Firefox(options=opts, firefox_profile=fp) browser.get('http://www.paho.org/data/index.php/en/mnu-topics/indicadores-dengue-en/dengue-nacional-en/252-dengue-pais-ano-en.html?showall=&start=1') tab1 = browser.window_handles[0] browser.execute_script('''window.open("","_blank");''') tab2 = browser.window_handles[1] browser.switch_to.window(tab1) - + curr_offset = offset - + wait_for(browser, "div.rt-top-inner", delay=30) header = browser.find_element_by_css_selector("div.rt-top-inner") global headerheight @@ -59,7 +60,7 @@ def get_paho_data(offset=0, dir='downloads'): # The actual content of the data of this webpage is within 2 iframes, so we need to navigate into them first browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) browser.switch_to.frame(browser.find_element_by_tag_name("iframe")) - + # Locate the button that allows to download the table downloadoption = browser.find_elements_by_css_selector("div.tabToolbarButton.tab-widget.download")[0] find_and_click(browser, downloadoption) @@ -78,10 +79,10 @@ def get_paho_data(offset=0, dir='downloads'): # Extract session ID href = downloadbutton.get_attribute("href") startidx = href.index("sessions/") + len("sessions/") - endidx = href.index("/",startidx) + endidx = href.index("/", startidx) sessionid = href[startidx:endidx] - dataurl = "http://phip.paho.org/vizql/w/Casosdedengue_tben/v/ByLastAvailableEpiWeek/viewData/sessions/%s/views/18076444178507886853_9530488980060483892?maxrows=200&viz=%%7B%%22worksheet%%22:%%22W%%20By%%20Last%%20Available%%20EpiWeek%%22,%%22dashboard%%22:%%22By%%20Last%%20Available%%20Epi%%20Week%%22%%7D"%sessionid + dataurl = "http://phip.paho.org/vizql/w/Casosdedengue_tben/v/ByLastAvailableEpiWeek/viewData/sessions/%s/views/18076444178507886853_9530488980060483892?maxrows=200&viz=%%7B%%22worksheet%%22:%%22W%%20By%%20Last%%20Available%%20EpiWeek%%22,%%22dashboard%%22:%%22By%%20Last%%20Available%%20Epi%%20Week%%22%%7D" % sessionid # noqa wait_for(browser, "div[data-tb-test-id='CancelBtn-Button']") @@ -107,18 +108,12 @@ def get_paho_data(offset=0, dir='downloads'): for i in range(offset): gp = browser.find_element_by_css_selector("div.wcGlassPane") - #print gp.is_enabled() - #print gp.is_selected() - #print gp.is_displayed() try: WebDriverWait(browser, 10).until(EC.staleness_of(gp)) print("Loaded next week % d" % (53-offset)) except TimeoutException: print("Loading next week %d took too much time!" % (53-offset)) gp = browser.find_element_by_css_selector("div.wcGlassPane") - #print gp.is_enabled() - #print gp.is_selected() - #print gp.is_displayed() x = browser.find_elements_by_css_selector("div.dijitReset.dijitSliderButtonContainer.dijitSliderButtonContainerH.tableauArrowDec")[0] find_and_click(browser, x) @@ -137,7 +132,7 @@ def get_paho_data(offset=0, dir='downloads'): full_data_tab = browser.find_elements_by_css_selector("li[id='tab-view-full-data']")[0] full_data_tab.click() - wait_for(browser, "a.csvLink") # Sometimes this fails but the button is successfully clicked anyway, not sure why + wait_for(browser, "a.csvLink") # Sometimes this fails but the button is successfully clicked anyway, not sure why # Actually download the data as a .csv (Will be downloaded to Firefox's default download destination) data_links = browser.find_elements_by_css_selector("a.csvLink") data_link = None @@ -155,10 +150,11 @@ def get_paho_data(offset=0, dir='downloads'): find_and_click(browser, x) curr_offset += 1 except Exception as e: - print('Got exception %s\nTrying again from week %d' % (e,53-offset)) + print('Got exception %s\nTrying again from week %d' % (e, 53-offset)) browser.quit() get_paho_data(offset=curr_offset) browser.quit() + if __name__ == '__main__': get_paho_data(dir='downloads/') diff --git a/src/acquisition/quidel/quidel.py b/src/acquisition/quidel/quidel.py index a7c9a2918..29b1c8dda 100644 --- a/src/acquisition/quidel/quidel.py +++ b/src/acquisition/quidel/quidel.py @@ -17,113 +17,122 @@ * original version ''' -# standard library -from collections import defaultdict import email import imaplib import os +import re +from collections import defaultdict from os import listdir from os.path import isfile, join -import re - -# third party -import numpy as np -import pandas as pd -# first party import delphi.operations.secrets as secrets import delphi.utils.epidate as ED +import numpy as np +import pandas as pd from delphi.utils.geo.locations import Locations -def word_map(row,terms): - for (k,v) in terms.items(): - row = row.replace(k,v) + +def word_map(row, terms): + for (k, v) in terms.items(): + row = row.replace(k, v) return row -def date_less_than(d1,d2): - y1,m1,d1 = [int(x) for x in d1.split('-')] - y2,m2,d2 = [int(x) for x in d2.split('-')] - if y1*10000+m1*100+d10: shifted to future def date_to_epiweek(date, shift=0): - y,m,d = [int(x) for x in date.split('-')] + y, m, d = [int(x) for x in date.split('-')] - epidate = ED.EpiDate(y,m,d) + epidate = ED.EpiDate(y, m, d) epidate = epidate.add_days(shift) ew = epidate.get_ew() return ew + # convert measurment to time series format # startweek and endweek are inclusive -def measurement_to_ts(m,index,startweek=None,endweek=None): +def measurement_to_ts(m, index, startweek=None, endweek=None): if startweek is None: startweek = 0 if endweek is None: endweek = 999999 res = {} - for r,rdict in m.items(): - res[r]={} - for t,vals in rdict.items(): - if index>=len(vals): + for r, rdict in m.items(): + res[r] = {} + for t, vals in rdict.items(): + if index >= len(vals): raise Exception("Index is invalid") - if t>=startweek and t<=endweek: + if t >= startweek and t <= endweek: res[r][t] = vals[index] return res + class QuidelData: def __init__(self, raw_path, load_email=True): self.data_path = raw_path - self.excel_uptodate_path = join(raw_path,'excel/uptodate') - self.excel_history_path = join(raw_path,'excel/history') - self.csv_path = join(raw_path,'csv') + self.excel_uptodate_path = join(raw_path, 'excel/uptodate') + self.excel_history_path = join(raw_path, 'excel/history') + self.csv_path = join(raw_path, 'csv') self.xlsx_uptodate_list = [ - f[:-5] for f in listdir(self.excel_uptodate_path) if isfile(join(self.excel_uptodate_path, f)) and f[-5:]=='.xlsx' + f[:-5] for f in listdir(self.excel_uptodate_path) if isfile(join(self.excel_uptodate_path, f)) and f[-5:] == '.xlsx' ] self.xlsx_history_list = [ - f[:-5] for f in listdir(self.excel_history_path) if isfile(join(self.excel_history_path, f)) and f[-5:]=='.xlsx' + f[:-5] for f in listdir(self.excel_history_path) if isfile(join(self.excel_history_path, f)) and f[-5:] == '.xlsx' ] - self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:]=='.csv'] - self.map_terms = { - ' FL 34637"':'FL', - } + self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:] == '.csv'] + self.map_terms = {' FL 34637"': 'FL'} # hardcoded parameters self.date_dim = 1 self.state_dim = 4 self.fields = [ - 'sofia_ser','date','fac_id','city','state','zip','age', - 'fluA','fluB','fluAll','county','fac_type' + 'sofia_ser', + 'date', + 'fac_id', + 'city', + 'state', + 'zip', + 'age', + 'fluA', + 'fluB', + 'fluAll', + 'county', + 'fac_type' ] - self.fields_to_keep = ['fac_id','fluA','fluB','fluAll'] + self.fields_to_keep = ['fac_id', 'fluA', 'fluB', 'fluAll'] self.dims_to_keep = [self.fields.index(x) for x in self.fields_to_keep] if load_email: self.retrieve_excels() self.prepare_csv() def retrieve_excels(self): - detach_dir = self.excel_uptodate_path # directory where to save attachments (default: current) + detach_dir = self.excel_uptodate_path # directory where to save attachments (default: current) # connecting to the gmail imap server m = imaplib.IMAP4_SSL("imap.gmail.com") - m.login(secrets.quidel.email_addr,secrets.quidel.email_pwd) - m.select("INBOX") # here you a can choose a mail box like INBOX instead + m.login(secrets.quidel.email_addr, secrets.quidel.email_pwd) + m.select("INBOX") # here you a can choose a mail box like INBOX instead # use m.list() to get all the mailboxes - _, items = m.search(None, "ALL") # you could filter using the IMAP rules here (check http://www.example-code.com/csharp/imap-search-critera.asp) - items = items[0].split() # getting the mails id + _, items = m.search(None, "ALL") # you could filter using the IMAP rules here (check http://www.example-code.com/csharp/imap-search-critera.asp) + items = items[0].split() # getting the mails id # The emailids are ordered from past to now for emailid in items: - _, data = m.fetch(emailid, "(RFC822)") # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc - email_body = data[0][1].decode('utf-8') # getting the mail content - mail = email.message_from_string(email_body) # parsing the mail content to get a mail object + _, data = m.fetch(emailid, "(RFC822)") # fetching the mail, "`(RFC822)`" means "get the whole stuff", but you can ask for headers only, etc + email_body = data[0][1].decode('utf-8') # getting the mail content + mail = email.message_from_string(email_body) # parsing the mail content to get a mail object - #Check if any attachments at all + # Check if any attachments at all if mail.get_content_maintype() != 'multipart': continue @@ -145,38 +154,38 @@ def retrieve_excels(self): self.xlsx_uptodate_list.append(filename[:-5]) att_path = os.path.join(detach_dir, filename) - #Check if its already there - if not os.path.isfile(att_path) : + # Check if its already there + if not os.path.isfile(att_path): # finally write the stuff fp = open(att_path, 'wb') fp.write(part.get_payload(decode=True)) fp.close() def prepare_csv(self): - need_update=False + need_update = False for f in self.xlsx_uptodate_list: if f in self.csv_list: continue else: - need_update=True + need_update = True date_regex = '\d{2}-\d{2}-\d{4}' - date_items = re.findall(date_regex,f) + date_items = re.findall(date_regex, f) if date_items: - end_date = '-'.join(date_items[-1].split('-')[x] for x in [2,0,1]) + end_date = '-'.join(date_items[-1].split('-')[x] for x in [2, 0, 1]) else: print("End date not found in file name:"+f) end_date = None df_dict = pd.read_excel(join(self.excel_uptodate_path, f+'.xlsx'), sheet_name=None) - for (_,df) in df_dict.items(): + for (_, df) in df_dict.items(): df = df.dropna(axis=0, how='all') df['TestDate'] = df['TestDate'].apply(lambda x: x.strftime('%Y-%m-%d')) - df_filtered = df[df['TestDate']!=''] + df_filtered = df[df['TestDate'] != ''] if end_date is not None: - df_filtered = df_filtered[df.apply(lambda x: date_less_than(end_date,x['TestDate'])!=1, axis=1)] + df_filtered = df_filtered[df.apply(lambda x: date_less_than(end_date, x['TestDate']) != 1, axis=1)] df_filtered.to_csv(join(self.csv_path, f+'.csv'), index=False, encoding='utf-8') - self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:]=='.csv'] + self.csv_list = [f[:-4] for f in listdir(self.csv_path) if isfile(join(self.csv_path, f)) and f[-4:] == '.csv'] self.need_update = need_update def load_csv(self, dims=None): @@ -186,11 +195,11 @@ def load_csv(self, dims=None): for f in self.csv_list: if f in self.xlsx_history_list: continue - rf = open(join(self.csv_path,f+'.csv')) + rf = open(join(self.csv_path, f+'.csv')) lines = rf.readlines() - for l in lines[1:]: - l = word_map(l,self.map_terms) + for l in lines[1:]: # noqa + l = word_map(l, self.map_terms) # noqa row = l.strip().split(',') date = row[self.date_dim] state = row[self.state_dim] @@ -202,7 +211,7 @@ def load_csv(self, dims=None): # hardcoded aggregation function # output: [#unique_device,fluA,fluB,fluAll,total] - def prepare_measurements(self,data_dict,use_hhs=True,start_weekday=6): + def prepare_measurements(self, data_dict, use_hhs=True, start_weekday=6): buffer_dict = {} if use_hhs: region_list = Locations.hhs_list @@ -210,34 +219,33 @@ def prepare_measurements(self,data_dict,use_hhs=True,start_weekday=6): region_list = Locations.atom_list def get_hhs_region(atom): - for region in Locations.hhs_list: - if atom.lower() in Locations.hhs_map[region]: - return region - if atom.lower() == 'ny': - return 'hhs2' - return atom + for region in Locations.hhs_list: + if atom.lower() in Locations.hhs_map[region]: + return region + if atom.lower() == 'ny': + return 'hhs2' + return atom day_shift = 6 - start_weekday - time_map = lambda x:date_to_epiweek(x,shift=day_shift) - region_map = lambda x:get_hhs_region(x) \ - if use_hhs and x not in Locations.hhs_list else x # a bit hacky + time_map = lambda x:date_to_epiweek(x, shift=day_shift) # noqa + region_map = lambda x:get_hhs_region(x) if use_hhs and x not in Locations.hhs_list else x # a bit hacky # noqa end_date = sorted(data_dict.keys())[-1] # count the latest week in only if Thurs data is included - end_epiweek = date_to_epiweek(end_date,shift=-4) + end_epiweek = date_to_epiweek(end_date, shift=-4) # first pass: prepare device_id set device_dict = {} - for (date,daily_dict) in data_dict.items(): + for (date, daily_dict) in data_dict.items(): if not date: continue ew = time_map(date) - if ew == -1 or ew>end_epiweek: + if ew == -1 or ew > end_epiweek: continue if ew not in device_dict: - device_dict[ew]={} + device_dict[ew] = {} for r in region_list: device_dict[ew][r] = set() - for (state,rec_list) in daily_dict.items(): + for (state, rec_list) in daily_dict.items(): region = region_map(state) # get rid of non-US regions if region not in region_list: @@ -247,38 +255,40 @@ def get_hhs_region(atom): device_dict[ew][region].add(fac) # second pass: prepare all measurements - for (date,daily_dict) in data_dict.items(): + for (date, daily_dict) in data_dict.items(): ew = time_map(date) - if ew == -1 or ew>end_epiweek: + if ew == -1 or ew > end_epiweek: continue if ew not in buffer_dict: - buffer_dict[ew]={} + buffer_dict[ew] = {} for r in region_list: buffer_dict[ew][r] = [0.0]*8 - for (state,rec_list) in daily_dict.items(): + for (state, rec_list) in daily_dict.items(): region = region_map(state) # get rid of non-US regions if region not in region_list: continue for rec in rec_list: fac_num = float(len(device_dict[ew][region])) - buffer_dict[ew][region]= np.add( - buffer_dict[ew][region],[ - rec[1]=='positive', - rec[2]=='positive', - rec[3]=='positive', + buffer_dict[ew][region] = np.add( + buffer_dict[ew][region], + [ + rec[1] == 'positive', + rec[2] == 'positive', + rec[3] == 'positive', 1.0, - float(rec[1]=='positive')/fac_num, - float(rec[2]=='positive')/fac_num, - float(rec[3]=='positive')/fac_num, + float(rec[1] == 'positive')/fac_num, + float(rec[2] == 'positive')/fac_num, + float(rec[3] == 'positive')/fac_num, 1.0/fac_num, - ]).tolist() + ] + ).tolist() # switch two dims of dict result_dict = {} for r in region_list: - result_dict[r]={} - for (k,v) in buffer_dict.items(): - result_dict[r][k]=v[r] + result_dict[r] = {} + for (k, v) in buffer_dict.items(): + result_dict[r][k] = v[r] return result_dict diff --git a/src/acquisition/quidel/quidel_update.py b/src/acquisition/quidel/quidel_update.py index b6303533c..3ffc4a62e 100644 --- a/src/acquisition/quidel/quidel_update.py +++ b/src/acquisition/quidel/quidel_update.py @@ -38,117 +38,116 @@ # standard library import argparse +import delphi.operations.secrets as secrets +import delphi.utils.epiweek as flu # third party import mysql.connector - # first party from delphi.epidata.acquisition.quidel import quidel -import delphi.operations.secrets as secrets -from delphi.utils.epidate import EpiDate -import delphi.utils.epiweek as flu from delphi.utils.geo.locations import Locations LOCATIONS = Locations.hhs_list DATAPATH = '/home/automation/quidel_data' + def update(locations, first=None, last=None, force_update=False, load_email=True): - # download and prepare data first - qd = quidel.QuidelData(DATAPATH,load_email) - if not qd.need_update and not force_update: - print('Data not updated, nothing needs change.') - return - - qd_data = qd.load_csv() - qd_measurements = qd.prepare_measurements(qd_data,start_weekday=4) - qd_ts = quidel.measurement_to_ts(qd_measurements,7,startweek=first,endweek=last) - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `quidel`') - for (num,) in cur: - pass - return num - - # check from 4 weeks preceeding the last week with data through this week - cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`') - for (ew0, ew1) in cur: - ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) - ew0 = ew0 if first is None else first - ew1 = ew1 if last is None else last - print('Checking epiweeks between %d and %d...' % (ew0, ew1)) - - # keep track of how many rows were added - rows_before = get_num_rows() - - # check Quidel for new and/or revised data - sql = ''' - INSERT INTO - `quidel` (`location`, `epiweek`, `value`) - VALUES - (%s, %s, %s) - ON DUPLICATE KEY UPDATE - `value` = %s - ''' - - total_rows = 0 - - for location in locations: - if location not in qd_ts: - continue - ews = sorted(qd_ts[location].keys()) - num_missing = 0 - for ew in ews: - v = qd_ts[location][ew] - sql_data = (location, ew, v, v) - cur.execute(sql, sql_data) - total_rows += 1 - if v == 0: - num_missing += 1 - if num_missing > 0: - print(' [%s] missing %d/%d value(s)' % (location, num_missing, len(ews))) - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + # download and prepare data first + qd = quidel.QuidelData(DATAPATH, load_email) + if not qd.need_update and not force_update: + print('Data not updated, nothing needs change.') + return + + qd_data = qd.load_csv() + qd_measurements = qd.prepare_measurements(qd_data, start_weekday=4) + qd_ts = quidel.measurement_to_ts(qd_measurements, 7, startweek=first, endweek=last) + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cur = cnx.cursor() + + def get_num_rows(): + cur.execute('SELECT count(1) `num` FROM `quidel`') + for (num,) in cur: + pass + return num + + # check from 4 weeks preceeding the last week with data through this week + cur.execute('SELECT max(`epiweek`) `ew0`, yearweek(now(), 6) `ew1` FROM `quidel`') + for (ew0, ew1) in cur: + ew0 = 200401 if ew0 is None else flu.add_epiweeks(ew0, -4) + ew0 = ew0 if first is None else first + ew1 = ew1 if last is None else last + print('Checking epiweeks between %d and %d...' % (ew0, ew1)) + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check Quidel for new and/or revised data + sql = ''' + INSERT INTO + `quidel` (`location`, `epiweek`, `value`) + VALUES + (%s, %s, %s) + ON DUPLICATE KEY UPDATE + `value` = %s + ''' + + total_rows = 0 + + for location in locations: + if location not in qd_ts: + continue + ews = sorted(qd_ts[location].keys()) + num_missing = 0 + for ew in ews: + v = qd_ts[location][ew] + sql_data = (location, ew, v, v) + cur.execute(sql, sql_data) + total_rows += 1 + if v == 0: + num_missing += 1 + if num_missing > 0: + print(' [%s] missing %d/%d value(s)' % (location, num_missing, len(ews))) + + # keep track of how many rows were added + rows_after = get_num_rows() + print('Inserted %d/%d row(s)' % (rows_after - rows_before, total_rows)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('--location', action='store', type=str, default=None, help='location(s) (ex: all; any of hhs1-10)') - parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') - parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') - parser.add_argument('--force_update', '-u', action='store_true', help='force update db values') - parser.add_argument('--skip_email', '-s', action='store_true', help='skip email downloading step') - args = parser.parse_args() - - # sanity check - first, last, force_update, skip_email = args.first, args.last, args.force_update, args.skip_email - load_email = not skip_email - if first is not None: - flu.check_epiweek(first) - if last is not None: - flu.check_epiweek(last) - if first is not None and last is not None and first > last: - raise Exception('epiweeks in the wrong order') - - # decide what to update - if args.location.lower() == 'all': - locations = LOCATIONS - else: - locations = args.location.lower().split(',') - - # run the update - update(locations, first, last, force_update, load_email) + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument('--location', action='store', type=str, default=None, help='location(s) (ex: all; any of hhs1-10)') + parser.add_argument('--first', '-f', default=None, type=int, help='first epiweek override') + parser.add_argument('--last', '-l', default=None, type=int, help='last epiweek override') + parser.add_argument('--force_update', '-u', action='store_true', help='force update db values') + parser.add_argument('--skip_email', '-s', action='store_true', help='skip email downloading step') + args = parser.parse_args() + + # sanity check + first, last, force_update, skip_email = args.first, args.last, args.force_update, args.skip_email + load_email = not skip_email + if first is not None: + flu.check_epiweek(first) + if last is not None: + flu.check_epiweek(last) + if first is not None and last is not None and first > last: + raise Exception('epiweeks in the wrong order') + + # decide what to update + if args.location.lower() == 'all': + locations = LOCATIONS + else: + locations = args.location.lower().split(',') + + # run the update + update(locations, first, last, force_update, load_email) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/twtr/healthtweets.py b/src/acquisition/twtr/healthtweets.py index 78eb2b3ec..a5cdd754a 100644 --- a/src/acquisition/twtr/healthtweets.py +++ b/src/acquisition/twtr/healthtweets.py @@ -22,146 +22,194 @@ * Original version ''' -# standard library + import argparse -from datetime import datetime, timedelta import json +from datetime import datetime -# third party` import requests -# first party from .pageparser import PageParser class HealthTweets: - # mapping from state abbreviations to location codes used by healthtweets.org - STATE_CODES = {'AL': 3024, 'AK': 3025, 'AZ': 3026, 'AR': 3027, 'CA': 440, 'CO': 3029, 'CT': 3030, 'DE': 3031, 'DC': 3032, 'FL': 3033, 'GA': 3034, 'HI': 3035, 'ID': 3036, 'IL': 3037, 'IN': 3038, 'IA': 3039, 'KS': 3040, 'KY': 3041, 'LA': 2183, 'ME': 3043, 'MD': 3044, 'MA': 450, 'MI': 3046, 'MN': 3047, 'MS': 3048, 'MO': 3049, 'MT': 3050, 'NE': 3051, 'NV': 3052, 'NH': 3053, 'NJ': 478, 'NM': 2225, 'NY': 631, 'NC': 3057, 'ND': 3058, 'OH': 3059, 'OK': 3060, 'OR': 281, 'PA': 3062, 'RI': 3063, 'SC': 3064, 'SD': 3065, 'TN': 3066, 'TX': 3067, 'UT': 2272, 'VT': 3069, 'VA': 3070, 'WA': 3071, 'WV': 3072, 'WI': 3073, 'WY': 3074} - - def __init__(self, username, password, debug=False): - self.debug = debug - self.session = requests.Session() - # spoof a web browser - self.session.headers.update({ - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', - }) - # get the login token - response = self._go('http://www.healthtweets.org/accounts/login') - token = self._get_token(response.text) - if self.debug: - print('token=%s'%(token)) - data = { - 'csrfmiddlewaretoken': token, - 'username': username, - 'password': password, - 'next': '/', + # mapping from state abbreviations to location codes used by healthtweets.org + STATE_CODES = { + 'AL': 3024, + 'AK': 3025, + 'AZ': 3026, + 'AR': 3027, + 'CA': 440, + 'CO': 3029, + 'CT': 3030, + 'DE': 3031, + 'DC': 3032, + 'FL': 3033, + 'GA': 3034, + 'HI': 3035, + 'ID': 3036, + 'IL': 3037, + 'IN': 3038, + 'IA': 3039, + 'KS': 3040, + 'KY': 3041, + 'LA': 2183, + 'ME': 3043, + 'MD': 3044, + 'MA': 450, + 'MI': 3046, + 'MN': 3047, + 'MS': 3048, + 'MO': 3049, + 'MT': 3050, + 'NE': 3051, + 'NV': 3052, + 'NH': 3053, + 'NJ': 478, + 'NM': 2225, + 'NY': 631, + 'NC': 3057, + 'ND': 3058, + 'OH': 3059, + 'OK': 3060, + 'OR': 281, + 'PA': 3062, + 'RI': 3063, + 'SC': 3064, + 'SD': 3065, + 'TN': 3066, + 'TX': 3067, + 'UT': 2272, + 'VT': 3069, + 'VA': 3070, + 'WA': 3071, + 'WV': 3072, + 'WI': 3073, + 'WY': 3074 } - # login to the site - response = self._go('http://www.healthtweets.org/accounts/login', data=data) - if response.status_code != 200 or 'Your username and password' in response.text: - raise Exception('login failed') - - def get_values(self, state, date1, date2): - ''' - state: two-letter state abbreviation (see STATE_CODES) - date1: the first date in the range, inclusive (format: YYYY-MM-DD) - date2: the last date in the range, inclusive (format: YYYY-MM-DD) - returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) - ''' - # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) - raw_values = self._get_values(state, date1, date2, False) - normalized_values = self._get_values(state, date1, date2, True) - values = {} - # save the raw number and calculate the total - for date in raw_values.keys(): - if normalized_values[date] == 0: - continue - values[date] = { - 'num': round(raw_values[date]), - 'total': round(100 * raw_values[date] / normalized_values[date]), - } - print(date, raw_values[date], normalized_values[date]) - return values - - def _get_values(self, state, date1, date2, normalized): - if state not in HealthTweets.STATE_CODES: - raise Exception('invalid state') - state_code = HealthTweets.STATE_CODES[state] - d1, d2 = datetime.strptime(date1, '%Y-%m-%d'), datetime.strptime(date2, '%Y-%m-%d') - s1, s2 = d1.strftime('%m%%2F%d%%2F%Y'), d2.strftime('%m%%2F%d%%2F%Y') - count_type = 'normalized' if normalized else 'raw' - url = 'http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code) - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d'%(count_type, (d2 - d1).days, s1, s2, state_code)) - #print(state, date1, date2, normalized) - #print(url) - #print(response.status_code) - if response.status_code != 200: - raise Exception('plot status is ' + str(response.status_code) + ' (when was data last updated?)') - lines = [line.strip() for line in response.text.split('\n')] - data_line = [line for line in lines if line[:16] == 'var chartData = '] - if len(data_line) != 1: - raise Exception('lookup failed') - values = json.loads(data_line[0][16:-1]) - return dict([(datetime.strptime(v[0], '%m/%d/%Y').strftime('%Y-%m-%d'), float(v[1])) for v in values]) - - def check_state(self, state): - ''' - Sanity checks state code mapping. - state: two-letter state abbreviation (see STATE_CODES) - returns the full state name associated with the state abbreviation - ''' - if state not in HealthTweets.STATE_CODES: - raise Exception('invalid state') - state_code = HealthTweets.STATE_CODES[state] - response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d'%(state_code)) - lines = [line.strip() for line in response.text.split('\n')] - data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] - if len(data_line) == 0: - raise Exception('check failed') - name = data_line[0][29:] - name = name.split('(')[0] - return name.strip() - - def _get_token(self, html): - page = PageParser.parse(html) - hidden = PageParser.filter_all(page, [('html',), ('body',), ('div',), ('div',), ('div',), ('form',), ('input',)]) - return hidden['attrs']['value'] - - def _go(self, url, method=None, referer=None, data=None): - if self.debug: - print('%s'%(url)) - if method is None: - if data is None: - method = self.session.get - else: - method = self.session.post - response = method(url, headers={'referer': referer}, data=data) - html = response.text - if self.debug: - for item in response.history: - print(' [%d to %s]'%(item.status_code, item.headers['Location'])) - print(' %d (%d bytes)'%(response.status_code, len(html))) - return response + + def __init__(self, username, password, debug=False): + self.debug = debug + self.session = requests.Session() + # spoof a web browser + self.session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', + }) + # get the login token + response = self._go('http://www.healthtweets.org/accounts/login') + token = self._get_token(response.text) + if self.debug: + print('token=%s' % (token)) + data = { + 'csrfmiddlewaretoken': token, + 'username': username, + 'password': password, + 'next': '/', + } + # login to the site + response = self._go('http://www.healthtweets.org/accounts/login', data=data) + if response.status_code != 200 or 'Your username and password' in response.text: + raise Exception('login failed') + + def get_values(self, state, date1, date2): + ''' + state: two-letter state abbreviation (see STATE_CODES) + date1: the first date in the range, inclusive (format: YYYY-MM-DD) + date2: the last date in the range, inclusive (format: YYYY-MM-DD) + returns a dictionary (by date) of number of flu tweets (num) and total tweets (total) + ''' + # get raw values (number of flu tweets) and normalized values (flu tweets as a percent of total tweets) + raw_values = self._get_values(state, date1, date2, False) + normalized_values = self._get_values(state, date1, date2, True) + values = {} + # save the raw number and calculate the total + for date in raw_values.keys(): + if normalized_values[date] == 0: + continue + values[date] = { + 'num': round(raw_values[date]), + 'total': round(100 * raw_values[date] / normalized_values[date]), + } + print(date, raw_values[date], normalized_values[date]) + return values + + def _get_values(self, state, date1, date2, normalized): + if state not in HealthTweets.STATE_CODES: + raise Exception('invalid state') + state_code = HealthTweets.STATE_CODES[state] + d1, d2 = datetime.strptime(date1, '%Y-%m-%d'), datetime.strptime(date2, '%Y-%m-%d') + s1, s2 = d1.strftime('%m%%2F%d%%2F%Y'), d2.strftime('%m%%2F%d%%2F%Y') + count_type = 'normalized' if normalized else 'raw' + # TODO: I cant see difference between `url` and url string used in `response` posibly just copy-paste + url = 'http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d' % (count_type, (d2 - d1).days, s1, s2, state_code) # noqa + response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=%s&dayNum=%d&from=%s&to=%s&plot1_disease=65&location_plot1=%d' % (count_type, (d2 - d1).days, s1, s2, state_code)) # noqa + if response.status_code != 200: + raise Exception('plot status is ' + str(response.status_code) + ' (when was data last updated?)') + lines = [line.strip() for line in response.text.split('\n')] + data_line = [line for line in lines if line[:16] == 'var chartData = '] + if len(data_line) != 1: + raise Exception('lookup failed') + values = json.loads(data_line[0][16:-1]) + return dict([(datetime.strptime(v[0], '%m/%d/%Y').strftime('%Y-%m-%d'), float(v[1])) for v in values]) + + def check_state(self, state): + ''' + Sanity checks state code mapping. + state: two-letter state abbreviation (see STATE_CODES) + returns the full state name associated with the state abbreviation + ''' + if state not in HealthTweets.STATE_CODES: + raise Exception('invalid state') + state_code = HealthTweets.STATE_CODES[state] + response = self._go('http://www.healthtweets.org/trends/plot?resolution=Day&count_type=normalized&dayNum=7&from=01%%2F01%%2F2015&to=01%%2F07%%2F2015&plot1_disease=65&location_plot1=%d' % (state_code)) # noqa + lines = [line.strip() for line in response.text.split('\n')] + data_line = [line for line in lines if line[:29] == 'var plotNames = ["Influenza ('] + if len(data_line) == 0: + raise Exception('check failed') + name = data_line[0][29:] + name = name.split('(')[0] + return name.strip() + + def _get_token(self, html): + page = PageParser.parse(html) + hidden = PageParser.filter_all(page, [('html',), ('body',), ('div',), ('div',), ('div',), ('form',), ('input',)]) + return hidden['attrs']['value'] + + def _go(self, url, method=None, referer=None, data=None): + if self.debug: + print('%s' % (url)) + if method is None: + if data is None: + method = self.session.get + else: + method = self.session.post + response = method(url, headers={'referer': referer}, data=data) + html = response.text + if self.debug: + for item in response.history: + print(' [%d to %s]' % (item.status_code, item.headers['Location'])) + print(' %d (%d bytes)' % (response.status_code, len(html))) + return response def main(): - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('username', action='store', type=str, help='healthtweets.org username') - parser.add_argument('password', action='store', type=str, help='healthtweets.org password') - parser.add_argument('state', action='store', type=str, choices=list(HealthTweets.STATE_CODES.keys()), help='U.S. state (ex: TX)') - parser.add_argument('date1', action='store', type=str, help='first date, inclusive (ex: 2015-01-01)') - parser.add_argument('date2', action='store', type=str, help='last date, inclusive (ex: 2015-01-01)') - parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') - args = parser.parse_args() - - ht = HealthTweets(args.username, args.password, debug=args.debug) - values = ht.get_values(args.state, args.date1, args.date2) - print('Daily counts in %s from %s to %s:'%(ht.check_state(args.state), args.date1, args.date2)) - for date in sorted(list(values.keys())): - print('%s: num=%-4d total=%-5d (%.3f%%)'%(date, values[date]['num'], values[date]['total'], 100 * values[date]['num'] / values[date]['total'])) + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument('username', action='store', type=str, help='healthtweets.org username') + parser.add_argument('password', action='store', type=str, help='healthtweets.org password') + parser.add_argument('state', action='store', type=str, choices=list(HealthTweets.STATE_CODES.keys()), help='U.S. state (ex: TX)') + parser.add_argument('date1', action='store', type=str, help='first date, inclusive (ex: 2015-01-01)') + parser.add_argument('date2', action='store', type=str, help='last date, inclusive (ex: 2015-01-01)') + parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') + args = parser.parse_args() + + ht = HealthTweets(args.username, args.password, debug=args.debug) + values = ht.get_values(args.state, args.date1, args.date2) + print('Daily counts in %s from %s to %s:' % (ht.check_state(args.state), args.date1, args.date2)) + for date in sorted(list(values.keys())): + print('%s: num=%-4d total=%-5d (%.3f%%)' % (date, values[date]['num'], values[date]['total'], 100 * values[date]['num'] / values[date]['total'])) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/twtr/pageparser.py b/src/acquisition/twtr/pageparser.py index 5e9aaaea1..bd552bf02 100644 --- a/src/acquisition/twtr/pageparser.py +++ b/src/acquisition/twtr/pageparser.py @@ -1,78 +1,76 @@ """A small library for parsing HTML.""" -# standard library from html.parser import HTMLParser class PageParser(HTMLParser): - ''' - This is an HTML parser! All of the hard work is done by the superclass - (which is a Python built-in). This class puts the HTML into a hierarchy - that's (hopefully) easier to work with than raw string parsing. - ''' + ''' + This is an HTML parser! All of the hard work is done by the superclass + (which is a Python built-in). This class puts the HTML into a hierarchy + that's (hopefully) easier to work with than raw string parsing. + ''' - @staticmethod - def parse(html): - parser = PageParser() - parser.feed(html) - return parser.get_root_node() + @staticmethod + def parse(html): + parser = PageParser() + parser.feed(html) + return parser.get_root_node() - @staticmethod - def banlist(): - '''Commonly unclosed tags''' - return ('br', 'img', 'meta') + @staticmethod + def banlist(): + '''Commonly unclosed tags''' + return ('br', 'img', 'meta') - @staticmethod - def new_node(type): - '''An empty node of the HTML tree''' - return {'type': type, 'attrs': {}, 'nodes': [], 'data': ''} + @staticmethod + def new_node(type): + '''An empty node of the HTML tree''' + return {'type': type, 'attrs': {}, 'nodes': [], 'data': ''} - @staticmethod - def filter_all(node, filters): - '''Applies all filters''' - for f in filters: - node = PageParser.filter(node, *f) - return node + @staticmethod + def filter_all(node, filters): + '''Applies all filters''' + for f in filters: + node = PageParser.filter(node, *f) + return node - @staticmethod - def filter(node, type, index=0): - '''Finds a sub-node of the given type, specified by index''' - i = 0 - for node in node['nodes']: - if node['type'] == type: - if i == index: - return node - i += 1 - return None + @staticmethod + def filter(node, type, index=0): + '''Finds a sub-node of the given type, specified by index''' + i = 0 + for node in node['nodes']: + if node['type'] == type: + if i == index: + return node + i += 1 + return None - def __init__(self): - HTMLParser.__init__(self) - self.root = PageParser.new_node(None) - self.stack = [self.root] - self.indent = 0 + def __init__(self): + HTMLParser.__init__(self) + self.root = PageParser.new_node(None) + self.stack = [self.root] + self.indent = 0 - def get_root_node(self): - '''After parsing, returns the abstract root node (which contains the html node)''' - return self.root + def get_root_node(self): + '''After parsing, returns the abstract root node (which contains the html node)''' + return self.root - def handle_starttag(self, tag, attrs): - '''Inherited - called when a start tag is found''' - if tag in PageParser.banlist(): - return - element = PageParser.new_node(tag) - for (k, v) in attrs: - element['attrs'][k] = v - self.stack[-1]['nodes'].append(element) - self.stack.append(element) + def handle_starttag(self, tag, attrs): + '''Inherited - called when a start tag is found''' + if tag in PageParser.banlist(): + return + element = PageParser.new_node(tag) + for (k, v) in attrs: + element['attrs'][k] = v + self.stack[-1]['nodes'].append(element) + self.stack.append(element) - def handle_endtag(self, tag): - '''Inherited - called when an end tag is found''' - if tag in PageParser.banlist(): - return - self.stack.pop() + def handle_endtag(self, tag): + '''Inherited - called when an end tag is found''' + if tag in PageParser.banlist(): + return + self.stack.pop() - - def handle_data(self, data): - '''Inherited - called when a data string is found''' - element = self.stack[-1] - element['data'] += data + def handle_data(self, data): + '''Inherited - called when a data string is found''' + element = self.stack[-1] + element['data'] += data diff --git a/src/acquisition/twtr/twitter_update.py b/src/acquisition/twtr/twitter_update.py index 5c1f3f45b..93460627e 100644 --- a/src/acquisition/twtr/twitter_update.py +++ b/src/acquisition/twtr/twitter_update.py @@ -51,55 +51,53 @@ * Original version ''' -# third party +import delphi.operations.secrets as secrets import mysql.connector -# first party from .healthtweets import HealthTweets -import delphi.operations.secrets as secrets def run(): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() - - def get_num_rows(): - cur.execute('SELECT count(1) `num` FROM `twitter`') - for (num,) in cur: - pass - return num - - # check from 7 days preceeding the last date with data through yesterday (healthtweets.org 404's if today's date is part of the range) - cur.execute('SELECT date_sub(max(`date`), INTERVAL 7 DAY) `date1`, date_sub(date(now()), INTERVAL 1 DAY) `date2` FROM `twitter`') - for (date1, date2) in cur: - date1, date2 = date1.strftime('%Y-%m-%d'), date2.strftime('%Y-%m-%d') - print('Checking dates between %s and %s...'%(date1, date2)) - - # keep track of how many rows were added - rows_before = get_num_rows() - - # check healthtweets.org for new and/or revised data - ht = HealthTweets(*secrets.healthtweets.login) - sql = 'INSERT INTO `twitter` (`date`, `state`, `num`, `total`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s, `total` = %s' - total_rows = 0 - for state in sorted(HealthTweets.STATE_CODES.keys()): - values = ht.get_values(state, date1, date2) - for date in sorted(list(values.keys())): - sql_data = (date, state, values[date]['num'], values[date]['total'], values[date]['num'], values[date]['total']) - cur.execute(sql, sql_data) - total_rows += 1 - - # keep track of how many rows were added - rows_after = get_num_rows() - print('Inserted %d/%d row(s)'%(rows_after - rows_before, total_rows)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cur = cnx.cursor() + + def get_num_rows(): + cur.execute('SELECT count(1) `num` FROM `twitter`') + for (num,) in cur: + pass + return num + + # check from 7 days preceeding the last date with data through yesterday (healthtweets.org 404's if today's date is part of the range) + cur.execute('SELECT date_sub(max(`date`), INTERVAL 7 DAY) `date1`, date_sub(date(now()), INTERVAL 1 DAY) `date2` FROM `twitter`') + for (date1, date2) in cur: + date1, date2 = date1.strftime('%Y-%m-%d'), date2.strftime('%Y-%m-%d') + print('Checking dates between %s and %s...' % (date1, date2)) + + # keep track of how many rows were added + rows_before = get_num_rows() + + # check healthtweets.org for new and/or revised data + ht = HealthTweets(*secrets.healthtweets.login) + sql = 'INSERT INTO `twitter` (`date`, `state`, `num`, `total`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `num` = %s, `total` = %s' + total_rows = 0 + for state in sorted(HealthTweets.STATE_CODES.keys()): + values = ht.get_values(state, date1, date2) + for date in sorted(list(values.keys())): + sql_data = (date, state, values[date]['num'], values[date]['total'], values[date]['num'], values[date]['total']) + cur.execute(sql, sql_data) + total_rows += 1 + + # keep track of how many rows were added + rows_after = get_num_rows() + print('Inserted %d/%d row(s)' % (rows_after - rows_before, total_rows)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() if __name__ == '__main__': - run() + run() diff --git a/src/acquisition/wiki/wiki.py b/src/acquisition/wiki/wiki.py index 602e21102..4be6dbe83 100644 --- a/src/acquisition/wiki/wiki.py +++ b/src/acquisition/wiki/wiki.py @@ -1,4 +1,4 @@ -""" +""" =============== === Purpose === =============== @@ -8,7 +8,7 @@ 2. Uses wiki_download.py to download the access logs 3. Uses wiki_extract.py to store article access counts -See also: master.php +See also: master.php ======================= @@ -91,55 +91,53 @@ === Changelog === ================= -2017-02-24 - * secrets and small improvements -2016-08-14 - * Increased job limit (6 -> 12) (pageviews files are ~2x smaller) +2017-02-24 + * secrets and small improvements +2016-08-14 + * Increased job limit (6 -> 12) (pageviews files are ~2x smaller) 2015-08-26 - * Reduced job limit (8 -> 6) + * Reduced job limit (8 -> 6) 2015-08-14 - * Reduced job limit (10 -> 8) + * Reduced job limit (10 -> 8) 2015-08-11 - + New table `wiki_meta` + + New table `wiki_meta` 2015-05-22 - * Updated status codes for `wiki_raw` table + * Updated status codes for `wiki_raw` table 2015-05-21 - * Original version -""" - -# first party -from . import wiki_update -from . import wiki_download -from . import wiki_extract -import delphi.operations.secrets as secrets - - -def main(): - # step 1: find new access logs (aka "jobs") - print('looking for new jobs...') - try: - wiki_update.run() - except: - print('wiki_update failed') - - # step 2: run a few jobs - print('running jobs...') - try: - wiki_download.run( - secrets.wiki.hmac, - download_limit=1024 * 1024 * 1024, - job_limit=12 - ) - except: - print('wiki_download failed') - - # step 3: extract counts from the staging data - print('extracting counts...') - try: - wiki_extract.run(job_limit=100) - except: - print('wiki_extract failed') - - -if __name__ == '__main__': - main() + * Original version +""" + +import delphi.operations.secrets as secrets + +from . import wiki_download, wiki_extract, wiki_update + + +def main(): + # step 1: find new access logs (aka "jobs") + print('looking for new jobs...') + try: + wiki_update.run() + except: + print('wiki_update failed') + + # step 2: run a few jobs + print('running jobs...') + try: + wiki_download.run( + secrets.wiki.hmac, + download_limit=1024 * 1024 * 1024, + job_limit=12 + ) + except: + print('wiki_download failed') + + # step 3: extract counts from the staging data + print('extracting counts...') + try: + wiki_extract.run(job_limit=100) + except: + print('wiki_extract failed') + + +if __name__ == '__main__': + main() diff --git a/src/acquisition/wiki/wiki_download.py b/src/acquisition/wiki/wiki_download.py index 1a01b7f8e..f1ac5fba8 100644 --- a/src/acquisition/wiki/wiki_download.py +++ b/src/acquisition/wiki/wiki_download.py @@ -28,15 +28,18 @@ # python 2 and 3 from __future__ import print_function + import sys + if sys.version_info.major == 2: - # python 2 libraries - from urllib import urlencode - from urllib2 import urlopen + # python 2 libraries + from urllib import urlencode + + from urllib2 import urlopen else: - # python 3 libraries - from urllib.parse import urlencode - from urllib.request import urlopen + # python 3 libraries + from urllib.parse import urlencode + from urllib.request import urlopen # common libraries import argparse @@ -44,243 +47,242 @@ import hashlib import hmac import json +import os import subprocess import time -import os from sys import platform from . import wiki_util - VERSION = 10 MASTER_URL = 'https://delphi.cmu.edu/~automation/public/wiki/master.php' + def text(data_string): - return str(data_string.decode('utf-8')) + return str(data_string.decode('utf-8')) def data(text_string): - if sys.version_info.major == 2: - return text_string - else: - return bytes(text_string, 'utf-8') + if sys.version_info.major == 2: + return text_string + else: + return bytes(text_string, 'utf-8') def get_hmac_sha256(key, msg): - key_bytes, msg_bytes = key.encode('utf-8'), msg.encode('utf-8') - return hmac.new(key_bytes, msg_bytes, hashlib.sha256).hexdigest() + key_bytes, msg_bytes = key.encode('utf-8'), msg.encode('utf-8') + return hmac.new(key_bytes, msg_bytes, hashlib.sha256).hexdigest() def extract_article_counts(filename, language, articles, debug_mode): - """ - Support multiple languages ('en' | 'es' | 'pt') - Running time optimized to O(M), which means only need to scan the whole file once - :param filename: - :param language: Different languages such as 'en', 'es', and 'pt' - :param articles: - :param debug_mode: - :return: - """ - counts = {} - articles_set = set(map(lambda x: x.lower(), articles)) - total = 0 - with open(filename, "r", encoding="utf8") as f: - for line in f: - content = line.strip().split() - if len(content) != 4: - print('unexpected article format: {0}'.format(line)) - continue - article_title = content[1].lower() - article_count = int(content[2]) - if content[0] == language: - total += article_count - if content[0] == language and article_title in articles_set: - if debug_mode: - print("Find article {0}: {1}".format(article_title, line)) - counts[article_title] = article_count - if debug_mode: - print("Total number of counts for language {0} is {1}".format(language, total)) - counts['total'] = total - return counts + """ + Support multiple languages ('en' | 'es' | 'pt') + Running time optimized to O(M), which means only need to scan the whole file once + :param filename: + :param language: Different languages such as 'en', 'es', and 'pt' + :param articles: + :param debug_mode: + :return: + """ + counts = {} + articles_set = set(map(lambda x: x.lower(), articles)) + total = 0 + with open(filename, "r", encoding="utf8") as f: + for line in f: + content = line.strip().split() + if len(content) != 4: + print('unexpected article format: {0}'.format(line)) + continue + article_title = content[1].lower() + article_count = int(content[2]) + if content[0] == language: + total += article_count + if content[0] == language and article_title in articles_set: + if debug_mode: + print("Find article {0}: {1}".format(article_title, line)) + counts[article_title] = article_count + if debug_mode: + print("Total number of counts for language {0} is {1}".format(language, total)) + counts['total'] = total + return counts def extract_article_counts_orig(articles, debug_mode): - """ - The original method which extracts article counts by shell command grep (only support en articles). - As it is difficult to deal with other languages (utf-8 encoding), we choose to use python read files. - Another things is that it is slower to go over the whole file once and once again, the time complexity is O(NM), - where N is the number of articles and M is the lines in the file - In our new implementation extract_article_counts(), the time complexity is O(M), and it can cope with utf8 encoding - :param articles: - :param debug_mode: - :return: - """ - counts = {} - for article in articles: - if debug_mode: - print(' %s' % (article)) - out = text( - subprocess.check_output('LC_ALL=C grep -a -i "^en %s " raw2 | cat' % (article.lower()), shell=True)).strip() - count = 0 - if len(out) > 0: - for line in out.split('\n'): - fields = line.split() - if len(fields) != 4: - print('unexpected article format: [%s]' % (line)) - else: - count += int(fields[2]) - # print ' %4d %s'%(count, article) - counts[article.lower()] = count + """ + The original method which extracts article counts by shell command grep (only support en articles). + As it is difficult to deal with other languages (utf-8 encoding), we choose to use python read files. + Another things is that it is slower to go over the whole file once and once again, the time complexity is O(NM), + where N is the number of articles and M is the lines in the file + In our new implementation extract_article_counts(), the time complexity is O(M), and it can cope with utf8 encoding + :param articles: + :param debug_mode: + :return: + """ + counts = {} + for article in articles: + if debug_mode: + print(' %s' % (article)) + out = text(subprocess.check_output('LC_ALL=C grep -a -i "^en %s " raw2 | cat' % (article.lower()), shell=True)).strip() + count = 0 + if len(out) > 0: + for line in out.split('\n'): + fields = line.split() + if len(fields) != 4: + print('unexpected article format: [%s]' % (line)) + else: + count += int(fields[2]) + # print ' %4d %s'%(count, article) + counts[article.lower()] = count + if debug_mode: + print(' %d' % (count)) + print('getting total count...') + out = text(subprocess.check_output( + 'cat raw2 | LC_ALL=C grep -a -i "^en " | cut -d" " -f 3 | awk \'{s+=$1} END {printf "%.0f", s}\'', shell=True)) + total = int(out) if debug_mode: - print(' %d' % (count)) - print('getting total count...') - out = text(subprocess.check_output( - 'cat raw2 | LC_ALL=C grep -a -i "^en " | cut -d" " -f 3 | awk \'{s+=$1} END {printf "%.0f", s}\'', shell=True)) - total = int(out) - if debug_mode: - print(total) - counts['total'] = total - return counts + print(total) + counts['total'] = total + return counts def run(secret, download_limit=None, job_limit=None, sleep_time=1, job_type=0, debug_mode=False): - worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() - print('this is [%s]'%(worker)) - if debug_mode: - print('*** running in debug mode ***') - - total_download = 0 - passed_jobs = 0 - failed_jobs = 0 - while (download_limit is None or total_download < download_limit) and (job_limit is None or (passed_jobs + failed_jobs) < job_limit): - try: - time_start = datetime.datetime.now() - req = urlopen(MASTER_URL + '?get=x&type=%s'%(job_type)) - code = req.getcode() - if code != 200: - if code == 201: - print('no jobs available') - if download_limit is None and job_limit is None: - time.sleep(60) - continue - else: - print('nothing to do, exiting') - return - else: - raise Exception('server response code (get) was %d'%(code)) - # Make the code compatible with mac os system - if platform == "darwin": - job_content = text(req.readlines()[1]) - else: - job_content = text(req.readlines()[0]) - if job_content == 'no jobs': - print('no jobs available') - if download_limit is None and job_limit is None: - time.sleep(60) - continue - else: - print('nothing to do, exiting') - return - job = json.loads(job_content) - print('received job [%d|%s]'%(job['id'], job['name'])) - # updated parsing for pageviews - maybe use a regex in the future - #year, month = int(job['name'][11:15]), int(job['name'][15:17]) - year, month = int(job['name'][10:14]), int(job['name'][14:16]) - #print 'year=%d | month=%d'%(year, month) - url = 'https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/%s'%(year, year, month, job['name']) - print('downloading file [%s]...'%(url)) - subprocess.check_call('curl -s %s > raw.gz'%(url), shell=True) - print('checking file size...') - # Make the code cross-platfrom, so use python to get the size of the file - # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) - size = os.stat("raw.gz").st_size - if debug_mode: - print(size) - total_download += size - if job['hash'] != '00000000000000000000000000000000': - print('checking hash...') - out = text(subprocess.check_output('md5sum raw.gz', shell=True)) - result = out[0:32] - if result != job['hash']: - raise Exception('wrong hash [expected %s, got %s]'%(job['hash'], result)) - if debug_mode: - print(result) - print('decompressing...') - subprocess.check_call('gunzip -f raw.gz', shell=True) - #print 'converting case...' - #subprocess.check_call('cat raw | tr "[:upper:]" "[:lower:]" > raw2', shell=True) - #subprocess.check_call('rm raw', shell=True) - subprocess.check_call('mv raw raw2', shell=True) - print('extracting article counts...') - - # Use python to read the file and extract counts, if you want to use the original shell method, please use - counts = {} - for language in wiki_util.Articles.available_languages: - lang2articles = {'en': wiki_util.Articles.en_articles, 'es': wiki_util.Articles.es_articles, 'pt': wiki_util.Articles.pt_articles} - articles = lang2articles[language] - articles = sorted(articles) - if debug_mode: - print("Language is {0} and target articles are {1}".format(language, articles)) - temp_counts = extract_article_counts("raw2", language, articles, debug_mode) - counts[language] = temp_counts - - if not debug_mode: - print('deleting files...') - subprocess.check_call('rm raw2', shell=True) - print('saving results...') - time_stop = datetime.datetime.now() - result = { - 'id': job['id'], - 'size': size, - 'data': json.dumps(counts), - 'worker': worker, - 'elapsed': (time_stop - time_start).total_seconds(), - } - payload = json.dumps(result) - hmac_str = get_hmac_sha256(secret, payload) - if debug_mode: - print(' hmac: %s' % hmac_str) - post_data = urlencode({'put': payload, 'hmac': hmac_str}) - req = urlopen(MASTER_URL, data=data(post_data)) - code = req.getcode() - if code != 200: - raise Exception('server response code (put) was %d'%(code)) - print('done! (dl=%d)'%(total_download)) - passed_jobs += 1 - except Exception as ex: - print('***** Caught Exception: %s *****'%(str(ex))) - failed_jobs += 1 - time.sleep(30) - print('passed=%d | failed=%d | total=%d'%(passed_jobs, failed_jobs, passed_jobs + failed_jobs)) - time.sleep(sleep_time) - - if download_limit is not None and total_download >= download_limit: - print('download limit has been reached [%d >= %d]'%(total_download, download_limit)) - if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: - print('job limit has been reached [%d >= %d]'%(passed_jobs + failed_jobs, job_limit)) + worker = text(subprocess.check_output("echo `whoami`@`hostname`", shell=True)).strip() + print('this is [%s]' % (worker)) + if debug_mode: + print('*** running in debug mode ***') + + total_download = 0 + passed_jobs = 0 + failed_jobs = 0 + while (download_limit is None or total_download < download_limit) and (job_limit is None or (passed_jobs + failed_jobs) < job_limit): + try: + time_start = datetime.datetime.now() + req = urlopen(MASTER_URL + '?get=x&type=%s' % (job_type)) + code = req.getcode() + if code != 200: + if code == 201: + print('no jobs available') + if download_limit is None and job_limit is None: + time.sleep(60) + continue + else: + print('nothing to do, exiting') + return + else: + raise Exception('server response code (get) was %d' % (code)) + # Make the code compatible with mac os system + if platform == "darwin": + job_content = text(req.readlines()[1]) + else: + job_content = text(req.readlines()[0]) + if job_content == 'no jobs': + print('no jobs available') + if download_limit is None and job_limit is None: + time.sleep(60) + continue + else: + print('nothing to do, exiting') + return + job = json.loads(job_content) + print('received job [%d|%s]' % (job['id'], job['name'])) + # updated parsing for pageviews - maybe use a regex in the future + # year, month = int(job['name'][11:15]), int(job['name'][15:17]) + year, month = int(job['name'][10:14]), int(job['name'][14:16]) + # print 'year=%d | month=%d'%(year, month) + url = 'https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/%s' % (year, year, month, job['name']) + print('downloading file [%s]...' % (url)) + subprocess.check_call('curl -s %s > raw.gz' % (url), shell=True) + print('checking file size...') + # Make the code cross-platfrom, so use python to get the size of the file + # size = int(text(subprocess.check_output('ls -l raw.gz | cut -d" " -f 5', shell=True))) + size = os.stat("raw.gz").st_size + if debug_mode: + print(size) + total_download += size + if job['hash'] != '00000000000000000000000000000000': + print('checking hash...') + out = text(subprocess.check_output('md5sum raw.gz', shell=True)) + result = out[0:32] + if result != job['hash']: + raise Exception('wrong hash [expected %s, got %s]' % (job['hash'], result)) + if debug_mode: + print(result) + print('decompressing...') + subprocess.check_call('gunzip -f raw.gz', shell=True) + # print 'converting case...' + # subprocess.check_call('cat raw | tr "[:upper:]" "[:lower:]" > raw2', shell=True) + # subprocess.check_call('rm raw', shell=True) + subprocess.check_call('mv raw raw2', shell=True) + print('extracting article counts...') + + # Use python to read the file and extract counts, if you want to use the original shell method, please use + counts = {} + for language in wiki_util.Articles.available_languages: + lang2articles = {'en': wiki_util.Articles.en_articles, 'es': wiki_util.Articles.es_articles, 'pt': wiki_util.Articles.pt_articles} + articles = lang2articles[language] + articles = sorted(articles) + if debug_mode: + print("Language is {0} and target articles are {1}".format(language, articles)) + temp_counts = extract_article_counts("raw2", language, articles, debug_mode) + counts[language] = temp_counts + + if not debug_mode: + print('deleting files...') + subprocess.check_call('rm raw2', shell=True) + print('saving results...') + time_stop = datetime.datetime.now() + result = { + 'id': job['id'], + 'size': size, + 'data': json.dumps(counts), + 'worker': worker, + 'elapsed': (time_stop - time_start).total_seconds(), + } + payload = json.dumps(result) + hmac_str = get_hmac_sha256(secret, payload) + if debug_mode: + print(' hmac: %s' % hmac_str) + post_data = urlencode({'put': payload, 'hmac': hmac_str}) + req = urlopen(MASTER_URL, data=data(post_data)) + code = req.getcode() + if code != 200: + raise Exception('server response code (put) was %d' % (code)) + print('done! (dl=%d)' % (total_download)) + passed_jobs += 1 + except Exception as ex: + print('***** Caught Exception: %s *****' % (str(ex))) + failed_jobs += 1 + time.sleep(30) + print('passed=%d | failed=%d | total=%d' % (passed_jobs, failed_jobs, passed_jobs + failed_jobs)) + time.sleep(sleep_time) + + if download_limit is not None and total_download >= download_limit: + print('download limit has been reached [%d >= %d]' % (total_download, download_limit)) + if job_limit is not None and (passed_jobs + failed_jobs) >= job_limit: + print('job limit has been reached [%d >= %d]' % (passed_jobs + failed_jobs, job_limit)) def main(): - # version info - print('version', VERSION) + # version info + print('version', VERSION) - # args and usage - parser = argparse.ArgumentParser() - parser.add_argument('secret', type=str, help='hmac secret key') - parser.add_argument('-b', '--blimit', action='store', type=int, default=None, help='download limit, in bytes') - parser.add_argument('-j', '--jlimit', action='store', type=int, default=None, help='job limit') - parser.add_argument('-s', '--sleep', action='store', type=int, default=1, help='seconds to sleep between each job') - parser.add_argument('-t', '--type', action='store', type=int, default=0, help='type of job') - parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') - args = parser.parse_args() + # args and usage + parser = argparse.ArgumentParser() + parser.add_argument('secret', type=str, help='hmac secret key') + parser.add_argument('-b', '--blimit', action='store', type=int, default=None, help='download limit, in bytes') + parser.add_argument('-j', '--jlimit', action='store', type=int, default=None, help='job limit') + parser.add_argument('-s', '--sleep', action='store', type=int, default=1, help='seconds to sleep between each job') + parser.add_argument('-t', '--type', action='store', type=int, default=0, help='type of job') + parser.add_argument('-d', '--debug', action='store_const', const=True, default=False, help='enable debug mode') + args = parser.parse_args() - # runtime options - secret, download_limit, job_limit, sleep_time, job_type, debug_mode = args.secret, args.blimit, args.jlimit, args.sleep, args.type, args.debug + # runtime options + secret, download_limit, job_limit, sleep_time, job_type, debug_mode = args.secret, args.blimit, args.jlimit, args.sleep, args.type, args.debug - # run - run(secret, download_limit, job_limit, sleep_time, job_type, debug_mode) + # run + run(secret, download_limit, job_limit, sleep_time, job_type, debug_mode) if __name__ == '__main__': - main() + main() diff --git a/src/acquisition/wiki/wiki_extract.py b/src/acquisition/wiki/wiki_extract.py index 839d7d6dc..8736a4d9d 100644 --- a/src/acquisition/wiki/wiki_extract.py +++ b/src/acquisition/wiki/wiki_extract.py @@ -23,86 +23,82 @@ * Original version """ -# standard library -from datetime import datetime, timedelta import json +from datetime import datetime, timedelta -# third party -import mysql.connector - -# first party import delphi.operations.secrets as secrets +import mysql.connector def floor_timestamp(timestamp): - return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) + return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) def ceil_timestamp(timestamp): - return floor_timestamp(timestamp) + timedelta(hours=1) + return floor_timestamp(timestamp) + timedelta(hours=1) def round_timestamp(timestamp): - before = floor_timestamp(timestamp) - after = ceil_timestamp(timestamp) - if (timestamp - before) < (after - timestamp): - return before - else: - return after + before = floor_timestamp(timestamp) + after = ceil_timestamp(timestamp) + if (timestamp - before) < (after - timestamp): + return before + else: + return after def get_timestamp(name): - # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future - #return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) - return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) + # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future + # return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) + return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) def run(job_limit=100): - # connect to the database - u, p = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - cur = cnx.cursor() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cur = cnx.cursor() - # # Some preparation for utf-8, and it is a temporary trick solution. The real solution should change those char set and collation encoding to utf8 permanently - # cur.execute("SET NAMES utf8;") - # cur.execute("SET CHARACTER SET utf8;") - # # I print SHOW SESSION VARIABLES LIKE 'character\_set\_%'; and SHOW SESSION VARIABLES LIKE 'collation\_%'; on my local computer - # cur.execute("SET character_set_client=utf8mb4;") - # cur.execute("SET character_set_connection=utf8mb4;") - # cur.execute("SET character_set_database=utf8;") - # cur.execute("SET character_set_results=utf8mb4;") - # cur.execute("SET character_set_server=utf8;") - # cur.execute("SET collation_connection=utf8mb4_general_ci;") - # cur.execute("SET collation_database=utf8_general_ci;") - # cur.execute("SET collation_server=utf8_general_ci;") - - # find jobs that are queued for extraction - cur.execute('SELECT `id`, `name`, `data` FROM `wiki_raw` WHERE `status` = 2 ORDER BY `name` ASC LIMIT %s', (job_limit,)) - jobs = [] - for (id, name, data_str) in cur: - jobs.append((id, name, json.loads(data_str))) - print('Processing data from %d jobs'%(len(jobs))) - - # get the counts from the json object and insert into (or update) the database - # Notice that data_collect contains data with different languages - for (id, name, data_collect) in jobs: - print('processing job [%d|%s]...'%(id, name)) - timestamp = round_timestamp(get_timestamp(name)) - for language in data_collect.keys(): - data = data_collect[language] - for article in sorted(data.keys()): - count = data[article] - cur.execute('INSERT INTO `wiki` (`datetime`, `article`, `count`, `language`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `count` = `count` + %s', (str(timestamp), article.encode('utf-8').decode('latin-1'), count, language, count)) - if article == 'total': - cur.execute('INSERT INTO `wiki_meta` (`datetime`, `date`, `epiweek`, `total`, `language`) VALUES (%s, date(%s), yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = `total` + %s', (str(timestamp), str(timestamp), str(timestamp), count, language, count)) - # update the job - cur.execute('UPDATE `wiki_raw` SET `status` = 3 WHERE `id` = %s', (id,)) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + # Some preparation for utf-8, and it is a temporary trick solution. The real solution should change those char set and collation encoding to utf8 permanently + # cur.execute("SET NAMES utf8;") + # cur.execute("SET CHARACTER SET utf8;") + # # I print SHOW SESSION VARIABLES LIKE 'character\_set\_%'; and SHOW SESSION VARIABLES LIKE 'collation\_%'; on my local computer + # cur.execute("SET character_set_client=utf8mb4;") + # cur.execute("SET character_set_connection=utf8mb4;") + # cur.execute("SET character_set_database=utf8;") + # cur.execute("SET character_set_results=utf8mb4;") + # cur.execute("SET character_set_server=utf8;") + # cur.execute("SET collation_connection=utf8mb4_general_ci;") + # cur.execute("SET collation_database=utf8_general_ci;") + # cur.execute("SET collation_server=utf8_general_ci;") + + # find jobs that are queued for extraction + cur.execute('SELECT `id`, `name`, `data` FROM `wiki_raw` WHERE `status` = 2 ORDER BY `name` ASC LIMIT %s', (job_limit,)) + jobs = [] + for (id, name, data_str) in cur: + jobs.append((id, name, json.loads(data_str))) + print('Processing data from %d jobs' % (len(jobs))) + + # get the counts from the json object and insert into (or update) the database + # Notice that data_collect contains data with different languages + for (id, name, data_collect) in jobs: + print('processing job [%d|%s]...' % (id, name)) + timestamp = round_timestamp(get_timestamp(name)) + for language in data_collect.keys(): + data = data_collect[language] + for article in sorted(data.keys()): + count = data[article] + cur.execute('INSERT INTO `wiki` (`datetime`, `article`, `count`, `language`) VALUES (%s, %s, %s, %s) ON DUPLICATE KEY UPDATE `count` = `count` + %s', (str(timestamp), article.encode('utf-8').decode('latin-1'), count, language, count)) + if article == 'total': + cur.execute('INSERT INTO `wiki_meta` (`datetime`, `date`, `epiweek`, `total`, `language`) VALUES (%s, date(%s), yearweek(%s, 6), %s, %s) ON DUPLICATE KEY UPDATE `total` = `total` + %s', (str(timestamp), str(timestamp), str(timestamp), count, language, count)) + # update the job + cur.execute('UPDATE `wiki_raw` SET `status` = 3 WHERE `id` = %s', (id,)) + + # cleanup + cur.close() + cnx.commit() + cnx.close() if __name__ == '__main__': - run() + run() diff --git a/src/acquisition/wiki/wiki_update.py b/src/acquisition/wiki/wiki_update.py index 411544810..1d3ae01b5 100644 --- a/src/acquisition/wiki/wiki_update.py +++ b/src/acquisition/wiki/wiki_update.py @@ -20,99 +20,96 @@ * Original version """ -# standard library + from datetime import datetime, timedelta -# third party +import delphi.operations.secrets as secrets import mysql.connector import requests -# first party -import delphi.operations.secrets as secrets - def floor_timestamp(timestamp): - return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) + return datetime(timestamp.year, timestamp.month, timestamp.day, timestamp.hour) def ceil_timestamp(timestamp): - return floor_timestamp(timestamp) + timedelta(hours=1) + return floor_timestamp(timestamp) + timedelta(hours=1) def round_timestamp(timestamp): - before = floor_timestamp(timestamp) - after = ceil_timestamp(timestamp) - if (timestamp - before) < (after - timestamp): - return before - else: - return after + before = floor_timestamp(timestamp) + after = ceil_timestamp(timestamp) + if (timestamp - before) < (after - timestamp): + return before + else: + return after def get_timestamp(name): - # If the program is cold start (there are no previous names in the table, and the name will be None) - if name is None: - curr = datetime.now() - return datetime(curr.year, curr.month, curr.day, curr.hour, curr.minute, curr.second) - # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future - #return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) - return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) + # If the program is cold start (there are no previous names in the table, and the name will be None) + if name is None: + curr = datetime.now() + return datetime(curr.year, curr.month, curr.day, curr.hour, curr.minute, curr.second) + # new parsing for pageviews compared to pagecounts - maybe switch to regex in the future + # return datetime(int(name[11:15]), int(name[15:17]), int(name[17:19]), int(name[20:22]), int(name[22:24]), int(name[24:26])) + return datetime(int(name[10:14]), int(name[14:16]), int(name[16:18]), int(name[19:21]), int(name[21:23]), int(name[23:25])) def get_manifest(year, month, optional=False): - # unlike pagecounts-raw, pageviews doesn't provide hashes - #url = 'https://dumps.wikimedia.org/other/pagecounts-raw/%d/%d-%02d/md5sums.txt'%(year, year, month) - url = 'https://dumps.wikimedia.org/other/pageviews/%d/%d-%02d/' % (year, year, month) - print('Checking manifest at %s...'%(url)) - response = requests.get(url) - if response.status_code == 200: - #manifest = [line.strip().split() for line in response.text.split('\n') if 'pagecounts' in line] - manifest = [('00000000000000000000000000000000', line[9:37]) for line in response.text.split('\n') if ' max_name: - new_logs[name] = hash - print(' New job: %s [%s]'%(name, hash)) - print('Found %d new job(s)'%(len(new_logs))) - - # store metadata for new jobs - for name in sorted(new_logs.keys()): - cur.execute('INSERT INTO `wiki_raw` (`name`, `hash`) VALUES (%s, %s)', (name, new_logs[name])) - - # cleanup - cur.close() - cnx.commit() - cnx.close() + # connect to the database + u, p = secrets.db.epi + cnx = mysql.connector.connect(user=u, password=p, database='epidata') + cur = cnx.cursor() + + # get the most recent job in wiki_raw + # luckily, "pageviews" is lexicographically greater than "pagecounts-raw" + cur.execute('SELECT max(`name`) FROM `wiki_raw`') + for (max_name,) in cur: + pass + print('Last known file: %s' % (max_name)) + timestamp = get_timestamp(max_name) + + # crawl dumps.wikimedia.org to find more recent access logs + t1, t2 = floor_timestamp(timestamp), ceil_timestamp(timestamp) + manifest = get_manifest(t1.year, t1.month, optional=False) + if t2.month != t1.month: + manifest += get_manifest(t2.year, t2.month, optional=True) + + # find access logs newer than the most recent job + new_logs = {} + for (hash, name) in manifest: + if max_name is None or name > max_name: + new_logs[name] = hash + print(' New job: %s [%s]' % (name, hash)) + print('Found %d new job(s)' % (len(new_logs))) + + # store metadata for new jobs + for name in sorted(new_logs.keys()): + cur.execute('INSERT INTO `wiki_raw` (`name`, `hash`) VALUES (%s, %s)', (name, new_logs[name])) + + # cleanup + cur.close() + cnx.commit() + cnx.close() if __name__ == '__main__': - run() + run() diff --git a/src/acquisition/wiki/wiki_util.py b/src/acquisition/wiki/wiki_util.py index ed3c743bc..d72fea764 100644 --- a/src/acquisition/wiki/wiki_util.py +++ b/src/acquisition/wiki/wiki_util.py @@ -1,6 +1,3 @@ - - - class Articles: # Notice that all languages must be two chars, because that `language` column in table `wiki` is CHAR(2) diff --git a/src/client/delphi_epidata.py b/src/client/delphi_epidata.py index 42f670ad4..a5d899c48 100644 --- a/src/client/delphi_epidata.py +++ b/src/client/delphi_epidata.py @@ -8,734 +8,731 @@ - Compatible with Python 2 and 3. """ -# External modules -import requests import asyncio -from tenacity import retry, stop_after_attempt +# External modules +import requests from aiohttp import ClientSession, TCPConnector -from pkg_resources import get_distribution, DistributionNotFound +from pkg_resources import DistributionNotFound, get_distribution +from tenacity import retry, stop_after_attempt # Obtain package version for the user-agent. Uses the installed version by # preference, even if you've installed it and then use this script independently # by accident. try: - _version = get_distribution('delphi-epidata').version + _version = get_distribution('delphi-epidata').version except DistributionNotFound: - _version = "0.script" + _version = "0.script" _HEADERS = { - "user-agent": "delphi_epidata/" + _version + "user-agent": "delphi_epidata/" + _version } + # Because the API is stateless, the Epidata class only contains static methods class Epidata: - """An interface to DELPHI's Epidata API.""" - - # API base url - BASE_URL = 'https://delphi.cmu.edu/epidata/api.php' - - client_version = _version - - # Helper function to cast values and/or ranges to strings - @staticmethod - def _listitem(value): - """Cast values and/or range to a string.""" - if isinstance(value, dict) and 'from' in value and 'to' in value: - return str(value['from']) + '-' + str(value['to']) - else: - return str(value) - - # Helper function to build a list of values and/or ranges - @staticmethod - def _list(values): - """Turn a list/tuple of values/ranges into a comma-separated string.""" - if not isinstance(values, (list, tuple)): - values = [values] - return ','.join([Epidata._listitem(value) for value in values]) - - @staticmethod - @retry(reraise=True, stop=stop_after_attempt(2)) - def _request_with_retry(params): - """Make request with a retry if an exception is thrown.""" - req = requests.get(Epidata.BASE_URL, params, headers=_HEADERS) - if req.status_code == 414: - req = requests.post(Epidata.BASE_URL, params, headers=_HEADERS) - return req - - @staticmethod - def _request(params): - """Request and parse epidata. - - We default to GET since it has better caching and logging - capabilities, but fall back to POST if the request is too - long and returns a 414. - """ - try: - return Epidata._request_with_retry(params).json() - except Exception as e: - return {'result': 0, 'message': 'error: ' + str(e)} - - # Raise an Exception on error, otherwise return epidata - @staticmethod - def check(resp): - """Raise an Exception on error, otherwise return epidata.""" - if resp['result'] != 1: - msg, code = resp['message'], resp['result'] - raise Exception('Error fetching epidata: %s. (result=%d)' % (msg, code)) - return resp['epidata'] - - # Build a `range` object (ex: dates/epiweeks) - @staticmethod - def range(from_, to_): - """Build a `range` object (ex: dates/epiweeks).""" - if to_ <= from_: - from_, to_ = to_, from_ - return {'from': from_, 'to': to_} - - # Fetch FluView data - @staticmethod - def fluview(regions, epiweeks, issues=None, lag=None, auth=None): - """Fetch FluView data.""" - # Check parameters - if regions is None or epiweeks is None: - raise Exception('`regions` and `epiweeks` are both required') - if issues is not None and lag is not None: - raise Exception('`issues` and `lag` are mutually exclusive') - # Set up request - params = { - 'endpoint': 'fluview', - 'regions': Epidata._list(regions), - 'epiweeks': Epidata._list(epiweeks), - } - if issues is not None: - params['issues'] = Epidata._list(issues) - if lag is not None: - params['lag'] = lag - if auth is not None: - params['auth'] = auth - # Make the API call - return Epidata._request(params) - - # Fetch FluView metadata - @staticmethod - def fluview_meta(): - """Fetch FluView metadata.""" - # Set up request - params = { - 'endpoint': 'fluview_meta', - } - # Make the API call - return Epidata._request(params) - - # Fetch FluView clinical data - @staticmethod - def fluview_clinical(regions, epiweeks, issues=None, lag=None): - """Fetch FluView clinical data.""" - # Check parameters - if regions is None or epiweeks is None: - raise Exception('`regions` and `epiweeks` are both required') - if issues is not None and lag is not None: - raise Exception('`issues` and `lag` are mutually exclusive') - # Set up request - params = { - 'endpoint': 'fluview_clinical', - 'regions': Epidata._list(regions), - 'epiweeks': Epidata._list(epiweeks), - } - if issues is not None: - params['issues'] = Epidata._list(issues) - if lag is not None: - params['lag'] = lag - # Make the API call - return Epidata._request(params) - - # Fetch FluSurv data - @staticmethod - def flusurv(locations, epiweeks, issues=None, lag=None): - """Fetch FluSurv data.""" - # Check parameters - if locations is None or epiweeks is None: - raise Exception('`locations` and `epiweeks` are both required') - if issues is not None and lag is not None: - raise Exception('`issues` and `lag` are mutually exclusive') - # Set up request - params = { - 'endpoint': 'flusurv', - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - } - if issues is not None: - params['issues'] = Epidata._list(issues) - if lag is not None: - params['lag'] = lag - # Make the API call - return Epidata._request(params) - - # Fetch PAHO Dengue data - @staticmethod - def paho_dengue(regions, epiweeks, issues=None, lag=None): - """Fetch PAHO Dengue data.""" - # Check parameters - if regions is None or epiweeks is None: - raise Exception('`regions` and `epiweeks` are both required') - if issues is not None and lag is not None: - raise Exception('`issues` and `lag` are mutually exclusive') - # Set up request - params = { - 'endpoint': 'paho_dengue', - 'regions': Epidata._list(regions), - 'epiweeks': Epidata._list(epiweeks), - } - if issues is not None: - params['issues'] = Epidata._list(issues) - if lag is not None: - params['lag'] = lag - # Make the API call - return Epidata._request(params) - - # Fetch ECDC ILI data - @staticmethod - def ecdc_ili(regions, epiweeks, issues=None, lag=None): - """Fetch ECDC ILI data.""" - # Check parameters - if regions is None or epiweeks is None: - raise Exception('`regions` and `epiweeks` are both required') - if issues is not None and lag is not None: - raise Exception('`issues` and `lag` are mutually exclusive') - # Set up request - params = { - 'endpoint': 'ecdc_ili', - 'regions': Epidata._list(regions), - 'epiweeks': Epidata._list(epiweeks), - } - if issues is not None: - params['issues'] = Epidata._list(issues) - if lag is not None: - params['lag'] = lag - # Make the API call - return Epidata._request(params) - - # Fetch KCDC ILI data - @staticmethod - def kcdc_ili(regions, epiweeks, issues=None, lag=None): - """Fetch KCDC ILI data.""" - # Check parameters - if regions is None or epiweeks is None: - raise Exception('`regions` and `epiweeks` are both required') - if issues is not None and lag is not None: - raise Exception('`issues` and `lag` are mutually exclusive') - # Set up request - params = { - 'endpoint': 'kcdc_ili', - 'regions': Epidata._list(regions), - 'epiweeks': Epidata._list(epiweeks), - } - if issues is not None: - params['issues'] = Epidata._list(issues) - if lag is not None: - params['lag'] = lag - # Make the API call - return Epidata._request(params) - - # Fetch Google Flu Trends data - @staticmethod - def gft(locations, epiweeks): - """Fetch Google Flu Trends data.""" - # Check parameters - if locations is None or epiweeks is None: - raise Exception('`locations` and `epiweeks` are both required') - # Set up request - params = { - 'endpoint': 'gft', - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - } - # Make the API call - return Epidata._request(params) - - # Fetch Google Health Trends data - @staticmethod - def ght(auth, locations, epiweeks, query): - """Fetch Google Health Trends data.""" - # Check parameters - if auth is None or locations is None or epiweeks is None or query is None: - raise Exception('`auth`, `locations`, `epiweeks`, and `query` are all required') - # Set up request - params = { - 'endpoint': 'ght', - 'auth': auth, - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - 'query': query, - } - # Make the API call - return Epidata._request(params) - - # Fetch HealthTweets data - @staticmethod - def twitter(auth, locations, dates=None, epiweeks=None): - """Fetch HealthTweets data.""" - # Check parameters - if auth is None or locations is None: - raise Exception('`auth` and `locations` are both required') - if not ((dates is None) ^ (epiweeks is None)): - raise Exception('exactly one of `dates` and `epiweeks` is required') - # Set up request - params = { - 'endpoint': 'twitter', - 'auth': auth, - 'locations': Epidata._list(locations), - } - if dates is not None: - params['dates'] = Epidata._list(dates) - if epiweeks is not None: - params['epiweeks'] = Epidata._list(epiweeks) - # Make the API call - return Epidata._request(params) - - # Fetch Wikipedia access data - @staticmethod - def wiki(articles, dates=None, epiweeks=None, hours=None, language='en'): - """Fetch Wikipedia access data.""" - # Check parameters - if articles is None: - raise Exception('`articles` is required') - if not ((dates is None) ^ (epiweeks is None)): - raise Exception('exactly one of `dates` and `epiweeks` is required') - # Set up request - params = { - 'endpoint': 'wiki', - 'articles': Epidata._list(articles), - 'language': language, - } - if dates is not None: - params['dates'] = Epidata._list(dates) - if epiweeks is not None: - params['epiweeks'] = Epidata._list(epiweeks) - if hours is not None: - params['hours'] = Epidata._list(hours) - # Make the API call - return Epidata._request(params) - - # Fetch CDC page hits - @staticmethod - def cdc(auth, epiweeks, locations): - """Fetch CDC page hits.""" - # Check parameters - if auth is None or epiweeks is None or locations is None: - raise Exception('`auth`, `epiweeks`, and `locations` are all required') - # Set up request - params = { - 'endpoint': 'cdc', - 'auth': auth, - 'epiweeks': Epidata._list(epiweeks), - 'locations': Epidata._list(locations), - } - # Make the API call - return Epidata._request(params) - - # Fetch Quidel data - @staticmethod - def quidel(auth, epiweeks, locations): - """Fetch Quidel data.""" - # Check parameters - if auth is None or epiweeks is None or locations is None: - raise Exception('`auth`, `epiweeks`, and `locations` are all required') - # Set up request - params = { - 'endpoint': 'quidel', - 'auth': auth, - 'epiweeks': Epidata._list(epiweeks), - 'locations': Epidata._list(locations), - } - # Make the API call - return Epidata._request(params) - - # Fetch NoroSTAT data (point data, no min/max) - @staticmethod - def norostat(auth, location, epiweeks): - """Fetch NoroSTAT data (point data, no min/max).""" - # Check parameters - if auth is None or location is None or epiweeks is None: - raise Exception('`auth`, `location`, and `epiweeks` are all required') - # Set up request - params = { - 'endpoint': 'norostat', - 'auth': auth, - 'location': location, - 'epiweeks': Epidata._list(epiweeks), - } - # Make the API call - return Epidata._request(params) - - # Fetch NoroSTAT metadata - @staticmethod - def meta_norostat(auth): - """Fetch NoroSTAT metadata.""" - # Check parameters - if auth is None: - raise Exception('`auth` is required') - # Set up request - params = { - 'endpoint': 'meta_norostat', - 'auth': auth, - } - # Make the API call - return Epidata._request(params) - - # Fetch AFHSB data - @staticmethod - def afhsb(auth, locations, epiweeks, flu_types): - """Fetch AFHSB data (point data, no min/max).""" - # Check parameters - if auth is None or locations is None or epiweeks is None or flu_types is None: - raise Exception('`auth`, `locations`, `epiweeks` and `flu_types` are all required') - - loc_exception = 'Location parameter `{}` is invalid. Valid `location` parameters are: '\ - '`hhs[1-10]`, `cen[1-9]`, 2-letter state code or 3-letter country code.' - for location in locations: - location = location.lower() - if (location.startswith('hhs') or location.startswith('cen')): - prefix, postfix = location[:3], location[3:] - if (postfix.isnumeric()): - region_num = int(postfix) - if (region_num < 1 or region_num > 10 or (region_num == 10 and prefix == 'cen')): - raise Exception(loc_exception.format(location)) + """An interface to DELPHI's Epidata API.""" + + # API base url + BASE_URL = 'https://delphi.cmu.edu/epidata/api.php' + + client_version = _version + + # Helper function to cast values and/or ranges to strings + @staticmethod + def _listitem(value): + """Cast values and/or range to a string.""" + if isinstance(value, dict) and 'from' in value and 'to' in value: + return str(value['from']) + '-' + str(value['to']) + else: + return str(value) + + # Helper function to build a list of values and/or ranges + @staticmethod + def _list(values): + """Turn a list/tuple of values/ranges into a comma-separated string.""" + if not isinstance(values, (list, tuple)): + values = [values] + return ','.join([Epidata._listitem(value) for value in values]) + + @staticmethod + @retry(reraise=True, stop=stop_after_attempt(2)) + def _request_with_retry(params): + """Make request with a retry if an exception is thrown.""" + req = requests.get(Epidata.BASE_URL, params, headers=_HEADERS) + if req.status_code == 414: + req = requests.post(Epidata.BASE_URL, params, headers=_HEADERS) + return req + + @staticmethod + def _request(params): + """Request and parse epidata. + + We default to GET since it has better caching and logging + capabilities, but fall back to POST if the request is too + long and returns a 414. + """ + try: + return Epidata._request_with_retry(params).json() + except Exception as e: + return {'result': 0, 'message': 'error: ' + str(e)} + + # Raise an Exception on error, otherwise return epidata + @staticmethod + def check(resp): + """Raise an Exception on error, otherwise return epidata.""" + if resp['result'] != 1: + msg, code = resp['message'], resp['result'] + raise Exception('Error fetching epidata: %s. (result=%d)' % (msg, code)) + return resp['epidata'] + + # Build a `range` object (ex: dates/epiweeks) + @staticmethod + def range(from_, to_): + """Build a `range` object (ex: dates/epiweeks).""" + if to_ <= from_: + from_, to_ = to_, from_ + return {'from': from_, 'to': to_} + + # Fetch FluView data + @staticmethod + def fluview(regions, epiweeks, issues=None, lag=None, auth=None): + """Fetch FluView data.""" + # Check parameters + if regions is None or epiweeks is None: + raise Exception('`regions` and `epiweeks` are both required') + if issues is not None and lag is not None: + raise Exception('`issues` and `lag` are mutually exclusive') + # Set up request + params = { + 'endpoint': 'fluview', + 'regions': Epidata._list(regions), + 'epiweeks': Epidata._list(epiweeks), + } + if issues is not None: + params['issues'] = Epidata._list(issues) + if lag is not None: + params['lag'] = lag + if auth is not None: + params['auth'] = auth + # Make the API call + return Epidata._request(params) + + # Fetch FluView metadata + @staticmethod + def fluview_meta(): + """Fetch FluView metadata.""" + # Set up request + params = { + 'endpoint': 'fluview_meta', + } + # Make the API call + return Epidata._request(params) + + # Fetch FluView clinical data + @staticmethod + def fluview_clinical(regions, epiweeks, issues=None, lag=None): + """Fetch FluView clinical data.""" + # Check parameters + if regions is None or epiweeks is None: + raise Exception('`regions` and `epiweeks` are both required') + if issues is not None and lag is not None: + raise Exception('`issues` and `lag` are mutually exclusive') + # Set up request + params = { + 'endpoint': 'fluview_clinical', + 'regions': Epidata._list(regions), + 'epiweeks': Epidata._list(epiweeks), + } + if issues is not None: + params['issues'] = Epidata._list(issues) + if lag is not None: + params['lag'] = lag + # Make the API call + return Epidata._request(params) + + # Fetch FluSurv data + @staticmethod + def flusurv(locations, epiweeks, issues=None, lag=None): + """Fetch FluSurv data.""" + # Check parameters + if locations is None or epiweeks is None: + raise Exception('`locations` and `epiweeks` are both required') + if issues is not None and lag is not None: + raise Exception('`issues` and `lag` are mutually exclusive') + # Set up request + params = { + 'endpoint': 'flusurv', + 'locations': Epidata._list(locations), + 'epiweeks': Epidata._list(epiweeks), + } + if issues is not None: + params['issues'] = Epidata._list(issues) + if lag is not None: + params['lag'] = lag + # Make the API call + return Epidata._request(params) + + # Fetch PAHO Dengue data + @staticmethod + def paho_dengue(regions, epiweeks, issues=None, lag=None): + """Fetch PAHO Dengue data.""" + # Check parameters + if regions is None or epiweeks is None: + raise Exception('`regions` and `epiweeks` are both required') + if issues is not None and lag is not None: + raise Exception('`issues` and `lag` are mutually exclusive') + # Set up request + params = { + 'endpoint': 'paho_dengue', + 'regions': Epidata._list(regions), + 'epiweeks': Epidata._list(epiweeks), + } + if issues is not None: + params['issues'] = Epidata._list(issues) + if lag is not None: + params['lag'] = lag + # Make the API call + return Epidata._request(params) + + # Fetch ECDC ILI data + @staticmethod + def ecdc_ili(regions, epiweeks, issues=None, lag=None): + """Fetch ECDC ILI data.""" + # Check parameters + if regions is None or epiweeks is None: + raise Exception('`regions` and `epiweeks` are both required') + if issues is not None and lag is not None: + raise Exception('`issues` and `lag` are mutually exclusive') + # Set up request + params = { + 'endpoint': 'ecdc_ili', + 'regions': Epidata._list(regions), + 'epiweeks': Epidata._list(epiweeks), + } + if issues is not None: + params['issues'] = Epidata._list(issues) + if lag is not None: + params['lag'] = lag + # Make the API call + return Epidata._request(params) + + # Fetch KCDC ILI data + @staticmethod + def kcdc_ili(regions, epiweeks, issues=None, lag=None): + """Fetch KCDC ILI data.""" + # Check parameters + if regions is None or epiweeks is None: + raise Exception('`regions` and `epiweeks` are both required') + if issues is not None and lag is not None: + raise Exception('`issues` and `lag` are mutually exclusive') + # Set up request + params = { + 'endpoint': 'kcdc_ili', + 'regions': Epidata._list(regions), + 'epiweeks': Epidata._list(epiweeks), + } + if issues is not None: + params['issues'] = Epidata._list(issues) + if lag is not None: + params['lag'] = lag + # Make the API call + return Epidata._request(params) + + # Fetch Google Flu Trends data + @staticmethod + def gft(locations, epiweeks): + """Fetch Google Flu Trends data.""" + # Check parameters + if locations is None or epiweeks is None: + raise Exception('`locations` and `epiweeks` are both required') + # Set up request + params = { + 'endpoint': 'gft', + 'locations': Epidata._list(locations), + 'epiweeks': Epidata._list(epiweeks), + } + # Make the API call + return Epidata._request(params) + + # Fetch Google Health Trends data + @staticmethod + def ght(auth, locations, epiweeks, query): + """Fetch Google Health Trends data.""" + # Check parameters + if auth is None or locations is None or epiweeks is None or query is None: + raise Exception('`auth`, `locations`, `epiweeks`, and `query` are all required') + # Set up request + params = { + 'endpoint': 'ght', + 'auth': auth, + 'locations': Epidata._list(locations), + 'epiweeks': Epidata._list(epiweeks), + 'query': query, + } + # Make the API call + return Epidata._request(params) + + # Fetch HealthTweets data + @staticmethod + def twitter(auth, locations, dates=None, epiweeks=None): + """Fetch HealthTweets data.""" + # Check parameters + if auth is None or locations is None: + raise Exception('`auth` and `locations` are both required') + if not ((dates is None) ^ (epiweeks is None)): + raise Exception('exactly one of `dates` and `epiweeks` is required') + # Set up request + params = { + 'endpoint': 'twitter', + 'auth': auth, + 'locations': Epidata._list(locations), + } + if dates is not None: + params['dates'] = Epidata._list(dates) + if epiweeks is not None: + params['epiweeks'] = Epidata._list(epiweeks) + # Make the API call + return Epidata._request(params) + + # Fetch Wikipedia access data + @staticmethod + def wiki(articles, dates=None, epiweeks=None, hours=None, language='en'): + """Fetch Wikipedia access data.""" + # Check parameters + if articles is None: + raise Exception('`articles` is required') + if not ((dates is None) ^ (epiweeks is None)): + raise Exception('exactly one of `dates` and `epiweeks` is required') + # Set up request + params = { + 'endpoint': 'wiki', + 'articles': Epidata._list(articles), + 'language': language, + } + if dates is not None: + params['dates'] = Epidata._list(dates) + if epiweeks is not None: + params['epiweeks'] = Epidata._list(epiweeks) + if hours is not None: + params['hours'] = Epidata._list(hours) + # Make the API call + return Epidata._request(params) + + # Fetch CDC page hits + @staticmethod + def cdc(auth, epiweeks, locations): + """Fetch CDC page hits.""" + # Check parameters + if auth is None or epiweeks is None or locations is None: + raise Exception('`auth`, `epiweeks`, and `locations` are all required') + # Set up request + params = { + 'endpoint': 'cdc', + 'auth': auth, + 'epiweeks': Epidata._list(epiweeks), + 'locations': Epidata._list(locations), + } + # Make the API call + return Epidata._request(params) + + # Fetch Quidel data + @staticmethod + def quidel(auth, epiweeks, locations): + """Fetch Quidel data.""" + # Check parameters + if auth is None or epiweeks is None or locations is None: + raise Exception('`auth`, `epiweeks`, and `locations` are all required') + # Set up request + params = { + 'endpoint': 'quidel', + 'auth': auth, + 'epiweeks': Epidata._list(epiweeks), + 'locations': Epidata._list(locations), + } + # Make the API call + return Epidata._request(params) + + # Fetch NoroSTAT data (point data, no min/max) + @staticmethod + def norostat(auth, location, epiweeks): + """Fetch NoroSTAT data (point data, no min/max).""" + # Check parameters + if auth is None or location is None or epiweeks is None: + raise Exception('`auth`, `location`, and `epiweeks` are all required') + # Set up request + params = { + 'endpoint': 'norostat', + 'auth': auth, + 'location': location, + 'epiweeks': Epidata._list(epiweeks), + } + # Make the API call + return Epidata._request(params) + + # Fetch NoroSTAT metadata + @staticmethod + def meta_norostat(auth): + """Fetch NoroSTAT metadata.""" + # Check parameters + if auth is None: + raise Exception('`auth` is required') + # Set up request + params = { + 'endpoint': 'meta_norostat', + 'auth': auth, + } + # Make the API call + return Epidata._request(params) + + # Fetch AFHSB data + @staticmethod + def afhsb(auth, locations, epiweeks, flu_types): + """Fetch AFHSB data (point data, no min/max).""" + # Check parameters + if auth is None or locations is None or epiweeks is None or flu_types is None: + raise Exception('`auth`, `locations`, `epiweeks` and `flu_types` are all required') + + loc_exception = 'Location parameter `{}` is invalid. Valid `location` parameters are: ' \ + '`hhs[1-10]`, `cen[1-9]`, 2-letter state code or 3-letter country code.' + for location in locations: + location = location.lower() + if (location.startswith('hhs') or location.startswith('cen')): + prefix, postfix = location[:3], location[3:] + if (postfix.isnumeric()): + region_num = int(postfix) + if (region_num < 1 or region_num > 10 or (region_num == 10 and prefix == 'cen')): + raise Exception(loc_exception.format(location)) + else: + raise Exception(loc_exception.format(location)) + elif (len(location) < 2 or len(location) > 3): + raise Exception(loc_exception.format(location)) + + flu_exception = 'Flu-type parameters `{}` is invalid. Valid flu-type parameters are: ' \ + '`flu1`, `flu2`, `flu3`, `ili`, `flu2-flu1`, `flu3-flu2`, `ili-flu3`.' + valid_flu_types = ['flu1', 'flu2', 'flu3', 'ili', 'flu2-flu1', 'flu3-flu2', 'ili-flu3'] + for flu_type in flu_types: + if flu_type not in valid_flu_types: + raise Exception(flu_exception.format(flu_type)) + + # Set up request + params = { + 'endpoint': 'afhsb', + 'auth': auth, + 'locations': Epidata._list(locations), + 'epiweeks': Epidata._list(epiweeks), + 'flu_types': Epidata._list(flu_types) + } + # Make the API call + return Epidata._request(params) + + # Fetch AFHSB metadata + @staticmethod + def meta_afhsb(auth): + """Fetch AFHSB metadata.""" + # Check parameters + if auth is None: + raise Exception('`auth` is required') + # Set up request + params = { + 'endpoint': 'meta_afhsb', + 'auth': auth, + } + # Make the API call + return Epidata._request(params) + + # Fetch NIDSS flu data + @staticmethod + def nidss_flu(regions, epiweeks, issues=None, lag=None): + """Fetch NIDSS flu data.""" + # Check parameters + if regions is None or epiweeks is None: + raise Exception('`regions` and `epiweeks` are both required') + if issues is not None and lag is not None: + raise Exception('`issues` and `lag` are mutually exclusive') + # Set up request + params = { + 'endpoint': 'nidss_flu', + 'regions': Epidata._list(regions), + 'epiweeks': Epidata._list(epiweeks), + } + if issues is not None: + params['issues'] = Epidata._list(issues) + if lag is not None: + params['lag'] = lag + # Make the API call + return Epidata._request(params) + + # Fetch NIDSS dengue data + @staticmethod + def nidss_dengue(locations, epiweeks): + """Fetch NIDSS dengue data.""" + # Check parameters + if locations is None or epiweeks is None: + raise Exception('`locations` and `epiweeks` are both required') + # Set up request + params = { + 'endpoint': 'nidss_dengue', + 'locations': Epidata._list(locations), + 'epiweeks': Epidata._list(epiweeks), + } + # Make the API call + return Epidata._request(params) + + # Fetch Delphi's forecast + @staticmethod + def delphi(system, epiweek): + """Fetch Delphi's forecast.""" + # Check parameters + if system is None or epiweek is None: + raise Exception('`system` and `epiweek` are both required') + # Set up request + params = { + 'endpoint': 'delphi', + 'system': system, + 'epiweek': epiweek, + } + # Make the API call + return Epidata._request(params) + + # Fetch Delphi's digital surveillance sensors + @staticmethod + def sensors(auth, names, locations, epiweeks): + """Fetch Delphi's digital surveillance sensors.""" + # Check parameters + if auth is None or names is None or locations is None or epiweeks is None: + raise Exception('`auth`, `names`, `locations`, and `epiweeks` are all required') + # Set up request + params = { + 'endpoint': 'sensors', + 'auth': auth, + 'names': Epidata._list(names), + 'locations': Epidata._list(locations), + 'epiweeks': Epidata._list(epiweeks), + } + # Make the API call + return Epidata._request(params) + + # Fetch Delphi's dengue digital surveillance sensors + @staticmethod + def dengue_sensors(auth, names, locations, epiweeks): + """Fetch Delphi's digital surveillance sensors.""" + # Check parameters + if auth is None or names is None or locations is None or epiweeks is None: + raise Exception('`auth`, `names`, `locations`, and `epiweeks` are all required') + # Set up request + params = { + 'endpoint': 'dengue_sensors', + 'auth': auth, + 'names': Epidata._list(names), + 'locations': Epidata._list(locations), + 'epiweeks': Epidata._list(epiweeks), + } + # Make the API call + return Epidata._request(params) + + # Fetch Delphi's wILI nowcast + @staticmethod + def nowcast(locations, epiweeks): + """Fetch Delphi's wILI nowcast.""" + # Check parameters + if locations is None or epiweeks is None: + raise Exception('`locations` and `epiweeks` are both required') + # Set up request + params = { + 'endpoint': 'nowcast', + 'locations': Epidata._list(locations), + 'epiweeks': Epidata._list(epiweeks), + } + # Make the API call + return Epidata._request(params) + + # Fetch Delphi's dengue nowcast + @staticmethod + def dengue_nowcast(locations, epiweeks): + """Fetch Delphi's dengue nowcast.""" + # Check parameters + if locations is None or epiweeks is None: + raise Exception('`locations` and `epiweeks` are both required') + # Set up request + params = { + 'endpoint': 'dengue_nowcast', + 'locations': Epidata._list(locations), + 'epiweeks': Epidata._list(epiweeks), + } + # Make the API call + return Epidata._request(params) + + # Fetch API metadata + @staticmethod + def meta(): + """Fetch API metadata.""" + return Epidata._request({'endpoint': 'meta'}) + + # Fetch Delphi's COVID-19 Surveillance Streams + @staticmethod + def covidcast(data_source, signals, time_type, geo_type, time_values, geo_value, as_of=None, issues=None, lag=None, **kwargs): + """Fetch Delphi's COVID-19 Surveillance Streams""" + # also support old parameter name + if signals is None and 'signal' in kwargs: + signals = kwargs['signal'] + # Check parameters + if data_source is None or signals is None or time_type is None or geo_type is None or time_values is None or geo_value is None: + raise Exception('`data_source`, `signals`, `time_type`, `geo_type`, `time_values`, and `geo_value` are all required') + if issues is not None and lag is not None: + raise Exception('`issues` and `lag` are mutually exclusive') + # Set up request + params = { + 'endpoint': 'covidcast', + 'data_source': data_source, + 'signals': Epidata._list(signals), + 'time_type': time_type, + 'geo_type': geo_type, + 'time_values': Epidata._list(time_values) + } + + if isinstance(geo_value, (list, tuple)): + params['geo_values'] = ','.join(geo_value) else: - raise Exception(loc_exception.format(location)) - elif (len(location) < 2 or len(location) > 3): - raise Exception(loc_exception.format(location)) - - flu_exception = 'Flu-type parameters `{}` is invalid. Valid flu-type parameters are: '\ - '`flu1`, `flu2`, `flu3`, `ili`, `flu2-flu1`, `flu3-flu2`, `ili-flu3`.' - valid_flu_types = ['flu1', 'flu2', 'flu3', 'ili', 'flu2-flu1', 'flu3-flu2', 'ili-flu3'] - for flu_type in flu_types: - if (not flu_type in valid_flu_types): - raise Exception(flu_exception.format(flu_type)) - - # Set up request - params = { - 'endpoint': 'afhsb', - 'auth': auth, - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - 'flu_types': Epidata._list(flu_types) - } - # Make the API call - return Epidata._request(params) - - # Fetch AFHSB metadata - @staticmethod - def meta_afhsb(auth): - """Fetch AFHSB metadata.""" - # Check parameters - if auth is None: - raise Exception('`auth` is required') - # Set up request - params = { - 'endpoint': 'meta_afhsb', - 'auth': auth, - } - # Make the API call - return Epidata._request(params) - - # Fetch NIDSS flu data - @staticmethod - def nidss_flu(regions, epiweeks, issues=None, lag=None): - """Fetch NIDSS flu data.""" - # Check parameters - if regions is None or epiweeks is None: - raise Exception('`regions` and `epiweeks` are both required') - if issues is not None and lag is not None: - raise Exception('`issues` and `lag` are mutually exclusive') - # Set up request - params = { - 'endpoint': 'nidss_flu', - 'regions': Epidata._list(regions), - 'epiweeks': Epidata._list(epiweeks), - } - if issues is not None: - params['issues'] = Epidata._list(issues) - if lag is not None: - params['lag'] = lag - # Make the API call - return Epidata._request(params) - - # Fetch NIDSS dengue data - @staticmethod - def nidss_dengue(locations, epiweeks): - """Fetch NIDSS dengue data.""" - # Check parameters - if locations is None or epiweeks is None: - raise Exception('`locations` and `epiweeks` are both required') - # Set up request - params = { - 'endpoint': 'nidss_dengue', - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - } - # Make the API call - return Epidata._request(params) - - # Fetch Delphi's forecast - @staticmethod - def delphi(system, epiweek): - """Fetch Delphi's forecast.""" - # Check parameters - if system is None or epiweek is None: - raise Exception('`system` and `epiweek` are both required') - # Set up request - params = { - 'endpoint': 'delphi', - 'system': system, - 'epiweek': epiweek, - } - # Make the API call - return Epidata._request(params) - - # Fetch Delphi's digital surveillance sensors - @staticmethod - def sensors(auth, names, locations, epiweeks): - """Fetch Delphi's digital surveillance sensors.""" - # Check parameters - if auth is None or names is None or locations is None or epiweeks is None: - raise Exception('`auth`, `names`, `locations`, and `epiweeks` are all required') - # Set up request - params = { - 'endpoint': 'sensors', - 'auth': auth, - 'names': Epidata._list(names), - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - } - # Make the API call - return Epidata._request(params) - - # Fetch Delphi's dengue digital surveillance sensors - @staticmethod - def dengue_sensors(auth, names, locations, epiweeks): - """Fetch Delphi's digital surveillance sensors.""" - # Check parameters - if auth is None or names is None or locations is None or epiweeks is None: - raise Exception('`auth`, `names`, `locations`, and `epiweeks` are all required') - # Set up request - params = { - 'endpoint': 'dengue_sensors', - 'auth': auth, - 'names': Epidata._list(names), - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - } - # Make the API call - return Epidata._request(params) - - # Fetch Delphi's wILI nowcast - @staticmethod - def nowcast(locations, epiweeks): - """Fetch Delphi's wILI nowcast.""" - # Check parameters - if locations is None or epiweeks is None: - raise Exception('`locations` and `epiweeks` are both required') - # Set up request - params = { - 'endpoint': 'nowcast', - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - } - # Make the API call - return Epidata._request(params) - - # Fetch Delphi's dengue nowcast - @staticmethod - def dengue_nowcast(locations, epiweeks): - """Fetch Delphi's dengue nowcast.""" - # Check parameters - if locations is None or epiweeks is None: - raise Exception('`locations` and `epiweeks` are both required') - # Set up request - params = { - 'endpoint': 'dengue_nowcast', - 'locations': Epidata._list(locations), - 'epiweeks': Epidata._list(epiweeks), - } - # Make the API call - return Epidata._request(params) - - # Fetch API metadata - @staticmethod - def meta(): - """Fetch API metadata.""" - return Epidata._request({'endpoint': 'meta'}) - - # Fetch Delphi's COVID-19 Surveillance Streams - @staticmethod - def covidcast( - data_source, signals, time_type, geo_type, - time_values, geo_value, as_of=None, issues=None, lag=None, **kwargs): - """Fetch Delphi's COVID-19 Surveillance Streams""" - # also support old parameter name - if signals is None and 'signal' in kwargs: - signals=kwargs['signal'] - # Check parameters - if data_source is None or signals is None or time_type is None or geo_type is None or time_values is None or geo_value is None: - raise Exception('`data_source`, `signals`, `time_type`, `geo_type`, `time_values`, and `geo_value` are all required') - if issues is not None and lag is not None: - raise Exception('`issues` and `lag` are mutually exclusive') - # Set up request - params = { - 'endpoint': 'covidcast', - 'data_source': data_source, - 'signals': Epidata._list(signals), - 'time_type': time_type, - 'geo_type': geo_type, - 'time_values': Epidata._list(time_values) - } - - if isinstance(geo_value, (list, tuple)): - params['geo_values'] = ','.join(geo_value) - else: - params['geo_value'] = geo_value - if as_of is not None: - params['as_of'] = as_of - if issues is not None: - params['issues'] = Epidata._list(issues) - if lag is not None: - params['lag'] = lag - - if 'format' in kwargs: - params['format'] = kwargs['format'] - - # Make the API call - return Epidata._request(params) - - # Fetch Delphi's COVID-19 Surveillance Streams metadata - @staticmethod - def covidcast_meta(): - """Fetch Delphi's COVID-19 Surveillance Streams metadata""" - return Epidata._request({'endpoint': 'covidcast_meta'}) - - # Fetch COVID hospitalization data - @staticmethod - def covid_hosp(states, dates, issues=None, as_of=None): - """Fetch COVID hospitalization data.""" - # Check parameters - if states is None or dates is None: - raise Exception('`states` and `dates` are both required') - # Set up request - params = { - 'endpoint': 'covid_hosp', - 'states': Epidata._list(states), - 'dates': Epidata._list(dates), - } - if issues is not None: - params['issues'] = Epidata._list(issues) - if as_of is not None: - params['as_of'] = as_of - # Make the API call - return Epidata._request(params) - - # Fetch COVID hospitalization data for specific facilities - @staticmethod - def covid_hosp_facility( - hospital_pks, collection_weeks, publication_dates=None): - """Fetch COVID hospitalization data for specific facilities.""" - # Check parameters - if hospital_pks is None or collection_weeks is None: - raise Exception('`hospital_pks` and `collection_weeks` are both required') - # Set up request - params = { - 'source': 'covid_hosp_facility', - 'hospital_pks': Epidata._list(hospital_pks), - 'collection_weeks': Epidata._list(collection_weeks), - } - if publication_dates is not None: - params['publication_dates'] = Epidata._list(publication_dates) - # Make the API call - return Epidata._request(params) - - # Lookup COVID hospitalization facility identifiers - @staticmethod - def covid_hosp_facility_lookup( - state=None, ccn=None, city=None, zip=None, fips_code=None): - """Lookup COVID hospitalization facility identifiers.""" - # Set up request - params = {'source': 'covid_hosp_facility_lookup'} - if state is not None: - params['state'] = state - elif ccn is not None: - params['ccn'] = ccn - elif city is not None: - params['city'] = city - elif zip is not None: - params['zip'] = zip - elif fips_code is not None: - params['fips_code'] = fips_code - else: - raise Exception('one of `state`, `ccn`, `city`, `zip`, or `fips_code` is required') - # Make the API call - return Epidata._request(params) - - # Fetch Delphi's COVID-19 Nowcast sensors - @staticmethod - def covidcast_nowcast( - data_source, signals, sensor_names, time_type, geo_type, - time_values, geo_value, as_of=None, issues=None, lag=None, **kwargs): - """Fetch Delphi's COVID-19 Nowcast sensors""" - # Check parameters - if data_source is None or signals is None or time_type is None or geo_type is None or time_values is None or geo_value is None or sensor_names is None: - raise Exception('`data_source`, `signals`, `sensor_names`, `time_type`, `geo_type`, `time_values`, and `geo_value` are all required') - if issues is not None and lag is not None: - raise Exception('`issues` and `lag` are mutually exclusive') - # Set up request - params = { - 'source': 'covidcast_nowcast', - 'data_source': data_source, - 'signals': Epidata._list(signals), - 'sensor_names': Epidata._list(sensor_names), - 'time_type': time_type, - 'geo_type': geo_type, - 'time_values': Epidata._list(time_values) - } - - if isinstance(geo_value, (list, tuple)): - params['geo_values'] = ','.join(geo_value) - else: - params['geo_value'] = geo_value - if as_of is not None: - params['as_of'] = as_of - if issues is not None: - params['issues'] = Epidata._list(issues) - if lag is not None: - params['lag'] = lag - - if 'format' in kwargs: - params['format'] = kwargs['format'] - - # Make the API call - return Epidata._request(params) - - @staticmethod - def async_epidata(param_list, batch_size=50): - """Make asynchronous Epidata calls for a list of parameters.""" - async def async_get(params, session): - """Helper function to make Epidata GET requests.""" - async with session.get(Epidata.BASE_URL, params=params) as response: - response.raise_for_status() - return await response.json(), params - - async def async_make_calls(param_combos): - """Helper function to asynchronously make and aggregate Epidata GET requests.""" - tasks = [] - connector = TCPConnector(limit=batch_size) - async with ClientSession(connector=connector, headers=_HEADERS) as session: - for param in param_combos: - task = asyncio.ensure_future(async_get(param, session)) - tasks.append(task) - responses = await asyncio.gather(*tasks) + params['geo_value'] = geo_value + if as_of is not None: + params['as_of'] = as_of + if issues is not None: + params['issues'] = Epidata._list(issues) + if lag is not None: + params['lag'] = lag + + if 'format' in kwargs: + params['format'] = kwargs['format'] + + # Make the API call + return Epidata._request(params) + + # Fetch Delphi's COVID-19 Surveillance Streams metadata + @staticmethod + def covidcast_meta(): + """Fetch Delphi's COVID-19 Surveillance Streams metadata""" + return Epidata._request({'endpoint': 'covidcast_meta'}) + + # Fetch COVID hospitalization data + @staticmethod + def covid_hosp(states, dates, issues=None, as_of=None): + """Fetch COVID hospitalization data.""" + # Check parameters + if states is None or dates is None: + raise Exception('`states` and `dates` are both required') + # Set up request + params = { + 'endpoint': 'covid_hosp', + 'states': Epidata._list(states), + 'dates': Epidata._list(dates), + } + if issues is not None: + params['issues'] = Epidata._list(issues) + if as_of is not None: + params['as_of'] = as_of + # Make the API call + return Epidata._request(params) + + # Fetch COVID hospitalization data for specific facilities + @staticmethod + def covid_hosp_facility(hospital_pks, collection_weeks, publication_dates=None): + """Fetch COVID hospitalization data for specific facilities.""" + # Check parameters + if hospital_pks is None or collection_weeks is None: + raise Exception('`hospital_pks` and `collection_weeks` are both required') + # Set up request + params = { + 'source': 'covid_hosp_facility', + 'hospital_pks': Epidata._list(hospital_pks), + 'collection_weeks': Epidata._list(collection_weeks), + } + if publication_dates is not None: + params['publication_dates'] = Epidata._list(publication_dates) + # Make the API call + return Epidata._request(params) + + # Lookup COVID hospitalization facility identifiers + @staticmethod + def covid_hosp_facility_lookup(state=None, ccn=None, city=None, zip=None, fips_code=None): + """Lookup COVID hospitalization facility identifiers.""" + # Set up request + params = {'source': 'covid_hosp_facility_lookup'} + if state is not None: + params['state'] = state + elif ccn is not None: + params['ccn'] = ccn + elif city is not None: + params['city'] = city + elif zip is not None: + params['zip'] = zip + elif fips_code is not None: + params['fips_code'] = fips_code + else: + raise Exception('one of `state`, `ccn`, `city`, `zip`, or `fips_code` is required') + # Make the API call + return Epidata._request(params) + + # Fetch Delphi's COVID-19 Nowcast sensors + @staticmethod + def covidcast_nowcast(data_source, signals, sensor_names, time_type, geo_type, time_values, + geo_value, as_of=None, issues=None, lag=None, **kwargs): + """Fetch Delphi's COVID-19 Nowcast sensors""" + # Check parameters + if data_source is None or signals is None or time_type is None or geo_type is None \ + or time_values is None or geo_value is None or sensor_names is None: + raise Exception('`data_source`, `signals`, `sensor_names`, `time_type`, `geo_type`, `time_values`, and `geo_value` are all required') + if issues is not None and lag is not None: + raise Exception('`issues` and `lag` are mutually exclusive') + # Set up request + params = { + 'source': 'covidcast_nowcast', + 'data_source': data_source, + 'signals': Epidata._list(signals), + 'sensor_names': Epidata._list(sensor_names), + 'time_type': time_type, + 'geo_type': geo_type, + 'time_values': Epidata._list(time_values) + } + + if isinstance(geo_value, (list, tuple)): + params['geo_values'] = ','.join(geo_value) + else: + params['geo_value'] = geo_value + if as_of is not None: + params['as_of'] = as_of + if issues is not None: + params['issues'] = Epidata._list(issues) + if lag is not None: + params['lag'] = lag + + if 'format' in kwargs: + params['format'] = kwargs['format'] + + # Make the API call + return Epidata._request(params) + + @staticmethod + def async_epidata(param_list, batch_size=50): + """Make asynchronous Epidata calls for a list of parameters.""" + async def async_get(params, session): + """Helper function to make Epidata GET requests.""" + async with session.get(Epidata.BASE_URL, params=params) as response: + response.raise_for_status() + return await response.json(), params + + async def async_make_calls(param_combos): + """Helper function to asynchronously make and aggregate Epidata GET requests.""" + tasks = [] + connector = TCPConnector(limit=batch_size) + async with ClientSession(connector=connector, headers=_HEADERS) as session: + for param in param_combos: + task = asyncio.ensure_future(async_get(param, session)) + tasks.append(task) + responses = await asyncio.gather(*tasks) + return responses + + loop = asyncio.get_event_loop() + future = asyncio.ensure_future(async_make_calls(param_list)) + responses = loop.run_until_complete(future) return responses - - loop = asyncio.get_event_loop() - future = asyncio.ensure_future(async_make_calls(param_list)) - responses = loop.run_until_complete(future) - return responses diff --git a/src/server/_config.py b/src/server/_config.py index 187d4581a..618532da0 100644 --- a/src/server/_config.py +++ b/src/server/_config.py @@ -1,5 +1,6 @@ import json import os + from dotenv import load_dotenv load_dotenv() @@ -13,8 +14,8 @@ # defaults SQLALCHEMY_ENGINE_OPTIONS = { - "pool_pre_ping": True, # enable ping test for validity of recycled pool connections on connect() calls - "pool_recycle": 5 # seconds after which a recycled pool connection is considered invalid + "pool_pre_ping": True, # enable ping test for validity of recycled pool connections on connect() calls + "pool_recycle": 5 # seconds after which a recycled pool connection is considered invalid } # update with overrides of defaults or additions from external configs SQLALCHEMY_ENGINE_OPTIONS.update( @@ -35,7 +36,8 @@ } # begin sensor query authentication configuration -# A multimap of sensor names to the "granular" auth tokens that can be used to access them; excludes the "global" sensor auth key that works for all sensors: +# A multimap of sensor names to the "granular" auth tokens that can be used to access them; +# excludes the "global" sensor auth key that works for all sensors: GRANULAR_SENSOR_AUTH_TOKENS = { "twtr": os.environ.get("SECRET_SENSOR_TWTR", "").split(","), "gft": os.environ.get("SECRET_SENSOR_GFT", "").split(","), @@ -46,7 +48,7 @@ "wiki": os.environ.get("SECRET_SENSOR_WIKI", "").split(","), } -# A set of sensors that do not require an auth key to access: +# A set of sensors that do not require an auth key to access: OPEN_SENSORS = [ "sar3", "epic", diff --git a/src/server/_pandas.py b/src/server/_pandas.py index 68cbc8833..82ffd0dde 100644 --- a/src/server/_pandas.py +++ b/src/server/_pandas.py @@ -1,18 +1,24 @@ -from typing import Dict, Any, Optional -import pandas as pd +from typing import Any, Dict, Optional +import pandas as pd from flask import request from sqlalchemy import text from sqlalchemy.engine.base import Engine from ._common import engine from ._config import MAX_RESULTS +from ._exceptions import DatabaseErrorException from ._printer import create_printer from ._query import filter_fields, limit_query -from ._exceptions import DatabaseErrorException -def as_pandas(query: str, params: Dict[str, Any], db_engine: Engine = engine, parse_dates: Optional[Dict[str, str]] = None, limit_rows = MAX_RESULTS+1) -> pd.DataFrame: +def as_pandas( + query: str, + params: Dict[str, Any], + db_engine: Engine = engine, + parse_dates: Optional[Dict[str, str]] = None, + limit_rows=MAX_RESULTS+1 +) -> pd.DataFrame: try: query = limit_query(query, limit_rows) return pd.read_sql_query(text(str(query)), db_engine, params=params, parse_dates=parse_dates) diff --git a/src/server/endpoints/sensors.py b/src/server/endpoints/sensors.py index cd76ca4d8..c92f2f817 100644 --- a/src/server/endpoints/sensors.py +++ b/src/server/endpoints/sensors.py @@ -1,23 +1,19 @@ +from typing import List + from flask import Blueprint, Request, request from .._config import AUTH, GRANULAR_SENSOR_AUTH_TOKENS, OPEN_SENSORS from .._exceptions import EpiDataException -from .._params import ( - extract_strings, - extract_integers, -) -from .._query import filter_strings, execute_query, filter_integers -from .._validate import ( - require_all, - resolve_auth_token, -) -from typing import List +from .._params import extract_integers, extract_strings +from .._query import execute_query, filter_integers, filter_strings +from .._validate import require_all, resolve_auth_token # first argument is the endpoint name bp = Blueprint("sensors", __name__) alias = "signals" -# Limits on the number of effective auth token equality checks performed per sensor query; generate auth tokens with appropriate levels of entropy according to the limits below: +# Limits on the number of effective auth token equality checks performed per sensor query; +# generate auth tokens with appropriate levels of entropy according to the limits below: MAX_GLOBAL_AUTH_CHECKS_PER_SENSOR_QUERY = 1 # (but imagine is larger to futureproof) MAX_GRANULAR_AUTH_CHECKS_PER_SENSOR_QUERY = 30 # (but imagine is larger to futureproof) # A (currently redundant) limit on the number of auth tokens that can be provided: @@ -38,18 +34,22 @@ def _authenticate(req: Request, names: List[str]): len(v) for v in GRANULAR_SENSOR_AUTH_TOKENS.values() ) - # The number of valid granular tokens is related to the number of auth token checks that a single query could perform. Use the max number of valid granular auth tokens per name in the check below as a way to prevent leakage of sensor names (but revealing the number of sensor names) via this interface. Treat all sensors as non-open for convenience of calculation. + # The number of valid granular tokens is related to the number of auth token checks that a single query could perform. + # Use the max number of valid granular auth tokens per name in the check below as a way to prevent leakage of sensor names + # (but revealing the number of sensor names) via this interface. Treat all sensors as non-open for convenience of calculation. if n_names == 0: # Check whether no names were provided to prevent edge-case issues in error message below, and in case surrounding behavior changes in the future: raise EpiDataException("no sensor names provided") if n_auth_tokens_presented > 1: raise EpiDataException( - "currently, only a single auth token is allowed to be presented at a time; please issue a separate query for each sensor name using only the corresponding token" + "currently, only a single auth token is allowed to be presented at a time; \ + please issue a separate query for each sensor name using only the corresponding token" ) - # Check whether max number of presented-vs.-acceptable token comparisons that would be performed is over the set limits, avoiding calculation of numbers > PHP_INT_MAX/100: - # Global auth token comparison limit check: - # Granular auth token comparison limit check: + # Check whether max number of presented-vs.-acceptable token comparisons that would be performed is over the set limits, + # avoiding calculation of numbers > PHP_INT_MAX/100: + # Global auth token comparison limit check: + # Granular auth token comparison limit check: if ( n_auth_tokens_presented > MAX_GLOBAL_AUTH_CHECKS_PER_SENSOR_QUERY or n_names @@ -60,7 +60,8 @@ def _authenticate(req: Request, names: List[str]): > MAX_GRANULAR_AUTH_CHECKS_PER_SENSOR_QUERY ): raise EpiDataException( - "too many sensors requested and/or auth tokens presented; please divide sensors into batches and/or use only the tokens needed for the sensors requested" + "too many sensors requested and/or auth tokens presented; \ + please divide sensors into batches and/or use only the tokens needed for the sensors requested" ) if len(auth_tokens_presented) > MAX_AUTH_KEYS_PROVIDED_PER_SENSOR_QUERY: diff --git a/src/server/simulate_api_response.py b/src/server/simulate_api_response.py index b07e32abc..420708806 100644 --- a/src/server/simulate_api_response.py +++ b/src/server/simulate_api_response.py @@ -1,32 +1,45 @@ # standard library +import json import os.path import subprocess -import json + def dangerously_simulate_api_response(request_dict): - """*SHOULD NOT RECEIVE USER INPUT*. Simulates the API output for the specified request using server files located within this repository's directory structure. + """ + *SHOULD NOT RECEIVE USER INPUT*. + Simulates the API output for the specified request using server files located within this repository's directory structure. + The request should be in the form of a dictionary specifying the html query parameters / php $_REQUEST entries for the request. + Used to construct tests of API behavior. + *Security note*: + the request argument should not be supplied by an outside user, + only carefully specified during development, or usage thoroughly vetted, + as, depending on usage (pre-deploy, during deploy, post-deploy), potential risks may vary. - The request should be in the form of a dictionary specifying the html query parameters / php $_REQUEST entries for the request. Used to construct tests of API behavior. *Security note*: the request argument should not be supplied by an outside user, only carefully specified during development, or usage thoroughly vetted, as, depending on usage (pre-deploy, during deploy, post-deploy), potential risks may vary. + The API output is simulated using files from ../../src/server/ in this repository's directory structure. + No web requests are issued, nor (when run on the server) does it use the currently deployed version of the API. - The API output is simulated using files from ../../src/server/ in this repository's directory structure. No web requests are issued, nor (when run on the server) does it use the currently deployed version of the API. + Returns a tuple (returncode, stderr_bytes, stdout_bytes). + """ + request_json = json.dumps(request_dict) + process = subprocess.Popen( + cwd=os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'src', 'server'), + args=['php', '-r', '$_REQUEST = json_decode(file_get_contents("php://stdin"), true); require("api.php");'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + (stdout_bytes, stderr_bytes) = process.communicate(input=request_json.encode('UTF-8')) + returncode = process.returncode + simulated_api_response = (returncode, stderr_bytes, stdout_bytes) + return simulated_api_response - Returns a tuple (returncode, stderr_bytes, stdout_bytes). - """ - request_json = json.dumps(request_dict) - process = subprocess.Popen( - cwd=os.path.join(os.path.dirname(os.path.realpath(__file__)),'..','..','src','server'), - args=['php', '-r', '$_REQUEST = json_decode(file_get_contents("php://stdin"), true); require("api.php");'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE - ) - (stdout_bytes, stderr_bytes) = process.communicate(input=request_json.encode('UTF-8')) - returncode = process.returncode - simulated_api_response = (returncode, stderr_bytes, stdout_bytes) - return simulated_api_response def extract_response_json(simulated_api_response): - (returncode, stderr_bytes, stdout_bytes) = simulated_api_response - if returncode != 0 or len(stderr_bytes)!=0: - raise Exception(['Simulated API request appears to have generated an internal error, returning a nonzero error code and/or producing output to stderr:',returncode,stderr_bytes]) - else: - unpacked_json = json.loads(stdout_bytes.decode("UTF-8")) - return unpacked_json + (returncode, stderr_bytes, stdout_bytes) = simulated_api_response + if returncode != 0 or len(stderr_bytes) != 0: + raise Exception([ + 'Simulated API request appears to have generated an internal error, returning a nonzero error code and/or producing output to stderr:', + returncode, + stderr_bytes + ]) + else: + unpacked_json = json.loads(stdout_bytes.decode("UTF-8")) + return unpacked_json diff --git a/src/server/utils/dates.py b/src/server/utils/dates.py index 126f79383..f513795bb 100644 --- a/src/server/utils/dates.py +++ b/src/server/utils/dates.py @@ -1,11 +1,5 @@ from datetime import date, timedelta -from typing import ( - Callable, - Optional, - Sequence, - Tuple, - Union -) +from typing import Callable, Optional, Sequence, Tuple, Union from epiweeks import Week, Year from typing_extensions import TypeAlias @@ -16,6 +10,7 @@ IntRange: TypeAlias = Union[Tuple[int, int], int] TimeValues: TypeAlias = Sequence[IntRange] + def time_value_to_day(value: int) -> date: year, month, day = value // 10000, (value % 10000) // 100, value % 100 if year < date.min.year: @@ -24,6 +19,7 @@ def time_value_to_day(value: int) -> date: return date.max return date(year=year, month=month, day=day) + def time_value_to_week(value: int) -> Week: year, week = value // 100, value % 100 if year < date.min.year: @@ -32,23 +28,29 @@ def time_value_to_week(value: int) -> Week: return Week(date.max.year - 1, 1) # minus 1 since internally it does some checks with a year + 1 return Week(year=year, week=week) + def guess_time_value_is_day(value: int) -> bool: # YYYYMMDD type and not YYYYMM return len(str(value)) == 8 + def guess_time_value_is_week(value: int) -> bool: # YYYYWW type and not YYYYMMDD return len(str(value)) == 6 + def day_to_time_value(d: date) -> int: return int(d.strftime("%Y%m%d")) + def week_to_time_value(w: Week) -> int: return w.year * 100 + w.week + def time_value_to_iso(value: int) -> str: return time_value_to_day(value).strftime("%Y-%m-%d") + def shift_day_value(time_value: int, days: int) -> int: if days == 0: return time_value @@ -56,6 +58,7 @@ def shift_day_value(time_value: int, days: int) -> int: shifted = d + timedelta(days=days) return day_to_time_value(shifted) + def shift_week_value(week_value: int, weeks: int) -> int: if weeks == 0: return week_value @@ -63,6 +66,7 @@ def shift_week_value(week_value: int, weeks: int) -> int: shifted = week + weeks return week_to_time_value(shifted) + def days_in_range(range: Tuple[int, int]) -> int: """ returns the days within this time range @@ -73,6 +77,7 @@ def days_in_range(range: Tuple[int, int]) -> int: delta = end - start return delta.days + 1 # same date should lead to 1 day that will be queried + def weeks_in_range(week_range: Tuple[int, int]) -> int: start = time_value_to_week(week_range[0]) end = time_value_to_week(week_range[1]) @@ -83,6 +88,7 @@ def weeks_in_range(week_range: Tuple[int, int]) -> int: acc += year.totalweeks() return acc + 1 # same week should lead to 1 week that will be queried + def time_values_to_ranges(values: Optional[TimeValues]) -> Optional[TimeValues]: """ Converts a mixed list of dates and date ranges to an optimized list where dates are merged into ranges where possible. @@ -107,12 +113,15 @@ def time_values_to_ranges(values: Optional[TimeValues]) -> Optional[TimeValues]: logger.info("Time value unclear, not optimizing", time_value=first_element) return values + def days_to_ranges(values: TimeValues) -> TimeValues: return _to_ranges(values, time_value_to_day, day_to_time_value, timedelta(days=1)) + def weeks_to_ranges(values: TimeValues) -> TimeValues: return _to_ranges(values, time_value_to_week, week_to_time_value, 1) + def _to_ranges(values: TimeValues, value_to_date: Callable, date_to_value: Callable, time_unit: Union[int, timedelta]) -> TimeValues: try: intervals = [] @@ -122,7 +131,7 @@ def _to_ranges(values: TimeValues, value_to_date: Callable, date_to_value: Calla if isinstance(v, int): # 20200101 -> [20200101, 20200101] intervals.append([value_to_date(v), value_to_date(v)]) - else: # tuple + else: # tuple # (20200101, 20200102) -> [20200101, 20200102] intervals.append([value_to_date(v[0]), value_to_date(v[1])]) @@ -147,7 +156,12 @@ def _to_ranges(values: TimeValues, value_to_date: Callable, date_to_value: Calla else: ranges.append((date_to_value(m[0]), date_to_value(m[1]))) - get_structured_logger('server_utils').info("Optimized list of date values", original=values, optimized=ranges, original_length=len(values), optimized_length=len(ranges)) + get_structured_logger('server_utils').info( + "Optimized list of date values", + original=values, optimized=ranges, + original_length=len(values), + optimized_length=len(ranges) + ) return ranges except Exception as e: diff --git a/tests/acquisition/covid_hosp/common/test_utils.py b/tests/acquisition/covid_hosp/common/test_utils.py index 85dbd110c..f3b5a170f 100644 --- a/tests/acquisition/covid_hosp/common/test_utils.py +++ b/tests/acquisition/covid_hosp/common/test_utils.py @@ -1,16 +1,12 @@ """Unit tests for utils.py.""" -# standard library -from datetime import date import unittest +from datetime import date from unittest.mock import MagicMock, PropertyMock, patch -# first party -from delphi.epidata.acquisition.covid_hosp.common.test_utils import UnitTestUtils -from delphi.epidata.acquisition.covid_hosp.common.utils import Utils, CovidHospException - -#third party import pandas as pd +from delphi.epidata.acquisition.covid_hosp.common.test_utils import UnitTestUtils +from delphi.epidata.acquisition.covid_hosp.common.utils import CovidHospException, Utils # py3tester coverage target __test_target__ = 'delphi.epidata.acquisition.covid_hosp.common.utils' @@ -18,125 +14,133 @@ class UtilsTests(unittest.TestCase): - def setUp(self): - """Perform per-test setup.""" - - # configure test data - self.test_utils = UnitTestUtils(__file__) - - def test_launch_if_main_when_main(self): - """Launch the main entry point.""" - - mock_entry = MagicMock() + def setUp(self): + """Perform per-test setup.""" - Utils.launch_if_main(mock_entry, '__main__') + # configure test data + self.test_utils = UnitTestUtils(__file__) - mock_entry.assert_called_once() - - def test_launch_if_main_when_not_main(self): - """Don't launch the main entry point.""" - - mock_entry = MagicMock() - - Utils.launch_if_main(mock_entry, '__test__') - - mock_entry.assert_not_called() - - def test_int_from_date(self): - """Convert a YYY-MM-DD date to a YYYYMMDD int.""" - - self.assertEqual(Utils.int_from_date('2020-11-17'), 20201117) - self.assertEqual(Utils.int_from_date('2020/11/17'), 20201117) - self.assertEqual(Utils.int_from_date('2020/11/17 10:00:00'), 20201117) - - def test_parse_bool(self): - """Parse a boolean value from a string.""" - - with self.subTest(name='None'): - self.assertIsNone(Utils.parse_bool(None)) - - with self.subTest(name='empty'): - self.assertIsNone(Utils.parse_bool('')) - - with self.subTest(name='true'): - self.assertTrue(Utils.parse_bool('true')) - self.assertTrue(Utils.parse_bool('True')) - self.assertTrue(Utils.parse_bool('tRuE')) - - with self.subTest(name='false'): - self.assertFalse(Utils.parse_bool('false')) - self.assertFalse(Utils.parse_bool('False')) - self.assertFalse(Utils.parse_bool('fAlSe')) - - with self.subTest(name='exception'): - with self.assertRaises(CovidHospException): - Utils.parse_bool('maybe') - - def test_issues_to_fetch(self): - test_metadata = pd.DataFrame({ - "date": [pd.Timestamp("2021-03-13 00:00:00"), - pd.Timestamp("2021-03-14 00:00:00"), - pd.Timestamp("2021-03-15 00:00:01"), - pd.Timestamp("2021-03-15 00:00:00"), - pd.Timestamp("2021-03-16 00:00:00") - ], - "Archive Link": ["a", "b", "d", "c", "e"] - }).set_index("date") - - issues = Utils.issues_to_fetch(test_metadata, pd.Timestamp("2021-3-13"), pd.Timestamp("2021-3-16")) - self.assertEqual(issues, - {date(2021, 3, 14): [("b", pd.Timestamp("2021-03-14 00:00:00"))], - date(2021, 3, 15): [("c", pd.Timestamp("2021-03-15 00:00:00")), - ("d", pd.Timestamp("2021-03-15 00:00:01"))] - } - ) - - def test_run_skip_old_dataset(self): - """Don't re-acquire an old dataset.""" - - mock_network = MagicMock() - mock_network.fetch_metadata.return_value = \ - self.test_utils.load_sample_metadata() - mock_database = MagicMock() - with mock_database.connect() as mock_connection: - pass - mock_connection.get_max_issue.return_value = pd.Timestamp("2200/1/1") - - result = Utils.update_dataset(database=mock_database, network=mock_network) - - self.assertFalse(result) - mock_network.fetch_dataset.assert_not_called() - mock_connection.insert_metadata.assert_not_called() - mock_connection.insert_dataset.assert_not_called() - - def test_run_acquire_new_dataset(self): - """Acquire a new dataset.""" - - mock_network = MagicMock() - mock_network.fetch_metadata.return_value = \ - self.test_utils.load_sample_metadata() - fake_dataset = pd.DataFrame({"date": [pd.Timestamp("2020/1/1")], "state": ["ca"]}) - mock_network.fetch_dataset.return_value = fake_dataset - mock_database = MagicMock() - with mock_database.connect() as mock_connection: - pass - type(mock_connection).KEY_COLS = PropertyMock(return_value=["state", "date"]) - mock_connection.get_max_issue.return_value = pd.Timestamp("1900/1/1") - with patch.object(Utils, 'issues_to_fetch') as mock_issues: - mock_issues.return_value = {pd.Timestamp("2021/3/15"): [("url1", pd.Timestamp("2021-03-15 00:00:00")), - ("url2", pd.Timestamp("2021-03-15 00:00:00"))]} - result = Utils.update_dataset(database=mock_database, network=mock_network) - - self.assertTrue(result) - - # should have been called twice - mock_connection.insert_metadata.assert_called() - assert mock_connection.insert_metadata.call_count == 2 - # most recent call should be for the final revision at url2 - args = mock_connection.insert_metadata.call_args[0] - self.assertEqual(args[:2], (20210315, "url2")) - pd.testing.assert_frame_equal( - mock_connection.insert_dataset.call_args[0][1], - pd.DataFrame({"state": ["ca"], "date": [pd.Timestamp("2020/1/1")]}) - ) - self.assertEqual(mock_connection.insert_dataset.call_args[0][0], 20210315) + def test_launch_if_main_when_main(self): + """Launch the main entry point.""" + + mock_entry = MagicMock() + + Utils.launch_if_main(mock_entry, '__main__') + + mock_entry.assert_called_once() + + def test_launch_if_main_when_not_main(self): + """Don't launch the main entry point.""" + + mock_entry = MagicMock() + + Utils.launch_if_main(mock_entry, '__test__') + + mock_entry.assert_not_called() + + def test_int_from_date(self): + """Convert a YYY-MM-DD date to a YYYYMMDD int.""" + + self.assertEqual(Utils.int_from_date('2020-11-17'), 20201117) + self.assertEqual(Utils.int_from_date('2020/11/17'), 20201117) + self.assertEqual(Utils.int_from_date('2020/11/17 10:00:00'), 20201117) + + def test_parse_bool(self): + """Parse a boolean value from a string.""" + + with self.subTest(name='None'): + self.assertIsNone(Utils.parse_bool(None)) + + with self.subTest(name='empty'): + self.assertIsNone(Utils.parse_bool('')) + + with self.subTest(name='true'): + self.assertTrue(Utils.parse_bool('true')) + self.assertTrue(Utils.parse_bool('True')) + self.assertTrue(Utils.parse_bool('tRuE')) + + with self.subTest(name='false'): + self.assertFalse(Utils.parse_bool('false')) + self.assertFalse(Utils.parse_bool('False')) + self.assertFalse(Utils.parse_bool('fAlSe')) + + with self.subTest(name='exception'): + with self.assertRaises(CovidHospException): + Utils.parse_bool('maybe') + + def test_issues_to_fetch(self): + test_metadata = pd.DataFrame({ + "date": [ + pd.Timestamp("2021-03-13 00:00:00"), + pd.Timestamp("2021-03-14 00:00:00"), + pd.Timestamp("2021-03-15 00:00:01"), + pd.Timestamp("2021-03-15 00:00:00"), + pd.Timestamp("2021-03-16 00:00:00") + ], + "Archive Link": ["a", "b", "d", "c", "e"] + }).set_index("date") + + issues = Utils.issues_to_fetch(test_metadata, pd.Timestamp("2021-3-13"), pd.Timestamp("2021-3-16")) + self.assertEqual( + issues, + { + date(2021, 3, 14): [("b", pd.Timestamp("2021-03-14 00:00:00"))], + date(2021, 3, 15): [ + ("c", pd.Timestamp("2021-03-15 00:00:00")), + ("d", pd.Timestamp("2021-03-15 00:00:01")) + ] + } + ) + + def test_run_skip_old_dataset(self): + """Don't re-acquire an old dataset.""" + + mock_network = MagicMock() + mock_network.fetch_metadata.return_value = self.test_utils.load_sample_metadata() + mock_database = MagicMock() + with mock_database.connect() as mock_connection: + pass + mock_connection.get_max_issue.return_value = pd.Timestamp("2200/1/1") + + result = Utils.update_dataset(database=mock_database, network=mock_network) + + self.assertFalse(result) + mock_network.fetch_dataset.assert_not_called() + mock_connection.insert_metadata.assert_not_called() + mock_connection.insert_dataset.assert_not_called() + + def test_run_acquire_new_dataset(self): + """Acquire a new dataset.""" + + mock_network = MagicMock() + mock_network.fetch_metadata.return_value = \ + self.test_utils.load_sample_metadata() + fake_dataset = pd.DataFrame({"date": [pd.Timestamp("2020/1/1")], "state": ["ca"]}) + mock_network.fetch_dataset.return_value = fake_dataset + mock_database = MagicMock() + with mock_database.connect() as mock_connection: + pass + type(mock_connection).KEY_COLS = PropertyMock(return_value=["state", "date"]) + mock_connection.get_max_issue.return_value = pd.Timestamp("1900/1/1") + with patch.object(Utils, 'issues_to_fetch') as mock_issues: + mock_issues.return_value = { + pd.Timestamp("2021/3/15"): [ + ("url1", pd.Timestamp("2021-03-15 00:00:00")), + ("url2", pd.Timestamp("2021-03-15 00:00:00")) + ] + } + result = Utils.update_dataset(database=mock_database, network=mock_network) + + self.assertTrue(result) + + # should have been called twice + mock_connection.insert_metadata.assert_called() + assert mock_connection.insert_metadata.call_count == 2 + # most recent call should be for the final revision at url2 + args = mock_connection.insert_metadata.call_args[0] + self.assertEqual(args[:2], (20210315, "url2")) + pd.testing.assert_frame_equal( + mock_connection.insert_dataset.call_args[0][1], + pd.DataFrame({"state": ["ca"], "date": [pd.Timestamp("2020/1/1")]}) + ) + self.assertEqual(mock_connection.insert_dataset.call_args[0][0], 20210315) diff --git a/tests/server/dev_test_granular_sensor_authentication.py b/tests/server/dev_test_granular_sensor_authentication.py index bc742392a..fe701dfb6 100644 --- a/tests/server/dev_test_granular_sensor_authentication.py +++ b/tests/server/dev_test_granular_sensor_authentication.py @@ -11,251 +11,275 @@ __test_target__ = 'delphi.epidata.server.simulate_api_response' - class UnitTests(unittest.TestCase): - """Basic unit tests.""" + """Basic unit tests.""" - def test_twtr_auth_blocked_on_self_plus_open_plus_other_closed_plus_bogus_plus_repeat_sensors(self): - """Test that TWTR key doesn't authenticate request for TWTR data + open data + other closed data + nonexistent sensor data + repeated sensor names:""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensor_subsets.twtr_sensor, - 'names': 'ght,ghtj,gft,arch,sar3,arch,epic,twtr,quid,wiki,does_not_exist,does_not_exist,does_not_exist2,twtr,ght', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'unauthenticated/nonexistent sensor(s): ght,ghtj,gft,quid,wiki,does_not_exist,does_not_exist,does_not_exist2,ght') - self.assertEqual(response['result'], -1) + def test_twtr_auth_blocked_on_self_plus_open_plus_other_closed_plus_bogus_plus_repeat_sensors(self): + """Test that TWTR key doesn't authenticate request for TWTR data + open data + other closed data + nonexistent sensor data + repeated sensor names:""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensor_subsets.twtr_sensor, + 'names': 'ght,ghtj,gft,arch,sar3,arch,epic,twtr,quid,wiki,does_not_exist,does_not_exist,does_not_exist2,twtr,ght', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual( + response['message'], + 'unauthenticated/nonexistent sensor(s): ght,ghtj,gft,quid,wiki,does_not_exist,does_not_exist,does_not_exist2,ght' + ) + self.assertEqual(response['result'], -1) - def test_no_auth_blocked_on_empty_sensor(self): - """Test that a request with zero auth for zero sensors is blocked.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - # no auth - 'names': '', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'no sensor names provided') - self.assertEqual(response['result'], -1) + def test_no_auth_blocked_on_empty_sensor(self): + """Test that a request with zero auth for zero sensors is blocked.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + # no auth + 'names': '', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual(response['message'], 'no sensor names provided') + self.assertEqual(response['result'], -1) - def test_no_auth_blocked_on_closed_ght_sensor(self): - """Test that providing no auth token doesn't authenticate request for GHT sensor.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - # no auth - 'names': 'ght', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'unauthenticated/nonexistent sensor(s): ght') - self.assertEqual(response['result'], -1) + def test_no_auth_blocked_on_closed_ght_sensor(self): + """Test that providing no auth token doesn't authenticate request for GHT sensor.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + # no auth + 'names': 'ght', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual(response['message'], 'unauthenticated/nonexistent sensor(s): ght') + self.assertEqual(response['result'], -1) - def test_bogus_auth_blocked_on_closed_ght_sensor(self): - """Test that providing a bogus auth token doesn't authenticate request for GHT sensor.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': 'bogusauth', - 'names': 'ght', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'unauthenticated/nonexistent sensor(s): ght') - self.assertEqual(response['result'], -1) + def test_bogus_auth_blocked_on_closed_ght_sensor(self): + """Test that providing a bogus auth token doesn't authenticate request for GHT sensor.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': 'bogusauth', + 'names': 'ght', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual(response['message'], 'unauthenticated/nonexistent sensor(s): ght') + self.assertEqual(response['result'], -1) - def test_no_auth_succeeds_on_open_sar3_sensor(self): - """Test that providing no auth token succeeds in retrieving a single open sensor (SAR3).""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - # no auth - 'names': 'sar3', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) # no auth failure + def test_no_auth_succeeds_on_open_sar3_sensor(self): + """Test that providing no auth token succeeds in retrieving a single open sensor (SAR3).""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + # no auth + 'names': 'sar3', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) # no auth failure - def test_no_auth_succeeds_on_open_sar3_arch_epic_sensor(self): - """Test that providing no auth token succeeds in retrieving multiple open sensors.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - # no auth - 'names': 'sar3,arch,epic', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) # no auth failure + def test_no_auth_succeeds_on_open_sar3_arch_epic_sensor(self): + """Test that providing no auth token succeeds in retrieving multiple open sensors.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + # no auth + 'names': 'sar3,arch,epic', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) # no auth failure - def test_bogus_auth_succeeds_on_open_sar3_sensor(self): - """Test that even a bogus auth token succeeds in retrieving an open sensor (SAR3).""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': 'bogus', - 'names': 'sar3', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) # no auth failure + def test_bogus_auth_succeeds_on_open_sar3_sensor(self): + """Test that even a bogus auth token succeeds in retrieving an open sensor (SAR3).""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': 'bogus', + 'names': 'sar3', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) # no auth failure - def test_no_auth_blocked_on_open_arch_epic_closed_quid_sensor(self): - """Test that not providing an auth token doesn't authenticate a mix of open and closed sensors.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - # no auth - 'names': 'arch,epic,quid', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'unauthenticated/nonexistent sensor(s): quid') - self.assertEqual(response['result'], -1) + def test_no_auth_blocked_on_open_arch_epic_closed_quid_sensor(self): + """Test that not providing an auth token doesn't authenticate a mix of open and closed sensors.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + # no auth + 'names': 'arch,epic,quid', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual(response['message'], 'unauthenticated/nonexistent sensor(s): quid') + self.assertEqual(response['result'], -1) - def test_two_auth_blocked_even_on_open_arch_sensor(self): - """Test that providing two auth tokens is blocked before considering openness/closedness of sensors.""" - # NOTE: This tests the global auth check limit and the direct limit on the number of auth tokens. If these are changed later, this check should be changed to test the new global auth check limit, and more tests should be added to test the granular auth check limits. - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': 'auth1,auth2', - 'names': 'arch', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'currently, only a single auth token is allowed to be presented at a time; please issue a separate query for each sensor name using only the corresponding token') - self.assertEqual(response['result'], -1) + def test_two_auth_blocked_even_on_open_arch_sensor(self): + """Test that providing two auth tokens is blocked before considering openness/closedness of sensors.""" + # NOTE: This tests the global auth check limit and the direct limit on the number of auth tokens. If these are changed later, + # this check should bechanged to test the new global auth check limit, and more tests should be added to test the granular auth check limits. + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': 'auth1,auth2', + 'names': 'arch', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual( + response['message'], + 'currently, only a single auth token is allowed to be presented at a time; \ + please issue a separate query for each sensor name using only the corresponding token' + ) + self.assertEqual(response['result'], -1) - def test_two_auth_blocked_on_closed_ghtj_sensor(self): - """Test that providing two auth tokens is blocked before considering openness/closedness of sensors.""" - # NOTE: This tests the global auth check limit and the direct limit on the number of auth tokens. If these are changed later, this check should be changed to test the new global auth check limit, and more tests should be added to test the granular auth check limits. - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': 'auth1,auth2', - 'names': 'ghtj', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'currently, only a single auth token is allowed to be presented at a time; please issue a separate query for each sensor name using only the corresponding token') - self.assertEqual(response['result'], -1) + def test_two_auth_blocked_on_closed_ghtj_sensor(self): + """Test that providing two auth tokens is blocked before considering openness/closedness of sensors.""" + # NOTE: This tests the global auth check limit and the direct limit on the number of auth tokens. If these are changed later, + # this check should be changed to test the new global auth check limit, and more tests should be added to test the granular auth check limits. + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': 'auth1,auth2', + 'names': 'ghtj', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual( + response['message'], + 'currently, only a single auth token is allowed to be presented at a time; \ + please issue a separate query for each sensor name using only the corresponding token' + ) + self.assertEqual(response['result'], -1) - def test_bogus_auth_blocked_on_31_sensors(self): - """Test that providing a bogus auth token does not allow us to request more than 30 sensors, using 31.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': 'auth1', - 'names': ','*30, - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'too many sensors requested and/or auth tokens presented; please divide sensors into batches and/or use only the tokens needed for the sensors requested') - self.assertEqual(response['result'], -1) + def test_bogus_auth_blocked_on_31_sensors(self): + """Test that providing a bogus auth token does not allow us to request more than 30 sensors, using 31.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': 'auth1', + 'names': ','*30, + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual( + response['message'], + 'too many sensors requested and/or auth tokens presented; \ + please divide sensors into batches and/or use only the tokens needed for the sensors requested' + ) + self.assertEqual(response['result'], -1) - def test_bogus_auth_blocked_on_61_sensors(self): - """Test that providing a bogus auth token does not allow us to request more than 30 sensors, using 61.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': 'auth1', - 'names': ','*60, - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'too many sensors requested and/or auth tokens presented; please divide sensors into batches and/or use only the tokens needed for the sensors requested') - self.assertEqual(response['result'], -1) + def test_bogus_auth_blocked_on_61_sensors(self): + """Test that providing a bogus auth token does not allow us to request more than 30 sensors, using 61.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': 'auth1', + 'names': ','*60, + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual( + response['message'], + 'too many sensors requested and/or auth tokens presented; \ + please divide sensors into batches and/or use only the tokens needed for the sensors requested' + ) + self.assertEqual(response['result'], -1) - def test_twtr_auth_blocked_on_31_twtr_sensors(self): - """Test that providing a valid granular auth token does not succeed when we request too many sensors.""" - # NOTE: This tests the granular auth check limits. If the global auth check limit is changed later, this testing should be more extensive. - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensor_subsets.twtr_sensor, - 'names': 'twtr,'*30+'twtr', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'too many sensors requested and/or auth tokens presented; please divide sensors into batches and/or use only the tokens needed for the sensors requested') - self.assertEqual(response['result'], -1) + def test_twtr_auth_blocked_on_31_twtr_sensors(self): + """Test that providing a valid granular auth token does not succeed when we request too many sensors.""" + # NOTE: This tests the granular auth check limits. If the global auth check limit is changed later, this testing should be more extensive. + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensor_subsets.twtr_sensor, + 'names': 'twtr,'*30+'twtr', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual( + response['message'], + 'too many sensors requested and/or auth tokens presented; \ + please divide sensors into batches and/or use only the tokens needed for the sensors requested' + ) + self.assertEqual(response['result'], -1) - def test_twtr_auth_succeeds_on_twtr_sensor(self): - """Test that the TWTR auth token authenticates a request for (a single copy of) the TWTR sensor.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensor_subsets.twtr_sensor, - 'names': 'twtr', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) + def test_twtr_auth_succeeds_on_twtr_sensor(self): + """Test that the TWTR auth token authenticates a request for (a single copy of) the TWTR sensor.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensor_subsets.twtr_sensor, + 'names': 'twtr', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) - def test_gft_auth_succeeds_on_gft_sensor(self): - """Test that the GFT auth token authenticates a request for the GFT sensor.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensor_subsets.gft_sensor, - 'names': 'gft', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) + def test_gft_auth_succeeds_on_gft_sensor(self): + """Test that the GFT auth token authenticates a request for the GFT sensor.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensor_subsets.gft_sensor, + 'names': 'gft', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) - def test_ght_auth_succeeds_on_ght_sensors(self): - """Test that the GHT auth token authenticates a request for the GHT sensors.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensor_subsets.ght_sensors, - 'names': 'ght,ghtj', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) + def test_ght_auth_succeeds_on_ght_sensors(self): + """Test that the GHT auth token authenticates a request for the GHT sensors.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensor_subsets.ght_sensors, + 'names': 'ght,ghtj', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) - def test_cdc_auth_succeeds_on_cdc_sensor(self): - """Test that the CDC auth token authenticates a request for the CDC sensor.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensor_subsets.cdc_sensor, - 'names': 'cdc', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) + def test_cdc_auth_succeeds_on_cdc_sensor(self): + """Test that the CDC auth token authenticates a request for the CDC sensor.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensor_subsets.cdc_sensor, + 'names': 'cdc', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) - def test_wiki_auth_succeeds_on_wiki_sensor(self): - """Test that the WIKI auth token authenticates a request for the WIKI sensor.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensor_subsets.wiki_sensor, - 'names': 'wiki', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) + def test_wiki_auth_succeeds_on_wiki_sensor(self): + """Test that the WIKI auth token authenticates a request for the WIKI sensor.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensor_subsets.wiki_sensor, + 'names': 'wiki', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) - def test_quid_auth_succeeds_on_quid_sensor(self): - """Test that the QUID auth token authenticates a request for the QUID sensor.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensor_subsets.quid_sensor, - 'names': 'quid', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) + def test_quid_auth_succeeds_on_quid_sensor(self): + """Test that the QUID auth token authenticates a request for the QUID sensor.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensor_subsets.quid_sensor, + 'names': 'quid', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) - def test_quid_auth_blocked_on_cdc_sensor(self): - """Test that the QUIDEL auth token doesn't authenticate a request for the CDC sensor.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensor_subsets.quid_sensor, - 'names': 'cdc', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertEqual(response['message'], 'unauthenticated/nonexistent sensor(s): cdc') - self.assertEqual(response['result'], -1) + def test_quid_auth_blocked_on_cdc_sensor(self): + """Test that the QUIDEL auth token doesn't authenticate a request for the CDC sensor.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensor_subsets.quid_sensor, + 'names': 'cdc', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertEqual(response['message'], 'unauthenticated/nonexistent sensor(s): cdc') + self.assertEqual(response['result'], -1) - def test_global_auth_succeeds_on_open_closed_sensors(self): - """Test that the global sensor auth token authenticates a request for open and closed sensors.""" - response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ - 'endpoint': 'sensors', - 'auth': secrets.api.sensors, - 'names': 'sar3,arch,ghtj,ght,epic,ght,quidel,cdc,wiki', - 'locations': 'nat', - 'epiweeks': '201410', - })) - self.assertNotEqual(response['result'], -1) + def test_global_auth_succeeds_on_open_closed_sensors(self): + """Test that the global sensor auth token authenticates a request for open and closed sensors.""" + response = sim_api.extract_response_json(sim_api.dangerously_simulate_api_response({ + 'endpoint': 'sensors', + 'auth': secrets.api.sensors, + 'names': 'sar3,arch,ghtj,ght,epic,ght,quidel,cdc,wiki', + 'locations': 'nat', + 'epiweeks': '201410', + })) + self.assertNotEqual(response['result'], -1) diff --git a/tests/server/test_exceptions.py b/tests/server/test_exceptions.py index 94cdc34f1..b4e3989e2 100644 --- a/tests/server/test_exceptions.py +++ b/tests/server/test_exceptions.py @@ -5,30 +5,31 @@ # from flask.testing import FlaskClient from delphi.epidata.server._common import app -from delphi.epidata.server._exceptions import _is_using_status_codes +from delphi.epidata.server._exceptions import _is_using_status_codes # py3tester coverage target __test_target__ = 'delphi.epidata.server._exceptions' + class UnitTests(unittest.TestCase): - """Basic unit tests.""" - # app: FlaskClient + """Basic unit tests.""" + # app: FlaskClient - def setUp(self): - app.config['TESTING'] = True - app.config['WTF_CSRF_ENABLED'] = False - app.config['DEBUG'] = False + def setUp(self): + app.config['TESTING'] = True + app.config['WTF_CSRF_ENABLED'] = False + app.config['DEBUG'] = False - def test_is_using_status_codes(self): - with app.test_request_context('/?format=csv'): - self.assertTrue(_is_using_status_codes()) - with app.test_request_context('/?format=json'): - self.assertTrue(_is_using_status_codes()) - with app.test_request_context('/?format=jsonl'): - self.assertTrue(_is_using_status_codes()) - with app.test_request_context('/'): - self.assertFalse(_is_using_status_codes()) - with app.test_request_context('/?format=classic'): - self.assertFalse(_is_using_status_codes()) - with app.test_request_context('/?format=tree'): - self.assertFalse(_is_using_status_codes()) + def test_is_using_status_codes(self): + with app.test_request_context('/?format=csv'): + self.assertTrue(_is_using_status_codes()) + with app.test_request_context('/?format=json'): + self.assertTrue(_is_using_status_codes()) + with app.test_request_context('/?format=jsonl'): + self.assertTrue(_is_using_status_codes()) + with app.test_request_context('/'): + self.assertFalse(_is_using_status_codes()) + with app.test_request_context('/?format=classic'): + self.assertFalse(_is_using_status_codes()) + with app.test_request_context('/?format=tree'): + self.assertFalse(_is_using_status_codes())