|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +"""Functions to call when running the function. |
| 3 | +
|
| 4 | +This module should contain a function called `run_module`, that is executed when |
| 5 | +the module is run with `python -m delphi_combo_cases_and_deaths`. |
| 6 | +
|
| 7 | +This module produces a combined signal for jhu-csse and usa-facts. This signal |
| 8 | +is only used for visualization. It sources Puerto Rico from jhu-csse and |
| 9 | +everything else from usa-facts. |
| 10 | +
|
| 11 | +""" |
| 12 | +from datetime import date, timedelta, datetime |
| 13 | +from itertools import product |
| 14 | +import re |
| 15 | +import sys |
| 16 | + |
| 17 | +import covidcast |
| 18 | +import pandas as pd |
| 19 | + |
| 20 | +from delphi_utils import read_params, create_export_csv |
| 21 | + |
| 22 | + |
| 23 | +METRICS = [ |
| 24 | + "confirmed", |
| 25 | + "deaths", |
| 26 | +] |
| 27 | +SMOOTH_TYPES = [ |
| 28 | + "", |
| 29 | + "7dav", |
| 30 | +] |
| 31 | +SENSORS = [ |
| 32 | + "incidence_num", |
| 33 | + "cumulative_num", |
| 34 | + "incidence_prop", |
| 35 | + "cumulative_prop", |
| 36 | +] |
| 37 | +GEO_RESOLUTIONS = [ |
| 38 | + "county", |
| 39 | + "state", |
| 40 | + "msa", |
| 41 | + "hrr", |
| 42 | +] |
| 43 | + |
| 44 | +def check_not_none(data_frame, label, date_range): |
| 45 | + """Exit gracefully if a data frame we attempted to retrieve is empty""" |
| 46 | + if data_frame is None: |
| 47 | + print(f"{label} not available in range {date_range}") |
| 48 | + sys.exit(1) |
| 49 | + |
| 50 | +def combine_usafacts_and_jhu(signal, geo, date_range): |
| 51 | + """ |
| 52 | + Add rows for PR from JHU signals to USA-FACTS signals |
| 53 | + """ |
| 54 | + usafacts_df = covidcast.signal("usa-facts", signal, date_range[0], date_range[1], geo) |
| 55 | + jhu_df = covidcast.signal("jhu-csse", signal, date_range[0], date_range[1], geo) |
| 56 | + check_not_none(usafacts_df, "USA-FACTS", date_range) |
| 57 | + check_not_none(jhu_df, "JHU", date_range) |
| 58 | + |
| 59 | + # State level |
| 60 | + if geo == 'state': |
| 61 | + combined_df = usafacts_df.append(jhu_df[jhu_df["geo_value"] == 'pr']) |
| 62 | + # County level |
| 63 | + elif geo == 'county': |
| 64 | + combined_df = usafacts_df.append(jhu_df[jhu_df["geo_value"] == '72000']) |
| 65 | + # For MSA and HRR level, they are the same |
| 66 | + else: |
| 67 | + combined_df = usafacts_df |
| 68 | + |
| 69 | + combined_df = combined_df.drop(["direction"], axis=1) |
| 70 | + combined_df = combined_df.rename({"time_value": "timestamp", |
| 71 | + "geo_value": "geo_id", |
| 72 | + "value": "val", |
| 73 | + "stderr": "se"}, |
| 74 | + axis=1) |
| 75 | + return combined_df |
| 76 | + |
| 77 | +def extend_raw_date_range(params, sensor_name): |
| 78 | + """A complete issue includes smoothed signals as well as all raw data |
| 79 | + that contributed to the smoothed values, so that it's possible to use |
| 80 | + the raw values in the API to reconstruct the smoothed signal at will. |
| 81 | +
|
| 82 | + The smoother we're currently using incorporates the previous 7 |
| 83 | + days of data, so we must extend the date range of the raw data |
| 84 | + backwards by 7 days. |
| 85 | + """ |
| 86 | + if sensor_name.find("7dav") < 0: |
| 87 | + return [ |
| 88 | + params['date_range'][0] - timedelta(days=7), |
| 89 | + params['date_range'][-1] |
| 90 | + ] |
| 91 | + return params['date_range'] |
| 92 | + |
| 93 | +def next_missing_day(source, signals): |
| 94 | + """Fetch the first day for which we want to generate new data.""" |
| 95 | + meta_df = covidcast.metadata() |
| 96 | + meta_df = meta_df[meta_df["data_source"] == source] |
| 97 | + meta_df = meta_df[meta_df["signal"].isin(signals)] |
| 98 | + # min: use the max_time of the most lagged signal, in case they differ |
| 99 | + # +timedelta: the subsequent day is the first day of new data to generate |
| 100 | + day = min(meta_df["max_time"]) + timedelta(days=1) |
| 101 | + return day |
| 102 | + |
| 103 | +def sensor_signal(metric, sensor, smoother): |
| 104 | + """Generate the signal name for a particular configuration""" |
| 105 | + if smoother == "7dav": |
| 106 | + sensor_name = "_".join([smoother, sensor]) |
| 107 | + else: |
| 108 | + sensor_name = sensor |
| 109 | + signal = "_".join([metric, sensor_name]) |
| 110 | + return sensor_name, signal |
| 111 | + |
| 112 | +def run_module(): |
| 113 | + """Produce a combined cases and deaths signal using data from JHU and USA Facts""" |
| 114 | + variants = [tuple((metric, geo_res)+sensor_signal(metric, sensor, smoother)) |
| 115 | + for (metric, geo_res, sensor, smoother) in |
| 116 | + product(METRICS, GEO_RESOLUTIONS, SENSORS, SMOOTH_TYPES)] |
| 117 | + |
| 118 | + params = read_params() |
| 119 | + params['export_start_date'] = date(*params['export_start_date']) |
| 120 | + yesterday = date.today() - timedelta(days=1) |
| 121 | + if params['date_range'] == 'new': |
| 122 | + # only create combined file for the newest update |
| 123 | + # (usually for yesterday, but check just in case) |
| 124 | + params['date_range'] = [ |
| 125 | + min( |
| 126 | + yesterday, |
| 127 | + next_missing_day( |
| 128 | + params["source"], |
| 129 | + set(signal[-1] for signal in variants) |
| 130 | + ) |
| 131 | + ), |
| 132 | + yesterday |
| 133 | + ] |
| 134 | + elif params['date_range'] == 'all': |
| 135 | + # create combined files for all of the historical reports |
| 136 | + params['date_range'] = [params['export_start_date'], yesterday] |
| 137 | + else: |
| 138 | + pattern = re.compile(r'^\d{8}-\d{8}$') |
| 139 | + match_res = re.findall(pattern, params['date_range']) |
| 140 | + if len(match_res) == 0: |
| 141 | + raise ValueError( |
| 142 | + "Invalid date_range parameter. Please choose from (new, all, yyyymmdd-yyyymmdd).") |
| 143 | + try: |
| 144 | + date1 = datetime.strptime(params['date_range'][:8], '%Y%m%d').date() |
| 145 | + except ValueError: |
| 146 | + raise ValueError("Invalid date_range parameter. Please check the first date.") |
| 147 | + try: |
| 148 | + date2 = datetime.strptime(params['date_range'][-8:], '%Y%m%d').date() |
| 149 | + except ValueError: |
| 150 | + raise ValueError("Invalid date_range parameter. Please check the second date.") |
| 151 | + |
| 152 | + #The the valid start date |
| 153 | + if date1 < params['export_start_date']: |
| 154 | + date1 = params['export_start_date'] |
| 155 | + params['date_range'] = [date1, date2] |
| 156 | + |
| 157 | + for metric, geo_res, sensor_name, signal in variants: |
| 158 | + create_export_csv( |
| 159 | + combine_usafacts_and_jhu(signal, geo_res, extend_raw_date_range(params, sensor_name)), |
| 160 | + export_dir=params['export_dir'], |
| 161 | + start_date=pd.to_datetime(params['export_start_date']), |
| 162 | + metric=metric, |
| 163 | + geo_res=geo_res, |
| 164 | + sensor=sensor_name, |
| 165 | + ) |
0 commit comments