Skip to content

Commit bd92813

Browse files
authored
Merge pull request #718 from cmu-delphi/sgratzl/missing_signals_detector
Add a missing signal detector helper job
2 parents bf037d9 + 69e7e40 commit bd92813

File tree

3 files changed

+116
-0
lines changed

3 files changed

+116
-0
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
name: Missing Signal Detector
2+
3+
on:
4+
workflow_dispatch:
5+
6+
jobs:
7+
build:
8+
runs-on: ubuntu-latest
9+
steps:
10+
- name: Check out code
11+
uses: actions/checkout@v2
12+
with:
13+
ref: dev
14+
- name: Set up Python 3.8
15+
uses: actions/setup-python@v2
16+
with:
17+
python-version: 3.8
18+
- name: Install Dependencies
19+
run: pip install requests pandas
20+
- name: Run Missing Signals Detector
21+
run: python scripts/report_missing_covidcast_meta.py
22+
- name: Upload Missing Artifact
23+
if: failure()
24+
uses: actions/upload-artifact@v2
25+
with:
26+
name: missing_db_signal.csv
27+
path: missing_db_signal.csv
28+

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ __pycache__/
77
/build
88
/node_modules
99
.mypy_cache
10+
/missing_db_signals.csv
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
from typing import Dict, List, Tuple, Union
2+
from requests import get
3+
import sys
4+
import pandas as pd
5+
from pathlib import Path
6+
7+
base_dir = Path(__file__).parent.parent
8+
base_url = 'https://delphi.cmu.edu/epidata'
9+
10+
def is_known_missing(source: str, signal: str) -> bool:
11+
if '7dav_cumulative' in signal:
12+
return True
13+
if source in ('youtube-survey', 'indicator-combination'):
14+
return True
15+
return False
16+
17+
def compute_missing_signals() -> List[Tuple[Tuple[str, str], Dict]]:
18+
defined_meta = get(f"{base_url}/covidcast/meta").json()
19+
defined_signals: Dict[Tuple[str, str], Dict] = {}
20+
for source in defined_meta:
21+
for signal in source['signals']:
22+
defined_signals[(signal['source'], signal['signal'])] = signal
23+
defined_signals[(source['db_source'], signal['signal'])] = signal
24+
25+
computed_meta = get(f"{base_url}/covidcast_meta/?format=json").json()
26+
computed_signals: Dict[Tuple[str, str], List[Dict]] = {}
27+
for entry in computed_meta:
28+
computed_signals.setdefault((entry['data_source'], entry['signal']), []).append(entry)
29+
30+
missing_signals: List[Tuple[Tuple[str, str], Dict]] = []
31+
32+
for key, infos in computed_signals.items():
33+
defined_info = defined_signals.get(key)
34+
if not defined_info:
35+
if not is_known_missing(key[0], key[1]):
36+
missing_signals.append((key, infos[0]))
37+
return missing_signals
38+
39+
40+
def gen_row(source: str, signal: str, info: Dict) -> Dict:
41+
is_weighted = signal.startswith('smoothed_w') and not (signal.startswith('smoothed_wa') or signal.startswith('smoothed_we') or signal.startswith('smoothed_wi') or signal.startswith('smoothed_wo') or signal.startswith('smoothed_wu'))
42+
base_name = signal.replace('smoothed_w', 'smoothed_') if is_weighted else signal
43+
bool_str = lambda x: 'TRUE' if x else 'FALSE'
44+
45+
return {
46+
'Source Subdivision': source,
47+
'Signal BaseName': base_name,
48+
'base_is_other': bool_str(False),
49+
'Signal': signal,
50+
'Compute From Base': False,
51+
'Name': "{base_name} (Weighted)" if is_weighted else signal,
52+
'Active': bool_str(True),
53+
'Short Description': 'TODO' if base_name == signal else '',
54+
'Description': 'TODO' if base_name == signal else '',
55+
'Time Type': info['time_type'],
56+
'Time Label': 'Week' if info['time_type'] == 'week' else 'Day',
57+
'Value Label': 'Percentage' if source == 'fb-survey' else 'Value',
58+
'Format': 'percent' if source == 'fb-survey' else 'raw',
59+
'Category': 'public' if source == 'fb-survey' else 'other',
60+
'High Values Are': 'neutral',
61+
'Is Smoothed': bool_str(signal.startswith('smoothed') or '7dav' in signal),
62+
'Is Weighted': bool_str(is_weighted),
63+
'Is Cumulative': bool_str('cumulative' in signal),
64+
'Has StdErr': 'TRUE' if source == 'fb-survey' else '',
65+
'Has Sample Size': 'TRUE' if source == 'fb-survey' else '',
66+
'Link': 'TODO'
67+
}
68+
69+
def generate_missing_info_hint(missing_signals: List[Tuple[Tuple[str, str], Dict]]) -> None:
70+
missing = pd.DataFrame.from_records([gen_row(s[0], s[1], info) for s, info in missing_signals])
71+
72+
# use the current as base to have the right column order
73+
current = pd.read_csv(base_dir / 'src/server/endpoints/covidcast_utils/db_signals.csv')
74+
# clear
75+
current = current[0:0]
76+
guessed: pd.DataFrame = pd.concat([current, missing])
77+
guessed.to_csv(base_dir / 'missing_db_signals.csv', index=False)
78+
79+
missing = compute_missing_signals()
80+
if missing:
81+
print(f'found {len(missing)} missing signals')
82+
generate_missing_info_hint(missing)
83+
sys.exit(1)
84+
else:
85+
print(f'all signals found')
86+
sys.exit(0)
87+

0 commit comments

Comments
 (0)