Skip to content

Commit f55b7ec

Browse files
authored
Merge pull request #149 from krivard/feature/data-versioning-and-optimization
Feature/data versioning and optimization
2 parents 17089cd + c9cf2b4 commit f55b7ec

24 files changed

+1903
-547
lines changed

docs/api/covidcast.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,38 @@ and lists.
9494
The current set of signals available for each data source is returned by the
9595
[`covidcast_meta`](covidcast_meta.md) endpoint.
9696

97+
#### Optional
98+
99+
The default API behavior is to return the most recently issued value for each `time_value` selected.
100+
101+
We also provide access to previous versions of data using the optional parameters below.
102+
103+
| Parameter | Description | Type |
104+
| --- | --- | --- |
105+
| `as_of` | maximum time unit (e.g., date) when the signal data were published (return most recent for each `time_value`) | time value (e.g., 20200401) |
106+
| `issues` | time unit (e.g., date) when the signal data were published (return all matching records for each `time_value`) | `list` of time values (e.g., 20200401) |
107+
| `lag` | time delta (e.g. days) between when the underlying events happened and when the data were published | integer |
108+
109+
Use cases:
110+
111+
* To pretend like you queried the API on June 1, such that the returned results
112+
do not include any updates which became available after June 1, use
113+
`as_of=20200601`.
114+
* To retrieve only data that was published or updated on June 1, and exclude
115+
records whose most recent update occured earlier than June 1, use
116+
`issues=20200601`.
117+
* To retrieve all data that was published between May 1 and June 1, and exclude
118+
records whose most recent update occured earlier than May 1, use
119+
`issues=20200501-20200601`. The results will include all matching issues for
120+
each `time_value`, not just the most recent.
121+
* To retrieve only data that was published or updated exactly 3 days after the
122+
underlying events occurred, use `lag=3`.
123+
124+
NB: Each issue in the versioning system contains only the records that were
125+
added or updated during that time unit; we exclude records whose values remain
126+
the same as a previous issue. If you have a research problem that would require
127+
knowing when an unchanged value was last confirmed, please get in touch.
128+
97129
### Response
98130

99131
| Field | Description | Type |
@@ -106,6 +138,8 @@ The current set of signals available for each data source is returned by the
106138
| `epidata[].value` | value (statistic) derived from the underlying data source | float |
107139
| `epidata[].stderr` | approximate standard error of the statistic with respect to its sampling distribution, `null` when not applicable | float |
108140
| `epidata[].sample_size` | number of "data points" used in computing the statistic, `null` when not applicable | float |
141+
| `epidata[].issue` | time unit (e.g. date) when this statistic was published | integer |
142+
| `epidata[].lag` | time delta (e.g. days) between when the underlying events happened and when this statistic was published | integer |
109143
| `message` | `success` or error message | string |
110144

111145
**Note:** `result` code 2, "too many results", means that the number of results

integrations/acquisition/covidcast/test_covidcast_meta_caching.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,14 @@ def test_caching(self):
6666
self.cur.execute('''
6767
insert into covidcast values
6868
(0, 'src', 'sig', 'day', 'state', 20200422, 'pa',
69-
123, 1, 2, 3, 456, 1)
69+
123, 1, 2, 3, 456, 1, 20200422, 0),
70+
(0, 'src', 'sig', 'day', 'state', 20200422, 'wa',
71+
789, 1, 2, 3, 456, 1, 20200423, 1)
7072
''')
7173
self.cur.execute('''
7274
insert into covidcast values
7375
(100, 'src', 'wip_sig', 'day', 'state', 20200422, 'pa',
74-
456, 4, 5, 6, 789, -1)
76+
456, 4, 5, 6, 789, -1, 20200422, 0)
7577
''')
7678

7779
self.cnx.commit()
@@ -90,12 +92,15 @@ def test_caching(self):
9092
'geo_type': 'state',
9193
'min_time': 20200422,
9294
'max_time': 20200422,
93-
'num_locations': 1,
94-
'last_update': 123,
95-
'min_value': 1,
96-
'max_value': 1,
97-
'mean_value': 1,
98-
'stdev_value': 0,
95+
'num_locations': 2,
96+
'last_update': 789,
97+
'min_value': 1.0,
98+
'max_value': 1.0,
99+
'mean_value': '1.0000000',
100+
'stdev_value': '0.0000000',
101+
'max_issue': 20200423,
102+
'min_lag': 0,
103+
'max_lag': 1
99104
}
100105
])
101106
epidata1={'result':1,'message':'success','epidata':epidata1}

integrations/acquisition/covidcast/test_csv_uploading.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Integration tests for covidcast's CSV-to-database uploading."""
22

33
# standard library
4+
from datetime import date
45
import os
56
import unittest
67
from unittest.mock import MagicMock
@@ -95,11 +96,24 @@ def test_uploading(self):
9596
response = Epidata.covidcast(
9697
'src-name', 'test', 'day', 'state', 20200419, '*')
9798

99+
100+
expected_issue_day=date.today()
101+
expected_issue=expected_issue_day.strftime("%Y%m%d")
102+
def apply_lag(expected_epidata):
103+
for dct in expected_epidata:
104+
dct['issue'] = int(expected_issue)
105+
time_value_day = date(year=dct['time_value'] // 10000,
106+
month=dct['time_value'] % 10000 // 100,
107+
day= dct['time_value'] % 100)
108+
expected_lag = (expected_issue_day - time_value_day).days
109+
dct['lag'] = expected_lag
110+
return expected_epidata
111+
98112
# verify data matches the CSV
99113
# NB these are ordered by geo_value
100114
self.assertEqual(response, {
101115
'result': 1,
102-
'epidata': [
116+
'epidata': apply_lag([
103117
{
104118
'time_value': 20200419,
105119
'geo_value': 'ca',
@@ -124,19 +138,20 @@ def test_uploading(self):
124138
'sample_size': 20,
125139
'direction': None,
126140
},
127-
],
141+
]),
128142
'message': 'success',
129143
})
130144

131145
# request CSV data from the API on WIP signal
132146
response = Epidata.covidcast(
133147
'src-name', 'wip_prototype', 'day', 'state', 20200419, '*')
134148

149+
135150
# verify data matches the CSV
136151
# NB these are ordered by geo_value
137152
self.assertEqual(response, {
138153
'result': 1,
139-
'epidata': [
154+
'epidata': apply_lag([
140155
{
141156
'time_value': 20200419,
142157
'geo_value': 'me',
@@ -161,7 +176,7 @@ def test_uploading(self):
161176
'sample_size': 300,
162177
'direction': None,
163178
},
164-
],
179+
]),
165180
'message': 'success',
166181
})
167182

integrations/acquisition/covidcast/test_direction_updating.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -61,23 +61,23 @@ def test_uploading(self):
6161
self.cur.execute('''
6262
insert into covidcast values
6363
(0, 'src', 'sig', 'day', 'state', 20200228, 'ca',
64-
123, 2, 0, 0, 0, NULL),
64+
123, 2, 0, 0, 0, NULL, 20200228, 0),
6565
(0, 'src', 'sig', 'day', 'state', 20200229, 'ca',
66-
123, 6, 0, 0, 0, NULL),
66+
123, 6, 0, 0, 0, NULL, 20200229, 0),
6767
(0, 'src', 'sig', 'day', 'state', 20200301, 'ca',
68-
123, 5, 0, 0, 0, NULL),
68+
123, 5, 0, 0, 0, NULL, 20200301, 0),
6969
(0, 'src', 'sig', 'day', 'state', 20200511, 'fl',
70-
123, 1, 0, 0, 0, NULL),
70+
123, 1, 0, 0, 0, NULL, 20200511, 0),
7171
(0, 'src', 'sig', 'day', 'state', 20200512, 'fl',
72-
123, 2, 0, 0, 0, NULL),
72+
123, 2, 0, 0, 0, NULL, 20200512, 0),
7373
(0, 'src', 'sig', 'day', 'state', 20200517, 'fl',
74-
123, 2, 0, 0, 0, NULL),
74+
123, 2, 0, 0, 0, NULL, 20200517, 0),
7575
(0, 'src', 'sig', 'day', 'state', 20200615, 'tx',
76-
123, 9, 0, 0, 456, NULL),
76+
123, 9, 0, 0, 456, NULL, 20200615, 0),
7777
(0, 'src', 'sig', 'day', 'state', 20200616, 'tx',
78-
123, 5, 0, 0, 456, NULL),
78+
123, 5, 0, 0, 456, NULL, 20200616, 0),
7979
(0, 'src', 'sig', 'day', 'state', 20200617, 'tx',
80-
123, 1, 0, 0, 456, 1)
80+
123, 1, 0, 0, 456, 1, 20200617, 0)
8181
''')
8282
self.cnx.commit()
8383

@@ -100,6 +100,8 @@ def test_uploading(self):
100100
'stderr': 0,
101101
'sample_size': 0,
102102
'direction': None,
103+
'issue': 20200228,
104+
'lag': 0
103105
},
104106
{
105107
'time_value': 20200229,
@@ -108,6 +110,8 @@ def test_uploading(self):
108110
'stderr': 0,
109111
'sample_size': 0,
110112
'direction': None,
113+
'issue': 20200229,
114+
'lag': 0
111115
},
112116
{
113117
'time_value': 20200301,
@@ -116,6 +120,8 @@ def test_uploading(self):
116120
'stderr': 0,
117121
'sample_size': 0,
118122
'direction': 1,
123+
'issue': 20200301,
124+
'lag': 0
119125
},
120126
{
121127
'time_value': 20200511,
@@ -124,6 +130,8 @@ def test_uploading(self):
124130
'stderr': 0,
125131
'sample_size': 0,
126132
'direction': None,
133+
'issue': 20200511,
134+
'lag': 0
127135
},
128136
{
129137
'time_value': 20200512,
@@ -132,6 +140,8 @@ def test_uploading(self):
132140
'stderr': 0,
133141
'sample_size': 0,
134142
'direction': None,
143+
'issue': 20200512,
144+
'lag': 0
135145
},
136146
{
137147
'time_value': 20200517,
@@ -140,6 +150,8 @@ def test_uploading(self):
140150
'stderr': 0,
141151
'sample_size': 0,
142152
'direction': 0,
153+
'issue': 20200517,
154+
'lag': 0
143155
},
144156
{
145157
'time_value': 20200615,
@@ -148,6 +160,8 @@ def test_uploading(self):
148160
'stderr': 0,
149161
'sample_size': 0,
150162
'direction': None,
163+
'issue': 20200615,
164+
'lag': 0
151165
},
152166
{
153167
'time_value': 20200616,
@@ -156,6 +170,8 @@ def test_uploading(self):
156170
'stderr': 0,
157171
'sample_size': 0,
158172
'direction': None,
173+
'issue': 20200616,
174+
'lag': 0
159175
},
160176
{
161177
'time_value': 20200617,
@@ -164,6 +180,8 @@ def test_uploading(self):
164180
'stderr': 0,
165181
'sample_size': 0,
166182
'direction': 1,
183+
'issue': 20200617,
184+
'lag': 0
167185
},
168186
],
169187
'message': 'success',

0 commit comments

Comments
 (0)