Skip to content

Commit b8d55e2

Browse files
committed
refactor: replace merge-sort with heapsort
This change replaces the merge-sort with a heapsort that uses much less CU than merge-sort. The previous algorithm is very fast running on a normal CPU but doesn't work very well in BPF because all instructions (like load, move, ..) have the same cost and there is no cache and optimizations for memory-alignment have no real benefit. A major benefit of heapsort is being non-recursive that reduces the high stackframe overhead in BPF and is inplace which minimizes number of copies. Unfortunately there is no way to systematically get the the compute usage out of program test. The `test_benchmark` file has a simple code that helps running benchmarks on various number of publishers. In 32-publisher setup, heapsort reduces the CU from 16.5k to 12k and in the 64-publisher setup 37k to 20.5k. The numbers are the worst cases running on randomized input.
1 parent 9acf172 commit b8d55e2

File tree

13 files changed

+145
-560
lines changed

13 files changed

+145
-560
lines changed

program/c/makefile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ cpyth-native: features.h
4141
test: features.h
4242
mkdir -p $(OUT_DIR)/test/
4343
gcc -c ./src/oracle/model/test_price_model.c -o $(OUT_DIR)/test/test_price_model.o -fPIC
44-
gcc -c ./src/oracle/sort/test_sort_stable.c -o $(OUT_DIR)/test/test_sort_stable.o -fPIC
4544
gcc -c ./src/oracle/util/test_align.c -o $(OUT_DIR)/test/test_align.o -fPIC
4645
gcc -c ./src/oracle/util/test_avg.c -o $(OUT_DIR)/test/test_avg.o -fPIC
4746
gcc -c ./src/oracle/util/test_hash.c -o $(OUT_DIR)/test/test_hash.o -fPIC
Lines changed: 50 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,112 +1,81 @@
11
#include "price_model.h"
22
#include "../util/avg.h" /* For avg_2_int64 */
33

4-
#define SORT_NAME int64_sort_ascending
5-
#define SORT_KEY_T int64_t
6-
#include "../sort/tmpl/sort_stable.c"
4+
/*
5+
* In-place Heapsort implementation optimized for minimal compute unit usage in BPF.
6+
*
7+
* Initially it creates a max heap in linear time and then to get ascending
8+
* order it swaps the root with the last element and then sifts down the root.
9+
*
10+
* There are a lot of (j-1) or (j+1) math in the code which can be optimized by
11+
* thinking of a as 1-based array. Fortunately, BPF compiler optimizes that for us.
12+
*/
13+
void heapsort(int64_t * a, uint64_t n) {
14+
/*
15+
* This is a bottom-up heapify which is linear in time.
16+
*/
17+
for (int i = n / 2 - 1; i >= 0; i--) {
18+
int64_t root = a[i];
19+
int j = i * 2 + 1;
20+
while (j < n) {
21+
if (j + 1 < n && a[j] < a[j + 1]) j++;
22+
if (root >= a[j]) break;
23+
a[(j - 1) / 2] = a[j];
24+
j = j * 2 + 1;
25+
}
26+
a[(j - 1) / 2] = root;
27+
}
28+
29+
for (int i = n - 1; i > 0; i--) {
30+
int64_t tmp = a[0];
31+
a[0] = a[i];
32+
a[i] = tmp;
33+
34+
int64_t root = a[0];
35+
int j = 1;
36+
while (j < i) {
37+
if (j + 1 < i && a[j] < a[j + 1]) j++;
38+
if (root >= a[j]) break;
39+
a[(j - 1) / 2] = a[j];
40+
j = j * 2 + 1;
41+
}
42+
a[(j - 1) / 2] = root;
43+
}
44+
}
745

846
int64_t *
947
price_model_core( uint64_t cnt,
1048
int64_t * quote,
1149
int64_t * _p25,
1250
int64_t * _p50,
13-
int64_t * _p75,
14-
void * scratch ) {
15-
16-
/* Sort the quotes. The sorting implementation used here is a highly
17-
optimized mergesort (merge with an unrolled insertion sorting
18-
network small n base cases). The best case is ~0.5 n lg n compares
19-
and the average and worst cases are ~n lg n compares.
20-
21-
While not completely data oblivious, this has quite low variance in
22-
operation count practically and this is _better_ than quicksort's
23-
average case and quicksort's worst case is a computational
24-
denial-of-service and timing attack vulnerable O(n^2). Unlike
25-
quicksort, this is also stable (but this stability does not
26-
currently matter ... it might be a factor in future models).
27-
28-
A data oblivious sorting network approach might be viable here with
29-
and would have a completely deterministic operations count. It
30-
currently isn't used as the best known practical approaches for
31-
general n have a worse algorithmic cost (O( n (lg n)^2 )) and,
32-
while the application probably doesn't need perfect obliviousness,
33-
mergesort is still moderately oblivious and the application can
34-
benefit from mergesort's lower operations cost. (The main drawback
35-
of mergesort over quicksort is that it isn't in place, but memory
36-
footprint isn't an issue here.)
37-
38-
Given the operations cost model (e.g. cache friendliness is not
39-
incorporated), a radix sort might be viable here (O(n) in best /
40-
average / worst). It currently isn't used as we expect invocations
41-
with small-ish n to be common and radix sort would be have large
42-
coefficients on the O(n) and additional fixed overheads that would
43-
make it more expensive than mergesort in this regime.
44-
45-
Note: price_model_cnt_valid( cnt ) implies
46-
int64_sort_ascending_cnt_valid( cnt ) currently.
47-
48-
Note: consider filtering out "NaN" quotes (i.e. INT64_MIN)? */
49-
50-
int64_t * sort_quote = int64_sort_ascending_stable( quote, cnt, scratch );
51-
52-
/* Extract the p25
53-
54-
There are many variants with subtle tradeoffs here. One option is
55-
to interpolate when the ideal p25 is bracketed by two samples (akin
56-
to the p50 interpolation above when the number of quotes is even).
57-
That is, for p25, interpolate between quotes floor((cnt-2)/4) and
58-
ceil((cnt-2)/4) with the weights determined by cnt mod 4. The
59-
current preference is to not do that as it is slightly more
60-
complex, doesn't exactly always minimize the current loss function
61-
and is more exposed to the confidence intervals getting skewed by
62-
bum quotes with the number of quotes is small.
63-
64-
Another option is to use the inside quote of the above pair. That
65-
is, for p25, use quote ceil((cnt-2)/4) == floor((cnt+1)/4) ==
66-
(cnt+1)>>2. The current preference is not to do this as, though
67-
this has stronger bum quote robustness, it results in p25==p50==p75
68-
when cnt==3. (In this case, the above wants to do an interpolation
69-
between quotes 0 and 1 to for the p25 and between quotes 1 and 2
70-
for the p75. But limiting to just the inside quote results in
71-
p25/p50/p75 all using the median quote.)
72-
73-
A tweak to this option, for p25, is to use floor(cnt/4) == cnt>>2.
74-
This is simple, has the same asymptotic behavior for large cnt, has
75-
good behavior in the cnt==3 case and practically as good bum quote
76-
rejection in the moderate cnt case. */
51+
int64_t * _p75) {
52+
heapsort(quote, cnt);
7753

54+
/* Extract the p25 */
7855
uint64_t p25_idx = cnt >> 2;
79-
80-
*_p25 = sort_quote[p25_idx];
56+
*_p25 = quote[p25_idx];
8157

8258
/* Extract the p50 */
83-
8459
if( (cnt & (uint64_t)1) ) { /* Odd number of quotes */
85-
8660
uint64_t p50_idx = cnt >> 1; /* ==ceil((cnt-1)/2) */
87-
88-
*_p50 = sort_quote[p50_idx];
89-
61+
*_p50 = quote[p50_idx];
9062
} else { /* Even number of quotes (at least 2) */
91-
9263
uint64_t p50_idx_right = cnt >> 1; /* == ceil((cnt-1)/2)> 0 */
9364
uint64_t p50_idx_left = p50_idx_right - (uint64_t)1; /* ==floor((cnt-1)/2)>=0 (no overflow/underflow) */
9465

95-
int64_t vl = sort_quote[p50_idx_left ];
96-
int64_t vr = sort_quote[p50_idx_right];
66+
int64_t vl = quote[p50_idx_left];
67+
int64_t vr = quote[p50_idx_right];
9768

9869
/* Compute the average of vl and vr (with floor / round toward
9970
negative infinity rounding and without possibility of
10071
intermediate overflow). */
101-
10272
*_p50 = avg_2_int64( vl, vr );
10373
}
10474

10575
/* Extract the p75 (this is the mirror image of the p25 case) */
10676

10777
uint64_t p75_idx = cnt - ((uint64_t)1) - p25_idx;
78+
*_p75 = quote[p75_idx];
10879

109-
*_p75 = sort_quote[p75_idx];
110-
111-
return sort_quote;
80+
return quote;
11281
}

program/c/src/oracle/model/price_model.h

Lines changed: 2 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -8,91 +8,12 @@
88
extern "C" {
99
#endif
1010

11-
/* Returns the minimum and maximum number of quotes the implementation
12-
can handle */
13-
14-
static inline uint64_t
15-
price_model_quote_min( void ) {
16-
return (uint64_t)1;
17-
}
18-
19-
static inline uint64_t
20-
price_model_quote_max( void ) {
21-
return (UINT64_MAX-(uint64_t)alignof(int64_t)+(uint64_t)1) / (uint64_t)sizeof(int64_t);
22-
}
23-
24-
/* price_model_cnt_valid returns non-zero if cnt is a valid value or
25-
zero if not. */
26-
27-
static inline int
28-
price_model_cnt_valid( uint64_t cnt ) {
29-
return price_model_quote_min()<=cnt && cnt<=price_model_quote_max();
30-
}
31-
32-
/* price_model_scratch_footprint returns the number of bytes of scratch
33-
space needed for an arbitrarily aligned scratch region required by
34-
price_model to handle price_model_quote_min() to cnt quotes
35-
inclusive. */
36-
37-
static inline uint64_t
38-
price_model_scratch_footprint( uint64_t cnt ) { /* Assumes price_model_cnt_valid( cnt ) is true */
39-
/* cnt int64_t's plus worst case alignment padding, no overflow
40-
possible as cnt is valid at this point */
41-
return cnt*(uint64_t)sizeof(int64_t)+(uint64_t)alignof(int64_t)-(uint64_t)1;
42-
}
43-
44-
/* price_model_core minimizes (to quote precision in a floor / round
45-
toward negative infinity sense) the loss model of the given quotes.
46-
Assumes valid inputs (e.g. cnt is at least 1 and not unreasonably
47-
large ... typically a multiple of 3 but this is not required,
48-
quote[i] for i in [0,cnt) are the quotes of interest on input, p25,
49-
p50, p75 point to where to write model outputs, scratch points to a
50-
suitable footprint scratch region).
51-
52-
Returns a pointer to the quotes sorted in ascending order. As such,
53-
the min and max and any other rank statistic can be extracted easily
54-
on return. This location will either be quote itself or to a
55-
location in scratch. Use price_model below for a variant that always
56-
replaces quote with the sorted quotes (potentially has extra ops for
57-
copying). Further, on return, *_p25, *_p50, *_p75 will hold the loss
58-
model minimizing values for the input quotes and the scratch region
59-
was clobbered.
60-
61-
Scratch points to a memory region of arbitrary alignment with at
62-
least price_model_scratch_footprint( cnt ) bytes and it will be
63-
clobbered on output. It is sufficient to use a normally aligned /
64-
normally allocated / normally declared array of cnt int64_t's.
65-
66-
The cost of this function is a fast and low variance (but not
67-
completely data oblivious) O(cnt lg cnt) in the best / average /
68-
worst cases. This function uses no heap / dynamic memory allocation.
69-
It is thread safe provided it passed non-conflicting quote, output
70-
and scratch arrays. It has a bounded call depth ~lg cnt <= ~64 (this
71-
could reduce to O(1) by using a non-recursive sort/select
72-
implementation under the hood if desired). */
73-
74-
int64_t * /* Returns pointer to sorted quotes (either quote or ALIGN_UP(scratch,int64_t)) */
11+
int64_t *
7512
price_model_core( uint64_t cnt, /* Assumes price_model_cnt_valid( cnt ) is true */
7613
int64_t * quote, /* Assumes quote[i] for i in [0,cnt) is the i-th quote on input */
7714
int64_t * _p25, /* Assumes *_p25 is safe to write to the p25 model output */
7815
int64_t * _p50, /* Assumes *_p50 " */
79-
int64_t * _p75, /* Assumes *_p75 " */
80-
void * scratch ); /* Assumes a suitable scratch region */
81-
82-
/* Same as the above but always returns quote and quote always holds the
83-
sorted quotes on return. */
84-
85-
static inline int64_t *
86-
price_model( uint64_t cnt,
87-
int64_t * quote,
88-
int64_t * _p25,
89-
int64_t * _p50,
90-
int64_t * _p75,
91-
void * scratch ) {
92-
int64_t * tmp = price_model_core( cnt, quote, _p25, _p50, _p75, scratch );
93-
if( tmp!=quote ) for( uint64_t idx=(uint64_t)0; idx<cnt; idx++ ) quote[ idx ] = tmp[ idx ];
94-
return quote;
95-
}
16+
int64_t * _p75); /* Assumes *_p75 " */
9617

9718
#ifdef __cplusplus
9819
}

program/c/src/oracle/model/test_price_model.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,11 @@ int test_price_model() {
1919
prng_t _prng[1];
2020
prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
2121

22-
# define N 96
22+
# define N 192
2323

2424
int64_t quote0 [N];
2525
int64_t quote [N];
2626
int64_t val [3];
27-
int64_t scratch[N];
2827

2928
for( int iter=0; iter<10000000; iter++ ) {
3029

@@ -36,7 +35,7 @@ int test_price_model() {
3635
/* Apply the model */
3736

3837
memcpy( quote, quote0, sizeof(int64_t)*(size_t)cnt );
39-
if( price_model( cnt, quote, val+0, val+1, val+2, scratch )!=quote ) { printf( "FAIL (compose)\n" ); return 1; }
38+
if( price_model_core( cnt, quote, val+0, val+1, val+2)!=quote ) { printf( "FAIL (compose)\n" ); return 1; }
4039

4140
/* Validate the results */
4241

program/c/src/oracle/sort/sort_stable_base_gen.c

Lines changed: 0 additions & 94 deletions
This file was deleted.

0 commit comments

Comments
 (0)