diff --git a/CMakeLists.txt b/CMakeLists.txt index 2a13188f1..291e15b41 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,7 @@ set( PC_SRC pc/request.cpp; pc/rpc_client.cpp; pc/user.cpp; - program/src/oracle/sort.c + program/src/oracle/model/price_model.c ) set( PC_HDR @@ -116,9 +116,8 @@ add_test( test_net test_net ) # fuzz testing application # -add_executable( fuzz pctest/fuzz.cpp ) -target_link_libraries( fuzz ${PC_DEP} ) - +#add_executable( fuzz pctest/fuzz.cpp ) +#target_link_libraries( fuzz ${PC_DEP} ) # # bpf static analysis (CLion) diff --git a/program/src/oracle/model/clean b/program/src/oracle/model/clean new file mode 100755 index 000000000..860ef1641 --- /dev/null +++ b/program/src/oracle/model/clean @@ -0,0 +1,3 @@ +#!/bin/sh +rm -rfv bin + diff --git a/program/src/oracle/model/gap_model.h b/program/src/oracle/model/gap_model.h new file mode 100644 index 000000000..99df18f6b --- /dev/null +++ b/program/src/oracle/model/gap_model.h @@ -0,0 +1,236 @@ +#ifndef _pyth_oracle_model_gap_model_h_ +#define _pyth_oracle_model_gap_model_h_ + +/* gap_model computes our apriori belief how comparable + (agg_price,agg_conf) pairs measured at different points in time are + expected to be. + + Let ps / pd be our apriori belief that aggregated values measured at + two different block sequence numbers (separated by gap where gap is + strictly positive) are representative of statistically similar / + distinct price distributions (note that ps+pd=1). Then gap_model + computes: + + r/2^30 ~ ps / pd + + The specific model used below is: + + ps ~ exp( -gap / lambda ) + + where lambda is the timescale (in block sequence numbers) over which + it is believed to be sensible to consider (agg_price,agg_conf) + price distribution parameterizations comparable. This model reduces + to a standard EMA when gap is always 1 but naturally generalizes to + non-uniform sampling (e.g. chain congestion), large gaps (e.g. + EOD-to-next-SOD) and initialization (gap->infinity). For this model, + lambda is known at compile time (see GAP_MODEL_LAMBA) and on return r + will be in [0,lambda 2^30]. Gap==0 should not be passed to this + model but for specificity, the model here returns 0 if it is. + + For robustness and accuracy of the finite precision computation, + lambda should be in 1 << lambda << 294801. The precision of the + calculation is impacted by the EXP2M1_FXP_ORDER (default value is + beyond IEEE single precision accurate). + + Note that by modifying this we can correct for subtle sampling + effects for specific chains and/or symbols (SOD, EOD, impact of + different mechanisms for chain congestion) without needing to modify + anything else. We could even incorporate time invariant processes by + tweaking the API here to take the block sequence numbers of the + samples in question. + + If GAP_MODEL_NEED_REF is defined, this will also declare a high + accuracy floating point reference implementation "gap_model_ref" to + aid in testing / tuning block chain suitable approximations. */ + +#include "../util/exp2m1.h" + +#ifdef GAP_MODEL_NEED_REF +#include +#endif + +/* GAP_MODEL_LAMBDA should an integer in [1,294802) + (such that 720 GAP_MODEL_LAMBDA^3 < 2^64 */ + +#ifndef GAP_MODEL_LAMBDA +#define GAP_MODEL_LAMBDA (3000) +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef GAP_MODEL_NEED_REF + +/* Returns ps/pd ~ 1/(exp(gap/GAP_MODEL_LAMBDA)-1) if gap is positive + and 0 otherwise */ + +static inline long double +gap_model_ref( long double gap ) { + return gap>0.L ? (1.L / expm1l( gap / ((long double)GAP_MODEL_LAMBDA) ) ) : 0.L; +} + +#endif + +static inline uint64_t /* In [0,2^30 GAP_MODEL_LAMBDA] */ +gap_model( uint64_t gap ) { /* Non-zero */ + + /* Let x = gap/lambda. Then: + + r/2^30 = ps / pd = exp(-x) / ( 1 - exp(-x) ) + + For x<<1, the series expansion is: + + r/2^30 ~ 1/x - 1/2 + (1/12) x - (1/720) x^3 + (1/30240) x^5 - ... + + Noting that this is a convergent series with alternating terms, the + error in truncated this series at the x^3 term is at most the x^5 + term. Requiring then the x^5 term to be at most 2^-31 guarantees + the series truncation error will be at most 1/2 ulp. This implies + the series can be used for a practically full precision if: + + x <~ (30240 2^-31)^(1/5) ~ 0.10709... + -> gap < 0.10709... lambda + + The truncated series in terms of gap can be written as: + + r/2^30 ~ lambda/gap - 1/2 + gap/(12 lambda) - gap^3/(720 lambda^3) + = (lambda + gap/2 + gap^2 ( 1/(12 lambda) - gap^2 (1/(720 lambda^3)) )) / gap + + Convering the division to an integer division with grade school + rounding: + + r ~ floor( ( 2^(30+s) lambda + gap (2^(29+s)-2^(s-1)) + + gap^2 ( 2^(30+s)/(12 lambda) - gap^2 2^(30+s) / (720 lambda^3) ) ) / (2^s gap) ) + + where s is a positive integer picked to maximize precision while + not causing intermediate overflow. We require 720 lambda^3 < 2^64 + -> lambda <= lambda_max = 294801 (such that the coefficients above + are straightforward to compute at compile time). + + Given gap <= 0.10709 lambda_max, gap^2 can be computed exactly + without overflow. Adding an intermediate scale factor to improve + the accuracy of the correction coefficients then yields: + + r ~ floor( cp + gap c0 + floor((gap^2 (c1 - gap^2 c3) + 2^(t-1)) / 2^t) ) / (2^s gap) ) + + where: + + cp = 2^(30+s) lambda + c0 = 2^(29+s)-2^(s-1) + c1 = 2^(30+s+t)/(12 lambda) + c3 = 2^(30+s+t)/(720 lambda^3) + + To prevent overflow in the c1 and c3 corrections, it is sufficient that: + + s+t < log(12 2^34 / lambda_max ) ~ 22.7 + + So we use s+t==22. Maximizing precision in the correction + calculation then is achieved with s==1 and t==21. (The pole and + constant calculations are exact so long as s is positive.) This + yields the small gap approximation: + + r ~ floor( (cp + gap c0 + (gap^2 ( c1 - gap^2 c3 ) + 2^20)>>21) / 2 gap) ) + + cp = 2^31 lambda + c0 = 2^30 - 1 + c1 = floor( (2^52 + 6 lambda)/(12 lambda) ) + c3 = floor( (2^52 + 360 lambda^3)/(720 lambda^3) ) + + This is ~30-bits accurate when gap < 0.10709... lambda. + + For x>>1, multiplying by 1 = exp(x) / exp(x) yields: + + r/2^30 = 1 / ( exp(x) - 1 ) + = 1 / ( exp2(x log_2 e) - 1 ) + = 1 / ( exp2( gap ((2^30 log_2 e)/lambda) ) - 1 ) + ~ 1 / exp2m1( (gap c + 2^(s-1)) >> s ) + = 2^30 / exp2m1_fxp( (gap c + 2^(s-1)) >> s ) + + where c = round( (2^30+s) log_2 e / lambda ) and s is picked to + maximize precision of computing gap / lambda without causing + overflow. Further evaluating: + + c ~ ((2^(31+s)) log_2 e + lambda) / (lambda<<1) + d = exp2m1_fxp( (gap*c + 2^(s-1)) >> s ) + r ~ (2^61 + d) / (d<<1) + + For the choice of s, we note that if: + + exp(-x) / (1-exp(-x)) <= 2^-31 + (1+2^-31) exp(-x) <= 2^-31 + exp(-x) <= 2^-31/(1+2^-31) + x >= -ln (2^31/(1+2^-31)) ~ 21.488... + + The result will round to zero. Thus, we only care about cases + where: + + gap < thresh + + and: + + thresh ~ 21.5 lambda + + We thus require: + + c (thresh-1) + 2^(s-1) < 2^64 + (2^(30+s)) log_2 e 21.5 < ~2^64 + 2^s <~ 2^34 / (21.5 log_2 e) + s <~ 34 - log_2 (21.5 log_2 e ) ~ 29.045... + + So we use s == 29 when computing compile time constant c. + + (Note: while it looks like we could just use the x>>1 approximation + everywhere, it isn't so accurate for gap ~ 1 as the error in the + c*gap is amplified exponentially into the output.) */ + + /* If we need the behavior that this never returns 0 (i.e. we are + never 100 percent confident that samples separated by a large gap + cannot be compared), tweak big_thresh for the condition + + exp(-x) / (1-exp(-x)) <= 3 2^-31 + + and return 1 here. */ + + static uint64_t const big_thresh = (UINT64_C(43)*(uint64_t)GAP_MODEL_LAMBDA) >> 1; + if( !gap || gap>=big_thresh ) return UINT64_C(0); + + /* round( 0.10709... lambda ) <= 3191 */ +//static uint64_t const small_thresh = (UINT64_C(0x1b69f363043ef)*(uint64_t)GAP_MODEL_LAMBDA + (UINT64_C(1)<<51)) >> 52; + + /* hand tuned value minimizes worst case ulp error in the + EXP2M1_ORDER=7 case */ + static uint64_t const small_thresh = (UINT64_C(0x2f513cc1e098e)*(uint64_t)GAP_MODEL_LAMBDA + (UINT64_C(1)<<51)) >> 52; + + if( gap<=small_thresh ) { + + /* 2^31 lambda, exact */ + static uint64_t const cp = ((uint64_t)GAP_MODEL_LAMBDA)<<31; + + /* 2^30 - 1, exact */ + static uint64_t const c0 = (UINT64_C(1)<<30) - UINT64_C(1); + + /* round( 2^52 / (12 lambda) ) */ + static uint64_t const c1 = ((UINT64_C(1)<<52) + UINT64_C(6)*(uint64_t)GAP_MODEL_LAMBDA) + / (UINT64_C(12)*(uint64_t)GAP_MODEL_LAMBDA); + + /* round( 2^52 / (720 lambda^3) ) */ + static uint64_t const c3 = ((UINT64_C(1)<<52) + UINT64_C(360)*((uint64_t)GAP_MODEL_LAMBDA)*((uint64_t)GAP_MODEL_LAMBDA)*((uint64_t)GAP_MODEL_LAMBDA)) + / (UINT64_C(720)*((uint64_t)GAP_MODEL_LAMBDA)*((uint64_t)GAP_MODEL_LAMBDA)*((uint64_t)GAP_MODEL_LAMBDA)); + + uint64_t gap_sq = gap*gap; /* exact, at most 3191^2 (no overflow) */ + return (cp - c0*gap + (( gap_sq*(c1 - gap_sq*c3) + (UINT64_C(1)<<20))>>21)) / (gap<<1); + } + + /* round( 2^59 log_2 / lambda ) */ + static uint64_t const c = ((UINT64_C(0x171547652b82fe18)) + (uint64_t)GAP_MODEL_LAMBDA) / (((uint64_t)GAP_MODEL_LAMBDA)<<1); + + uint64_t d = exp2m1_fxp( (gap*c + (UINT64_C(1)<<28)) >> 29 ); + return ((UINT64_C(1)<<61) + d) / (d<<1); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_model_gap_model_h_ */ diff --git a/program/src/oracle/model/leaky_integrator.h b/program/src/oracle/model/leaky_integrator.h new file mode 100644 index 000000000..46a8e6171 --- /dev/null +++ b/program/src/oracle/model/leaky_integrator.h @@ -0,0 +1,82 @@ +#ifndef _pyth_oracle_model_leaky_integrator_h_ +#define _pyth_oracle_model_leaky_integrator_h_ + +/* Compute: + + (z/2^30) ~ (y/2^30)(w/2^30) + x + + Specifically returns: + + co 2^128 + = round( w / 2^30 ) + 2^30 x + + where co is the "carry out". (Can be used to detect overflow and/or + automatically rescale as necessary.) Assumes w is in [0,2^30]. + Calculation is essentially full precision O(1) and more accurate than + a long double implementation for a wide range inputs. + + If LEAKY_INTEGRATOR_NEED_REF is defined, this will also declare a + high accuracy floating point reference implementation + "leaky_integrator_ref" to aid in testing / tuning block chain + suitable approximations. (Note: In this case, the limitations of + IEEE long double floating point mean in this case leaky_integator_ref + is actually expected to be _less_ _precise_ than leaky_integrator + when the time scale * price scales involved require more than 64-bit + mantissas.) */ + +#include "../util/uwide.h" + +#ifdef LEAKY_INTEGRATOR_NEED_REF +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef LEAKY_INTEGRATOR_NEED_REF + +/* Compute z ~ y w + x */ + +static inline long double +leaky_integrator_ref( long double y, + long double w, + long double x ) { + return fmal(y,w,x); +} + +#endif + +static inline uint64_t +leaky_integrator( uint64_t * _zh, uint64_t * _zl, + uint64_t yh, uint64_t yl, + uint64_t w, /* In [0,2^30] */ + uint64_t x ) { + + /* Start from the exact expression: + + z/2^30 = (y/2^30)*(w/2^30) + x + + This yields: + + z = (y w)/2^30 + x 2^30 + z = (yh w 2^64 + yl w)/2^30 + 2^30 x + = yh w 2^34 + yl w/2^30 + 2^30 x + ~ yh w 2^34 + floor((yl w + 2^29)/2^30) + 2^30 x */ + + uint64_t t0_h,t0_l; uwide_mul( &t0_h,&t0_l, yh, w ); /* At most 2^94-2^30 */ + uwide_sl( &t0_h,&t0_l, t0_h,t0_l, 34 ); /* At most 2^128-2^64 (so cannot overflow) */ + + uint64_t t1_h,t1_l; uwide_mul( &t1_h,&t1_l, yl, w ); /* At most 2^94-2^30 */ + uwide_inc( &t1_h,&t1_l, t1_h,t1_l, UINT64_C(1)<<29 ); /* At most 2^94-2^29 (all ones 30:93) */ + uwide_sr ( &t1_h,&t1_l, t1_h,t1_l, 30 ); /* At most 2^64-1 (all ones 0:63) (t1_h==0) */ + + uint64_t xh,xl; uwide_sl( &xh,&xl, UINT64_C(0),x, 30 ); /* At most 2^94-2^30 */ + return uwide_add( _zh,_zl, t0_h,t0_l, xh,xl, t1_l ); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_model_leaky_integrator_h_ */ + diff --git a/program/src/oracle/model/overlap_model.h b/program/src/oracle/model/overlap_model.h new file mode 100644 index 000000000..2e9432a35 --- /dev/null +++ b/program/src/oracle/model/overlap_model.h @@ -0,0 +1,341 @@ +#ifndef _pyth_oracle_model_overlap_model_h_ +#define _pyth_oracle_model_overlap_model_h_ + +/* overlap_model computes our prior belief whether two + (agg_price,agg_conf) estimates parameterize the same underlying + process. Specifically, let ps / pd be our apriori belief that two + different price distribution estimates came from the same / different + underlying process. Then overlap_model computes: + + r/2^30 ~ ps / pd + + Let Lij be the expected likelihood of IID sample drawn from a + Gaussian distribution with mean/rms mu_i/sigma_i for a Gaussian + distribution with mean/rms mu_j/sigma_j. The implementation below + corresponds to: + + ps / pd = weight sqrt( (L01 L10) / (L00 L11) ) + + After some fun with integrals: + + ps / pd = weight sqrt( 2 sigma_0 sigma_1 / (sigma_0^2 + sigma_1^2) ) + exp( -0.5 (mu_0-mu_1)^2 / (sigma_0^2 + sigma_1^2) ) + + where the estimates to compare are (mu0,sigma0) and (mu1,sigma1) and + weight is the ps / pd prior to use for identical estimates (e.g. + what our prior belief that two similar agg_price,agg_conf estimates + just happen to be parameterizing different processes with similar + distributions). As such the output is in [0,weight 2^30]. This + model is heuristic but well motivated theoretically with good + limiting behaviors and robust numerical implementations. + + The implementation below has hardcoded weight to 1 and also has a + tunable but hardcoded sigma normalization factor of 1. If the user + wants to run with different weights and normalizations, see notes + below ("weight", "alpha" and "beta" will need to be adjusted). + + This implementation has been crafted to be completely price scale + invariant and the result should be accurate to O(1) ulp. If + OVERLAP_MODEL_NEED_REF is defined, this will also declare a high + accuracy floating point reference implementation "overlap_model_ref" + to aid in testing / tuning block chain suitable approximations. */ + +#include "../util/fxp.h" +#include "../util/sar.h" + +#ifdef OVERLAP_MODEL_NEED_REF +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef OVERLAP_MODEL_NEED_REF + +/* Compute ps/pd ~ weight sqrt( 2 sigma_0 sigma_1 / (sigma_0^2 + sigma_1^2) ) + exp( -0.5 norm^2 (mu_0-mu_1)^2 / (sigma_0^2 + sigma_1^2) ) + where weight==1 and norm==1. If sigma_0 is not positive, it will be + treated as zero. Similarly for sigma_1. Assumes inputs are tame + such that there are no risks of nans, infs or intermediate overflows. */ + +static inline long double +overlap_model_ref( long double mu_0, long double sigma_0, + long double mu_1, long double sigma_1 ) { + static long double const weight = 1.L; + static long double const norm = 1.L; + if( !(sigma_0>0.L) ) sigma_0 = 0.L; + if( !(sigma_1>0.L) ) sigma_1 = 0.L; + if( sigma_0==0.L && sigma_1==0.L ) return mu_0==mu_1 ? 1.L : 0.L; + long double kappa_sq = 1.L/(sigma_0*sigma_0 + sigma_1*sigma_1); + long double delta = mu_0-mu_1; + return weight*sqrtl( (2.L*kappa_sq)*(sigma_0*sigma_1) )*expl( -((0.5L*norm*norm)*kappa_sq)*(delta*delta) ); +} + +#endif + +static inline uint64_t +overlap_model( uint64_t mu_0, uint64_t sigma_0, + uint64_t mu_1, uint64_t sigma_1 ) { + uint64_t ovfl; + + /* Methodology: + + If we compute sqrt and exp terms of r separately, the sqrt term + will end up with calculations like sqrt( 1/2^30 ) ~ 1/2^15 with a + loss of the bits after the decimal point. This has the effect of + roughly halving the precision in extreme cases. To reduce the + impact of this and speed up the calculation, we instead absorb the + sqrt into the exp term. Further, since exp2 is faster and more + accurate to compute, we rewrite r as: + + r/2^30 ~ exp2( zeta_0 - zeta_1 ) + + where: + + zeta_0 = log2( weight sqrt( 2 sigma_0 sigma_1 / (sigma_0^2 + sigma_1^2) ) ) + zeta_1 = alpha (mu_0 - mu_1)^2 / (sigma_0^2 + sigma_1^2) + alpha = 0.5 log2(e) norm^2 = 0.721... norm^2 + + and norm is a factor of how to the model should normalize the input + sigma values to a "Gaussian RMS equivalent". + + We don't have any prior price scale restrictions for mu and sigma + so we let: + + sigma_min = min(sigma_0,sigma_1) + sigma_max = max(sigma_0,sigma_1) + + With this we have: + + zeta_1 = delta^2 t_sigma + + where: + + delta = (mu_0-mu_1) / sigma_max + t_sigma = alpha / ( 1 + ratio_sigma^2 ) ... in [alpha/2,alpha] + ratio_sigma = sigma_min / sigma_max ... in [0,1] when sigma_max>0 + + For zeta_0, we note: + + zeta_0 = 0.5 + log2( weight ) + 0.5 log2( ratio_sigma / ( 1 + ratio_sigma^2 ) ) + + To optimize this calculation we note that the log2 term can be + rewritten in terms of t_sigma from above: + + zeta_0 = 0.5 + log2( weight ) + 0.5 log2( ratio_sigma t_sigma / alpha ) + = beta + 0.5 log2( ratio_sigma t_sigma ) + + where: + + beta = 0.5 + log2( weight ) - 0.5 log2( alpha ) + + Like alpha, beta can be tuned to taste to reflect how identical + inputs distributions should be weighted. + + This yields the method (in exact math): + + sigma_max = max( sigma_0, sigma_1 ) + if sigma_max==0, return mu_0==mu_1 ? weight : 0 ... two delta functions were input, return weight if same and 0 if not + sigma_min = min( sigma_0, sigma_1 ) + delta = (mu_0-mu_1) / sigma_max + ratio_sigma = sigma_min / sigma_max + t_sigma = alpha / ( 1 + ratio_sigma^2 ) + zeta_0 = beta + 0.5 log2( ratio_sigma t_sigma ) + zeta_1 = delta^2 t_sigma + return exp2( zeta_0 - zeta_1 ) + + This is straightforward to convert to fixed points as described in + the details below. */ + + /* Compute: + + sigma_min = min(sigma_0,sigma_1) + sigma_max = max(sigma_0,sigma_1). + + If both are zero (i.e. sigma_max==0), we have two delta functions + and we return weight if they are the same delta function (i.e. + mu_0==mu_1) or 0 if not. If just one is a delta function (i.e. + sigma_max!=0 and sigma_min==0), we return 0 to save a lot of work. */ + + /* FIXME: HARDCODED TO weight==1 */ + static uint64_t const weight = UINT64_C(1)<<30; /* round( 2^30 weight ) */ + + int sigma_c = sigma_0 <= sigma_1; + uint64_t sigma_max = sigma_c ? sigma_1 : sigma_0; + if( !sigma_max ) return mu_0==mu_1 ? weight : UINT64_C(0); + + uint64_t sigma_min = sigma_c ? sigma_0 : sigma_1; + if( !sigma_min ) return UINT64_C(0); + + /* At this point, sigma_max >= sigma_min > 0. Compute: + + delta/2^30 ~ |mu_0-mu_1| / sigma_max + + with round-nearest-away style rounding. For a fixed delta, the + overlap model result is maximized when sigma_0==sigma_1 (maximizes + simultaneously the sqrt factor and the exp factor). In this limit + we have: + + r = weight exp( -0.25 norm^2 delta^2 ) + + If this is less than 2^-31, the result necessarily round to zero + regardless of sigma_0 or sigma_1. In equations, if: + + weight exp( -0.25 norm^2 (delta/2^30)^2 ) < 2^-31 + -> -0.25 norm^2 (delta/2^30)^2 < ln( 2^-31 / weight ) + -> delta/2^30 > 2^30 sqrt( -4 ln( 2^-31 / weight ) / norm^2 ) + ~ 9.2709 for weight==1 and norm==1 + + the result rounds to zero and do not need to compute further. This + implies that if: + + delta > thresh + + where + + thresh ~ ceil( 2^30 sqrt( -4 ln( 2^-31 / weight ) / norm^2 ) ) + + we know the result rounds to zero and can return immediately. This + restricts the range of delta we need to handle which in turn + simplifies the remaining fixed point conversions. + + We can't use fxp_div_rna_fast because the div might overflow (e.g. + mu_max==UINT64_MAX, mu_min=0, sigma_max=2 such that delta/2^30~2^63 + -> delta~2^93 but this gets handled as part of the thresh check) */ + + /* FIXME: HARDCODED TO weight==1 AND norm==1 */ + static uint64_t const thresh = UINT64_C(0x251570310); /* ~33.3 bits */ + + int mu_c = mu_0 <= mu_1; + uint64_t mu_min = mu_c ? mu_0 : mu_1; + uint64_t mu_max = mu_c ? mu_1 : mu_0; + uint64_t delta = fxp_div_rna( mu_max-mu_min, sigma_max, &ovfl ); + if( ovfl || delta>thresh ) return UINT64_C(0); + + /* Compute: + + ratio_sigma/2^30 ~ sigma_min/sigma_max + = (sigma_min/2^30) / (sigma_max/2^30) + + with round-nearest-away style rounding. We can't use + fxp_div_rna_fast here because 2^30 sigma_min might be >= 2^64. We + know that the result will not overflow though as + sigma_min<=sigma_max at this point. + + FIXME: IDEALLY WE'D FIND A REPRESENTATION OF 1/SIGMA_MAX AND USE + THAT HERE AND ABOVE TO SAVE A SLOW FXP_DIV. THIS IS TRICKY TO MAKE + ROBUST GIVEN LACK OF LIMITS ON DYNAMIC RANGE OF SIGMA. */ + + uint64_t ratio_sigma = fxp_div_rna( sigma_min, sigma_max, &ovfl ); + + /* At this point, ratio_sigma is in [0,2^30] (it might have rounded + to zero if the sigmas are wildly different). Compute: + + t_sigma/2^30 ~ alpha / (1+(ratio_sigma/2^30)^2) + ~ in [alpha/2,alpha] + + to fixed point precision. Interval is approximate as alpha itself + might be rounded from its ideal value. For typical cases, + 0.5>1)) / d ) + + (or, in code): + + uint64_t d = (UINT64_C(1)<<60) + ratio_sigma*ratio_sigma; + uint64_t nh,nl; uwide_inc( &nh,&nl, UINT64_C(0x2e2a8ec),UINT64_C(0xa5705fc2f0000000), d>>1 ); // ~2^90 alpha (use higher precision) + uwide_div( &nh,&nl, nh,nl, d ); + uint64_t t_sigma = nl; + + does this calculation with precise round nearest away rounding but + it is a lot more expensive due to the use of 128b/64b division and + the extra precision empirically doesn't improve the overall + accuracy to justify the extra work. + + fxp_mul_rna_fast is okay as ratio_sigma <= 2^30 + fxp_add_fast is okay 2^30 + ratio_sigma^2/2^30 <= 2^31 + fxp_div_rna_fast is okay as alpha 2^30 + (2^31>>1) < 2^64 */ + + /* FIXME: HARDCODED TO norm==1 */ + static uint64_t const alpha = UINT64_C(0x2e2a8eca); /* round( 2^30 0.5 log2(e) norm^2 ) */ + + uint64_t t_sigma = fxp_div_rna_fast( alpha, fxp_add_fast( UINT64_C(1)<<30, fxp_mul_rna_fast( ratio_sigma, ratio_sigma ) ) ); + + /* Compute: + + e + (f/2^30) ~ 0.5 log2( ratio_sigma t_sigma ) + ~ 0.5 ( e0 + f0/2^30 ) + = floor(e0/2) + 0.5 (e0 is odd) + f0/2^31 + = floor(e0/2) + (2^30 (e0 is odd) + f0) / 2^31 + = (e0>>1) + (((e0&1)<<30) + f0) / 2^31 + ~ (e0>>1) + ((((e0&1)<<30) + f0 + 1)>>1) / 2^30 + -> e = (e0>>1) with sign extension + f = (((e0&1)<<30)+f0+1)>>1) + + If ratio_sigma t_sigma == 0 (or rounds to zero), the log2 term is + going to -inf such that the result will round to zero. (Note that + this might be a source of precision loss when sigmas are wildly + dissimilar.) + + fxp_mul_rna_fast is okay here is ratio_sigma and t_sigma both fit + in O(30) bits. */ + + uint64_t f0; int e0 = fxp_log2_approx( &f0, fxp_mul_rna_fast( ratio_sigma, t_sigma ) ); + if( e0<-30 ) return UINT64_C(0); + int32_t e = sar_int32( (int32_t)e0, 1 ); /* |e| <= 16 */ + uint64_t f = ((((uint64_t)(e0&1)) << 30) + f0 + UINT64_C(1)) >> 1; + + /* Compute: + + zeta0/2^30 ~ beta + f/2^30 + ifelse(e<0, 0, e) + zeta1/2^30 ~ delta^2 t_sigma + ifelse(e<0,-e, 0) + + such that: + + (zeta0-zeta1)/2^30 ~ -delta^2 t_sigma + beta + 0.5 log2( ratio_sigma t_sigma ) + + The ifelse trick allows handling of signed exponents without + needing to use a signed fixed point representation. + + delta^2 t_sigma isn't generally safe to compute with fast fxp + multiplies (for the current hard coded weight==1 and norm==1, + delta*t_sigma is safe but delta^2 isn't and delta*(delta*t_sigma) + isn't due to potential intermediate overflow risk). We know there + is no risk of overflow of the result though. */ + + /* FIXME: HARDCODED TO WEIGHT==1 and NORM==1 */ + static uint64_t const beta = UINT64_C(0x2f14588b); /* round( 2^30 ( 0.5 + log2(weight) - 0.5*log2(alpha) ) ), fits in 30-bits */ + + uint64_t zeta0 = fxp_add_fast( beta, f ); + uint64_t zeta1 = fxp_mul_rna( fxp_mul_rna( delta, delta, &ovfl ), t_sigma, &ovfl ); + if( e1). So we use the appropriate exp2 variant to avoid a + division. + + fxp_sub_fast is okay because we always subtract the smaller from + the larger. */ + + uint64_t r = (zeta0>=zeta1) ? fxp_exp2_approx( fxp_sub_fast( zeta0, zeta1 ) ) + : fxp_rexp2_approx( fxp_sub_fast( zeta1, zeta0 ) ); + return (r>2. The current preference is not to do this as, though + this has stronger bum quote robustness, it results in p25==p50==p75 + when cnt==3. (In this case, the above wants to do an interpolation + between quotes 0 and 1 to for the p25 and between quotes 1 and 2 + for the p75. But limiting to just the inside quote results in + p25/p50/p75 all using the median quote.) + + A tweak to this option, for p25, is to use floor(cnt/4) == cnt>>2. + This is simple, has the same asymptotic behavior for large cnt, has + good behavior in the cnt==3 case and practically as good bum quote + rejection in the moderate cnt case. */ + + uint64_t p25_idx = cnt >> 2; + + *_p25 = sort_quote[p25_idx]; + + /* Extract the p50 */ + + if( (cnt & (uint64_t)1) ) { /* Odd number of quotes */ + + uint64_t p50_idx = cnt >> 1; /* ==ceil((cnt-1)/2) */ + + *_p50 = sort_quote[p50_idx]; + + } else { /* Even number of quotes (at least 2) */ + + uint64_t p50_idx_right = cnt >> 1; /* == ceil((cnt-1)/2)> 0 */ + uint64_t p50_idx_left = p50_idx_right - (uint64_t)1; /* ==floor((cnt-1)/2)>=0 (no overflow/underflow) */ + + int64_t vl = sort_quote[p50_idx_left ]; + int64_t vr = sort_quote[p50_idx_right]; + + /* Compute the average of vl and vr (with floor / round toward + negative infinity rounding and without possibility of + intermediate overflow). */ + + *_p50 = avg_2_int64( vl, vr ); + } + + /* Extract the p75 (this is the mirror image of the p25 case) */ + + uint64_t p75_idx = cnt - ((uint64_t)1) - p25_idx; + + *_p75 = sort_quote[p75_idx]; + + return sort_quote; +} + diff --git a/program/src/oracle/model/price_model.h b/program/src/oracle/model/price_model.h new file mode 100644 index 000000000..0a667c72f --- /dev/null +++ b/program/src/oracle/model/price_model.h @@ -0,0 +1,101 @@ +#ifndef _pyth_oracle_model_price_model_h_ +#define _pyth_oracle_model_price_model_h_ + +#include "../util/compat_stdint.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Returns the minimum and maximum number of quotes the implementation + can handle */ + +static inline uint64_t +price_model_quote_min( void ) { + return (uint64_t)1; +} + +static inline uint64_t +price_model_quote_max( void ) { + return (UINT64_MAX-(uint64_t)alignof(int64_t)+(uint64_t)1) / (uint64_t)sizeof(int64_t); +} + +/* price_model_cnt_valid returns non-zero if cnt is a valid value or + zero if not. */ + +static inline int +price_model_cnt_valid( uint64_t cnt ) { + return price_model_quote_min()<=cnt && cnt<=price_model_quote_max(); +} + +/* price_model_scratch_footprint returns the number of bytes of scratch + space needed for an arbitrarily aligned scratch region required by + price_model to handle price_model_quote_min() to cnt quotes + inclusive. */ + +static inline uint64_t +price_model_scratch_footprint( uint64_t cnt ) { /* Assumes price_model_cnt_valid( cnt ) is true */ + /* cnt int64_t's plus worst case alignment padding, no overflow + possible as cnt is valid at this point */ + return cnt*(uint64_t)sizeof(int64_t)+(uint64_t)alignof(int64_t)-(uint64_t)1; +} + +/* price_model_core minimizes (to quote precision in a floor / round + toward negative infinity sense) the loss model of the given quotes. + Assumes valid inputs (e.g. cnt is at least 1 and not unreasonably + large ... typically a multiple of 3 but this is not required, + quote[i] for i in [0,cnt) are the quotes of interest on input, p25, + p50, p75 point to where to write model outputs, scratch points to a + suitable footprint srcatch region). + + Returns a pointer to the quotes sorted in ascending order. As such, + the min and max and any other rank statistic can be extracted easily + on return. This location will either be quote itself or to a + location in scratch. Use price_model below for a variant that always + replaces quote with the sorted quotes (potentially has extra ops for + copying). Further, on return, *_p25, *_p50, *_p75 will hold the loss + model minimizing values for the input quotes and the scratch region + was clobbered. + + Scratch points to a memory region of arbitrary alignment with at + least price_model_scratch_footprint( cnt ) bytes and it will be + clobbered on output. It is sufficient to use a normally aligned / + normally allocated / normally declared array of cnt int64_t's. + + The cost of this function is a fast and low variance (but not + completely data oblivious) O(cnt lg cnt) in the best / average / + worst cases. This function uses no heap / dynamic memory allocation. + It is thread safe provided it passed non-conflicting quote, output + and scratch arrays. It has a bounded call depth ~lg cnt <= ~64 (this + could reducd to O(1) by using a non-recursive sort/select + implementation under the hood if desired). */ + +int64_t * /* Returns pointer to sorted quotes (either quote or ALIGN_UP(scratch,int64_t)) */ +price_model_core( uint64_t cnt, /* Assumes price_model_cnt_valid( cnt ) is true */ + int64_t * quote, /* Assumes quote[i] for i in [0,cnt) is the i-th quote on input */ + int64_t * _p25, /* Assumes *_p25 is safe to write to the p25 model output */ + int64_t * _p50, /* Assumes *_p50 " */ + int64_t * _p75, /* Assumes *_p75 " */ + void * scratch ); /* Assumes a suitable scratch region */ + +/* Same as the above but always returns quote and quote always holds the + sorted quotes on return. */ + +static inline int64_t * +price_model( uint64_t cnt, + int64_t * quote, + int64_t * _p25, + int64_t * _p50, + int64_t * _p75, + void * scratch ) { + int64_t * tmp = price_model_core( cnt, quote, _p25, _p50, _p75, scratch ); + if( tmp!=quote ) for( uint64_t idx=(uint64_t)0; idx +#include +#include "../util/prng.h" + +#define GAP_MODEL_NEED_REF +#define EXP2M1_FXP_ORDER 7 /* EXP2M1_FXP_ORDER 4 yields a better than IEEE single precision accurate approximation */ +#include "gap_model.h" + +static uint64_t const ulp_thresh[8] = { + UINT64_C(0), + UINT64_C(1260649965), /* max_rerr ~ 3.9e-04 */ + UINT64_C(95149836), /* max_rerr ~ 3.0e-05 */ + UINT64_C(1348998), /* max_rerr ~ 4.2e-07 */ + UINT64_C(67273), /* max_rerr ~ 2.1e-08 */ + UINT64_C(3331), /* max_rerr ~ 1.0e-09 */ + UINT64_C(40), /* max_rerr ~ 1.2e-11 */ + UINT64_C(21) /* max_rerr ~ 6.5e-12 */ +}; + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + uint64_t max_ulp = UINT64_C(0); + uint64_t thresh = ulp_thresh[ EXP2M1_FXP_ORDER ]; + + for( int i=0; i<100*GAP_MODEL_LAMBDA; i++ ) { + + uint64_t x; + if( i<(22*GAP_MODEL_LAMBDA) ) x = (uint64_t)i; + else { + int s = ((int)prng_uint32( prng )) & 63; + x = prng_uint64( prng ) >> s; + } + + uint64_t y = gap_model( x ); + uint64_t z = (uint64_t)roundl( gap_model_ref( x )*(long double)(1<<30) ); + + if( !x ) { + if( y!=z ) { printf( "FAIL (iter %i: special x %lu y %lu z %lu)\n", i, x, y, z ); return 1; } + continue; + } + if( y>(((uint64_t)GAP_MODEL_LAMBDA)<<30) ) { printf( "FAIL (iter %i: range x %lu y %lu z %lu)\n", i, x, y, z ); return 1; } + + uint64_t ulp = y>z ? (y-z) : (z-y); + if( ulp>thresh ) { printf( "FAIL (iter %i: ulp x %lu y %lu z %lu ulp %lu)\n", i, x, y, z, ulp ); return 1; } + if( ulp>max_ulp ) max_ulp = ulp; + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass (max_rerr %.1e max ulp %lu)\n", (double)max_ulp / (((double)(1<<30))*((double)GAP_MODEL_LAMBDA)), max_ulp ); + + return 0; +} + diff --git a/program/src/oracle/model/test_leaky_integrator.c b/program/src/oracle/model/test_leaky_integrator.c new file mode 100644 index 000000000..8b3568974 --- /dev/null +++ b/program/src/oracle/model/test_leaky_integrator.c @@ -0,0 +1,62 @@ +#include +#include +#include "../util/prng.h" + +#define LEAKY_INTEGRATOR_NEED_REF +#include "leaky_integrator.h" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + long double max_rerr = 0.L; + + static long double const _2_30 = (long double)(UINT64_C(1)<<30); + static long double const _2_n30 = 1.L/(long double)(UINT64_C(1)<<30); + static long double const _2_64 = 18446744073709551616.L; + static long double const _2_128 = 18446744073709551616.L*18446744073709551616.L; + + int ctr = 0; + for( int iter=0; iter<100000000; iter++ ) { + if( !ctr ) { printf( "Completed %u iterations\n", iter ); ctr = 10000000; } + ctr--; + + uint32_t t = prng_uint32( prng ); + + uint64_t yh = prng_uint64( prng ); + uint64_t yl = prng_uint64( prng ); + uwide_sr( &yh,&yl, yh,yl, (int)(t & UINT32_C(127)) ); t >>= 7; + long double y = (_2_64*(long double)yh) + (long double)yl; + + uint64_t w = (uint64_t)(prng_uint32( prng ) >> (t & UINT32_C(31))); t >>= 5; + if( w>(UINT64_C(1)<<30) ) w = (UINT64_C(1)<<30); + + uint64_t x = prng_uint64( prng ) >> (t & UINT32_C(63)); t >>= 6; + + uint64_t zh,zl; uint64_t co = leaky_integrator( &zh,&zl, yh,yl, w, x ); + long double z = ((long double)co)*_2_128 + ((long double)zh)*_2_64 + ((long double)zl); + + long double z_ref = roundl( leaky_integrator_ref( y, ((long double)w)*_2_n30, _2_30*(long double)x ) ); + + long double aerr = fabsl( z-z_ref ); + long double rerr = aerr / fmaxl( 1.L, z_ref ); + + if( rerr > 2.6e-14 ) { + printf( "FAIL (iter %i: y %Le w %lx x %lx z %Le z_ref %Le aerr %.1Le rerr %.1Le\n", iter, + y, w, x, z, z_ref, aerr, rerr ); + return 1; + } + max_rerr = fmaxl( rerr, max_rerr ); + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass (max_rerr %.1Le)\n", max_rerr ); + + return 0; +} + diff --git a/program/src/oracle/model/test_overlap_model.c b/program/src/oracle/model/test_overlap_model.c new file mode 100644 index 000000000..7a24eecb3 --- /dev/null +++ b/program/src/oracle/model/test_overlap_model.c @@ -0,0 +1,63 @@ +#include +#include +#include "../util/prng.h" + +#define OVERLAP_MODEL_NEED_REF +#include "overlap_model.h" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + long double max_ulp_fine = 0.L; + long double max_ulp_coarse = 0.L; + + long double const thresh_fine = 2.6L; /* max_rerr 2.4e-09 */ + long double const thresh_coarse = 32768.L; /* max_rerr 3.1e-05 */ + + int ctr = 0; + for( int iter=0; iter<100000000; iter++ ) { + if( !ctr ) { printf( "Completed %u iterations\n", iter ); ctr = 10000000; } + ctr--; + + /* Generate random tests that really stress price invariance assumptions */ + + uint32_t t = prng_uint32( prng ); + uint64_t mu_0 = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6; + uint64_t mu_1 = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6; + uint64_t sigma_0 = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6; + uint64_t sigma_1 = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6; + + /* When sigmas are really dissimilar, limitations of the fixed point + representation limit accuracy to ~2^15 ulp. So we do separate + states for when the two distributions have similar widths. */ + + uint64_t sigma_min = sigma_0= (sigma_max>>1)); /* sigmas of the two are comparable */ + + long double y = (long double)overlap_model( mu_0,sigma_0, mu_1,sigma_1 ); + long double z = ((long double)(1<<30))*overlap_model_ref( (long double)mu_0,(long double)sigma_0, + (long double)mu_1,(long double)sigma_1 ); + long double ulp = fabsl( y - z ); + if( ulp>=(fine ? thresh_fine : thresh_coarse) ) { + printf( "FAIL (iter %i: i(%lu,%lu) j(%lu,%lu) y %Lf z %Lf ulp %Lf)\n", iter, mu_0, sigma_0, mu_1, sigma_1, y, z, ulp ); + return 1; + } + if( fine && ulp>max_ulp_fine ) max_ulp_fine = ulp; + if( ulp>max_ulp_coarse ) max_ulp_coarse = ulp; + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass (fine: max_rerr %.1Le max ulp %.1Lf, coarse: max_rerr %.1Le max ulp %.1Lf)\n", + max_ulp_fine / ((long double)(UINT64_C(1)<<30)), max_ulp_fine, + max_ulp_coarse / ((long double)(UINT64_C(1)<<30)), max_ulp_coarse ); + + return 0; +} + diff --git a/program/src/oracle/model/test_price_model.c b/program/src/oracle/model/test_price_model.c new file mode 100644 index 000000000..321ff1746 --- /dev/null +++ b/program/src/oracle/model/test_price_model.c @@ -0,0 +1,69 @@ +#include +#include +#include +#include "../util/util.h" +#include "price_model.h" + +int +qcmp( void const * _p, + void const * _q ) { + int64_t p = *(int64_t const *)_p; + int64_t q = *(int64_t const *)_q; + if( p < q ) return -1; + if( p > q ) return 1; + return 0; +} + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + +# define N 96 + + int64_t quote0 [N]; + int64_t quote [N]; + int64_t val [3]; + int64_t scratch[N]; + + int ctr = 0; + for( int iter=0; iter<10000000; iter++ ) { + if( !ctr ) { printf( "Completed %u iterations\n", iter ); ctr = 100000; } + ctr--; + + /* Generate a random test */ + + uint64_t cnt = (uint64_t)1 + (uint64_t)(prng_uint32( prng ) % (uint32_t)N); /* In [1,N], approx uniform IID */ + for( uint64_t idx=(uint64_t)0; idx>2; + uint64_t p50_idx = cnt>>1; + uint64_t p75_idx = cnt - (uint64_t)1 - p25_idx; + uint64_t is_even = (uint64_t)!(cnt & (uint64_t)1); + + if( val[0]!=quote[ p25_idx ] ) { printf( "FAIL (p25)\n" ); return 1; } + if( val[1]!=avg_2_int64( quote[ p50_idx-is_even ], quote[ p50_idx ] ) ) { printf( "FAIL (p50)\n" ); return 1; } + if( val[2]!=quote[ p75_idx ] ) { printf( "FAIL (p75)\n" ); return 1; } + } + +# undef N + + prng_delete( prng_leave( prng ) ); + + printf( "pass\n" ); + return 0; +} + diff --git a/program/src/oracle/model/test_weight_model.c b/program/src/oracle/model/test_weight_model.c new file mode 100644 index 000000000..14a09933c --- /dev/null +++ b/program/src/oracle/model/test_weight_model.c @@ -0,0 +1,47 @@ +#include +#include +#include "../util/prng.h" + +#define WEIGHT_MODEL_NEED_REF +#include "weight_model.h" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + static long double const _2_30 = (long double)(UINT64_C(1)<<30); + static long double const _2_n30 = 1.L/(long double)(UINT64_C(1)<<30); + + uint64_t max_ulp = UINT64_C(0); + + int ctr = 0; + for( int iter=0; iter<100000000; iter++ ) { + if( !ctr ) { printf( "Completed %u iterations\n", iter ); ctr = 10000000; } + ctr--; + + uint32_t t = prng_uint32( prng ); + uint64_t ratio_gap = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6; + uint64_t ratio_overlap = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6; + + uint64_t y = weight_model( ratio_gap, ratio_overlap ); + uint64_t z = (uint64_t)roundl( _2_30*weight_model_ref( _2_n30*(long double)ratio_gap, _2_n30*(long double)ratio_overlap ) ); + + uint64_t ulp = y>z ? y-z : z-y; + if( y>(UINT64_C(1)<<30) || ulp>UINT64_C(1) ) { + printf( "FAIL (iter %i: gap %lx overlap %lx y %lx z %lx ulp %lu)\n", iter, ratio_gap, ratio_overlap, y, z, ulp ); + return 1; + } + if( ulp>max_ulp ) max_ulp = ulp; + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass (max_rerr %.1e max ulp %lu)\n", (double)max_ulp / (double)(1UL<<30), max_ulp ); + + return 0; +} + diff --git a/program/src/oracle/model/twap_model.c b/program/src/oracle/model/twap_model.c new file mode 100644 index 000000000..eaa3ae7c8 --- /dev/null +++ b/program/src/oracle/model/twap_model.c @@ -0,0 +1,73 @@ +#define REXP2_FXP_ORDER 7 +#define EXP2M1_FXP_ORDER 7 + +#include "twap_model.h" + +#include "gap_model.h" +#include "overlap_model.h" +#include "weight_model.h" +#include "leaky_integrator.h" +#include "../util/uwide.h" + +twap_model_t * +twap_model_update( twap_model_t * tm, + uint64_t now, + uint64_t agg_price, + uint64_t agg_conf ) { + + /* If this is the first observation, reset */ + + if( !tm->valid ) goto reset; + + /* If sequence numbers didn't advance since last observation, reset + (this logic also correctly handles sequence number wrapping). */ + + int64_t gap = (int64_t)(now-tm->ts); + if( gap<=INT64_C(0) ) goto reset; + + /* Compute weight/2^30 ~ our Bayesian probability that this + (agg_price,agg_conf) estimate parameterizes a price distribution + similiar to the most recent observed price distribution (event + SIM). */ + + uint64_t weight = weight_model( gap_model( (uint64_t)gap ), overlap_model( tm->twap,tm->twac, agg_price,agg_conf ) ); + + /* The number of samples if SIM is D+1 and prob SIM = weight/2^30 + The number of samples if ~SIM is 1 and prob ~SIM = 1-weight/2^30 + Expected number of recent samples available to average over then: + (D+1)*weight + 1*(1-weight) == D*weight + 1 + Thus we can apply a leaky integrator recursion for D and similarly + for the twap_N and twac_N. While these should never have any + overflow or what not given a GAP_MODEL_LAMBDA and + OVERLAP_MODEL_WEIGHT, we reset if we have even the slightest hint + of an issue to fail safe. */ + + uint64_t tmp_h,tmp_l; + + if( leaky_integrator( &tm->twap_Nh,&tm->twap_Nl, tm->twap_Nh,tm->twap_Nl, weight, agg_price ) ) goto reset; + if( leaky_integrator( &tm->twac_Nh,&tm->twac_Nl, tm->twac_Nh,tm->twac_Nl, weight, agg_conf ) ) goto reset; + if( leaky_integrator( &tmp_h, &tm->D, UINT64_C(0),tm->D, weight, UINT64_C(1) ) || tmp_h ) goto reset; + + /* Compute twap ~ twap_N/D via: + twap = floor( (twap_N+(D/2)) / D ) + and similarly for twac. */ + + tm->valid = INT8_C(1); + tm->ts = now; + if( uwide_add( &tmp_h,&tmp_l, tm->twap_Nh,tm->twap_Nl, UINT64_C(0),UINT64_C(0), tm->D>>1 ) ) goto reset; + if( uwide_div( &tmp_h,&tm->twap, tmp_h,tmp_l, tm->D ) || tmp_h ) goto reset; + if( uwide_add( &tmp_h,&tmp_l, tm->twac_Nh,tm->twac_Nl, UINT64_C(0),UINT64_C(0), tm->D>>1 ) ) goto reset; + if( uwide_div( &tmp_h,&tm->twac, tmp_h,tmp_l, tm->D ) || tmp_h ) goto reset; + return tm; + +reset: + tm->valid = INT8_C(1); + tm->ts = now; + tm->twap = agg_price; + tm->twac = agg_conf; + uwide_sl( &tm->twap_Nh,&tm->twap_Nl, UINT64_C(0),agg_price, 30 ); + uwide_sl( &tm->twac_Nh,&tm->twac_Nl, UINT64_C(0),agg_conf, 30 ); + tm->D = UINT64_C(1)<<30; + return tm; +} + diff --git a/program/src/oracle/model/twap_model.h b/program/src/oracle/model/twap_model.h new file mode 100644 index 000000000..3b7873b0c --- /dev/null +++ b/program/src/oracle/model/twap_model.h @@ -0,0 +1,102 @@ +#ifndef _pyth_oracle_model_twap_model_h_ +#define _pyth_oracle_model_twap_model_h_ + +#include "../util/compat_stdint.h" +#include + +/* twap_model_t should be treated as an opaque handle of a twap_model. + (It technically isn't here to facilitate inlining of operations.) */ + +struct twap_model { + int8_t valid; /* If 0, all values below should be ignored */ + uint64_t ts; /* Block sequence number corresponding to the below values */ + uint64_t twap; /* round( twap_N/D ) */ + uint64_t twac; /* round( twac_N/D ) */ + uint64_t twap_Nh,twap_Nl; /* twap_N is the weighted sum of recent aggregrated price observations */ + uint64_t twac_Nh,twac_Nl; /* twac_N is the weighted sum of recent aggregrated confidence observations */ + uint64_t D; /* D is the weighted number of recent observations */ +}; + +typedef struct twap_model twap_model_t; + +#ifdef __cplusplus +extern "C" { +#endif + +/* twap_model_footprint and twap_model_align give the needed footprint + and alignment for a memory region to hold a twap_model's state. + malloc / alloca / declaration friendly (e.g. a memory region declared + as "twap_model_t _tm[1];", or created by + "malloc(sizeof(twap_model_t))" or created by + "alloca(sizeof(twap_model_t))" will all automatically have the needed + footprint and alignment). + + twap_model_new takes ownership of the memory region pointed to by + shmem (which is assumed to be non-NULL with the appropriate footprint + and alignment) and formats it as a twap_model. Returns mem (which + will be formatted for use). The caller will not be joined to the + region on return. + + twap_model_join joins the caller to a memory region holding the state + of a twap_model. shtm points to a memory region in the local address + space that holds the twap_model. Returns an opaque handle of the + local join in the local address space to twap_model (which might not + be the same thing as shtm ... the separation of new and join is to + facilitate interprocess shared memory usage patterns while supporting + transparent upgrades users of this to more elaborate models where + address translations under the hood may not be trivial). + + twap_model_leave leaves the current twap_model join. Returns a + pointer in the local address space to the memory region holding the + state of the twap_model. The twap_model join should not be used + afterward. + + twap_model_delete unformats the memory region currently used to hold + the state of a _twap_model and returns ownership of the underlying + memory region to the caller. There should be no joins in the system + on the twap_model. Returns a pointer to the underlying memory + region. */ + +static inline uint64_t twap_model_footprint( void ) { return (uint64_t) sizeof(twap_model_t); } +static inline uint64_t twap_model_align ( void ) { return (uint64_t)alignof(twap_model_t); } + +static inline void * +twap_model_new( void * shmem ) { + twap_model_t * tm = (twap_model_t *)shmem; + tm->valid = INT8_C(0); + tm->ts = UINT64_C(0); + tm->twap = UINT64_C(0); + tm->twac = UINT64_C(0); + tm->twap_Nh = UINT64_C(0); tm->twap_Nl = UINT64_C(0); + tm->twac_Nh = UINT64_C(0); tm->twac_Nl = UINT64_C(0); + tm->D = UINT64_C(0); + return tm; +} + +static inline twap_model_t * twap_model_join ( void * shtm ) { return (twap_model_t *)shtm; } +static inline void * twap_model_leave ( twap_model_t * tm ) { return tm; } +static inline void * twap_model_delete( void * shtm ) { return shtm; } + +/* Accessors to the current state of a twap model. */ + +static inline int twap_model_valid( twap_model_t const * tm ) { return !!tm->valid; } +static inline uint64_t twap_model_ts ( twap_model_t const * tm ) { return tm->ts; } +static inline uint64_t twap_model_twap ( twap_model_t const * tm ) { return tm->twap; } +static inline uint64_t twap_model_twac ( twap_model_t const * tm ) { return tm->twac; } + +/* Update the state of the twap_model to incorporate observation + at block chain sequence number now the aggregrate price / confidence + pair (agg_price,agg_conf). Returns tm. */ + +twap_model_t * +twap_model_update( twap_model_t * tm, + uint64_t now, + uint64_t agg_price, + uint64_t agg_conf ); + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_model_twap_model_h_ */ + diff --git a/program/src/oracle/model/weight_model.h b/program/src/oracle/model/weight_model.h new file mode 100644 index 000000000..348413757 --- /dev/null +++ b/program/src/oracle/model/weight_model.h @@ -0,0 +1,124 @@ +#ifndef _pyth_oracle_model_weight_model_h_ +#define _pyth_oracle_model_weight_model_h_ + +/* Given our estimate of the distribution of prices gap blocks ago and + an estimate of the current price distribution ("OBS"), compute our + belief that the two estimates are modeling statistically similar + price distributions ("SIM"). Via Bayes theorem then: + + PROB(SIM;OBS) = PROB(OBS;SIM) PROB(SIM) / PROB(OBS) + = PROB(OBS;SIM) PROB(SIM) / ( PROB(OBS;SIM) PROB(SIM) + PROB(OBS;~SIM) PROB(~SIM) ) + = R / ( 1 + R ) + + where: + + R = ( PROB(OBS;SIM)/PROB(OBS;~SIM) ) ( PROB(SIM)/PROB(~SIM) ) + + The first term in R is exactly what overlap_model estimates. The + second term is exactly what gap_model estimates. Specifically, + weight_model computes: + + weight/2^30 ~ PROB(SIM;OBS) + + given the two ratios + + ratio_overlap/2^30 ~ PROB(OBS;SIM) / PROB(OBS;~SIM) + ratio_gap /2^30 ~ PROB(SIM) / PROB(~SIM) + + This calcuation ~30-bits accurate (the result is +/-1 ulp of the + correctly rounded result). + + If WEIGHT_MODEL_NEED_REF is defined, this will also declare a high + accuracy floating point reference implementation "weight_model_ref" + to aid in testing / tuning block chain suitable approximations. */ + +#include "../util/uwide.h" + +#ifdef WEIGHT_MODEL_NEED_REF +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef WEIGHT_MODEL_NEED_REF + +static inline long double +weight_model_ref( long double ratio_gap, + long double ratio_overlap ) { + long double r = ratio_gap*ratio_overlap; + return r / (1.L + r); +} + +#endif + +static inline uint64_t /* In [0,2^30] */ +weight_model( uint64_t ratio_gap, + uint64_t ratio_overlap ) { + uint64_t tmp_h,tmp_l; + + /* Starting from the exact expression: + + weight/2^30 = ((ratio_gap/2^30) (ratio_overlap/2^30)) / ( 1 + ((ratio_gap/2^30) (ratio_overlap/2^30))) + + we have: + + weight = 2^30 ratio_gap ratio_overlap / ( 2^60 + ratio_gap ratio_overlap ) + + When ratio_gap ratio_overlap <= 2^59/2^30 - 0.5 (>= 2^91-2^60) + weight rounds to 0 (2^30). This implies we don't actually need to + do a division with a 128-bit denominator to get a practically full + precision result. Instead we compute: + + ratio_gap ratio_overlap / 2^s + + where s the smallest integer that keep the denominator in 63-bits + for all non-trivial values of ratio_gap ratio_overlap (63 is used + instead of 64 to with the rounding calc). Given the above, s=28 is + sufficient, yielding: + + weight = 2^2 ratio_gap ratio_overlap / ( 2^32 + (ratio_gap ratio_overlap/2^28) ) + + The below approximates this via: + + num = ratio_gap ratio_overlap + den = 2^32 + floor( (num + 2^27)/2^28 ) + weight = floor( (2^3 num + den) / (2 den) ) + + Should have a maximum error of O(1) ulp. */ + + /* Compute num. 128-bit wide needed because these ratios can both + potentially be >=2^32. */ + + uint64_t num_h,num_l; uwide_mul( &num_h,&num_l, ratio_gap, ratio_overlap ); + + /* Compute den. Note that as ratio_gap ratio_overlap is at most + 2^128-2^65+1, the rounding add by 2^27 will not overflow. If the + result here >= (2^91-2^60)/2^28 = 2^63-2^32, we know the result + rounds to 2^30 and return immediately. */ + + static uint64_t const thresh = (UINT64_C(1)<<63) - (UINT64_C(1)<<32); + + uwide_inc( &tmp_h,&tmp_l, num_h,num_l, UINT64_C(1)<<27 ); + uwide_sr ( &tmp_h,&tmp_l, tmp_h,tmp_l, 28 ); + if( tmp_h || tmp_l>=thresh ) return UINT64_C(1)<<30; + uint64_t den = (UINT64_C(1)<<32) + tmp_l; + + /* Compute weight. Note that at this point < 2^91-2^60 + and den < 2^63 such that none of the below can overflow. */ + + uwide_sl ( &tmp_h,&tmp_l, num_h,num_l, 3 ); + uwide_inc( &tmp_h,&tmp_l, tmp_h,tmp_l, den ); + uwide_div( &tmp_h,&tmp_l, tmp_h,tmp_l, den<<1 ); + uint64_t weight = tmp_l; /* tmp_h==0 at this point */ + + return weight; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_model_weight_model_h_ */ + diff --git a/program/src/oracle/pd.h b/program/src/oracle/pd.h deleted file mode 100644 index 413e4ccdb..000000000 --- a/program/src/oracle/pd.h +++ /dev/null @@ -1,191 +0,0 @@ -#pragma once - -#ifdef __cplusplus -extern "C" { -#endif - -#define PD_SCALE9 (1000000000L) -#define PD_EMA_MAX_DIFF 4145 // maximum slots before reset -#define PD_EMA_EXPO (-9) // exponent of temporary storage -#define PD_EMA_DECAY (-117065) // 1e9*-log(2)/5921 -#define PC_FACTOR_SIZE 18 - -#define EXP_BITS 5 -#define EXP_MASK ( ( 1L << EXP_BITS ) - 1 ) - -#define pd_new( n,v,e ) {(n)->v_=v;(n)->e_=e;} -#define pd_set( r,n ) (r)[0] = (n)[0] -#define pd_new_scale(n,v,e) {(n)->v_=v;(n)->e_=e;pd_scale(n);} - -// decimal number format -typedef struct pd -{ - int32_t e_; - int64_t v_; -} pd_t; - -static inline void pd_scale( pd_t *n ) -{ - int const neg = n->v_ < 0L; - int64_t v = neg ? -n->v_ : n->v_; // make v positive for loop condition - for( ; v >= ( 1L << 28 ); v /= 10L, ++n->e_ ); - n->v_ = neg ? -v : v; -} - -static inline bool pd_store( int64_t *r, pd_t const *n ) -{ - int64_t v = n->v_; - int32_t e = n->e_; - while ( v < -( 1L << 58 ) ) { - v /= 10; - ++e; - } - while ( v > ( 1L << 58 ) - 1 ) { - v /= 10; - ++e; - } - while ( e < -( 1 << ( EXP_BITS - 1 ) ) ) { - v /= 10; - ++e; - } - while ( e > ( 1 << ( EXP_BITS - 1 ) ) - 1 ) { - v *= 10; - if ( v < -( 1L << 58 ) || v > ( 1L << 58 ) - 1 ) { - return false; - } - --e; - } - *r = ( v << EXP_BITS ) | ( e & EXP_MASK ); - return true; -} - -void pd_load( pd_t *r, int64_t const n ) -{ - pd_new( r, n >> EXP_BITS, ( ( n & EXP_MASK ) << 59 ) >> 59 ); - pd_scale( r ); -} - -static inline void pd_adjust( pd_t *n, int e, const int64_t *p ) -{ - int64_t v = n->v_; - int d = n->e_ - e; - if ( d > 0 ) { - v *= p[ d ]; - } - else if ( d < 0 ) { - v /= p[ -d ]; - } - pd_new( n, v, e ); -} - -static inline void pd_mul( pd_t *r, const pd_t *n1, const pd_t *n2 ) -{ - r->v_ = n1->v_ * n2->v_; - r->e_ = n1->e_ + n2->e_; - pd_scale( r ); -} - -static inline void pd_div( pd_t *r, pd_t *n1, pd_t *n2 ) -{ - if ( n1->v_ == 0 ) { pd_set( r, n1 ); return; } - int64_t v1 = n1->v_, v2 = n2->v_; - int neg1 = v1 < 0L, neg2 = v2 < 0L, m = 0; - if ( neg1 ) v1 = -v1; - if ( neg2 ) v2 = -v2; - for( ; 0UL == ( ( uint64_t )v1 & 0xfffffffff0000000UL ); v1 *= 10L, ++m ); - r->v_ = ( v1 * PD_SCALE9 ) / v2; - if ( neg1 ) r->v_ = -r->v_; - if ( neg2 ) r->v_ = -r->v_; - r->e_ = n1->e_ - n2->e_ - m - 9; - pd_scale( r ); -} - -static inline void pd_add( pd_t *r, const pd_t *n1, const pd_t *n2, const int64_t *p ) -{ - int d = n1->e_ - n2->e_; - if ( d==0 ) { - pd_new( r, n1->v_ + n2->v_, n1->e_ ); - } else if ( d>0 ) { - if ( d<9 ) { - pd_new( r, n1->v_*p[d] + n2->v_, n2->e_ ); - } else if ( d < PC_FACTOR_SIZE+9 ) { - pd_new( r, n1->v_*PD_SCALE9 + n2->v_/p[d-9], n1->e_-9); - } else { - pd_set( r, n1 ); - } - } else { - d = -d; - if ( d<9 ) { - pd_new( r, n1->v_ + n2->v_*p[d], n1->e_ ); - } else if ( d < PC_FACTOR_SIZE+9 ) { - pd_new( r, n1->v_/p[d-9] + n2->v_*PD_SCALE9, n2->e_-9 ); - } else { - pd_set( r, n2 ); - } - } - pd_scale( r ); -} - -static inline void pd_sub( pd_t *r, const pd_t *n1, const pd_t *n2, const int64_t *p ) -{ - int d = n1->e_ - n2->e_; - if ( d==0 ) { - pd_new( r, n1->v_ - n2->v_, n1->e_ ); - } else if ( d>0 ) { - if ( d<9 ) { - pd_new( r, n1->v_*p[d] - n2->v_, n2->e_ ); - } else if ( d < PC_FACTOR_SIZE+9 ) { - pd_new( r, n1->v_*PD_SCALE9 - n2->v_/p[d-9], n1->e_-9); - } else { - pd_set( r, n1 ); - } - } else { - d = -d; - if ( d<9 ) { - pd_new( r, n1->v_ - n2->v_*p[d], n1->e_ ); - } else if ( d < PC_FACTOR_SIZE+9 ) { - pd_new( r, n1->v_/p[d-9] - n2->v_*PD_SCALE9, n2->e_-9 ); - } else { - pd_new( r, -n2->v_, n2->e_ ); - } - } - pd_scale( r ); -} - -static inline int pd_lt( const pd_t *n1, const pd_t *n2, const int64_t *p ) -{ - pd_t r[1]; - pd_sub( r, n1, n2, p ); - return r->v_ < 0L; -} - -static inline int pd_gt( const pd_t *n1, const pd_t *n2, const int64_t *p ) -{ - pd_t r[1]; - pd_sub( r, n1, n2, p ); - return r->v_ > 0L; -} - -static inline void pd_sqrt( pd_t *r, pd_t *val, const int64_t *f ) -{ - pd_t t[1], x[1], hlf[1]; - pd_set( t, val ); - pd_new( r, 1, 0 ); - pd_add( x, t, r, f ); - pd_new( hlf, 5, -1 ); - pd_mul( x, x, hlf ); - for(;;) { - pd_div( r, t, x ); - pd_add( r, r, x, f ); - pd_mul( r, r, hlf ); - if ( x->v_ == r->v_ ) { - break; - } - pd_set( x, r ); - } -} - -#ifdef __cplusplus -} -#endif - diff --git a/program/src/oracle/sort.c b/program/src/oracle/sort.c deleted file mode 100644 index 4cb53ebc3..000000000 --- a/program/src/oracle/sort.c +++ /dev/null @@ -1,82 +0,0 @@ -#include "sort.h" - -#ifndef __bpf__ -#include -#endif - -static inline void swap_int64( int64_t *a, int64_t *b ) -{ - int64_t const temp = *a; - *a = *b; - *b = temp; -} - -static inline int partition_int64( int64_t* v, int i, int j ) -{ -#ifndef __bpf__ -#ifndef NDEBUG - int const i0 = i; - int const j0 = j; -#endif -#endif - - int const p = i; - int64_t const pv = v[ p ]; - - while ( i < j ) { - while ( v[ i ] <= pv && i <= j ) { - ++i; - } - while ( v[ j ] > pv ) { - --j; - } - if ( i < j ) { - swap_int64( &v[ i ], &v[ j ] ); - } - } - swap_int64( &v[ p ], &v[ j ] ); - -#ifndef __bpf__ -#ifndef NDEBUG - int k; - for ( k = i0; k < j; ++k ) { - assert( v[ k ] <= pv ); - } - assert( v[ j ] == pv ); - for ( k = j + 1; k <= j0; ++k ) { - assert( v[ k ] > pv ); - } -#endif -#endif - - return j; -} - -static void qsort_help_int64( int64_t* const v, int i, int j ) -{ - if ( i >= j ) { - return; - } - int p = partition_int64( v, i, j ); - qsort_help_int64( v, i, p - 1 ); - qsort_help_int64( v, p + 1, j ); -} - -void qsort_int64( int64_t* const v, unsigned const size ) -{ - qsort_help_int64( v, 0, ( int )size - 1 ); - -#ifndef __bpf__ -#ifndef NDEBUG - if ( size ) { - int64_t x = v[ 0 ]; - unsigned i = 1; - for ( ; i < size; ++i ) { - assert( x <= v[ i ] ); - x = v[ i ]; - } - } -#endif -#endif -} - diff --git a/program/src/oracle/sort.h b/program/src/oracle/sort.h deleted file mode 100644 index ebf67198c..000000000 --- a/program/src/oracle/sort.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once - -#ifdef __bpf__ -#include -#else -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -void qsort_int64( int64_t* v, unsigned size ); - -#ifdef __cplusplus -} -#endif - diff --git a/program/src/oracle/sort/clean b/program/src/oracle/sort/clean new file mode 100755 index 000000000..860ef1641 --- /dev/null +++ b/program/src/oracle/sort/clean @@ -0,0 +1,3 @@ +#!/bin/sh +rm -rfv bin + diff --git a/program/src/oracle/sort/run_tests b/program/src/oracle/sort/run_tests new file mode 100755 index 000000000..0eb277605 --- /dev/null +++ b/program/src/oracle/sort/run_tests @@ -0,0 +1,18 @@ +#!/bin/sh + +module purge || exit 1 +module load gcc-9.3.0 || exit 1 + +./clean || exit 1 +mkdir -pv bin || exit 1 + +CC="gcc -g -Wall -Werror -Wextra -Wconversion -Wstrict-aliasing=2 -Wimplicit-fallthrough=2 -pedantic -D_XOPEN_SOURCE=600 -O2 -march=native -std=c17" + +set -x + +$CC test_sort_stable.c -o bin/test_sort_stable || exit 1 + +bin/test_sort_stable || exit 1 + +echo all tests passed + diff --git a/program/src/oracle/sort/sort_stable_base_gen.c b/program/src/oracle/sort/sort_stable_base_gen.c new file mode 100644 index 000000000..70ac61597 --- /dev/null +++ b/program/src/oracle/sort/sort_stable_base_gen.c @@ -0,0 +1,92 @@ +#include +#include + +void +sort_gen( int n ) { + +# if 0 + + /* In register variant (PSEUDO OPS ~ 9+4n+3n(n-1)) + Assumes switch ~ 3 PSEUDO OPS (LDA,LD,JMP) -> 3 switch statements / 9 pseudo ops + Assumes load ~ 2 PSEUDO OPS (LDA,LD) -> n loads / 2n pseudo ops + Assumes store ~ " + Assumes cswap ~ 6 PSEUDO OPS (CMP,MOV,TESTEQ,CMOV,TESTNEQ,CMOV) / 6*0.5*n*(n-1) pseudo ops */ + +//printf( "static inline key_t * /* returns (sorted) x */\n" ); +//printf( "sort_network_stable( key_t * x, /* indexed [0,n) */\n" ); +//printf( " ulong n ) { /* assumes n in [0,%i) */\n", n ); + printf( " int c;\n" ); + printf( " key_t t" ); + for( int i=0; i 3 pseudo ops + Assumes cswap ~ 9 PSEUDO OPS (LDA,LDA,LD,LD,CMP,CMOV,CMOV,ST,ST) / 9*0.5*n*(n-1) pseudo ops */ + +//printf( "static inline key_t * /* returns (sorted) x */\n" ); +//printf( "sort_network_stable( key_t * x, /* indexed [0,n) */\n" ); +//printf( " ulong n ) { /* assumes n in [0,%i) */\n", n ); + + printf( " do { /* BEGIN AUTOGENERATED CODE (n=%2i) *****************************/\n", n ); + printf( "# define SORT_STABLE_CE(i,j) u = x[(SORT_IDX_T)i]; v = x[(SORT_IDX_T)j]; c = SORT_BEFORE( v, u ); x[(SORT_IDX_T)i] = c ? v : u; x[(SORT_IDX_T)j] = c ? u : v\n" ); + printf( " int c;\n" ); + printf( " SORT_KEY_T u;\n" ); + printf( " SORT_KEY_T v;\n" ); + printf( " switch( n ) {\n" ); + for( int i=n-1; i>=0; i-- ) { + printf( " case (SORT_IDX_T)%2i:", i+1 ); + for( int j=0; j +#include "../util/util.h" + +#define BEFORE(i,j) (((i)>>16)<((j)>>16)) + +#define SORT_NAME sort +#define SORT_KEY_T int +#define SORT_IDX_T int +#define SORT_BEFORE(i,j) BEFORE(i,j) +#include "tmpl/sort_stable.c" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + +# define N 96 + int x[N]; + int y[N]; + int w[N]; + + /* Brute force validate small sizes via the 0-1 principle (with + additional information in the keys to validate stability as well). */ + + for( int n=0; n<=24; n++ ) { + printf( "Zero-One: Testing n=%i\n", n ); + for( long b=0L; b<(1L<>i) & 1L))<<16) | i; + for( int i=0; i=n || z[i]!=w[j] ) { printf( "FAIL (corrupt)\n" ); return 1; } + w[j] = -1; /* Mark that this entry has already been confirmed */ + } + for( int i=0; i=n || z[i]!=w[j] ) { printf( "FAIL (corrupt)\n" ); return 1; } + w[j] = -1; /* Mark that this entry has already been confirmed */ + } + for( int i=0; i=0 always true" if idx is an unsigned type or + "n<=UINT64_MAX always true" if key is a byte type). */ + static uint64_t const max = ((UINT64_MAX - (uint64_t)alignof(SORT_KEY_T) + (uint64_t)1) / (uint64_t)sizeof(SORT_KEY_T)); + return !cnt || (((SORT_IDX_T)0)> 1; + SORT_KEY_T * yl = SORT_IMPL(stable_node)( xl,nl, tl ); + + SORT_KEY_T * xr = x + nl; + SORT_KEY_T * tr = t + nl; + SORT_IDX_T nr = n - nl; + SORT_KEY_T * yr = SORT_IMPL(stable_node)( xr,nr, tr ); + + /* If left subsort result ended up in orig array, merge into temp + array. Otherwise, merge into orig array. */ + + if( yl==xl ) x = t; + + /* At this point, note that yl does not overlap with the location for + merge output at this point. yr might overlap (with the right half) + with the location for merge output but this will still work in that + case. */ + + SORT_IDX_T i = (SORT_IDX_T)0; + SORT_IDX_T j = (SORT_IDX_T)0; + SORT_IDX_T k = (SORT_IDX_T)0; + + /* Note that nl and nr are both at least one at this point so at least + one interation of the loop body is necessary. */ + + for(;;) { /* Minimal C language operations */ + if( SORT_BEFORE( yr[k], yl[j] ) ) { + x[i++] = yr[k++]; + if( k>=nr ) { /* append left stragglers (at least one) */ do x[i++] = yl[j++]; while( j=nl ) { /* append right stragglers (at least one) */ do x[i++] = yr[k++]; while( k Test(oracle, init_mapping) { diff --git a/program/src/oracle/upd_aggregate.h b/program/src/oracle/upd_aggregate.h index 221af607a..622f1a33a 100644 --- a/program/src/oracle/upd_aggregate.h +++ b/program/src/oracle/upd_aggregate.h @@ -1,406 +1,152 @@ -#pragma once +#ifndef _pyth_oracle_upd_aggregrate_h_ +#define _pyth_oracle_upd_aggregrate_h_ + +#include /* For INT64_MAX */ #include "oracle.h" -#include "pd.h" -#include "sort.h" +#include "model/price_model.h" +#include "model/twap_model.h" -#include +/* FIXME: TEMP HACK TO DEAL WITH LINKING */ +#include "model/price_model.c" +#include "model/twap_model.c" #ifdef __cplusplus extern "C" { #endif -typedef struct pc_qset -{ - pd_t iprice_[PC_COMP_SIZE]; - pd_t uprice_[PC_COMP_SIZE]; - pd_t lprice_[PC_COMP_SIZE]; - pd_t weight_[PC_COMP_SIZE]; - int64_t decay_[1+PC_MAX_SEND_LATENCY]; - int64_t fact_[PC_FACTOR_SIZE]; - int32_t expo_; -} pc_qset_t; - -// initialize quote-set temporary data in heap area -static pc_qset_t *qset_new( int expo ) -{ - // allocate off heap - pc_qset_t *qs = (pc_qset_t*)PC_HEAP_START; - - // sqrt of numbers 1 to 25 for decaying conf. interval based on slot delay - qs->decay_[0] = 1000000000L; - qs->decay_[1] = 1000000000L; - qs->decay_[2] = 1414213562L; - qs->decay_[3] = 1732050807L; - qs->decay_[4] = 2000000000L; - qs->decay_[5] = 2236067977L; - qs->decay_[6] = 2449489742L; - qs->decay_[7] = 2645751311L; - qs->decay_[8] = 2828427124L; - qs->decay_[9] = 3000000000L; - qs->decay_[10] = 3162277660L; - qs->decay_[11] = 3316624790L; - qs->decay_[12] = 3464101615L; - qs->decay_[13] = 3605551275L; - qs->decay_[14] = 3741657386L; - qs->decay_[15] = 3872983346L; - qs->decay_[16] = 4000000000L; - qs->decay_[17] = 4123105625L; - qs->decay_[18] = 4242640687L; - qs->decay_[19] = 4358898943L; - qs->decay_[20] = 4472135954L; - qs->decay_[21] = 4582575694L; - qs->decay_[22] = 4690415759L; - qs->decay_[23] = 4795831523L; - qs->decay_[24] = 4898979485L; - qs->decay_[25] = 5000000000L; - - // powers of 10 for use in decimal arithmetic scaling - qs->fact_[0] = 1L; - qs->fact_[1] = 10L; - qs->fact_[2] = 100L; - qs->fact_[3] = 1000L; - qs->fact_[4] = 10000L; - qs->fact_[5] = 100000L; - qs->fact_[6] = 1000000L; - qs->fact_[7] = 10000000L; - qs->fact_[8] = 100000000L; - qs->fact_[9] = 1000000000L; - qs->fact_[10] = 10000000000L; - qs->fact_[11] = 100000000000L; - qs->fact_[12] = 1000000000000L; - qs->fact_[13] = 10000000000000L; - qs->fact_[14] = 100000000000000L; - qs->fact_[15] = 1000000000000000L; - qs->fact_[16] = 10000000000000000L; - qs->fact_[17] = 100000000000000000L; - - qs->expo_ = expo; - - return qs; -} - -static void upd_ema( - pc_ema_t *ptr, pd_t *val, pd_t *conf, int64_t nslot, pc_qset_t *qs - , pc_price_t *prc_ptr) -{ - pd_t numer[1], denom[1], cwgt[1], wval[1], decay[1], diff[1], one[1]; - pd_new( one, 100000000L, -8 ); - if ( conf->v_ ) { - pd_div( cwgt, one, conf ); - } else { - pd_set( cwgt, one ); - } - if ( nslot > PD_EMA_MAX_DIFF ) { - // initial condition - pd_mul( numer, val, cwgt ); - pd_set( denom, cwgt ); - } else { - // compute decay factor - pd_new( diff, nslot, 0 ); - pd_new( decay, PD_EMA_DECAY, PD_EMA_EXPO ); - pd_mul( decay, decay, diff ); - pd_add( decay, decay, one, qs->fact_ ); - - // compute numer/denom and new value from decay factor - if ( prc_ptr->drv1_ ) { - pd_load( numer, ptr->numer_ ); - pd_load( denom, ptr->denom_ ); - } - else { - // temporary upgrade code - pd_new_scale( numer, ptr->numer_, PD_EMA_EXPO ); - pd_new_scale( denom, ptr->denom_, PD_EMA_EXPO ); - } - if ( numer->v_ < 0 || denom->v_ < 0 ) { - // temporary reset twap on negative value - pd_set( numer, val ); - pd_set( denom, one ); - } - else { - pd_mul( numer, numer, decay ); - pd_mul( wval, val, cwgt ); - pd_add( numer, numer, wval, qs->fact_ ); - pd_mul( denom, denom, decay ); - pd_add( denom, denom, cwgt, qs->fact_ ); - pd_div( val, numer, denom ); - } - } - - // adjust and store results - pd_adjust( val, qs->expo_, qs->fact_ ); - ptr->val_ = val->v_; - int64_t numer1, denom1; - if ( pd_store( &numer1, numer ) && pd_store( &denom1, denom ) ) { - ptr->numer_ = numer1; - ptr->denom_ = denom1; - } - prc_ptr->drv1_ = 1; -} - -static inline void upd_twap( - pc_price_t *ptr, int64_t nslots, pc_qset_t *qs ) -{ - pd_t px[1], conf[1]; - pd_new_scale( px, ptr->agg_.price_, ptr->expo_ ); - pd_new_scale( conf, ( int64_t )( ptr->agg_.conf_ ), ptr->expo_ ); - upd_ema( &ptr->twap_, px, conf, nslots, qs, ptr ); - upd_ema( &ptr->twac_, conf, conf, nslots, qs, ptr ); -} - -// compute weighted percentile -static void wgt_ptile( - pd_t * const res - , const pd_t * const prices, const pd_t * const weights, const uint32_t num - , const pd_t * const ptile, pc_qset_t *qs ) -{ - pd_t cumwgt[ PC_COMP_SIZE ]; - - pd_t cumwgta[ 1 ], half[ 1 ]; - pd_new( cumwgta, 0, 0 ); - pd_new( half, 5, -1 ); - for ( uint32_t i = 0; i < num; ++i ) { - const pd_t * const wptr = &weights[ i ]; - pd_t weight[ 1 ]; - pd_mul( weight, wptr, half ); - pd_add( &cumwgt[ i ], cumwgta, weight, qs->fact_ ); - pd_add( cumwgta, cumwgta, wptr, qs->fact_ ); - } +// update aggregate price - uint32_t i = 0; - for( ; i != num && pd_lt( &cumwgt[i], ptile, qs->fact_ ); ++i ); - if ( i == num ) { - pd_set( res, &prices[num-1] ); - } else if ( i == 0 ) { - pd_set( res, &prices[0] ); - } else { - pd_t t1[1], t2[1]; - pd_sub( t1, &prices[i], &prices[i-1], qs->fact_ ); - pd_sub( t2, ptile, &cumwgt[i-1], qs->fact_ ); - pd_mul( t1, t1, t2 ); - pd_sub( t2, &cumwgt[i], &cumwgt[i-1], qs->fact_ ); - pd_div( t1, t1, t2 ); - pd_add( res, &prices[i-1], t1, qs->fact_ ); - } -} +static inline void +upd_aggregate( pc_price_t *ptr, + uint64_t now ) { -// update aggregate price -static inline void upd_aggregate( pc_price_t *ptr, uint64_t slot ) -{ - // only re-compute aggregate in next slot - if ( slot <= ptr->agg_.pub_slot_ ) { - return; - } - pc_qset_t *qs = qset_new( ptr->expo_ ); + // only re-compute aggregate in newer slots + // this logic correctly handles sequence number wrapping - // get number of slots from last published valid price - int64_t agg_diff = ( int64_t )slot - ( int64_t )( ptr->last_slot_ ); + int64_t agg_diff = (int64_t)(now - ptr->last_slot_); + if( agg_diffprev_slot_ = ptr->valid_slot_; ptr->prev_price_ = ptr->agg_.price_; ptr->prev_conf_ = ptr->agg_.conf_; // update aggregate details ready for next slot - ptr->valid_slot_ = ptr->agg_.pub_slot_;// valid slot-time of agg. price - ptr->agg_.pub_slot_ = slot; // publish slot-time of agg. price - uint32_t numv = 0; - uint32_t vidx[ PC_COMP_SIZE ]; - // identify valid quotes and order them by price - for ( uint32_t i = 0; i != ptr->num_; ++i ) { - pc_price_comp_t *iptr = &ptr->comp_[i]; - // copy contributing price to aggregate snapshot - iptr->agg_ = iptr->latest_; - // add quote to sorted permutation array if it is valid - int64_t slot_diff = ( int64_t )slot - ( int64_t )( iptr->agg_.pub_slot_ ); - if ( iptr->agg_.status_ == PC_STATUS_TRADING && - iptr->agg_.conf_ != 0UL && - iptr->agg_.price_ > 0L && - slot_diff >= 0 && slot_diff <= PC_MAX_SEND_LATENCY ) { - vidx[numv++] = i; - } - } - - uint32_t nprcs = 0; - int64_t prcs[ PC_COMP_SIZE * 2 ]; - for ( uint32_t i = 0; i < numv; ++i ) { - pc_price_comp_t const *iptr = &ptr->comp_[ vidx[ i ] ]; - int64_t const price = iptr->agg_.price_; - int64_t const conf = ( int64_t )( iptr->agg_.conf_ ); - prcs[ nprcs++ ] = price - conf; - prcs[ nprcs++ ] = price + conf; - } - qsort_int64( prcs, nprcs ); - - uint32_t numa = 0; - uint32_t aidx[PC_COMP_SIZE]; - - if ( nprcs ) { - int64_t const mprc = ( prcs[ numv - 1 ] + prcs[ numv ] ) / 2; - int64_t const lb = mprc / 5; - int64_t const ub = ( mprc > LONG_MAX / 5 ) ? LONG_MAX : mprc * 5; - - for ( uint32_t i = 0; i < numv; ++i ) { - uint32_t const idx = vidx[ i ]; - pc_price_comp_t const *iptr = &ptr->comp_[ idx ]; - int64_t const prc = iptr->agg_.price_; - if ( prc >= lb && prc <= ub ) { - uint32_t j = numa++; - for( ; j > 0 && ptr->comp_[ aidx[ j - 1 ] ].agg_.price_ > prc; --j ) { - aidx[ j ] = aidx[ j - 1 ]; - } - aidx[ j ] = idx; + ptr->valid_slot_ = ptr->agg_.pub_slot_; // valid slot-time of agg. price + ptr->agg_.pub_slot_ = now; // publish slot-time of agg. price + + // identify valid quotes (those that have a trading status, have a + // recent quote (i.e. from a block seq number + // [now-PC_MAX_SEND_LATENCY,now]) and do not have obviously bad values + // for price and conf for that quote (i.e. 0num_; i++ ) { + pc_price_comp_t *iptr = &ptr->comp_[i]; + // copy contributing price to aggregate snapshot + iptr->agg_ = iptr->latest_; + // add quote values to prcs if valid + int64_t slot_diff = (int64_t)(now - iptr->agg_.pub_slot_); + int64_t price = iptr->agg_.price_; + int64_t conf = ( int64_t )( iptr->agg_.conf_ ); + if( iptr->agg_.status_ == PC_STATUS_TRADING && + INT64_C(0)<=slot_diff && slot_diff<=PC_MAX_SEND_LATENCY && + INT64_C(0)0 + numv++; + prcs[ nprcs++ ] = price - conf; // No overflow as 0 < conf < price + prcs[ nprcs++ ] = price; + prcs[ nprcs++ ] = price + conf; // No overflow as 0 < conf <= INT64_MAX-price } } - } - - // zero quoters - ptr->num_qt_ = numa; - if ( numa == 0 || numa < ptr->min_pub_ || numa * 2 <= numv ) { - ptr->agg_.status_ = PC_STATUS_UNKNOWN; - return; - } - - // update status and publish slot of last trading status price - ptr->agg_.status_ = PC_STATUS_TRADING; - ptr->last_slot_ = slot; - - // single quoter - if ( numa == 1 ) { - pc_price_comp_t *iptr = &ptr->comp_[aidx[0]]; - ptr->agg_.price_ = iptr->agg_.price_; - ptr->agg_.conf_ = iptr->agg_.conf_; - upd_twap( ptr, agg_diff, qs ); - return; - } - - // assign quotes and compute weights - pc_price_comp_t *pptr = 0; - pd_t price[1], conf[1], weight[1], one[1], wsum[1]; - pd_new( one, 100000000L, -8 ); - pd_new( wsum, 0, 0 ); - int64_t ldiff = INT64_MAX; - pd_t *wptr = qs->weight_; - for( uint32_t i=0;i != numa; ++i ) { - pc_price_comp_t *iptr = &ptr->comp_[aidx[i]]; - // scale confidence interval by sqrt of slot age - int64_t slot_diff = ( int64_t )slot - ( int64_t )( iptr->agg_.pub_slot_ ); - pd_t decay[1]; - pd_new( decay, qs->decay_[slot_diff], PC_EXP_DECAY ); - pd_new_scale( conf, ( int64_t )( iptr->agg_.conf_ ), ptr->expo_ ); - pd_mul( conf, conf, decay ); - // assign price and upper/lower bounds of price - pd_new_scale( price, iptr->agg_.price_, ptr->expo_ ); - pd_set( &qs->iprice_[i], price ); - pd_add( &qs->uprice_[i], price, conf, qs->fact_ ); - pd_sub( &qs->lprice_[i], price, conf, qs->fact_ ); - - // compute weight (1/(conf+nearest_neighbor)) - pd_set( &qs->weight_[i], conf ); - if ( i ) { - int64_t idiff = iptr->agg_.price_ - pptr->agg_.price_; - pd_new_scale( weight, idiff < ldiff ? idiff : ldiff, ptr->expo_ ); - pd_add( wptr, wptr, weight, qs->fact_ ); - pd_div( wptr, one, wptr ); - pd_add( wsum, wsum, wptr, qs->fact_ ); - ldiff = idiff; - ++wptr; + // too few valid quotes + if( !numv || numvmin_pub_ ) { + // FIXME: SHOULD NUM_QT_ AND/OR LAST_SLOT_ BE UPDATED? IN EARLIER + // CODE, NUM_QT_ WAS BUT LAST SLOT WASN'T IN CASES LIKE THIS. IT + // SEEMS LIKE IT WAS WRONG TO UPDATE NUM_QT_ THOUGH AS IT SEEMS TO + // CORRESPOND THE POINT IN TIME SPECIFIED BY LAST_SLOT_. + ptr->agg_.status_ = PC_STATUS_UNKNOWN; + return; } - pptr = iptr; - } - // compute weight for last quote - pd_new_scale( weight, ldiff, ptr->expo_ ); - pd_add( wptr, wptr, weight, qs->fact_ ); - pd_div( wptr, one, wptr ); - pd_add( wsum, wsum, wptr, qs->fact_ ); - // bound weights at 1/sqrt(Nquotes) and redeistribute the remaining weight - // among the remaining quoters proportional to their weights - pd_t wmax[1], rnumer[1], rdenom[1]; - pd_set( rnumer, one ); - pd_new( rdenom, 0, 0 ); - // wmax = 1 / sqrt( numa ) - pd_new( wmax, numa, 0 ); - pd_sqrt( wmax, wmax, qs->fact_ ); - pd_div( wmax, one, wmax ); - for( uint32_t i=0;i != numa; ++i ) { - wptr = &qs->weight_[i]; - pd_div( wptr, wptr, wsum ); - if ( pd_gt( wptr, wmax, qs->fact_ ) ) { - pd_set( wptr, wmax ); - pd_sub( rnumer, rnumer, wmax, qs->fact_ ); - aidx[i] = 1; - } else { - pd_add( rdenom, rdenom, wptr, qs->fact_ ); - aidx[i] = 0; - } - } - if ( rdenom->v_ ) { - pd_div( rnumer, rnumer, rdenom ); - } - for ( uint32_t i = 0; i != numa; ++i ) { - wptr = &qs->weight_[ i ]; - if ( aidx[ i ] == 0 ) { - pd_mul( wptr, wptr, rnumer ); + // evaluate price_model to get the p25/p50/p75 prices + // note: numv>0 and nprcs = 3*numv at this point + int64_t agg_p25; + int64_t agg_p75; + price_model_core( (uint64_t)nprcs, prcs, &agg_p25, &agg_price, &agg_p75, prcs+nprcs ); + + // get the left and right confidences + // note that as valid quotes have positive prices currently and + // agg_p25, agg_price, agg_p75 are ordered, this calculation can't + // overflow / underflow + int64_t agg_conf_left = agg_price - agg_p25; + int64_t agg_conf_right = agg_p75 - agg_price; + + // use the larger of the left and right confidences + agg_conf = agg_conf_right > agg_conf_left ? agg_conf_right : agg_conf_left; + + // if the aggregated values don't look sane, we abort + // this is paranoia given the current pricing model and + // above quoter sanity checks + if( !(INT64_C(0)agg_.status_ = PC_STATUS_UNKNOWN; + return; } } - const pd_t half = { .e_ = -1, .v_ = 5 }; + // we have an valid (now,agg_price,agg_conf) observation + // update the twap_model - // compute aggregate price as weighted median - pd_t iprice[1], lprice[1], uprice[1], q3price[1], q1price[1], ptile[1]; - pd_new( ptile, 5, -1 ); - wgt_ptile( iprice, qs->iprice_, qs->weight_, numa, ptile, qs ); - pd_adjust( iprice, ptr->expo_, qs->fact_ ); - ptr->agg_.price_ = iprice->v_; + // FIXME: HIDEOUS TEMPORARY HACK TO PACK THE EXISTING PC_PRICE_T DATA + // LAYOUT INTO A TWAP_MODEL_T (PC_PRICE_T SHOULD JUST HAVE THIS AS A + // COMPONENT AND MAYBE ELIMINATE LAST_SLOT_ IN FAVOR OF TM->TS). + twap_model_t tm[1]; + tm->valid = ptr->drv2_; // FIXME: MAKE SURE DRV2_ IS INITIALIZED TO ZERO ON ACCOUNT CREATION + tm->ts = ptr->last_slot_; + tm->twap = (uint64_t)ptr->twap_.val_; + tm->twac = (uint64_t)ptr->twac_.val_; + tm->twap_Nh = (uint64_t)ptr->twap_.numer_; tm->twap_Nl = (uint64_t)ptr->twap_.denom_; // FIXME: UGLY REINTERP + tm->twac_Nh = (uint64_t)ptr->twac_.numer_; tm->twac_Nl = (uint64_t)ptr->twac_.denom_; // FIXME: UGLY REINTERP + tm->D = (uint64_t)ptr->drv1_; - // compute diff in weighted median between upper and lower price bounds - pd_t prices[ PC_COMP_SIZE ]; - pd_t weights[ PC_COMP_SIZE ]; - // sort upper prices and weights - for ( uint32_t i = 0; i < numa; ++i ) { - uint32_t j = i; - for ( ; j > 0 && pd_lt( &qs->uprice_[ i ], &prices[ j - 1 ], qs->fact_ ); --j ) { - prices[ j ] = prices[ j - 1 ]; - weights[ j ] = weights[ j - 1 ]; - } - prices[ j ] = qs->uprice_[ i ]; - weights[ j ] = qs->weight_[ i ]; - } - wgt_ptile( uprice, prices, weights, numa, ptile, qs ); - // sort lower prices and weights - for ( uint32_t i = 0; i < numa; ++i ) { - uint32_t j = i; - for ( ; j > 0 && pd_lt( &qs->lprice_[ i ], &prices[ j - 1 ], qs->fact_ ); --j ) { - prices[ j ] = prices[ j - 1 ]; - weights[ j ] = weights[ j - 1 ]; - } - prices[ j ] = qs->lprice_[ i ]; - weights[ j ] = qs->weight_[ i ]; - } - wgt_ptile( lprice, prices, weights, numa, ptile, qs ); - - pd_sub( uprice, uprice, lprice, qs->fact_ ); - pd_mul( uprice, uprice, &half ); + twap_model_update( tm, now, (uint64_t)agg_price, (uint64_t)agg_conf ); - // compute weighted iqr of prices - pd_new( ptile, 75, -2 ); - wgt_ptile( q3price, qs->iprice_, qs->weight_, numa, ptile, qs ); - pd_new( ptile, 25, -2 ); - wgt_ptile( q1price, qs->iprice_, qs->weight_, numa, ptile, qs ); - pd_sub( q3price, q3price, q1price, qs->fact_ ); - pd_mul( q3price, q3price, &half ); - - // take confidence interval as larger - pd_t *cptr = pd_gt( uprice, q3price, qs->fact_ ) ? uprice : q3price; - pd_adjust( cptr, ptr->expo_, qs->fact_ ); - ptr->agg_.conf_ = ( uint64_t )( cptr->v_ ); - upd_twap( ptr, agg_diff, qs ); + // FIXME: HIDEOUS TEMPORARY HACK TO UNPACK. SEE NOTE ABOVE. IF + // ANYTHING EVEN WENT THE SLIGHTEST WEIRD, WE ABORT. + if( !(INT64_C(0)twac && tm->twactwap && tm->twac<=((uint64_t)INT64_MAX-tm->twap)) ) { + // FIXME: SEE NOTE ABOVE ABOUT NUM_QT_ AND LAST_SLOT_ + ptr->agg_.status_ = PC_STATUS_UNKNOWN; + return; + } + ptr->drv2_ = tm->valid; + ptr->twap_.val_ = (int64_t)tm->twap; + ptr->twac_.val_ = (int64_t)tm->twac; + ptr->twap_.numer_ = (int64_t)tm->twap_Nh; ptr->twap_.denom_ = (int64_t)tm->twap_Nl; + ptr->twac_.numer_ = (int64_t)tm->twac_Nh; ptr->twac_.denom_ = (int64_t)tm->twac_Nl; + ptr->drv1_ = (int64_t)tm->D; + + // all good + ptr->num_qt_ = numv; + ptr->last_slot_ = now; + ptr->agg_.price_ = agg_price; + ptr->agg_.conf_ = (uint64_t)agg_conf; + ptr->agg_.status_ = PC_STATUS_TRADING; } #ifdef __cplusplus } #endif +#endif /* _pyth_oracle_upd_aggregrate_h_ */ diff --git a/program/src/oracle/util/align.h b/program/src/oracle/util/align.h new file mode 100644 index 000000000..023e85735 --- /dev/null +++ b/program/src/oracle/util/align.h @@ -0,0 +1,57 @@ +#ifndef _pyth_oracle_util_align_h_ +#define _pyth_oracle_util_align_h_ + +#include "compat_stdint.h" /* For uintptr_t */ +#include /* For alignof */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* align_ispow2 returns non-zero if a is a non-negative integral power + of 2 and zero otherwise. */ + +static inline int +align_ispow2( uintptr_t a ) { + uintptr_t mask = a - (uintptr_t)1; + return (a>(uintptr_t)0) & !(a & mask); +} + +/* align_isaligned returns non-zero if p has byte alignment a. Assumes + align_ispow2(a) is true. */ + +static inline int +align_isaligned( void * p, + uintptr_t a ) { + uintptr_t mask = a - (uintptr_t)1; + return !(((uintptr_t)p) & mask); +} + +/* align_dn/align_up aligns p to the first byte at or before / after p + with alignment a. Assumes align_ispow2(a) is true. */ + +static inline void * +align_dn( void * p, + uintptr_t a ) { + uintptr_t mask = a - (uintptr_t)1; + return (void *)(((uintptr_t)p) & (~mask)); +} + +static inline void * +align_up( void * p, + uintptr_t a ) { + uintptr_t mask = a - (uintptr_t)1; + return (void *)((((uintptr_t)p) + mask) & (~mask)); +} + +/* Helper macros for aligning pointers up or down to compatible + alignments for type T. FIXME: ADD UNIT TEST COVERAGE */ + +#define ALIGN_DN( p, T ) ((T *)align_dn( p, (uintptr_t)alignof(T) )) +#define ALIGN_UP( p, T ) ((T *)align_up( p, (uintptr_t)alignof(T) )) + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_util_align_h_ */ diff --git a/program/src/oracle/util/avg.h b/program/src/oracle/util/avg.h new file mode 100644 index 000000000..cf4e0fa00 --- /dev/null +++ b/program/src/oracle/util/avg.h @@ -0,0 +1,170 @@ +#ifndef _pyth_oracle_util_avg_h_ +#define _pyth_oracle_util_avg_h_ + +/* Portable robust integer averaging. (No intermediate overflow, no + assumptions about platform type wides, no assumptions about integer + signed right shift, no signed integer division, no implict + conversions, etc.) */ + +#include "sar.h" + +/* FIXME: ADD SUPPORT FOR 64_BIT ARRAY AVERAGES BASED ON UWIDE? */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* uint8_t a = avg_2_uint8 ( x, y ); int8_t a = avg_2_int8 ( x, y ); + uint16_t a = avg_2_uint16( x, y ); int16_t a = avg_2_int16( x, y ); + uint32_t a = avg_2_uint32( x, y ); int32_t a = avg_2_int32( x, y ); + uint64_t a = avg_2_uint64( x, y ); int64_t a = avg_2_int64( x, y ); + + compute the average (a) of x and y of the corresponding type with <1 + ulp error (floor / round toward negative infinity rounding). That + is, these compute this exactly: + floor( (x+y)/2 ) + On systems where signed right shifts are sign extending, this is + identical to: + (x+y) >> 1 + but the below will be safe against intermediate overflow. */ + +static inline uint8_t avg_2_uint8 ( uint8_t x, uint8_t y ) { return (uint8_t )((((uint16_t)x)+((uint16_t)y))>>1); } +static inline uint16_t avg_2_uint16( uint16_t x, uint16_t y ) { return (uint16_t)((((uint32_t)x)+((uint32_t)y))>>1); } +static inline uint32_t avg_2_uint32( uint32_t x, uint32_t y ) { return (uint32_t)((((uint64_t)x)+((uint64_t)y))>>1); } + +static inline uint64_t +avg_2_uint64( uint64_t x, + uint64_t y ) { + + /* x+y might have intermediate overflow and we don't have an efficient + portable wider integer type available. So we can't just do + (x+y)>>1 to compute floor((x+y)/2). */ + +# if 1 /* Fewer ops but less parallel issue */ + /* Emulate an adc (add with carry) to get the 65-bit wide intermediate + and then shift back in as necessary */ + uint64_t z = x + y; + uint64_t c = (uint64_t)(z>1) + (c<<63); +# else /* More ops but more parallel issue */ + /* floor(x/2)+floor(y/2)==(x>>1)+(y>>1) doesn't have intermediate + overflow but it has two rounding errors. Noting that + (x+y)/2 == floor(x/2) + floor(y/2) + (x_lsb+y_lsb)/2 + == (x>>1) + (y>>1) + (x_lsb+y_lsb)/2 + implies we should increment when x and y have their lsbs set. */ + return (x>>1) + (y>>1) + (x & y & UINT64_C(1)); +# endif + +} + +static inline int8_t avg_2_int8 ( int8_t x, int8_t y ) { return (int8_t )sar_int16((int16_t)(((int16_t)x)+((int16_t)y)),1); } +static inline int16_t avg_2_int16 ( int16_t x, int16_t y ) { return (int16_t)sar_int32((int32_t)(((int32_t)x)+((int32_t)y)),1); } +static inline int32_t avg_2_int32 ( int32_t x, int32_t y ) { return (int32_t)sar_int64((int64_t)(((int64_t)x)+((int64_t)y)),1); } + +static inline int64_t +avg_2_int64( int64_t x, + int64_t y ) { + + /* Similar considerations as above */ + +# if 1 /* Fewer ops but less parallel issue */ + /* x+y = x+2^63 + y+2^63 - 2^64 ... exact ops + = ux + uy - 2^64 ... exact ops where ux and uy are exactly representable as a 64-bit uint + = uz + 2^64 c - 2^64 ... exact ops where uz is a 64-bit uint and c is in [0,1]. + Thus, as before + uz = ux+uy ... c ops + c = (uz=ux) ... exact ops + Noting that (ux + uy) mod 2^64 = (x+y+2^64) mod 2^64 = (x+y) mod 2^64 + and that signed and added adds are the same operation binary in twos + complement math yields: + uz = (uint64_t)(x+y) ... c ops + b = uz>=(x+2^63) ... exact ops + as we know x+2^63 is in [0,2^64), x+2^63 = x+/-2^63 mod 2^64. And + using the signed and unsigned adds are the same operation binary + in we have: + x+2^63 ... exact ops + ==x+/-2^63 mod 2^64 ... exact ops + ==x+/-2^63 ... c ops */ + uint64_t t = (uint64_t)x; + uint64_t z = t + (uint64_t)y; + uint64_t b = (uint64_t)(z>=(t+(((uint64_t)1)<<63))); + return (int64_t)((z>>1) - (b<<63)); +# else /* More ops but more parallel issue (significant more ops if no platform arithmetic right shift) */ + /* See above for uint64_t for details on this calc. */ + return sar_int64( x, 1 ) + sar_int64( y, 1 ) + (x & y & (int64_t)1); +# endif + +} + +/* uint8_t a = avg_uint8 ( x, n ); int8_t a = avg_int8 ( x, n ); + uint16_t a = avg_uint16( x, n ); int16_t a = avg_int16( x, n ); + uint32_t a = avg_uint32( x, n ); int32_t a = avg_int32( x, n ); + + compute the average (a) of the n element array f the corresponding + type pointed to by x with <1 ulp round error (truncate / round toward + zero rounding). Supports arrays with up to 2^32-1 elements. Returns + 0 for arrays with 0 elements. Elements of the array are unchanged by + this function. No 64-bit equivalents are provided here as that + requires a different and slower implementation (at least if + portability is required). */ + +#define PYTH_ORACLE_UTIL_AVG_DECL(func,T) /* Comments for uint32_t but apply for uint8_t and uint16_t too */ \ +static inline T /* == (1/n) sum_i x[i] with one rounding error and round toward zero rounding */ \ +func( T const * x, /* Indexed [0,n) */ \ + uint32_t n ) { /* Returns 0 on n==0 */ \ + if( !n ) return (T)0; /* Handle n==0 case */ \ + /* At this point n is in [1,2^32-1]. Sum up all the x into a 64-bit */ \ + /* wide signed accumulator. */ \ + uint64_t a = (uint64_t)0; \ + for( uint32_t r=n; r; r-- ) a += (uint64_t)*(x++); \ + /* At this point a is in [ 0, (2^32-1)(2^32-1) ] < 2^64 and thus was */ \ + /* computed without intermediate overflow. Complete the average. */ \ + return (T)( a / (uint64_t)n ); \ +} + +PYTH_ORACLE_UTIL_AVG_DECL( avg_uint8, uint8_t ) +PYTH_ORACLE_UTIL_AVG_DECL( avg_uint16, uint16_t ) +PYTH_ORACLE_UTIL_AVG_DECL( avg_uint32, uint32_t ) +/* See note above about 64-bit variants */ + +#undef PYTH_ORACLE_UTIL_AVG_DECL + +#define PYTH_ORACLE_UTIL_AVG_DECL(func,T) /* Comments for int32_t but apply for int8_t and int16_t too */ \ +static inline T /* == (1/n) sum_i x[i] with one rounding error and round toward zero rounding */ \ +func( T const * x, /* Indexed [0,n) */ \ + uint32_t n ) { /* Returns 0 on n==0 */ \ + if( !n ) return (T)0; /* Handle n==0 case */ \ + /* At this point n is in [1,2^32-1]. Sum up all the x into a 64-bit */ \ + /* wide signed accumulator. */ \ + int64_t a = (int64_t)0; \ + for( uint32_t r=n; r; r-- ) a += (int64_t)*(x++); \ + /* At this point a is in [ -2^31 (2^32-1), (2^31-1)(2^32-1) ] such that */ \ + /* |a| < 2^63. It thus was computed without intermediate overflow and */ \ + /* further |a| can fit with an uint64_t. Compute |a|/n. If a<0, the */ \ + /* result will be at most 2^31-1 (i.e. all x[i]==2^31-1). Otherwise, */ \ + /* the result will be at most 2^31 (i.e. all x[i]==-2^31). */ \ + int s = a<(int64_t)0; \ + uint64_t u = (uint64_t)(s ? -a : a); \ + u /= (uint64_t)n; \ + a = (int64_t)u; \ + return (T)(s ? -a : a); \ +} + +PYTH_ORACLE_UTIL_AVG_DECL( avg_int8, int8_t ) +PYTH_ORACLE_UTIL_AVG_DECL( avg_int16, int16_t ) +PYTH_ORACLE_UTIL_AVG_DECL( avg_int32, int32_t ) +/* See note above about 64-bit variants */ + +#undef PYTH_ORACLE_UTIL_AVG_DECL + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_util_avg_h_ */ + diff --git a/program/src/oracle/util/clean b/program/src/oracle/util/clean new file mode 100755 index 000000000..860ef1641 --- /dev/null +++ b/program/src/oracle/util/clean @@ -0,0 +1,3 @@ +#!/bin/sh +rm -rfv bin + diff --git a/program/src/oracle/util/compat_stdint.h b/program/src/oracle/util/compat_stdint.h new file mode 100644 index 000000000..1842cc52c --- /dev/null +++ b/program/src/oracle/util/compat_stdint.h @@ -0,0 +1,28 @@ +#ifndef _pyth_oracle_util_compat_stdint_h_ +#define _pyth_oracle_util_compat_stdint_h_ + +/* Include functionality from . Define + PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE to indicate how to do this: + 0 - Use stdint.h directly + 1 - Use solana_sdk.h (solana uses its own definitions for stdint + types and that can conflicts with stdint.h) + Defaults to 0 or 1 depending on if __bpf__ is set. */ + +#ifndef PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE +#ifndef __bpf__ +#define PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE 0 +#else +#define PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE 1 +#endif +#endif + +#if PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE==0 +#include +#elif PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE==1 +#include +typedef uint64_t uintptr_t; +#else +#error "Unsupported PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE" +#endif + +#endif /* _pyth_oracle_util_compat_stdint_h_ */ diff --git a/program/src/oracle/util/exp2m1.h b/program/src/oracle/util/exp2m1.h new file mode 100644 index 000000000..6b47c0611 --- /dev/null +++ b/program/src/oracle/util/exp2m1.h @@ -0,0 +1,165 @@ +#ifndef _pyth_oracle_util_exp2m1_h_ +#define _pyth_oracle_util_exp2m1_h_ + +/* exp2m1_fxp computes: + + y/2^30 ~ exp2m1( x/2^30 ) = 2^( x/2^30 ) - 1 + + That is, it computes a fixed point approximation of 2^x-1 where x is + non-negative and represented with 34 integer bits and 30 fractional + bits (34u.30 fxp). If x/2^30 >= 34 (i.e. x>exp2m1_fxp_max()), + returns UINT64_MAX. For x<=exp2m1_fxp_max(), a exp2_fxp can also + trivially constructed by just adding 2^30 to the result. + + The approximation is designed to minimize the RMS error over the + interval [0,2^30) while preserving continuity overall and matching at + at key places exactly (e.g. if x/2^30 = 2^i for i in [-30,33], this + will be exact). + + Define EXP2M1_FXP_ORDER from 1:7 to get the desired cost / accuracy + tradeoff. At EXP2M1_FXP_ORDER 1, this costs virtually nothing (some + fast bit ops); it is a piecewise linear approximation that is + typically accurate to a few percent. + + At EXP2M1_FXP_ORDER 7, this is nearly IEEE style 30 bit precision + (i.e. for x in [0,2^30), the result with 0.6 ulp instead of the + theoretical limit of 0.5 ulp). For x in [0,2^30), the rounding is + approximately round nearest even. For larger x, relative error is + approximately constant. It requires some fast bits ops, 7 64*64->64 + adds and 7 64*64->64 multiplies (similarly for the orders 2:6). + + EXP2M1_FXP_ORDER 5 is the default and has a worst case output + accuracy comparable IEEE single precision. + + In short, for integer x/2^30, it is the exact value correctly rounded + and, for IID random x in [0,2^30), the approximation errors can be + thought of as a random Gaussian perturbation on the output value such + that the approximation for a given order is the one that minimizes + the perturbation variance. */ + +#include "compat_stdint.h" + +#ifndef EXP2M1_FXP_ORDER +#define EXP2M1_FXP_ORDER 5 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +static inline uint64_t exp2m1_fxp_max( void ) { return UINT64_C(0x87fffffff); } /* == 34*2^30 - 1 (i=33,d=2^30-1) */ + +static inline uint64_t +exp2m1_fxp( uint64_t x ) { + + /* This calculation is based on splitting x into integer and + fractional bits such that: + x = 2^30 i + d + where d is in [0,2^30). Then we have: + y = 2^(30+i) exp2( d/2^30 ) - 2^30 + = (2^64 exp2( d/2^30 )) / 2^(64-30-i) - 2^30 + = (2^64 exp2m1( d/2^30 ) + 2^64) / 2^(34-i) - 2^30 + = (2^64 exp2m1( d/2^30 ))/2^(34-i) + 2^(30+i)- 2^30 + Thus, the main challenge is computing an approximation of: + 2^64 exp2m1( d/2^30 ) / 2^(34-i) + for delta = d/2^30 in [0,1) with good enough precision. As this + smoothly goes from 0 to 1, this is an ideal candidate for a + polynomial approximation (Pade' approximation might be even better + if we are willing to accept the cost of a divide). We aren't too + particular or maxing out precision or precise rounding in the + interest of compactness and speed. + + The polynomial coefficients below were selected as the polynomials + that minimize the RMS error over the interval [0,1) while matching + exactly at 0 and 1 (RMS error was used over minimax as it has more + intuitive error properties for users typically). The coefficients + were scaled such that the intermediates in Horner's rule are all + less than 2^34 to minimize the coefficient quantization error while + not overflowing 64 bits when multiplied by d. The output scale was + fixed at 2^64 so that we have some extra precision when i is + non-zero. */ + + if( x>exp2m1_fxp_max() ) return UINT64_MAX; + + uint64_t i = x >> 30; /* In [0,34) */ + uint64_t d = x & ((UINT64_C(1)<<30)-UINT64_C(1)); /* In [0,2^30) */ + + /* Estimate y ~ 2^64 exp2m1( d/2^30 ) */ + + uint64_t y; + + /* BEGIN AUTOGENERATED CODE - See KJB for code generator */ +# if EXP2M1_FXP_ORDER==1 + /* bits 4.0 rms_aerr 2.6e-02 rms_rerr 1.8e-02 max_aerr 8.6e-02 max_rerr 6.1e-02 */ + /* As implemented: bits 4.0 max_aerr 8.6e-02 max_rerr 6.1e-02 ulp 92418389.1 */ + y = d<<34; +# elif EXP2M1_FXP_ORDER==2 + /* bits 8.3 rms_aerr 2.7e-03 rms_rerr 2.0e-03 max_aerr 3.9e-03 max_rerr 3.2e-03 */ + /* As implemented: bits 8.3 max_aerr 3.9e-03 max_rerr 3.2e-03 ulp 4173744.7 */ + y = UINT64_C(0x2c029d07d); + y = UINT64_C(0x29feb17c1) + ((y*d)>>31); + y = (y*d); +# elif EXP2M1_FXP_ORDER==3 + /* bits 13.0 rms_aerr 1.1e-04 rms_rerr 7.9e-05 max_aerr 1.7e-04 max_rerr 1.2e-04 */ + /* As implemented: bits 13.0 max_aerr 1.7e-04 max_rerr 1.2e-04 ulp 183713.4 */ + y = UINT64_C(0x288319c3e); + y = UINT64_C(0x1cd1735e6) + ((y*d)>>32); + y = UINT64_C(0x2c86e3185) + ((y*d)>>31); + y = (y*d); +# elif EXP2M1_FXP_ORDER==4 + /* bits 17.6 rms_aerr 3.3e-06 rms_rerr 2.4e-06 max_aerr 5.6e-06 max_rerr 5.0e-06 */ + /* As implemented: bits 17.6 max_aerr 5.6e-06 max_rerr 5.0e-06 ulp 5998.7 */ + y = UINT64_C(0x38100ce15); + y = UINT64_C(0x1a7f168b9) + ((y*d)>>33); + y = UINT64_C(0x1eeba70d4) + ((y*d)>>32); + y = UINT64_C(0x2c5a09747) + ((y*d)>>31); + y = (y*d); +# elif EXP2M1_FXP_ORDER==5 + /* bits 22.8 rms_aerr 9.1e-08 rms_rerr 6.7e-08 max_aerr 1.5e-07 max_rerr 1.4e-07 */ + /* As implemented: bits 22.8 max_aerr 1.5e-07 max_rerr 1.4e-07 ulp 165.8 */ + y = UINT64_C(0x3e1a2fa1b); + y = UINT64_C(0x24a7ddfee) + ((y*d)>>33); + y = UINT64_C(0x1c994ed30) + ((y*d)>>33); + y = UINT64_C(0x1ebd13698) + ((y*d)>>32); + y = UINT64_C(0x2c5c9fe11) + ((y*d)>>31); + y = (y*d); +# elif EXP2M1_FXP_ORDER==6 + /* bits 27.9 rms_aerr 2.1e-09 rms_rerr 1.6e-09 max_aerr 4.2e-09 max_rerr 3.9e-09 */ + /* As implemented: bits 27.8 max_aerr 4.6e-09 max_rerr 4.3e-09 ulp 5.0 */ + y = UINT64_C(0x3959e8bc0); + y = UINT64_C(0x28987867f) + ((y*d)>>33); + y = UINT64_C(0x27aac1b83) + ((y*d)>>33); + y = UINT64_C(0x1c67f6aa0) + ((y*d)>>33); + y = UINT64_C(0x1ebfde70a) + ((y*d)>>32); + y = UINT64_C(0x2c5c8510d) + ((y*d)>>31); + y = (y*d); +# elif EXP2M1_FXP_ORDER==7 + /* bits 33.4 rms_aerr 4.7e-11 rms_rerr 3.5e-11 max_aerr 9.9e-11 max_rerr 9.0e-11 */ + /* As implemented: bits 30.8 max_aerr 5.9e-10 max_rerr 5.4e-10 ulp 0.6 */ + y = UINT64_C(0x2d6e5bd1d); + y = UINT64_C(0x257992e8b) + ((y*d)>>33); + y = UINT64_C(0x2c02265a3) + ((y*d)>>33); + y = UINT64_C(0x27607eb13) + ((y*d)>>33); + y = UINT64_C(0x1c6b30b08) + ((y*d)>>33); + y = UINT64_C(0x1ebfbcc25) + ((y*d)>>32); + y = UINT64_C(0x2c5c8604f) + ((y*d)>>31); + y = (y*d); +# else +# error "Unsupported EXP2M1_FXP_ORDER" +# endif + /* END AUTOGENERATED CODE */ + + /* Note: (y + 2^(s-1)) / 2^s does a divide with grade school rounding + (rounds to nearest with ties away from 0). The add will not + overflow as y is <= 2^64-2^34 at this point and 2^(s-1) is at most + 2^33. */ + + int s = 34-(int)i; /* In [1,34] so no UB in below shifts */ + return ((y+(UINT64_C(1) << (s-1))) >> s) + (UINT64_C(1)<<(64-s)) - (UINT64_C(1)<<30); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_util_exp2m1_h_ */ diff --git a/program/src/oracle/util/fxp.h b/program/src/oracle/util/fxp.h new file mode 100644 index 000000000..793cbfd94 --- /dev/null +++ b/program/src/oracle/util/fxp.h @@ -0,0 +1,935 @@ +#ifndef _pyth_oracle_util_fxp_h_ +#define _pyth_oracle_util_fxp_h_ + +/* Large set of primitives for portable fixed point arithmetic with + correct rounding and/or overflow detection. Strongly targeted at + platforms with reasonable performance C-style 64b unsigned integer + arithmetic under the hood. Likewise, strongly targets unsigned fixed + point representations that fit within 64b and have 30 fractional + bits. */ + +#include /* For INT_MIN */ +#include "uwide.h" +#include "sqrt.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* FIXED POINT ADDITION ***********************************************/ + +/* Compute: + (c 2^64 + z)/2^30 = x/2^30 + y/2^30 + Exact. c will be in [0,1]. Fast variant assumes that the user knows + c is zero or is not needed. */ + +static inline uint64_t fxp_add( uint64_t x, uint64_t y, uint64_t * _c ) { *_c = (uint64_t)(x>(~y)); return x + y; } +static inline uint64_t fxp_add_fast( uint64_t x, uint64_t y ) { return x + y; } + +/* FIXED POINT SUBTRACTION ********************************************/ + +/* Compute: + z/2^30 = (2^64 b + x)/2^30 - y/2^30 + Exact. b will be in [0,1]. Fast variant assumes that the user knows + b is zero (i.e. x>=y) or is not needed. */ + +static inline uint64_t fxp_sub( uint64_t x, uint64_t y, uint64_t * _b ) { *_b = (uint64_t)(x>30; return (xh<<34) | (xl>>30); } + +/* rtz -> Round toward zero (aka truncate rounding) + Fast variant assumes x*y < 2^64 (i.e. exact result < ~2^4). + Based on: + z/2^30 ~ (x/2^30)(y/2^30) + -> z ~ x y / 2^30 + With RTZ rounding: + z = floor( x y / 2^30 ) + = (x*y) >> 30 + As x*y might overflow 64 bits, we need to do a 64b*64b->128b + multiplication (uwide_mul) and a 128b shift right by 30 + (fxp_mul_contract). + + Fastest style of rounding. Rounding error in [0,1) ulp. + (ulp==2^-30). */ + +static inline uint64_t +fxp_mul_rtz( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint64_t zh,zl; uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 so no overflow */ + return fxp_mul_contract( zh,zl, _c ); +} + +static inline uint64_t fxp_mul_rtz_fast( uint64_t x, uint64_t y ) { return (x*y) >> 30; } + +/* raz -> Round away from zero + Fast variant assumes x*y < 2^64-2^30+1 (i.e. exact result < ~2^4) + Based on: + z/2^30 ~ (x/2^30)(y/2^30) + -> z ~ x y / 2^30 + With RAZ rounding: + z = ceil( x y / 2^30 ) + = floor( (x y + 2^30 - 1) / 2^30 ) + = (x*y + 2^30 -1) >> 30 + As x*y might overflow 64 bits, we need to do a 64b*64b->128b + multiplication (uwide_mul), a 64b increment of a 128b (uwide_inc) and + a 128b shift right by 30 (fxp_mul_contract). + + Slightly more expensive (one uwide_inc) than RTZ rounding. Rounding + error in (-1,0] ulp. (ulp==2^-30). */ + +static inline uint64_t +fxp_mul_raz( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint64_t zh,zl; uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 so no overflow */ + uwide_inc( &zh,&zl, zh,zl, (UINT64_C(1)<<30)-UINT64_C(1) ); /* <= 2^128 - 2^65 + 2^30 so no overflow */ + return fxp_mul_contract( zh,zl, _c ); +} + +static inline uint64_t fxp_mul_raz_fast( uint64_t x, uint64_t y ) { return (x*y+((UINT64_C(1)<<30)-UINT64_C(1))) >> 30; } + +/* rnz -> Round nearest with ties toward zero + Fast variant assumes x*y < 2^64-2^29+1 (i.e. exact result < ~2^4) + Based on: + z/2^30 ~ (x/2^30)(y/2^30) + -> z ~ x y / 2^30 + Let frac be the least significant 30 bits of x*y. If frac<2^29 + (frac>2^29), we should round down (up). frac==2^29 is a tie and we + should round down. If we add 2^29-1 to frac, the result will be + <2^30 when frac<=2^29 and will be >=2^30. Thus bit 30 of frac+2^29-1 + indicates whether we need to round up or down. This yields: + z = floor( x y + 2^29 - 1 ) / 2^30 ) + = (x*y + 2^29 - 1) >> 30 + As x*y might overflow 64 bits, we need to do a 64b*64b->128b + multiplication (uwide_mul), a 64b increment of a 128b (uwide_inc) and + a 128b shift right by 30 (fxp_mul_contract). + + Slightly more expensive (one uwide_inc) than RTZ rounding. Rounding + error in (-1/2,1/2] ulp. (ulp==2^-30). */ + +static inline uint64_t +fxp_mul_rnz( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint64_t zh,zl; uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 so no overflow */ + uwide_inc( &zh,&zl, zh,zl, (UINT64_C(1)<<29)-UINT64_C(1) ); /* <= 2^128 - 2^65 + 2^29 so no overflow */ + return fxp_mul_contract( zh,zl, _c ); +} + +static inline uint64_t fxp_mul_rnz_fast( uint64_t x, uint64_t y ) { return (x*y+((UINT64_C(1)<<29)-UINT64_C(1))) >> 30; } + +/* rna -> Round nearest with ties away from zero (aka grade school rounding) + Fast variant assumes x*y < 2^64-2^29 (i.e. exact result < ~2^4) + Based on: + z/2^30 ~ (x/2^30)(y/2^30) + -> z ~ x y / 2^30 + Let frac be the least significant 30 bits of x*y. If frac<2^29 + (frac>2^29), we should round down (up). frac==2^29 is a tie and we + should round up. If we add 2^29-1 to frac, the result will be <2^30 + when frac<=2^29 and will be >=2^30. Thus bit 30 of frac+2^29 + indicates whether we need to round up or down. This yields: + z = floor( x y + 2^29 ) / 2^30 ) + = (x*y + 2^29) >> 30 + As x*y might overflow 64 bits, we need to do a 64b*64b->128b + multiplication (uwide_mul), a 64b increment of a 128b (uwide_inc) and + a 128b shift right by 30 (fxp_mul_contract). + + Slightly more expensive (one uwide_inc) than RTZ rounding. Rounding + error in [-1/2,1/2) ulp. (ulp==2^-30). */ + +static inline uint64_t +fxp_mul_rna( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint64_t zh,zl; uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 so no overflow */ + uwide_inc( &zh,&zl, zh,zl, UINT64_C(1)<<29 ); /* <= 2^128 - 2^65 + 2^29 so no overflow */ + return fxp_mul_contract( zh,zl, _c ); +} + +static inline uint64_t fxp_mul_rna_fast( uint64_t x, uint64_t y ) { return (x*y+(UINT64_C(1)<<29)) >> 30; } + +/* rne -> Round toward nearest with ties toward even (aka banker's rounding / IEEE style rounding) + Fast variant assumes x*y < 2^64-2^29 (i.e. exact result < ~2^4) + Based on the observation that rnz / rna rounding should be used when + floor(x*y/2^30) is even/odd. That is, use the rnz / rna increment of + 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1. As x*y might overflow 64 + bits, we need to do a 64b*64b->128b multiplication (uwide_mul), a 64b + increment of a 128b (uwide_inc) and a 128b shift right by 30 + (fxp_mul_contract). + + The most accurate style of rounding usually and somewhat more + expensive (some sequentially dependent bit ops and one uwide_inc) + than RTZ rounding. Rounding error in [-1/2,1/2] ulp (unbiased). + (ulp==2^-30). */ + +static inline uint64_t +fxp_mul_rne( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint64_t zh,zl; uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 */ + uint64_t t = ((UINT64_C(1)<<29)-UINT64_C(1)) + ((zl>>30) & UINT64_C(1)); /* t = 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1 */ + uwide_inc( &zh,&zl, zh,zl, t ); /* <= 2^128 - 2^65 + 2^29 */ + return fxp_mul_contract( zh,zl, _c ); +} + +static inline uint64_t +fxp_mul_rne_fast( uint64_t x, + uint64_t y ) { + uint64_t z = x*y; + uint64_t t = ((UINT64_C(1)<<29)-UINT64_C(1)) + ((z>>30) & UINT64_C(1)); /* t = 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1 */ + return (z + t) >> 30; +} + +/* rno -> Round toward nearest with ties toward odd + Fast variant assumes x*y < 2^64-2^29 (i.e. exact result < ~2^4) + Same as rne with the parity flipped for the increment. As x*y might + overflow 64 bits, we need to do a 64b*64b->128b multiplication + (uwide_mul), a 64b increment of a 128b (uwide_inc) and a 128b shift + right by 30 (fxp_mul_contract). + + Somewhat more expensive (some sequentially dependent bit ops and one + uwide_inc) than RTZ rounding. Rounding error in [-1/2,1/2] ulp + (unbiased). (ulp==2^-30). */ + +static inline uint64_t +fxp_mul_rno( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint64_t zh,zl; uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 */ + uint64_t t = (UINT64_C(1)<<29) - ((zl>>30) & UINT64_C(1)); /* t = 2^29 / 2^29-1 when bit 30 of x*y is 0 / 1 */ + uwide_inc( &zh,&zl, zh,zl, t ); /* <= 2^128 - 2^65 + 2^29 */ + return fxp_mul_contract( zh,zl, _c ); +} + +static inline uint64_t +fxp_mul_rno_fast( uint64_t x, + uint64_t y ) { + uint64_t z = x*y; + uint64_t t = (UINT64_C(1)<<29) - ((z>>30) & UINT64_C(1)); /* t = 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1 */ + return (z + t) >> 30; +} + +/* Other rounding modes: + rdn -> Round down / toward floor / toward -inf ... same as rtz for unsigned + rup -> Round up / toward ceil / toward +inf ... same as raz for unsigned + rnd -> Round nearest with ties down / toward floor / toward -inf ... same as rnz for unsigned + rnu -> Round nearest with ties up / toward ceil / toward -inf ... same as rna for unsigned */ + +static inline uint64_t fxp_mul_rdn( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_mul_rtz( x, y, _c ); } +static inline uint64_t fxp_mul_rup( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_mul_raz( x, y, _c ); } +static inline uint64_t fxp_mul_rnd( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_mul_rnz( x, y, _c ); } +static inline uint64_t fxp_mul_rnu( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_mul_rna( x, y, _c ); } + +static inline uint64_t fxp_mul_rdn_fast( uint64_t x, uint64_t y ) { return fxp_mul_rtz_fast( x, y ); } +static inline uint64_t fxp_mul_rup_fast( uint64_t x, uint64_t y ) { return fxp_mul_raz_fast( x, y ); } +static inline uint64_t fxp_mul_rnd_fast( uint64_t x, uint64_t y ) { return fxp_mul_rnz_fast( x, y ); } +static inline uint64_t fxp_mul_rnu_fast( uint64_t x, uint64_t y ) { return fxp_mul_rna_fast( x, y ); } + +/* FIXED POINT DIVISION ***********************************************/ + +/* Compute: + (2^64 c + z)/2^30 ~ (x/2^30)/(y/2^30) + under various rounding modes. c<2^30 if y is non-zero. Returns + c=UINT64_MAX,z=0 if y is zero. */ + +/* Helper used by the below that computes + 2^64 yh + yl = 2^30 x + (i.e. widen a 64b number to 128b and shift it left by 30.) Exact. + yh<2^30. */ + +static inline void fxp_div_expand( uint64_t * _yh, uint64_t * _yl, uint64_t x ) { *_yh = x>>34; *_yl = x<<30; } + +/* rtz -> Round toward zero (aka truncate rounding) + Fast variant assumes y!=0 and x<2^34 (i.e. exact result < ~2^34) + Based on: + z/2^30 ~ (x/2^30) / (y/2^30) + -> z ~ 2^30 x / y + With RTZ rounding: + z = floor( 2^30 x / y ) + As 2^30 x might overflow 64 bits, we need to expand x + (fxp_div_expand) and then use a 128b/64b -> 128b divider. + (uwide_div). + + Fastest style of rounding. Rounding error in [0,1) ulp. + (ulp==2^-30). */ + +static inline uint64_t +fxp_div_rtz( uint64_t x, + uint64_t y, + uint64_t * _c ) { + if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */ + uint64_t zh,zl; fxp_div_expand( &zh,&zl, x ); /* 2^30 x <= 2^94-2^30 so no overflow */ + uwide_div( &zh,&zl, zh,zl, y ); /* <= 2^94-2^30 so no overflow */ + *_c = zh; return zl; +} + +static inline uint64_t fxp_div_rtz_fast( uint64_t x, uint64_t y ) { return (x<<30) / y; } + +/* raz -> Round away from zero + Fast variant assumes y!=0 and 2^30*x+y-1<2^64 (i.e. exact result < ~2^34) + Based on: + z/2^30 ~ (x/2^30) / (y/2^30) + -> z ~ 2^30 x / y + With RAZ rounding: + z = ceil( 2^30 x / y ) + = floor( (2^30 x + y - 1) / y ) + As 2^30 x might overflow 64 bits, we need to expand x + (fxp_div_expand), increment it by the 64b y-1 (uwide_inc) and then + use a 128b/64b->128b divider (uwide_div). + + Slightly more expensive (one uwide_inc) than RTZ rounding. Rounding + error in (-1,0] ulp. (ulp==2^-30). */ + +static inline uint64_t +fxp_div_raz( uint64_t x, + uint64_t y, + uint64_t * _c ) { + if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */ + uint64_t zh,zl; fxp_div_expand( &zh,&zl, x ); /* 2^30 x <= 2^94-2^30 so no overflow */ + uwide_inc( &zh,&zl, zh,zl, y-UINT64_C(1) ); /* <= 2^94+2^64-2^30-2 so no overflow */ + uwide_div( &zh,&zl, zh,zl, y ); /* = ceil( 2^30 x / y ) <= 2^94-2^30 so no overflow */ + *_c = zh; return zl; +} + +static inline uint64_t fxp_div_raz_fast( uint64_t x, uint64_t y ) { return ((x<<30)+(y-UINT64_C(1))) / y; } + +/* rnz -> Round nearest with ties toward zero + Fast variant assumes y!=0 and 2^30*x+floor((y-1)/2)<2^64 (i.e. exact result < ~2^34) + + Consider: + z = floor( (2^30 x + floor( (y-1)/2 )) / y ) + where y>0. + + If y is even: odd + z = floor( (2^30 x + (y/2) - 1) / y ) = floor( (2^30 x + (y-1)/2) / y ) + or: + z y + r = 2^30 x + (y/2)-1 = 2^30 x + (y-1)/2 + for some r in [0,y-1]. Or: + z y = 2^30 x + delta = 2^30 x + delta + where: + delta in [-y/2,y/2-1] in [-y/2+1/2,y/2-1/2] + or: + z = 2^30 x / y + epsilon = 2^30 x / y + epsilon + where: + epsilon in [-1/2,1/2-1/y] in [-1/2+1/(2y),1/2-1/(2y)] + Thus we have: + 2^30 x/y - 1/2 <= z < 2^30 x/y + 1/2 2^30 x/y - 1/2 < z < 2^30 x/y + 1/2 + + Combining yields: + + 2^30 x/y - 1/2 <= z < 2^30 x/y + 1/2 + + Thus, the z computed as per the above is the RNZ rounded result. As + 2^30 x might overflow 64 bits, we need to expand x (fxp_div_expand), + increment it by the 64b (y-1)>>1 (uwide_inc) and then use a + 128b/64b->128b divider (uwide_div). + + Slightly more expensive (one uwide_inc) than RTZ rounding. Rounding + error in (-1/2,1/2] ulp. (ulp==2^-30). */ + +static inline uint64_t +fxp_div_rnz( uint64_t x, + uint64_t y, + uint64_t * _c ) { + if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */ + uint64_t zh,zl; fxp_div_expand( &zh,&zl, x ); /* 2^30 x <= 2^94-2^30 so no overflow */ + uwide_inc( &zh,&zl, zh,zl, (y-UINT64_C(1))>>1 ); /* <= 2^94-2^30 + 2^63-1 so no overflow */ + uwide_div( &zh,&zl, zh,zl, y ); /* <= ceil(2^30 x/y) <= 2^94-2^30 so no overflow */ + *_c = zh; return zl; +} + +static inline uint64_t fxp_div_rnz_fast( uint64_t x, uint64_t y ) { return ((x<<30)+((y-UINT64_C(1))>>1)) / y; } + +/* rna -> Round nearest with ties away from zero (aka grade school rounding) + Fast variant assumes y!=0 and 2^30*x+floor(y/2)<2^64 (i.e. exact result < ~2^34) + + Consider: + z = floor( (2^30 x + floor( y/2 )) / y ) + where y>0. + + If y is even: odd + z = floor( (2^30 x + (y/2)) / y ) = floor( (2^30 x + (y-1)/2) / y ) + or: + z y + r = 2^30 x + (y/2) = 2^30 x + (y-1)/2 + for some r in [0,y-1]. Or: + z y = 2^30 x + delta = 2^30 x + delta + where: + delta in [-y/2+1,y/2] in [-y/2+1/2,y/2-1/2] + or: + z = 2^30 x / y + epsilon = 2^30 x / y + epsilon + where: + epsilon in [-1/2+1/y,1/2] in [-1/2+1/(2y),1/2-1/(2y)] + Thus we have: + 2^30 x/y - 1/2 < z <= 2^30 x/y + 1/2 2^30 x/y - 1/2 < z < 2^30 x/y + 1/2 + + Combining yields: + + 2^30 x/y - 1/2 < z <= 2^30 x/y + 1/2 + + Thus, the z computed as per the above is the RNA rounded result. As + 2^30 x might overflow 64 bits, we need to expand x (fxp_div_expand), + increment it by the 64b y>>1 (uwide_inc) and then use a + 128b/64b->128b divider (uwide_div). + + Slightly more expensive (one uwide_inc) than RTZ rounding. Rounding + error in [-1/2,1/2) ulp. (ulp==2^-30). + + Probably worth noting that if y has its least significant bit set, + all the rnz/rna/rne/rno modes yield the same result (as ties aren't + possible) and this is the cheapest of the round nearest modes.*/ + +static inline uint64_t +fxp_div_rna( uint64_t x, + uint64_t y, + uint64_t * _c ) { + if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */ + uint64_t zh,zl; fxp_div_expand( &zh,&zl, x ); /* 2^30 x <= 2^94-2^30 so no overflow */ + uwide_inc( &zh,&zl, zh,zl, y>>1 ); /* <= 2^94-2^30 + 2^63-1 so no overflow */ + uwide_div( &zh,&zl, zh,zl, y ); /* <= ceil(2^30 x/y) <= 2^94-2^30 so no overflow */ + *_c = zh; return zl; +} + +static inline uint64_t fxp_div_rna_fast( uint64_t x, uint64_t y ) { return ((x<<30)+(y>>1)) / y; } + +/* rne -> Round nearest with ties toward even (aka banker's rounding / IEEE style rounding) + Fast variant assumes y!=0 and 2^30 x < 2^64 (i.e. exact result < ~2^34) + + Based on computing (when y>0): + + q y + r = 2^30 x + + where q = floor( 2^30 x / y ) and r is in [0,y-1]. + + If r < y/2, the result should round down. And if r > y/2 the result + should round up. If r==y/2 (which is only possible if y is even), + the result should round down / up when q is even / odd. + + Combining yields we need to round up when: + + r>floor(y/2) or (r==floor(y/2) and y is even and q is odd) + + As 2^30 x might overflow 64 bits, we need to expand x + (fxp_div_expand). Since we need both the 128b quotient and the 64b + remainder, we need a 128b/64b->128b,64b divider (uwide_divrem ... if + there was a way to quickly determine if floor( 2^30 x / y ) is even + or odd, we wouldn't need the remainder and could select the + appropriate RNZ/RNA based uwide_inc increment) and then a 128b + conditional increment (uwide_inc). + + The most accurate style of rounding usually and somewhat more + expensive (needs remainder, some sequentially dependent bit ops and + one uwide_inc) than RTZ rounding. Rounding error in [-1/2,1/2] ulp + (unbiased). (ulp==2^-30). */ + +static inline uint64_t +fxp_div_rne( uint64_t x, + uint64_t y, + uint64_t * _c ) { + if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */ + uint64_t zh,zl; fxp_div_expand( &zh,&zl, x ); /* 2^30 x <= 2^94-2^30 so no overflow */ + uint64_t r = uwide_divrem( &zh,&zl, zh,zl, y ); /* *y + r = 2^30 x where r is in [0,y-1] so no overflow */ + uint64_t flhy = y>>1; /* floor(y/2) so no overflow */ + uwide_inc( &zh,&zl, zh,zl, (uint64_t)( (r>flhy) | ((r==flhy) & !!((~y) & zl & UINT64_C(1))) ) ); /* <= ceil( 2^30 x / y ) <= 2^94-2^30 so no overflow */ + *_c = zh; return zl; +} + +static inline uint64_t +fxp_div_rne_fast( uint64_t x, + uint64_t y ) { + uint64_t n = x << 30; + uint64_t q = n / y; + uint64_t r = n - q*y; + uint64_t flhy = y>>1; + return q + (uint64_t)( (r>flhy) | ((r==flhy) & !!((~y) & q & UINT64_C(1))) ); +} + +/* rno -> Round nearest with ties toward odd + Fast variant assumes y!=0 and 2^30 x < 2^64 (i.e. exact result < ~2^34) + + Similar considerations as RNE with the parity for rounding on ties + swapped. + + Somewhat more expensive (needs remainder, some sequentially dependent + bit ops and one uwide_inc) than RTZ rounding. Rounding error in + [-1/2,1/2] ulp (unbiased). (ulp==2^-30). */ + +static inline uint64_t +fxp_div_rno( uint64_t x, + uint64_t y, + uint64_t * _c ) { + if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */ + uint64_t zh,zl; fxp_div_expand( &zh,&zl, x ); /* 2^30 x <= 2^94-2^30 so no overflow */ + uint64_t r = uwide_divrem( &zh,&zl, zh,zl, y ); /* *y + r = 2^30 x where r is in [0,y-1] so no overflow */ + uint64_t flhy = y>>1; /* floor(y/2) so no overflow */ + uwide_inc( &zh,&zl, zh,zl, (uint64_t)( (r>flhy) | ((r==flhy) & !!((~y) & (~zl) & UINT64_C(1))) ) ); /* <= ceil( 2^30 x / y ) <= 2^94-2^30 so no overflow */ + *_c = zh; return zl; +} + +static inline uint64_t +fxp_div_rno_fast( uint64_t x, + uint64_t y ) { + uint64_t n = x << 30; + uint64_t q = n / y; + uint64_t r = n - q*y; + uint64_t flhy = y>>1; + return q + (uint64_t)( (r>flhy) | ((r==flhy) & !!((~y) & (~q) & UINT64_C(1))) ); +} + +/* Other rouding modes: + rdn -> Round down / toward floor / toward -inf ... same as rtz for unsigned + rup -> Round up / toward ceil / toward +inf ... same as raz for unsigned + rnd -> Round nearest with ties down / toward floor / toward -inf ... same as rnz for unsigned + rnu -> Round nearest with ties up / toward ceil / toward -inf ... same as rna for unsigned */ + +static inline uint64_t fxp_div_rdn( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_div_rtz( x, y, _c ); } +static inline uint64_t fxp_div_rup( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_div_raz( x, y, _c ); } +static inline uint64_t fxp_div_rnd( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_div_rnz( x, y, _c ); } +static inline uint64_t fxp_div_rnu( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_div_rna( x, y, _c ); } + +static inline uint64_t fxp_div_rdn_fast( uint64_t x, uint64_t y ) { return fxp_div_rtz_fast( x, y ); } +static inline uint64_t fxp_div_rup_fast( uint64_t x, uint64_t y ) { return fxp_div_raz_fast( x, y ); } +static inline uint64_t fxp_div_rnd_fast( uint64_t x, uint64_t y ) { return fxp_div_rnz_fast( x, y ); } +static inline uint64_t fxp_div_rnu_fast( uint64_t x, uint64_t y ) { return fxp_div_rna_fast( x, y ); } + +/* FIXED POINT SQRT ***************************************************/ + +/* Compute: + z/2^30 ~ sqrt( x/2^30 ) + under various rounding modes. */ + +/* rtz -> Round toward zero (aka truncate rounding) + Fast variant assumes x<2^34 + Based on: + z/2^30 ~ sqrt( x/2^30) + -> z ~ sqrt( 2^30 x ) + With RTZ rounding: + z = floor( sqrt( 2^30 x ) ) + Fastest style of rounding. Rounding error in [0,1) ulp. + (ulp==2^-30). */ + +static inline uint64_t +fxp_sqrt_rtz( uint64_t x ) { + + /* Initial guess. Want to compute + y = sqrt( x 2^30 ) + but x 2^30 does not fit into 64-bits at this point. So we instead + approximate: + y = sqrt( x 2^(2s) 2^(30-2s) ) + = sqrt( x 2^(2s) ) 2^(15-s) + ~ floor( sqrt( x 2^(2s) ) ) 2^(15-s) + where s is the largest integer such that x 2^(2s) does not + overflow. */ + + int s = (63-log2_uint64( x )) >> 1; /* lg x in [34,63], 63-lg x in [0,29], s in [0,14] when x>=2^34 */ + if( s>15 ) s = 15; /* s==15 when x<2^34 */ + uint64_t y = sqrt_uint64( x << (s<<1) ) << (15-s); /* All shifts well defined */ + if( s==15 ) return y; /* No iteration if x<2^34 */ + + /* Expand x to 2^30 x for the fixed point iteration */ + uint64_t xh,xl; fxp_div_expand( &xh,&xl, x ); + for(;;) { + + /* Iterate y' = floor( (y(y+1) + 2^30 x) / (2y+1) ). This is the + same iteration as sqrt_uint{8,16,32,64} (which converges on the + floor(sqrt(x)) but applied to the (wider than 64b) quantity + 2^30*x and then starting from an exceptionally good guess (such + that ~2 iterations should be needed at most). */ + + uint64_t yh,yl; + uwide_mul( &yh,&yl, y,y+UINT64_C(1) ); + uwide_add( &yh,&yl, yh,yl, xh,xl, UINT64_C(0) ); + uwide_div( &yh,&yl, yh,yl, (y<<1)+UINT64_C(1) ); + if( yl==y ) break; + y = yl; + } + + return y; +} + +static inline uint64_t fxp_sqrt_rtz_fast( uint64_t x ) { return sqrt_uint64( x<<30 ); } + +/* raz -> Round away zero + Fast variant assumes x<2^34 + Based on: + z/2^30 ~ sqrt( x/2^30) + -> z ~ sqrt( 2^30 x ) + Let y by the RTZ rounded result: + y = floor( sqrt( 2^30 x ) ) + and consider the residual: + r = 2^30 x - y^2 + which, given the above, will be in [0,2y]. If r==0, the result + is exact and thus already correctly rounded. Otherwise, we need + to round up. We note that the residual of the RTZ iteration is + the same as this residual at convergence: + y = floor( (y^2 + y + 2^30 x) / (2y+1) ) + = (y^2 + y + 2^30 x - r') / (2y+1) + where r' in [0,2y]: + 2y^2 + y = y^2 + y + 2^30 x - r' + y^2 = 2^30 x - r' + r' = 2^30 x - y^2 + -> r' = r + Thus we can use explicitly compute the remainder or use uwide_divrem + in the iteration itself to produce the needed residual. + + Alternatively, the iteration + y = floor( (y^2 + y - 2 + 2^30 x) / (2y-1) ) + = floor( (y(y+1) + (2^30 x-2)) / (2y-1) ) + should converge on the RAZ rounded result as: + y = (y^2 + y - 2 + 2^30 x - r'') / (2y-1) + where r'' in [0,2y-2] + 2y^2 - y = y^2 + y - 2 + 2^30 x - r'' + y^2 - 2 y + 2 + r'' = 2^30 x + Thus at r'' = 0: + (y-1)^2 + 1 = 2^30 x + -> (y-1)^2 < 2^30 x + and at r'' = 2y-2 + y^2 = 2^30 x + such that: + (y-1)^2 < 2^30 x <= y^2 + which means y is the correctly rounded result. + + Slightly more expensive than RTZ rounding. Rounding error in (-1,0] + ulp. (ulp==2^-30). */ + +static inline uint64_t +fxp_sqrt_raz( uint64_t x ) { + uint64_t xh, xl; + + /* Same guess as rtz rounding */ + int s = (63-log2_uint64( x )) >> 1; + if( s>15 ) s = 15; + xl = x << (s<<1); + uint64_t y = sqrt_uint64( xl ) << (15-s); + if( s==15 ) return y + (uint64_t)!!(xl-y*y); /* Explicitly compute residual to round when no iteration needed */ + + /* Use the modified iteration to converge on raz rounded result */ + fxp_div_expand( &xh,&xl, x ); + uwide_dec( &xh,&xl, xh, xl, UINT64_C(2) ); + for(;;) { + uint64_t yh,yl; + uwide_mul( &yh,&yl, y, y+UINT64_C(1) ); + uwide_add( &yh,&yl, yh,yl, xh,xl, UINT64_C(0) ); + uwide_div( &yh,&yl, yh,yl, (y<<1)-UINT64_C(1) ); + if( yl==y ) break; + y = yl; + } + + return y; +} + +static inline uint64_t +fxp_sqrt_raz_fast( uint64_t x ) { + uint64_t xl = x<<30; + uint64_t y = sqrt_uint64( xl ); + uint64_t r = xl - y*y; + return y + (uint64_t)!!r; +} + +/* rnz/rna/rne/rno -> Round nearest with ties toward zero/away zero/toward even/toward odd + Fast variant assumes x<2^34 + Based on: + z/2^30 ~ sqrt( x/2^30) + -> z ~ sqrt( 2^30 x ) + Assuming there are no ties, we want to integer z such that: + (z-1/2)^2 < 2^30 x < (z+1/2)^2 + z^2 - z + 1/4 < 2^30 x < z^2 + z + 1/4 + since z is integral, this is equivalent to finding a z such that: + -> z^2 - z + 1 <= 2^30 x < z^2 + z + 1 + -> r = 2^30 x - (z^2 - z + 1) and is in [0,2z) + This suggests using the iteration: + z = floor( (z^2 + z - 1 + 2^30 x) / (2z) ) + = floor( (z(z+1) + (2^30 x-1)) / (2z) ) + which, at convergence, has: + 2z^2 = z^2 + z - 1 + 2^30 x - r' + where r' is in [0,2z). Solving for r', at convergence: + r' = 2^30 x - (z^2 - z + 1) + r' = r + Thus, this iteration converges to the correctly rounded z when there + are no ties. But this also demonstrates that no ties are possible + when z is integral ... the above derivation hold when either endpoint + of the initial inequality is closed because the endpoint values are + fraction and cannot be exactly met for any integral z. As such, + there are no ties and all round nearest styles can use the same + iteration for the sqrt function. + + For computing a faster result for small x, let y by the RTZ rounded + result: + y = floor( sqrt( 2^30 x ) ) + and consider the residual: + r'' = 2^30 x - y^2 + which, given the above, will be in [0,2y]. If r''==0, the result is + exact and thus already correctly rounded. Otherwise, let: + z = y when r''<=y and y+1 when when r''>y + Consider r''' from the above for this z. + r''' = 2^30 x - z^2 + = 2^30 x - y^2 when r''<=y and 2^30 x - (y+1)^2 o.w. + = r''' when r''<=y and r'' - 2y - 1 o.w. + -> r''' in [0,y] when r''<=y and in [y+1,2y]-2y-1 o.w. + -> r''' in [0,y] when r''<=y and in [-y,-1]-2y-1 o.w. + -> r''' in [-y,y] and is negative when r''>y + -> r''' in [-z+1,z] + This implies that we have for + 2^30 x - (z^2-z+1) = r''' + z-1 is in [0,2z) + As such, z computed by this method is also correctly rounded. Thus + we can use explicitly compute the remainder or use uwide_divrem in + the iteration itself to produce the needed residual. + + Very slightly more expensive than RTZ rounding. Rounding error in + (-1/2,1/2) ulp. (ulp==2^-30). */ + +static inline uint64_t +fxp_sqrt_rnz( uint64_t x ) { + uint64_t xh, xl; + + /* Same guess as rtz rounding */ + int s = (63-log2_uint64( x )) >> 1; + if( s>15 ) s = 15; + xl = x << (s<<1); + uint64_t y = sqrt_uint64( xl ) << (15-s); + if( s==15 ) return y + (uint64_t)((xl-y*y)>y); /* Explicitly compute residual to round when no iteration needed */ + + /* Use the modified iteration to converge on rnz rounded result */ + fxp_div_expand( &xh,&xl, x ); /* 2^30 x */ + uwide_dec( &xh,&xl, xh,xl, UINT64_C(1) ); /* 2^30 x - 1 */ + for(;;) { + uint64_t yh,yl; + uwide_mul( &yh,&yl, y,y+UINT64_C(1) ); /* y^2 + y */ + uwide_add( &yh,&yl, yh,yl, xh,xl, UINT64_C(0) ); /* y^2 + y - 1 + 2^30 x */ + uwide_div( &yh,&yl, yh,yl, y<<1 ); + if( yl==y ) break; + y = yl; + } + + return y; +} + +static inline uint64_t +fxp_sqrt_rnz_fast( uint64_t x ) { + uint64_t xl = x<<30; + uint64_t y = sqrt_uint64( xl ); + uint64_t r = xl - y*y; + return y + (uint64_t)(r>y); +} + +static inline uint64_t fxp_sqrt_rna( uint64_t x ) { return fxp_sqrt_rnz( x ); } +static inline uint64_t fxp_sqrt_rne( uint64_t x ) { return fxp_sqrt_rnz( x ); } +static inline uint64_t fxp_sqrt_rno( uint64_t x ) { return fxp_sqrt_rnz( x ); } + +static inline uint64_t fxp_sqrt_rna_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); } +static inline uint64_t fxp_sqrt_rne_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); } +static inline uint64_t fxp_sqrt_rno_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); } + +/* Other rouding modes: + rdn -> Round down / toward floor / toward -inf ... same as rtz for unsigned + rup -> Round up / toward ceil / toward +inf ... same as raz for unsigned + rnd -> Round nearest with ties down / toward floor / toward -inf ... same as rnz for unsigned + rnu -> Round nearest with ties up / toward ceil / toward -inf ... same as rna for unsigned */ + +static inline uint64_t fxp_sqrt_rdn( uint64_t x ) { return fxp_sqrt_rtz( x ); } +static inline uint64_t fxp_sqrt_rup( uint64_t x ) { return fxp_sqrt_raz( x ); } +static inline uint64_t fxp_sqrt_rnd( uint64_t x ) { return fxp_sqrt_rnz( x ); } +static inline uint64_t fxp_sqrt_rnu( uint64_t x ) { return fxp_sqrt_rnz( x ); } + +static inline uint64_t fxp_sqrt_rdn_fast( uint64_t x ) { return fxp_sqrt_rtz_fast( x ); } +static inline uint64_t fxp_sqrt_rup_fast( uint64_t x ) { return fxp_sqrt_raz_fast( x ); } +static inline uint64_t fxp_sqrt_rnd_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); } +static inline uint64_t fxp_sqrt_rnu_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); } + +/* FIXED POINT LOG2 ***************************************************/ + +/* Compute: + + e + f/2^30 ~ log2( x/2^30 ) + + If x is non-zero, e will be in [-30,33] and f will be in [0,2^30] + + Note: This is not guaranteed to be exactly rounded and thus this + doesn't have variants for every rounding mode under the sun. The + current implementation has <=2 ulp error with a round-nearest flavor + though. + + Given the round-nearest flavor, it is possible to have a f/2^30=1 + exactly (e.g. log2_approx( UINT64_MAX ) will have e=33 and f/2^30=1 + such that e still is exactly determined by the index of x's most + significant bit but the fractional part is 1 after rounding up. + + It is possible modify this while retaining a round-nearest flavor + such that e is strictly in [-30,34] and f/2^30 is strictly in [0,1) + (e will no longer strictly be determined the index of x's most + significant bit so technically this retains less info than the + above). + + Likewise, it is possible to modify this to use a round-toward-zero + flavor such that e will be in [-30,33] and f/2^30 is in [0,1) always. + The resulting approximation accuracy would be slightly lower and + slightly biased. + + If x is zero, the result is undefined mathematically (the limit + x->zero+ is -inf ... this implementation returns i=INT_MIN<<-30 and + f=0 for specificity. */ + +static inline int /* e, in [-30,33] (==log2_uint64(x)-30 ... see note above) */ +fxp_log2_approx( uint64_t * _f, /* f, in [0,2^30] (yes, closed ... see note above) */ + uint64_t x ) { + + /* Handle bad inputs */ + + if( !x ) { *_f = UINT64_C(0); return INT_MIN; } + + /* Crack x into: + + x = 2^i ( 1 + y/2^63 ) + + where y is in [0,2^63). This can always be done _exactly_ for + non-zero x. That is, i is the index of x's most significant + non-zero bit (in [0,63]) and y is trailing i bits shifted up to be + 63b wide. */ + + int i = log2_uint64( x ); /* In [0,63] */ + uint64_t y = (x << (63-i)) - (UINT64_C(1)<<63); /* In [0,2^63) */ + + /* Convert this to a fixed point approximation of x: + + x ~ 2^i ( 1 + d/2^30 ) + + via: + + d = floor( (y+2^32) / 2^33 ) + + This representation is still exact when i <= 30 and at most 1/2 ulp + error in d when i>30 (rna / round nearest with ties away from zero + rounding ... consider using ties toward even or truncate rounding + here as per note above). Given the use of a round nearest style, d + is in [0,2^30] (closed at both ends). */ + + uint64_t d = (y + (UINT64_C(1)<<32)) >> 33; + + /* Given this, we have: + + e + f/2^30 = log2( x/2^30 ) + = log2( x ) - 30 + ~ log2( 2^i ( 1+ d/2^30 ) ) - 30 + = i-30 + log2( 1 + d/2^30 ) + + From this, we identify: + + e = i-30 (exact) + f/2^30 ~ log2( 1 + d/2^30 ) (approximate) + + Thus, f of a 30b fixed point lg1p calculator with a 30b fixed point + input. The below is automatically generated code for a fixed point + implementation of a minimax Pade(4,3) approximant to log2(1+x) over + the domain [0,1]. If exact math, the approximation has an accuracy + better than 1/2 ulp over the whole domain and is exact at the + endpoints. As implemented the accuracy is O(1) ulp over the whole + domain (with round nearest flavored rounding) style), monotonic and + still exact at the endpoints. */ + + uint64_t f; + uint64_t g; + + /* BEGIN AUTOGENERATED CODE */ + /* bits 31.8 rms_aerr 1.9e-10 rms_rerr 1.3e-10 max_aerr 2.7e-10 max_rerr 2.7e-10 */ + + f = UINT64_C(0x0000000245c36b35); /* scale 41 bout 34 bmul - */ + f = UINT64_C(0x000000029c5b8e15) + ((f*d)>>36); /* scale 35 bout 34 bmul 64 */ + f = UINT64_C(0x0000000303d59639) + ((f*d)>>32); /* scale 33 bout 34 bmul 64 */ + f = UINT64_C(0x00000001715475cc) + ((f*d)>>31); /* scale 32 bout 34 bmul 64 */ + f = (f*d); /* scale 62 bout 64 bmul 64 */ + /* f max 0xd1fb651800000000 */ + + g = UINT64_C(0x000000024357c946); /* scale 37 bout 34 bmul - */ + g = UINT64_C(0x00000002a94e3723) + ((g*d)>>33); /* scale 34 bout 34 bmul 64 */ + g = UINT64_C(0x000000018b7f484d) + ((g*d)>>32); /* scale 32 bout 34 bmul 64 */ + g = UINT64_C(0x0000000100000000) + ((g*d)>>30); /* scale 32 bout 34 bmul 64 */ + /* g max 0x0000000347ed945f */ + + f = (f + (g>>1)) / g; /* RNA style rounding */ + /* END AUTOGENERATED CODE */ + + *_f = f; return i-30; +} + +/* FIXED POINT EXPONENTIALS *******************************************/ + +/* fxp_exp2_approx computes: + + y/2^30 ~ exp2( x/2^30 ) + + with an error of O(1) ulp for x/2^30 < ~1. This works identical to + exp2m1.h but uses a minimax polynomial that is better than 0.5 ulp + accurate in exact arithmetic. As implemented this is +/-1 ulp of the + correctly rounded RNE result when x<=2^30, has the leading ~30 bits + correct for larger x and is exact for input values that yield exactly + representable outputs. Returns UINT64_MAX if output would overflow a + 34.30u input. */ + +static inline uint64_t +fxp_exp2_approx( uint64_t x ) { + uint64_t i = x >> 30; + if( i>=UINT64_C(34) ) return UINT64_MAX; + uint64_t d = x & ((UINT64_C(1)<<30)-UINT64_C(1)); + uint64_t y; + /* BEGIN AUTOGENERATED CODE */ + /* bits 33.8 rms_aerr 4.7e-11 rms_rerr 3.5e-11 max_aerr 6.7e-11 max_rerr 6.6e-11 */ + y = UINT64_C(0x00000002d6e2cc42); /* scale 49 bout 34 bmul - */ + y = UINT64_C(0x0000000257c0894c) + ((y*d)>>33); /* scale 46 bout 34 bmul 64 */ + y = UINT64_C(0x00000002c01421b9) + ((y*d)>>33); /* scale 43 bout 34 bmul 64 */ + y = UINT64_C(0x000000027609e3a4) + ((y*d)>>33); /* scale 40 bout 34 bmul 64 */ + y = UINT64_C(0x00000001c6b2ea70) + ((y*d)>>33); /* scale 37 bout 34 bmul 64 */ + y = UINT64_C(0x00000001ebfbce13) + ((y*d)>>32); /* scale 35 bout 34 bmul 64 */ + y = UINT64_C(0x00000002c5c8603b) + ((y*d)>>31); /* scale 34 bout 34 bmul 64 */ + y = (y*d); /* scale 64 bout 64 bmul 64 */ + /* END AUTOGENERATED CODE */ + int s = 34-(int)i; + return ((y + (UINT64_C(1)<<(s-1))) >> s) + (UINT64_C(1)<<(64-s)); +} + +/* fxp_rexp2_approx computes: + + y/2^30 ~ exp2( -x/2^30 ) + + with an error of O(1) ulp everywhere. This works identical to + rexp2.h but uses a minimax polynomial that is better than 0.5 ulp + accurate in exact arithmetic. As implemented this is +/-1 ulp of the + correctly rounded RNE result everywhere and exact for input values + that have exactly representable outputs. */ + +static inline uint64_t +fxp_rexp2_approx( uint64_t x ) { + uint64_t i = x >> 30; + if( i>=UINT64_C(31) ) return UINT64_C(0); + uint64_t d = x & ((UINT64_C(1)<<30)-UINT64_C(1)); + uint64_t y; + /* BEGIN AUTOGENERATED CODE */ + /* bits 35.4 rms_aerr 2.4e-11 rms_rerr 1.4e-11 max_aerr 3.3e-11 max_rerr 2.2e-11 */ + y = UINT64_C(0x00000002d6e2c6a2); /* scale 50 bout 34 bmul - */ + y = UINT64_C(0x0000000269e37ccb) - ((y*d)>>34); /* scale 46 bout 34 bmul 64 */ + y = UINT64_C(0x00000002b83379a8) - ((y*d)>>33); /* scale 43 bout 34 bmul 64 */ + y = UINT64_C(0x00000002762c0cea) - ((y*d)>>33); /* scale 40 bout 34 bmul 64 */ + y = UINT64_C(0x00000001c6af4b81) - ((y*d)>>33); /* scale 37 bout 34 bmul 64 */ + y = UINT64_C(0x00000001ebfbd6aa) - ((y*d)>>32); /* scale 35 bout 34 bmul 64 */ + y = UINT64_C(0x00000002c5c85fb1) - ((y*d)>>31); /* scale 34 bout 34 bmul 64 */ + y = UINT64_C(0x8000000000000000) - ((y*d)>> 1); /* scale 63 bout 64 bmul 64 */ + /* END AUTOGENERATED CODE */ + int s = 33+(int)i; + return (y + (UINT64_C(1)<<(s-1))) >> s; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_util_fxp_h_ */ diff --git a/program/src/oracle/util/hash.h b/program/src/oracle/util/hash.h new file mode 100644 index 000000000..4bdd1023a --- /dev/null +++ b/program/src/oracle/util/hash.h @@ -0,0 +1,74 @@ +#ifndef _pyth_oracle_util_hash_h_ +#define _pyth_oracle_util_hash_h_ + +/* Useful hashing utilities */ + +#include "compat_stdint.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* High quality (full avalanche) high speed integer to integer hashing. + hash_uint32 has the properties that [0,2^32) hashes to a random + looking permutation of [0,2^32) and hash(0)==0. Similarly for + hash_uint64. Based on Google's Murmur3 hash finalizer (public + domain). Not cryptographically secure but passes various strict + tests of randomness when used as a PRNG (see prng.h). */ + +static inline uint32_t +hash_uint32( uint32_t x ) { + x ^= x >> 16; + x *= UINT32_C( 0x85ebca6b ); + x ^= x >> 13; + x *= UINT32_C( 0xc2b2ae35 ); + x ^= x >> 16; + return x; +} + +static inline uint64_t +hash_uint64( uint64_t x ) { + x ^= x >> 33; + x *= UINT64_C( 0xff51afd7ed558ccd ); + x ^= x >> 33; + x *= UINT64_C( 0xc4ceb9fe1a85ec53 ); + x ^= x >> 33; + return x; +} + +/* Inverses of the above. E.g.: + hash_inverse_uint32( hash_uint32( (uint32_t)x ) )==(uint32_t)x + and: + hash_uint32( hash_inverse_uint32( (uint32_t)x ) )==(uint32_t)x + Similarly for hash_inverse_uint64. These by themselves are similar + quality hashes to the above and having the inverses of the above can + be useful. The fact these have (nearly) identical operations / + operation counts concretely demonstrates that none of these are + standalone cryptographically secure. */ + +static inline uint32_t +hash_inverse_uint32( uint32_t x ) { + x ^= x >> 16; + x *= UINT32_C( 0x7ed1b41d ); + x ^= (x >> 13) ^ (x >> 26); + x *= UINT32_C( 0xa5cb9243 ); + x ^= x >> 16; + return x; +} + +static inline uint64_t +hash_inverse_uint64( uint64_t x ) { + x ^= x >> 33; + x *= UINT64_C( 0x9cb4b2f8129337db ); + x ^= x >> 33; + x *= UINT64_C( 0x4f74430c22a54005 ); + x ^= x >> 33; + return x; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_util_hash_h_ */ + diff --git a/program/src/oracle/util/log2.h b/program/src/oracle/util/log2.h new file mode 100644 index 000000000..4093f7d5f --- /dev/null +++ b/program/src/oracle/util/log2.h @@ -0,0 +1,108 @@ +#ifndef _pyth_oracle_util_log2_h_ +#define _pyth_oracle_util_log2_h_ + +/* Portable robust integer log base 2 (i.e. find most significant bit). + Define PYTH_ORACLE_UTIL_LOG2_STYLE to indicate the implementation + style: + 0 - portable + 1 - use gcc's compiler builtins assuming uint32_t/uint64_t is an + "unsigned int"/"unsigned long". + Default is 0. */ + +#include "compat_stdint.h" + +#ifndef PYTH_ORACLE_UTIL_LOG2_STYLE +#define PYTH_ORACLE_UTIL_LOG2_STYLE 0 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Compute z = floor( log_2( x ) ) for positive x (i.e. return the + index of the most significant bit set in x. Exact. Undefined + behavior if x is not positive (can depend on the implementation + style). For the portable style, the cost is a O(log_2 N) fast + integer ops where N is bit width of x. */ + +#if PYTH_ORACLE_UTIL_LOG2_STYLE==0 + +static inline int /* In [0,7] */ +log2_uint8( uint8_t _x ) { /* Positive */ + uint32_t x = (uint32_t)_x; + int z = 0; + if( x >= (UINT32_C(1)<< 4) ) z += 4, x >>= 4; + if( x >= (UINT32_C(1)<< 2) ) z += 2, x >>= 2; + if( x >= (UINT32_C(1)<< 1) ) z += 1; + return z; +} + +static inline int /* In [0,15] */ +log2_uint16( uint16_t _x ) { /* Positive */ + uint32_t x = (uint32_t)_x; + int z = 0; + if( x >= (UINT32_C(1)<< 8) ) z += 8, x >>= 8; + if( x >= (UINT32_C(1)<< 4) ) z += 4, x >>= 4; + if( x >= (UINT32_C(1)<< 2) ) z += 2, x >>= 2; + if( x >= (UINT32_C(1)<< 1) ) z += 1; + return z; +} + +static inline int /* In [0,31] */ +log2_uint32( uint32_t x ) { /* Positive */ + int z = 0; + if( x >= (UINT32_C(1)<<16) ) z += 16, x >>= 16; + if( x >= (UINT32_C(1)<< 8) ) z += 8, x >>= 8; + if( x >= (UINT32_C(1)<< 4) ) z += 4, x >>= 4; + if( x >= (UINT32_C(1)<< 2) ) z += 2, x >>= 2; + if( x >= (UINT32_C(1)<< 1) ) z += 1; + return z; +} + +static inline int /* In [0,63] */ +log2_uint64( uint64_t x ) { /* Positive */ + int z = 0; + if( x >= (UINT64_C(1)<<32) ) z += 32, x >>= 32; + if( x >= (UINT64_C(1)<<16) ) z += 16, x >>= 16; + if( x >= (UINT64_C(1)<< 8) ) z += 8, x >>= 8; + if( x >= (UINT64_C(1)<< 4) ) z += 4, x >>= 4; + if( x >= (UINT64_C(1)<< 2) ) z += 2, x >>= 2; + if( x >= (UINT64_C(1)<< 1) ) z += 1; + return z; +} + +#elif PYTH_ORACLE_UTIL_LOG2_STYLE==1 + +static inline int log2_uint8 ( uint8_t x ) { return 31-__builtin_clz ( (unsigned int )x ); } +static inline int log2_uint16( uint16_t x ) { return 31-__builtin_clz ( (unsigned int )x ); } +static inline int log2_uint32( uint32_t x ) { return 31-__builtin_clz ( (unsigned int )x ); } +static inline int log2_uint64( uint64_t x ) { return 63-__builtin_clzl( (unsigned long)x ); } + +#else +#error "Unsupported PYTH_ORACLE_UTIL_LOG2_STYLE" +#endif + +static inline int /* In [0, 6] */ log2_int8 ( int8_t x /* positive */ ) { return log2_uint8 ( (uint8_t )x ); } +static inline int /* In [0,14] */ log2_int16( int16_t x /* positive */ ) { return log2_uint16( (uint16_t)x ); } +static inline int /* In [0,30] */ log2_int32( int32_t x /* positive */ ) { return log2_uint32( (uint32_t)x ); } +static inline int /* In [0,62] */ log2_int64( int64_t x /* positive */ ) { return log2_uint64( (uint64_t)x ); } + +/* Same as the above but these variants return default when x is zero + such that they have no undefined behavior */ + +static inline int log2_uint8_def ( uint8_t x, int def ) { return x ? log2_uint8 ( x ) : def; } +static inline int log2_uint16_def( uint16_t x, int def ) { return x ? log2_uint16( x ) : def; } +static inline int log2_uint32_def( uint32_t x, int def ) { return x ? log2_uint32( x ) : def; } +static inline int log2_uint64_def( uint64_t x, int def ) { return x ? log2_uint64( x ) : def; } + +static inline int log2_int8_def ( int8_t x, int def ) { return (x>(int8_t )0) ? log2_int8 ( x ) : def; } +static inline int log2_int16_def( int16_t x, int def ) { return (x>(int16_t)0) ? log2_int16( x ) : def; } +static inline int log2_int32_def( int32_t x, int def ) { return (x>(int32_t)0) ? log2_int32( x ) : def; } +static inline int log2_int64_def( int64_t x, int def ) { return (x>(int64_t)0) ? log2_int64( x ) : def; } + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_util_log2_h_ */ + diff --git a/program/src/oracle/util/prng.h b/program/src/oracle/util/prng.h new file mode 100644 index 000000000..08e42f4ff --- /dev/null +++ b/program/src/oracle/util/prng.h @@ -0,0 +1,366 @@ +#ifndef _pyth_oracle_util_prng_h_ +#define _pyth_oracle_util_prng_h_ + +/* Simple fast high quality non-cryptographic pseudo random number + generator. Supports parallel generation, interprocess shared memory + usage, checkpointing, random access, reversible, atomic, etc. Passes + extremely strict tests of randomness. + + Assumes hash.h provides a high quality 64<>64-bit integer hash + functions (i.e. full avalanche) with the property hash_uint64(0)==0, + hash_uint64(i) for i in [0,2^64) yields a permutation of [0,2^64) and + also provides reasonably efficient inverse of this. */ + +#include +#include "hash.h" /* includes stdint.h */ + +/* prng_t should be treated as an opaque handle of a pseudo random + number generator. (It technically isn't here to faciliate inlining + of prng operations.) */ + +struct prng { + uint64_t seq; + uint64_t idx; +}; + +typedef struct prng prng_t; + +#ifdef __cplusplus +extern "C" { +#endif + +/* Private: randomly expands an arbitrary 32-bit value into a unique + 64-bit non-sparse value such that the original 32-bit value can be + recovered and that 0 expands to something non-zero (the non-sparse + expansion helps reduce correlations between different sequences, the + zero to non-zero expansion means the common initialization of seq=0, + idx=0 doesn't yield 0 for the first random value as would happen for + hash functions that have the property hash_uint64(0)==0, the XOR + 64-bit const here is zero in the lower 32-bits and an arbitrary + non-zero in the upper 32-bits). For the current hash_uint64 + implementation and XOR 64-bit constant, the pop count of the expanded + seq is ~32.000 +/- ~4.000 and in [8,56] for all possible values of + seq (i.e. the expanded seq popcount is well approximated as a normal + with mean 32 and rms 4 and the extremes are in line with the expected + extremes for 2^32 samples). */ + +static inline uint64_t +prng_private_expand( uint32_t seq ) { + return hash_uint64( UINT64_C( 0x900df00d00000000 ) ^ (uint64_t)seq ); +} + +/* Private: extract the original 32-bit seq from its expanded value */ + +static inline uint32_t +prng_private_contract( uint64_t seq ) { + return (uint32_t)hash_inverse_uint64( seq ); +} + +/* prng_footprint and prng_align give the needed footprint and alignment + for a memory region to hold a prng's state. malloc / alloca / + declaration friendly (e.g. a memory region declared as "prng_t + _prng[1];", or created by "malloc(sizeof(prng_t))" or created by + "alloca(sizeof(prng_t))" will all automatically have the needed + footprint and alignment). + + prng_new takes ownership of the memory region pointed to by mem + (which is assumed to be non-NULL with the appropriate footprint and + alingment) and formats it as a prng. The pseudo random number + generator stream will initialized to use sequence random sequence + "seq" and will start at index "idx". Returns mem (which will be + formatted for use). The caller will not be joined to the region on + return. + + prng_join joins the caller to a memory region holding the state of a + prng. _prng points to a memory region in the local address space + that holds prng. Returns an opaque handle of the local join in the + local address space to the prng (which might not be the same thing as + _prng ... the separation of new and join is to facilitate + interprocess shared memory usage patterns while supporting + transparent upgrades users of this to more elaborate PRNG algorithms + where address translations under the hood may not be trivial). + + prng_leave leaves the current prng join. Returns a pointer in the + local address space to the memory region holding the state of the + prng. The prng join should not be used afterward. + + prng_delete unformats the memory region currently used to hold the + state of a _prng and returns ownership of the underlying memory + region to the caller. There should be no joins in the system on the + prng. Returns a pointer to the underlying memory region. + + FIXME: CONSIDER FLATTENING ALIGN? */ + +static inline uint64_t prng_footprint( void ) { return (uint64_t)sizeof ( prng_t ); } +static inline uint64_t prng_align ( void ) { return (uint64_t)alignof( prng_t ); } + +static inline void * +prng_new( void * mem, + uint32_t seq, + uint64_t idx ) { + prng_t * prng = (prng_t *)mem; + prng->seq = prng_private_expand( seq ); + prng->idx = idx; + return (void *)prng; +} + +static inline prng_t * prng_join ( void * _prng ) { return (prng_t *)_prng; } +static inline void * prng_leave ( prng_t * prng ) { return (void *) prng; } +static inline void * prng_delete( void * _prng ) { return (void *)_prng; } + +/* prng_seq returns the sequence used by the prng. prng_idx returns the + next index that will be consumed by the prng. */ + +static inline uint32_t prng_seq( prng_t * prng ) { return prng_private_contract( prng->seq ); } +static inline uint64_t prng_idx( prng_t * prng ) { return prng->idx; } + +/* prng_seq_set sets the sequence to be used by the prng and returns the + replaced value. prng_idx_set sets the next index that will be + consumed by the prng and returns the replaced value. */ + +static inline uint32_t +prng_seq_set( prng_t * prng, + uint32_t seq ) { + uint32_t old = prng_seq( prng ); + prng->seq = prng_private_expand( seq ); + return old; +} + +static inline uint64_t +prng_idx_set( prng_t * prng, + uint64_t idx ) { + uint64_t old = prng_idx( prng ); + prng->idx = idx; + return old; +} + +/* prng_uintN returns the next integer in the PRNG sequence in [0,2^N) + with uniform probability with a period of 2^64 (prng_uint64 has a + period of 2^63 as consumes two prng indices). Passes various strict + PRNG tests (e.g. diehard, dieharder, testu01, etc). Assumes prng is + a current join. prng_intN is the same as prng_uintN but returns a + value in [0,2^(N-1)). (A signed generator that can assume all + possible values of of a signed int uniform IID can be obtained by + casting the output of the unsigned generator of the same, assuming + typical twos complement arithmetic platform.) + + The theory for this that hash_uint64(i) for i in [0,2^64) specifies a + random looking permutation of the integers in [0,2^64). Returning + the low order bits of this random permutation then yields a high + quality non-cryptographic random number stream automatically as it: + + - Has a long period (2^64). This is implied by the permutation + property. + + - Has the expected random properties (as theoretically best possible + for a finite period generator) of a true uniform IID bit source. + For example, the probability of next random number is uniform and + independent of previous N random numbers for N<<2^64). This is + also implied by the full avalanche and permutation property. + + - Is "unpredictable". That is, the internal state of the generator + is difficult to recover from its outputs, e.g. a return from + prng_uint32 could be have been generated from 2^32 internal states + (if the sequence is known), up to 2^32 sequences (if the state is + known) and up to 2^64 (state,seq) pairs neither is known (the state + / sequence is potentially recoverable given a long enough stream of + values though). This is implied by the truncation of hash values. + + To turn this into parameterizable family of generators, note that + hash_uint64( perm_j( i ) ) where j is some parameterized family of + random permutations is still a permutation and would have all the + above properties for free so long as no perm_j is similar to the + inverse of the hash permutation. Practically, xoring i by a + non-sparse 64-bit number will ultra cheaply generate a wide family of + "good enough" permutations to do a parameterized shuffling of the + original hash_uint64 permutation, creating a large number of parallel + sequences. Since users are empirically notoriously bad at seeding + though, we only let the user specify a 32-bit sequence id and then + generate a unique non-sparse 64-bit random looking number from it. */ + +static inline uint8_t prng_uint8 ( prng_t * prng ) { return (uint8_t )hash_uint64( prng->seq ^ (prng->idx++) ); } +static inline uint16_t prng_uint16( prng_t * prng ) { return (uint16_t)hash_uint64( prng->seq ^ (prng->idx++) ); } +static inline uint32_t prng_uint32( prng_t * prng ) { return (uint32_t)hash_uint64( prng->seq ^ (prng->idx++) ); } + +static inline uint64_t +prng_uint64( prng_t * prng ) { + uint64_t hi = (uint64_t)prng_uint32( prng ); + return (hi<<32) | (uint64_t)prng_uint32( prng ); +} + +static inline int8_t prng_int8 ( prng_t * prng ) { return (int8_t )( prng_uint8 ( prng ) >> 1 ); } +static inline int16_t prng_int16( prng_t * prng ) { return (int16_t)( prng_uint16( prng ) >> 1 ); } +static inline int32_t prng_int32( prng_t * prng ) { return (int32_t)( prng_uint32( prng ) >> 1 ); } +static inline int64_t prng_int64( prng_t * prng ) { return (int64_t)( prng_uint64( prng ) >> 1 ); } + +/* These quickly and robustly convert uniform random integers into + uniform random floating point with appropriate rounding. Intervals + are: + + *_o -> (0,1) + *_c0 -> [0,1) + *_c1 -> (0,1] + *_c -> [0,1] + + To provide more specifics, let the real numbers from [0,1) be + partitioned into N uniform disjoint intervals such that interval i + goes from [i/N,(i+1)/N) where i is in [0,N). For single (double) + precision, "float" ("double"), the largest N for which the range of + each interval is _exactly_ representable is N = 2^24 (2^53). + + Given then a uniform IID uint32_t random input, the + prng_uint32_to_float_c0 converter output is as though an continuous + IID uniform random in [0,1) was generated and then rounded down to + the start of the containing interval (2^24 intervals). As such, this + generator can never return exactly 1 but it can exactly return +0. + Since floats have higher resolution near 0 than 1, this will not + return all float possible representations in [0,1) but it can return + all possible float representations in [1/2,1). In particular, this + will never return a denorm or -0. + + Similarly for prng_uint32_to_float_c1 converter rounds up to the end + of the containing interval / start of the next interval (2^24 + intervals). As such, this converter can never return exactly +0 but + it can exactly return 1. It will not return all possible float + representations in (0,1] but it can return all possible float + representations in [1/2,1]. This will never return a denorm or -0. + + The prng_uint32_to_float_c converter rounds toward nearest even + toward the start containing interval or start of the next interval + (2^24 intervals). As such, this can return both exactly +0 and + exactly 1 (and the probability of returning exactly +0 == probability + of exactly 1 == (1/2) probability all other possible return values). + It will not return all possible float representations in [0,1] but it + can return all float possible representations in [1/2,1]. This will + never return a denorm or -0. + + The prng_uint32_to_float_o converter rounds toward the middle of + containing internal (2^23 intervals ... note that then in a sense + this converter is 1-bit less accurate than the above). As such, this + can neither return +0 nor 1 and will not return all possible float + representations in (0,1). This will never return a denorm or -0. + + Similarly for the double variants (*_{c0,c1,c} uses 2^53 intervals + and o uses 2^52 intervals). + + Note that it is possible to make converters that will handle exactly + rounding toward all possible floating point representations in [0,1] + but this are significantly more expensive. + + Assumes IEEE-754 style float and doubles. + + FIXME: ADD UNIT TEST COVERAGE */ + +static inline float +prng_uint32_to_float_c0( uint32_t u ) { + return (1.f/(float )(INT32_C(1)<<24))*(float)(int32_t)( u>>(32-24) ); +} + +static inline float +prng_uint32_to_float_c1( uint32_t u ) { + return (1.f/(float )(INT32_C(1)<<24))*(float)(int32_t)((u>>(32-24))+ UINT32_C(1) ); +} + +static inline float +prng_uint32_to_float_c( uint32_t u ) { + return (1.f/(float )(INT32_C(1)<<24))*(float)(int32_t)((u>>(32-24))+(u&UINT32_C(1))); +} + +static inline float +prng_uint32_to_float_o( uint32_t u ) { + return (1.f/(float )(INT32_C(1)<<24))*(float)(int32_t)((u>>(32-24))| UINT32_C(1) ); +} + +static inline double +prng_uint64_to_double_c0( uint64_t u ) { + return (1. /(double)(INT64_C(1)<<53))*(double)(int64_t)( u>>(64-53) ); +} + +static inline double +prng_uint64_to_double_c1( uint64_t u ) { + return (1. /(double)(INT64_C(1)<<53))*(double)(int64_t)((u>>(64-53))+ UINT64_C(1) ); +} + +static inline double +prng_uint64_to_double_c( uint64_t u ) { + return (1. /(double)(INT64_C(1)<<53))*(double)(int64_t)((u>>(64-53))+(u&UINT64_C(1))); +} + +static inline double +prng_uint64_to_double_o( uint64_t u ) { + return (1. /(double)(INT64_C(1)<<53))*(double)(int64_t)((u>>(64-53))| UINT64_C(1) ); +} + +static inline float prng_float_c0 ( prng_t * prng ) { return prng_uint32_to_float_c0 ( prng_uint32( prng ) ); } +static inline float prng_float_c1 ( prng_t * prng ) { return prng_uint32_to_float_c1 ( prng_uint32( prng ) ); } +static inline float prng_float_c ( prng_t * prng ) { return prng_uint32_to_float_c ( prng_uint32( prng ) ); } +static inline float prng_float_o ( prng_t * prng ) { return prng_uint32_to_float_o ( prng_uint32( prng ) ); } + +static inline double prng_double_c0( prng_t * prng ) { return prng_uint64_to_double_c0( prng_uint64( prng ) ); } +static inline double prng_double_c1( prng_t * prng ) { return prng_uint64_to_double_c1( prng_uint64( prng ) ); } +static inline double prng_double_c ( prng_t * prng ) { return prng_uint64_to_double_c ( prng_uint64( prng ) ); } +static inline double prng_double_o ( prng_t * prng ) { return prng_uint64_to_double_o ( prng_uint64( prng ) ); } + +/* prng_int32_roll uses the given prng to roll an n-sided die where n + is the number of sides (assumed to be positive). That is returns + uniform IID rand in [0,n) even if n is not an exact power of two. + + Rejection method based. Specifically, the number of prng slots + consumed is typically 1 but theoretically might be higher + occassionally (64-bit wide types consume prng slots twice as fast). + + Deterministic prng slot consumption possible with a slightly more + approximate implementation (bound the number of iterations such that + this always consumes a fixed number of slot and accept the + practically infinitesimal bias when n is not a power of 2). + + FIXME: ADD UNIT TEST COVERAGE. */ + +static inline uint32_t +prng_private_roll32( prng_t * prng, + uint32_t n ) { + uint32_t r = (-n) % n; /* Compute 2^32 mod n = (2^32 - n) mod n = (-n) mod n, compile time for compile time n */ + uint32_t u; do u = prng_uint32( prng ); while( u64 + subtracts and 7 64*64->64 multiplies (similarly for the orders 2:6). + + REXP2_FXP_ORDER 5 is the default and has a worst case output accuracy + comparable IEEE single precision. + + In short, for integer x/2^30, it is the exact value correctly rounded + and, for IID random x in [0,2^30), the approximation errors can be + thought of as a random Gaussian perturbation on the output value such + that the approximation for a given order is the one that minimizes + the perturbation variance. */ + +#include "compat_stdint.h" + +#ifndef REXP2_FXP_ORDER +#define REXP2_FXP_ORDER 5 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +static inline uint64_t +rexp2_fxp( uint64_t x ) { + + /* This calculation is based on splitting x into integer and + fractional bits such that: + x = 2^30 i + d + where d is in [0,2^30). Then we have: + y = 2^30 2^-i 2^( -d/2^30 ) + = (2^63 2^( -d/2^30 )) / 2^(63-30+i) + = (2^63 2^( -d/2^30 )) / 2^(33+i) + ~ floor( (2^63 2^(-d/2^30) + 2^(32+i)) / 2^(33+i) ) + Thus, the main challenge is computing an approximation of: + 2^63 2^(-d/2^30) + As this smoothly goes from 1 to 1/2, this is an ideal candidate for + a polynomial approximation (Pade' approximation might be even be + better if willing to accept the cost of the divide). We aren't too + particular about maxing out precision or precise rounding in the + interest of compactness and speed. + + The polynomial coefficients below were selected to minimize the RMS + error over the interval [0,1) while matching exactly at 0 and 1 + (RMS error was used over minimax as it has more intuitive error + properties for users typically). All but the leading coefficent + were scaled to be in the range [2^33,2^34) to minimize the + coefficient quantization error while yielding a value that cannot + not overflow 64-bits when multiplied by d. The intermediate shifts + needed for the Horner's rule application were then computed + accordingly. The leading coefficient scale was fixed at 2^63 to + enable quick-and-dirty implementation of rounding modes. */ + + uint64_t i = x >> 30; + if( i>UINT64_C(30) ) return UINT64_C(0); + + uint64_t d = x & ((UINT64_C(1)<<30)-UINT64_C(1)); + uint64_t y; + + /* BEGIN AUTOGENERATED CODE - See KJB for code generator */ +# if REXP2_FXP_ORDER==1 + /* bits 4.0 rms_aerr 1.3e-02 rms_rerr 1.8e-02 max_aerr 4.3e-02 max_rerr 6.1e-02 */ + /* As implemented: bits 4.0 max_aerr 4.3e-02 max_rerr 6.1e-02 ulp 46209195.0 */ + y = UINT64_C(0x8000000000000000) - (d<<32); +# elif REXP2_FXP_ORDER==2 + /* bits 8.3 rms_aerr 1.4e-03 rms_rerr 2.0e-03 max_aerr 1.9e-03 max_rerr 3.2e-03 */ + /* As implemented: bits 8.3 max_aerr 1.9e-03 max_rerr 3.2e-03 ulp 2086872.6 */ + y = UINT64_C(0x2c029d07d); + y = UINT64_C(0x2b00a741f) - ((y*d)>>32); + y = UINT64_C(0x8000000000000000) - ((y*d)>>1); +# elif REXP2_FXP_ORDER==3 + /* bits 13.0 rms_aerr 5.5e-05 rms_rerr 7.9e-05 max_aerr 8.6e-05 max_rerr 1.2e-04 */ + /* As implemented: bits 13.0 max_aerr 8.6e-05 max_rerr 1.2e-04 ulp 91856.9 */ + y = UINT64_C(0x288319c3e); + y = UINT64_C(0x3b33c6b15) - ((y*d)>>32); + y = UINT64_C(0x2c44c0101) - ((y*d)>>32); + y = UINT64_C(0x8000000000000000) - ((y*d)>>1); +# elif REXP2_FXP_ORDER==4 + /* bits 17.6 rms_aerr 1.6e-06 rms_rerr 2.4e-06 max_aerr 2.8e-06 max_rerr 5.0e-06 */ + /* As implemented: bits 17.6 max_aerr 2.8e-06 max_rerr 5.0e-06 ulp 2999.6 */ + y = UINT64_C(0x38100ce16); + y = UINT64_C(0x36871cfc4) - ((y*d)>>33); + y = UINT64_C(0x3d4dfa602) - ((y*d)>>32); + y = UINT64_C(0x2c5b2ce21) - ((y*d)>>32); + y = UINT64_C(0x8000000000000000) - ((y*d)>>1); +# elif REXP2_FXP_ORDER==5 + /* bits 22.8 rms_aerr 4.6e-08 rms_rerr 6.7e-08 max_aerr 7.7e-08 max_rerr 1.4e-07 */ + /* As implemented: bits 22.8 max_aerr 7.7e-08 max_rerr 1.4e-07 ulp 83.1 */ + y = UINT64_C(0x3e1a2f97e); + y = UINT64_C(0x25bc1de09) - ((y*d)>>34); + y = UINT64_C(0x38a155436) - ((y*d)>>32); + y = UINT64_C(0x3d7c8e03d) - ((y*d)>>32); + y = UINT64_C(0x2c5c78186) - ((y*d)>>32); + y = UINT64_C(0x8000000000000000) - ((y*d)>>1); +# elif REXP2_FXP_ORDER==6 + /* bits 27.9 rms_aerr 1.1e-09 rms_rerr 1.6e-09 max_aerr 2.1e-09 max_rerr 3.9e-09 */ + /* As implemented: bits 27.6 max_aerr 2.5e-09 max_rerr 4.9e-09 ulp 2.7 */ + y = UINT64_C(0x3959e0dfb); + y = UINT64_C(0x29cdf1eff) - ((y*d)>>34); + y = UINT64_C(0x273d8f899) - ((y*d)>>33); + y = UINT64_C(0x38d2ad669) - ((y*d)>>32); + y = UINT64_C(0x3d7f590ad) - ((y*d)>>32); + y = UINT64_C(0x2c5c85808) - ((y*d)>>32); + y = UINT64_C(0x8000000000000000) - ((y*d)>>1); +# elif REXP2_FXP_ORDER==7 + /* bits 33.5 rms_aerr 2.3e-11 rms_rerr 3.4e-11 max_aerr 4.4e-11 max_rerr 8.5e-11 */ + /* As implemented: bits 29.8 max_aerr 5.6e-10 max_rerr 1.1e-09 ulp 0.6 */ + y = UINT64_C(0x2d6cd448b); + y = UINT64_C(0x269cc5254) - ((y*d)>>34); + y = UINT64_C(0x2b82bc124) - ((y*d)>>33); + y = UINT64_C(0x2762b03ae) - ((y*d)>>33); + y = UINT64_C(0x38d5e75bc) - ((y*d)>>32); + y = UINT64_C(0x3d7f7ab76) - ((y*d)>>32); + y = UINT64_C(0x2c5c85fa8) - ((y*d)>>32); + y = UINT64_C(0x8000000000000000) - ((y*d)>>1); +# else +# error "Unsupported REXP2_FXP_ORDER" +# endif + /* END AUTOGENERATED CODE */ + + /* At this point, y ~ 2^63 2^(-d/2^30) <= 2^63 while i is in [0,30]. + Do a rounding right shift to get the final result. */ + + int s = 33+(int)i; /* s is [33,63], s-1 in [32,62], y+(1<<(s-1)) at most <=2^63+2^62 and will not overflow */ + return (y + (UINT64_C(1)<<(s-1))) >> s; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_util_rexp2_h_ */ diff --git a/program/src/oracle/util/run_tests b/program/src/oracle/util/run_tests new file mode 100755 index 000000000..2d0d052af --- /dev/null +++ b/program/src/oracle/util/run_tests @@ -0,0 +1,69 @@ +#!/bin/sh + +module purge || exit 1 +module load gcc-9.3.0 || exit 1 + +./clean || exit 1 +mkdir -pv bin || exit 1 + +CC="gcc -g -Wall -Werror -Wextra -Wconversion -Wstrict-aliasing=2 -Wimplicit-fallthrough=2 -pedantic -D_XOPEN_SOURCE=600 -O2 -march=native -std=c17 -lm" + +set -x + +$CC test_align.c -o bin/test_align || exit 1 +$CC test_hash.c -o bin/test_hash || exit 1 +$CC test_prng.c -o bin/test_prng || exit 1 +$CC test_sar.c -o bin/test_sar || exit 1 +$CC test_round.c -o bin/test_round || exit 1 +$CC test_log2.c -o bin/test_log2 || exit 1 +$CC test_smag.c -o bin/test_smag || exit 1 +$CC test_rexp2.c -o bin/test_rexp2 || exit 1 +$CC test_exp2m1.c -o bin/test_exp2m1 || exit 1 +$CC test_uwide.c -o bin/test_uwide || exit 1 +$CC test_sqrt.c -o bin/test_sqrt || exit 1 +$CC test_fxp.c -o bin/test_fxp || exit 1 +$CC test_avg.c -o bin/test_avg || exit 1 + +bin/test_align || exit 1 +bin/test_hash || exit 1 +bin/test_prng || exit 1 +bin/test_sar || exit 1 +bin/test_round || exit 1 +bin/test_log2 || exit 1 +bin/test_smag || exit 1 +bin/test_rexp2 || exit 1 +bin/test_exp2m1 || exit 1 +bin/test_uwide || exit 1 +bin/test_sqrt || exit 1 +bin/test_fxp || exit 1 +bin/test_avg || exit 1 + +# PRNG batteries + +# Note that when running an individual test in a TestU01 test battery, +# the probability of that test failing even though the generator is fine +# is roughly 0.2%. Thus, when repeatedly running batteries that +# themselves contain a large number of tests, some test failures are +# expected. So long as the test failures are are sporadic, don't occur +# in the same place when running multiple times with random seq and/or +# idx, and don't have p-values improbably close to 0 +# (p<<<1/overall_num_tests_run) or 1 (1-p<<<1/overall_num_tests_run), it +# is expected and normal. Because of this, TestU01 doesn't just fail if +# a handful of test fails ... need to read the reports produced to see +# how well the generator is actually behaving. + +# Point the TESTU01 variable at where TestU01 is installed. Testing +# thus far has been done with TestU01-1.2.3. + +#TESTU01=~/nfs/TestU01 +# +#$CC test_prng_battery.c -o bin/test_prng_battery -I$TESTU01/include $TESTU01/lib/libtestu01.a $TESTU01/lib/libprobdist.a $TESTU01/lib/libmylib.a || exit 1 +# +#bin/test_prng_battery 0 0 0 || exit 1 # FIPS-140.2 (seq 0, idx 0) +#bin/test_prng_battery 1 0 0 || exit 1 # PseudoDiehard (seq 0, idx 0) +#bin/test_prng_battery 2 0 0 || exit 1 # SmallCrush (seq 0, idx 0) +#bin/test_prng_battery 3 0 0 || exit 1 # Crush: Takes >~20 min (seq 0, idx 0) +#bin/test_prng_battery 4 0 0 || exit 1 # BigCrush: Takes >~3 hours (seq 0, idx 0) + +echo all tests passed + diff --git a/program/src/oracle/util/sar.h b/program/src/oracle/util/sar.h new file mode 100644 index 000000000..1fbd8b30b --- /dev/null +++ b/program/src/oracle/util/sar.h @@ -0,0 +1,71 @@ +#ifndef _pyth_oracle_util_sar_h_ +#define _pyth_oracle_util_sar_h_ + +/* Portable arithmetic shift right. Define PYTH_ORACLE_UTIL_SAR_STYLE + to indicate the implementation style: + 0 - portable (i.e. platform/compiler does not do arithmetic right + shift for signed integers or is not known to) + 1 - platform/compiler does do arithmetic right shits. + Default is 0. */ + +#include "compat_stdint.h" + +#ifndef PYTH_ORACLE_UTIL_SAR_STYLE +#define PYTH_ORACLE_UTIL_SAR_STYLE 0 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if PYTH_ORACLE_UTIL_SAR_STYLE==0 + +static inline int8_t +sar_int8( int8_t x, + int n ) { /* Assumed in [0,7] */ + uint8_t ux = (uint8_t)x; + uint8_t s = (uint8_t)-(ux >> 7); /* get the sign, ((int8_t)s)==-1 if x<0 and 0 otherwise */ + return (int8_t)(((ux ^ s) >> n) ^ s); +} + +static inline int16_t +sar_int16( int16_t x, + int n ) { /* Assumed in [0,15] */ + uint16_t ux = (uint16_t)x; + uint16_t s = (uint16_t)-(ux >> 15); /* get the sign, ((int16_t)s)==-1 if x<0 and 0 otherwise */ + return (int16_t)(((ux ^ s) >> n) ^ s); +} + +static inline int32_t +sar_int32( int32_t x, + int n ) { /* Assumed in [0,31] */ + uint32_t ux = (uint32_t)x; + uint32_t s = -(ux >> 31); /* get the sign, ((int32_t)s)==-1 if x<0 and 0 otherwise */ + return (int32_t)(((ux ^ s) >> n) ^ s); +} + +static inline int64_t +sar_int64( int64_t x, + int n ) { /* Assumed in [0,63] */ + uint64_t ux = (uint64_t)x; + uint64_t s = -(ux >> 63); /* get the sign, ((int64_t)s)==-1 if x<0 and 0 otherwise */ + return (int64_t)(((ux ^ s) >> n) ^ s); +} + +#elif PYTH_ORACLE_UTIL_SAR_STYLE==1 + +static inline int8_t sar_int8 ( int8_t x, int n ) { return (int8_t )(x >> n); } +static inline int16_t sar_int16( int16_t x, int n ) { return (int16_t)(x >> n); } +static inline int32_t sar_int32( int32_t x, int n ) { return (int32_t)(x >> n); } +static inline int64_t sar_int64( int64_t x, int n ) { return (int64_t)(x >> n); } + +#else +#error "Unsupported PYTH_ORACLE_UTIL_SAR_STYLE" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_util_sar_h_ */ + diff --git a/program/src/oracle/util/smag.h b/program/src/oracle/util/smag.h new file mode 100644 index 000000000..daf8a8e10 --- /dev/null +++ b/program/src/oracle/util/smag.h @@ -0,0 +1,54 @@ +#ifndef _pyth_oracle_util_smag_h_ +#define _pyth_oracle_util_smag_h_ + +/* Signed magnitude conversions to/from twos complement + representations. */ + +#include "compat_stdint.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Unpack a twos complement represention into signed magnitude + representation. Returns magnitude, assumes _s is non-NULL and *_s + will be 0 or 1 on return. For the 32-bit variant, return will be in + [0,2^31] and *_s==1 if the return is 2^31. Similarly for the other + widths. */ + +static inline uint8_t smag_unpack_int8 ( int8_t x, int * _s ) { int s = x 2 y^2 = y^2 + x - r + -> y^2 = x - r + + for some r in [0,2 y-1]. We note that if y = floor( sqrt( x ) ) + exactly though: + + y^2 <= x < (y+1)^2 + -> y^2 <= x < y^2 + 2 y + 1 + -> y^2 = x - r' + + for some r' in [0,2 y]. r' is r with the element 2 y added. And it + is possible to have y^2 = x - 2 y. Namely if x+1 = z^2 for integer + z, this becomes y^2 + 2 y + 1 = z^2 -> y = z-1. That is, when + x = z^2 - 1 for integral z, the relationship can never converge. If + we instead used a denominator of 2y+1 in the iteration, r would have + the necessary range: + + y' = floor( (y^2 + x) / (2 y + 1) ) + + At convergence we have: + + y = (y^2 + x - r) / (2 y+1) + -> 2 y^2 + y = (y^2 + x - r) + -> y^2 = x-y-r + + for some r in [0,2 y]. This isn't quite right but we change the + recurrence numerator to compensate: + + y' = floor( (y^2 + y + x) / (2 y + 1) ) + + At convergence we now have: + + y^2 = x-r + + for some r in [0,2 y]. That is, at convergence y = floor( sqrt(x) ) + exactly! The addition of y to the numerator has not made + intermediate overflow much more diffcult to deal with either as y <<< + x for large x. So to compute this without intermediate overflow, we + compute the terms individually and then combine the reminders + appropriately. x/(2y+1) term is trivial. The other term, + (y^2+y)/(2y+1) is asymptotically approximately y/2. Breaking it into + its asymptotic and residual: + + (y^2 + y) / (2y+1) = y/2 + ( y^2 + y - (y/2)(2y+1) ) / (2y+1) + = y/2 + ( y^2 + y - y^2 - y/2 ) / (2y+1) + = y/2 + (y/2) / (2y+1) + + For even y, y/2 = y>>1 = yh and we have the partial quotient yh and + remainder yh. For odd y, we have: + + = yh + (1/2) + (yh+(1/2)) / (2y+1) + = yh + ((1/2)(2y+1)+yh+(1/2)) / (2y+1) + = yh + (y+yh+1) / (2y+1) + + with partial quotent yh and remainder y+yh+1. This yields the + iteration: + + y ~ sqrt(x) // <<< INT_MAX for all x + for(;;) { + d = 2*y + 1; + qx = x / d; rx = x - qx*d; // Compute x /(2y+1), rx in [0,2y] + qy = y>>1; ry = (y&1) ? (y+yh+1) : yh; // Compute y^2/(2y+1), ry in [0,2y] + q = qx+qy; r = rx+ry; // Combine partials, r in [0,4y] + if( r>=d ) q++, r-=d; // Handle carry (at most 1), r in [0,2y] + if( y==q ) break; // At convergence y = floor(sqrt(x)) + y = q; + } + + The better the initial guess, the faster this will converge. Since + convergence is still quadratic though, it will converge even given + very simple guesses. We use: + + y = sqrt(x) = sqrt( 2^n + d ) <~ 2^(n/2) + + where n is the index of the MSB and d is in [0,2^n) (i.e. is n bits + wide). Thus: + + y ~ 2^(n>>1) if n is even and 2^(n>>1) sqrt(2) if n is odd + + and we can do a simple fixed point calculation to compute this. + + For small values of x, we encode a 20 entry 3-bit wide lookup table + in a 64-bit constant and just do a quick lookup. */ + +static inline uint8_t +sqrt_uint8( uint8_t x ) { /* FIXME: CONSIDER 8-bit->4-bit LUT INSTEAD (128bytes)? */ + if( x> (3*(int)x)) & UINT64_C(7)); + int n = log2_uint8( x ); + uint8_t y = (uint8_t)(((n & 1) ? UINT8_C(0xb) /* floor( 2^3 sqrt(2) ) */ : UINT8_C(0x8) /* 2^3 */) >> (3-(n>>1))); + for(;;) { + uint8_t d = (uint8_t)(y<<1); d++; + uint8_t qx = x / d; uint8_t rx = (uint8_t)(x - qx*d); + uint8_t qy = y>>1; uint8_t ry = (y & UINT8_C(1)) ? (uint8_t)(y+qy+UINT8_C(1)) : qy; + uint8_t q = (uint8_t)(qx+qy); uint8_t r = (uint8_t)(rx+ry); + if( r>=d ) q++; + if( y==q ) break; + y = q; + } + return y; +} + +static inline uint16_t +sqrt_uint16( uint16_t x ) { + if( x> (3*(int)x)) & UINT64_C(7)); + int n = log2_uint16( x ); + uint16_t y = (uint16_t)(((n & 1) ? UINT16_C(0xb5) /* floor( 2^7 sqrt(2) ) */ : UINT16_C(0x80) /* 2^7 */) >> (7-(n>>1))); + for(;;) { + uint16_t d = (uint16_t)(y<<1); d++; + uint16_t qx = x / d; uint16_t rx = (uint16_t)(x - qx*d); + uint16_t qy = y>>1; uint16_t ry = (y & UINT16_C(1)) ? (uint16_t)(y+qy+UINT16_C(1)) : qy; + uint16_t q = (uint16_t)(qx+qy); uint16_t r = (uint16_t)(rx+ry); + if( r>=d ) q++; + if( y==q ) break; + y = q; + } + return y; +} + +static inline uint32_t +sqrt_uint32( uint32_t x ) { + if( x> (3*(int)x)) & UINT64_C(7)); + if( !x ) return UINT32_C(0); + int n = log2_uint32( x ); + uint32_t y = ((n & 1) ? UINT32_C(0xb504) /* floor( 2^15 sqrt(2) ) */ : UINT32_C(0x8000) /* 2^15 */) >> (15-(n>>1)); + for(;;) { + uint32_t d = (y<<1); d++; + uint32_t qx = x / d; uint32_t rx = x - qx*d; + uint32_t qy = y>>1; uint32_t ry = (y & UINT32_C(1)) ? (y+qy+UINT32_C(1)) : qy; + uint32_t q = qx+qy; uint32_t r = rx+ry; + if( r>=d ) q++; + if( y==q ) break; + y = q; + } + return y; +} + +static inline uint64_t +sqrt_uint64( uint64_t x ) { + if( x> (3*(int)x)) & UINT64_C(7); + int n = log2_uint64( x ); + uint64_t y = ((n & 1) ? UINT64_C(0xb504f333) /* floor( 2^31 sqrt(2) ) */ : UINT64_C(0x80000000) /* 2^31 */) >> (31-(n>>1)); + for(;;) { + uint64_t d = (y<<1); d++; + uint64_t qx = x / d; uint64_t rx = x - qx*d; + uint64_t qy = y>>1; uint64_t ry = (y & UINT64_C(1)) ? (y+qy+UINT64_C(1)) : qy; + uint64_t q = qx+qy; uint64_t r = rx+ry; + if( r>=d ) q++; + if( y==q ) break; + y = q; + } + return y; +} + +/* These return floor( sqrt( x ) ), undefined behavior for negative x. */ + +static inline int8_t sqrt_int8 ( int8_t x ) { return (int8_t )sqrt_uint8 ( (uint8_t )x ); } +static inline int16_t sqrt_int16( int16_t x ) { return (int16_t)sqrt_uint16( (uint16_t)x ); } +static inline int32_t sqrt_int32( int32_t x ) { return (int32_t)sqrt_uint32( (uint32_t)x ); } +static inline int64_t sqrt_int64( int64_t x ) { return (int64_t)sqrt_uint64( (uint64_t)x ); } + +/* These return the floor( re sqrt(x) ) */ + +static inline int8_t sqrt_re_int8 ( int8_t x ) { return x>INT8_C( 0) ? (int8_t )sqrt_uint8 ( (uint8_t )x ) : INT8_C( 0); } +static inline int16_t sqrt_re_int16( int16_t x ) { return x>INT16_C(0) ? (int16_t)sqrt_uint16( (uint16_t)x ) : INT16_C(0); } +static inline int32_t sqrt_re_int32( int32_t x ) { return x>INT32_C(0) ? (int32_t)sqrt_uint32( (uint32_t)x ) : INT32_C(0); } +static inline int64_t sqrt_re_int64( int64_t x ) { return x>INT64_C(0) ? (int64_t)sqrt_uint64( (uint64_t)x ) : INT64_C(0); } + +/* These return the floor( sqrt( |x| ) ) */ + +static inline int8_t sqrt_abs_int8 ( int8_t x ) { return (int8_t )sqrt_uint8 ( (uint8_t )(x +#include +#include "util.h" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + uint32_t shift_mask = (uint32_t)(sizeof(uintptr_t)*(size_t)CHAR_BIT); + if( !align_ispow2( (uintptr_t)shift_mask ) ) { + printf( "FAIL (platform)\n" ); + return 1; + } + shift_mask--; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + int ctr = 0; + for( int i=0; i<1000000000; i++ ) { + if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; } + ctr--; + + /* Test align_ispow2 */ + + do { + uint32_t r = prng_uint32( prng ); + uint32_t s = (r>>8) & shift_mask; /* s is uniform IID valid right shift amount for a uintptr_t */ + r &= (uint32_t)255; /* r is 8-bit uniform IID rand (power of 2 with prob ~8/256=1/32) */ + uintptr_t x = ((uintptr_t)r) << s; /* x is power of 2 with prob ~1/32 */ + + int c = align_ispow2( x ); + int d = (x>(uintptr_t)0) && !(x & (x-(uintptr_t)1)); + if( c!=d ) { + printf( "FAIL (x %lx c %i d %i)\n", (unsigned long)x, c, d ); + return 1; + } + } while(0); + + /* Test align_up/align_dn/align_isaligned */ + + do { + uintptr_t a = ((uintptr_t)1) << (prng_uint32( prng ) & shift_mask); + + uintptr_t x = (uintptr_t)prng_uint64( prng ); + void * u = (void *)x; + void * v = align_dn( u, a ); + void * w = align_up( u, a ); + uintptr_t y = (uintptr_t)v; + uintptr_t z = (uintptr_t)w; + + int already_aligned = (u==v); + int already_aligned_1 = (u==w); + + uintptr_t mask = a-(uintptr_t)1; + if( already_aligned != already_aligned_1 || + align_isaligned( u, a )!=already_aligned || + !align_isaligned( v, a ) || + !align_isaligned( w, a ) || + ( x & ~mask) != y || + ((x+mask) & ~mask) != z ) { + printf( "FAIL (a %lx aa %i aa1 %i x %lx y %lx z %lx)\n", + (unsigned long)a, already_aligned, already_aligned_1, (unsigned long)x, (unsigned long)y, (unsigned long)z ); + return 1; + } + + } while(0); + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass\n" ); + + return 0; +} + diff --git a/program/src/oracle/util/test_avg.c b/program/src/oracle/util/test_avg.c new file mode 100644 index 000000000..ee07de0a9 --- /dev/null +++ b/program/src/oracle/util/test_avg.c @@ -0,0 +1,141 @@ +#include +#include "util.h" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + int ctr = 0; + for( int i=0; i<1000000000; i++ ) { + if( !ctr ) { printf( "reg: Completed %i iterations\n", i ); ctr = 10000000; } + ctr--; + +# define TEST(w) do { \ + uint##w##_t x = prng_uint##w( prng ); \ + uint##w##_t y = prng_uint##w( prng ); \ + uint##w##_t z = avg_2_uint##w( x, y ); \ + uint##w##_t r = (uint##w##_t)(((uint64_t)x+(uint64_t)y) >> 1); \ + if( z!=r ) { \ + printf( "FAIL (iter %i op avg_2_uint" #w "x %lu y %lu z %lu r %lu\n", \ + i, (long unsigned)x, (long unsigned)y, (long unsigned)z, (long unsigned)r ); \ + return 1; \ + } \ + } while(0) + TEST(8); + TEST(16); + TEST(32); +# undef TEST + + do { + uint64_t x = prng_uint64( prng ); + uint64_t y = prng_uint64( prng ); + uint64_t z = avg_2_uint64( x, y ); + uint64_t r = (x>>1) + (y>>1) + (x & y & (uint64_t)1); + if( z!=r ) { + printf( "FAIL (iter %i op avg_2_uint64 x %lu y %lu z %lu r %lu\n", + i, (long unsigned)x, (long unsigned)y, (long unsigned)z, (long unsigned)r ); + return 1; + } + } while(0); + \ +# define TEST(w) do { \ + int##w##_t x = (int##w##_t)prng_uint##w( prng ); \ + int##w##_t y = (int##w##_t)prng_uint##w( prng ); \ + int##w##_t z = avg_2_int##w( x, y ); \ + int##w##_t r = (int##w##_t)(((int64_t)x+(int64_t)y) >> 1); \ + if( z!=r ) { \ + printf( "FAIL (iter %i op avg_2_int" #w "x %li y %li z %li r %li\n", \ + i, (long)x, (long)y, (long)z, (long)r ); \ + return 1; \ + } \ + } while(0) + TEST(8); + TEST(16); + TEST(32); +# undef TEST + + do { + int64_t x = (int64_t)prng_uint64( prng ); + int64_t y = (int64_t)prng_uint64( prng ); + int64_t z = avg_2_int64( x, y ); + int64_t r = (x>>1) + (y>>1) + (x & y & (int64_t)1); + if( z!=r ) { + printf( "FAIL (iter %i op avg_2_int64 x %li y %li z %li r %li\n", + i, (long)x, (long)y, (long)z, (long)r ); + return 1; + } + } while(0); + } + +# define N 512 + + ctr = 0; + for( int i=0; i<10000000; i++ ) { + if( !ctr ) { printf( "mem: Completed %i iterations\n", i ); ctr = 100000; } + ctr--; + +# define TEST(w) do { \ + uint##w##_t x[N]; \ + uint32_t n = prng_uint32( prng ) & (uint32_t)(N-1); \ + uint64_t a = (uint64_t)0; \ + for( uint32_t i=(uint32_t)0; i +#include +#include "prng.h" + +#define EXP2M1_FXP_ORDER 7 +#include "exp2m1.h" + +static inline double exp2m1( double x ) { return expm1(0.69314718055994530943*x); } /* FIXME: USE PROPER METHOD */ + +static double const ulp_thresh[8] = { + 0.50000000000000000000, // For measuring limit + 92418389.07584851980209350586, + 4173744.68851661682128906250, + 183713.37186783552169799805, + 5998.71748709678649902344, + 165.76749026775360107422, + 4.96575236320495605469, + 0.63437640666961669922 +}; + +static double const rel_thresh[8] = { + 4.7e-10, // For measuring limit + 6.2e-02, + 3.3e-03, + 1.3e-04, + 5.1e-06, + 1.5e-07, + 4.4e-09, + 5.5e-10 +}; + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + double const one_ulp = (double)(1<<30); /* exact */ + double const ulp = (1./one_ulp); /* exact */ + + /* Exhaustively test x in [0,2^30) */ + + double max_aerr = 0.; + double max_rerr = 0.; + + double thresh = ulp_thresh[ EXP2M1_FXP_ORDER ]; + + int ctr = 0; + for( int i=0; i<(1<<30); i++ ) { + if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 100000000; } + ctr--; + + double x = ulp*(double)i; /* exact */ + double y = ulp*(double)exp2m1_fxp( (uint64_t)i ); /* approx */ + //double y = ulp*round( one_ulp*exp2m1( x ) ); /* limit: bits 31.0 max_aerr 4.7e-10 max_rerr 4.7e-10 ulp 0.5 */ + double z = exp2m1( x ); /* goal */ + double aerr = fabs(y-z); + double rerr = aerr/(1.+z); + if( aerr*one_ulp>thresh ) { + printf( "FAIL (x %.20f y %.20f z %.20f aerr %.20f ulp)\n", x, y, z, aerr*one_ulp ); + return 1; + } + max_aerr = fmax( max_aerr, aerr ); + max_rerr = fmax( max_rerr, rerr ); + } + + /* Randomly test larger x (with coverage of each exponent range and + special values) */ + + thresh = rel_thresh[ EXP2M1_FXP_ORDER ]; + + for( int i=0; i<10000000; i++ ) { + uint64_t _x; + if ( i< 64 ) _x = UINT64_C(1)<> s; + } + uint64_t _y = exp2m1_fxp( _x ); + + if( _x>exp2m1_fxp_max() ) { + if( _y!=UINT64_MAX ) { + printf( "FAIL (iter %i sat _x %lx y %lx)\n", i, _x, _y ); + return 1; + } + continue; + } + + uint64_t xi = _x>>30; + uint64_t xf = (_x<<34)>>34; + if( !(xi & (xi-UINT64_C(1))) && !xf ) { + if( _y!=((UINT64_C(1)<<(xi+UINT64_C(30)))-(UINT64_C(1)<<30)) ) { + printf( "FAIL (iter %i exact_x %lx y %lx)\n", i, _x, _y ); + return 1; + } + } + + double x = ulp*(double)_x; /* exact */ + double y = ulp*(double)_y; /* exact */ + double z = exp2m1( x ); /* goal */ + double aerr = fabs(y-z); + double rerr = aerr/(1.+z); + if( rerr>thresh ) { + printf( "FAIL (x %.20f y %.20f z %.20f rerr %.20e)\n", x, y, z, rerr ); + return 1; + } + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass (bits %.1f max_aerr %.1e max_rerr %.1e ulp %.1f)\n", -log2( max_rerr ), max_aerr, max_rerr, max_aerr*one_ulp ); + + return 0; +} + diff --git a/program/src/oracle/util/test_fxp.c b/program/src/oracle/util/test_fxp.c new file mode 100644 index 000000000..ce4dfc40a --- /dev/null +++ b/program/src/oracle/util/test_fxp.c @@ -0,0 +1,376 @@ +#include +#include +#include "prng.h" + +#include "fxp.h" + +/* Create random bit patterns with lots of leading and/or trailing zeros + or ones to really stress limits of implementations. */ + +static inline uint64_t /* Random fxp */ +make_rand_fxp( uint64_t x, /* Random 64-bit */ + uint32_t *_ctl ) { /* Least significant 8 bits random, uses them up */ + uint32_t ctl = *_ctl; + int s = (int)(ctl & UINT32_C(63)); ctl >>= 6; /* Shift, in [0,63] */ + int d = (int)(ctl & UINT32_C( 1)); ctl >>= 1; /* Direction, in [0,1] */ + int i = (int)(ctl & UINT32_C( 1)); ctl >>= 1; /* Invert, in [0,1] */ + *_ctl = ctl; + x = d ? (x<>s); + return i ? (~x) : x; +} + +__extension__ typedef unsigned __int128 uint128_t; + +static inline uint64_t split_hi( uint128_t x ) { return (uint64_t)(x>>64); } +static inline uint64_t split_lo( uint128_t x ) { return (uint64_t) x; } + +static inline uint64_t +fxp_add_ref( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint128_t z = ((uint128_t)x) + ((uint128_t)y); + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_sub_ref( uint64_t x, + uint64_t y, + uint64_t * _b ) { + uint64_t b = (uint64_t)(x>= 30; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_mul_raz_ref( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint128_t z = (((uint128_t)x)*((uint128_t)y) + ((uint128_t)((UINT64_C(1)<<30)-UINT64_C(1)))) >> 30; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_mul_rnz_ref( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint128_t z = (((uint128_t)x)*((uint128_t)y) + ((uint128_t)((UINT64_C(1)<<29)-UINT64_C(1)))) >> 30; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_mul_rna_ref( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint128_t z = (((uint128_t)x)*((uint128_t)y) + ((uint128_t)(UINT64_C(1)<<29))) >> 30; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_mul_rne_ref( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint128_t z = ((uint128_t)x)*((uint128_t)y); + uint64_t f = split_lo( z ) & ((UINT64_C(1)<<30)-UINT64_C(1)); + z >>= 30; + if( (f>(UINT64_C(1)<<29)) || ((f==(UINT64_C(1)<<29)) && (z & UINT64_C(1))) ) z++; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_mul_rno_ref( uint64_t x, + uint64_t y, + uint64_t * _c ) { + uint128_t z = ((uint128_t)x)*((uint128_t)y); + uint64_t f = split_lo( z ) & ((UINT64_C(1)<<30)-UINT64_C(1)); + z >>= 30; + if( (f>(UINT64_C(1)<<29)) || ((f==(UINT64_C(1)<<29)) && !(z & UINT64_C(1))) ) z++; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_div_rtz_ref( uint64_t x, + uint64_t _y, + uint64_t * _c ) { + if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); } + uint128_t ex = ((uint128_t)x) << 30; + uint128_t y = (uint128_t)_y; + uint128_t z = ex / y; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_div_raz_ref( uint64_t x, + uint64_t _y, + uint64_t * _c ) { + if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); } + uint128_t ex = ((uint128_t)x) << 30; + uint128_t y = (uint128_t)_y; + uint128_t z = (ex + y - (uint128_t)1) / y; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_div_rnz_ref( uint64_t x, + uint64_t _y, + uint64_t * _c ) { + if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); } + uint128_t ex = ((uint128_t)x) << 30; + uint128_t y = (uint128_t)_y; + uint128_t z = (ex + ((y-(uint128_t)1)>>1)) / y; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_div_rna_ref( uint64_t x, + uint64_t _y, + uint64_t * _c ) { + if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); } + uint128_t ex = ((uint128_t)x) << 30; + uint128_t y = (uint128_t)_y; + uint128_t z = (ex + (y>>1)) / y; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_div_rne_ref( uint64_t x, + uint64_t _y, + uint64_t * _c ) { + if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); } + uint128_t ex = ((uint128_t)x) << 30; + uint128_t y = (uint128_t)_y; + uint128_t z = ex / y; + uint128_t r2 = (ex - z*y) << 1; + if( r2>y || (r2==y && (z & (uint128_t)1)) ) z++; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline uint64_t +fxp_div_rno_ref( uint64_t x, + uint64_t _y, + uint64_t * _c ) { + if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); } + uint128_t ex = ((uint128_t)x) << 30; + uint128_t y = (uint128_t)_y; + uint128_t z = ex / y; + uint128_t r2 = (ex - z*y) << 1; + if( r2>y || (r2==y && !(z & (uint128_t)1)) ) z++; + *_c = split_hi( z ); + return split_lo( z ); +} + +static inline int +test_fxp_sqrt_rtz( uint64_t x, + uint64_t y ) { + if( !x ) return !!y; + if( !(((UINT64_C(1)<<15)<=y) && (y<=(UINT64_C(1)<<(32+15)))) ) return 1; + uint64_t xh,xl; uwide_sl ( &xh,&xl, UINT64_C(0),x, 30 ); /* 2^30 x */ + uint64_t rh,rl; uwide_mul( &rh,&rl, y,y ); /* y^2 */ + uint64_t b = uwide_sub( &rh,&rl, xh,xl, rh,rl, UINT64_C(0) ); /* r = 2^30 x - y^2, in [0,2y] if rounded correctly */ + return b || rh || rl>(y<<1); +} + +static inline int +test_fxp_sqrt_raz( uint64_t x, + uint64_t y ) { + if( !x ) return !!y; + if( !(((UINT64_C(1)<<15)<=y) && (y<=(UINT64_C(1)<<(32+15)))) ) return 1; + uint64_t xh,xl; uwide_sl ( &xh,&xl, UINT64_C(0),x, 30 ); /* 2^30 x */ + uint64_t rh,rl; uwide_mul( &rh,&rl, y,y ); /* y^2 */ + uint64_t b = uwide_sub( &rh,&rl, rh,rl, xh,xl, UINT64_C(0) ); /* y^2 - 2^30 x, in [0,2y-2] if rounded correctly */ + return b || rh || rl>((y<<1)-UINT64_C(2)); +} + +static inline int +test_fxp_sqrt_rnz( uint64_t x, + uint64_t y ) { + if( !x ) return !!y; + if( !(((UINT64_C(1)<<15)<=y) && (y<=(UINT64_C(1)<<(32+15)))) ) return 1; + uint64_t xh,xl; uwide_sl ( &xh,&xl, UINT64_C(0),x, 30 ); /* 2^30 x */ + uint64_t rh,rl; uwide_mul( &rh,&rl, y,y-UINT64_C(1) ); /* y^2 - y */ + uwide_inc( &rh,&rl, rh,rl, UINT64_C(1) ); /* y^2 - y + 1 */ + uint64_t b = uwide_sub( &rh,&rl, xh,xl, rh,rl, UINT64_C(0) ); /* r = 2^30 x - (y^2 - y + 1), in [0,2y-1] if rounded correctly */ + return b || rh || rl>=(y<<1); +} + +static inline int +fxp_log2_ref( uint64_t * _f, + uint64_t x ) { + if( !x ) { *_f = UINT64_C(0); return INT_MIN; } + long double ef = log2l( (long double)x ); + int e = log2_uint64( x ); + *_f = (uint64_t)roundl( (ef - (long double)e)*(long double)(UINT64_C(1)<<30) ); + return e-30; +} + +static inline uint64_t +fxp_exp2_ref( uint64_t x ) { + if( x>=UINT64_C(0x880000000) ) return UINT64_MAX; + return (uint64_t)roundl( ((long double)(UINT64_C(1)<<30))*exp2l( ((long double)x)*(1.L/(long double)(UINT64_C(1)<<30)) ) ); +} + +static inline uint64_t +fxp_rexp2_ref( uint64_t x ) { + return (uint64_t)roundl( ((long double)(UINT64_C(1)<<30))*exp2l( ((long double)x)*(-1.L/(long double)(UINT64_C(1)<<30)) ) ); +} + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + uint64_t fxp_log2_approx_ulp = UINT64_C(0); + uint64_t fxp_exp2_approx_ulp = UINT64_C(0); + uint64_t fxp_rexp2_approx_ulp = UINT64_C(0); + + int ctr = 0; + for( int i=0; i<100000000; i++ ) { + if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; } + ctr--; + + uint32_t t = prng_uint32( prng ); + uint64_t x = make_rand_fxp( prng_uint64( prng ), &t ); + uint64_t y = make_rand_fxp( prng_uint64( prng ), &t ); + +# define TEST(op) \ + do { \ + uint64_t c0,z0 = fxp_##op##_ref ( x, y, &c0 ); \ + uint64_t c1,z1 = fxp_##op ( x, y, &c1 ); \ + uint64_t z2 = fxp_##op##_fast( x, y ); \ + if( c0!=c1 || z0!=z1 || (!c0 && z0!=z2) ) { \ + printf( "%i: FAIL (fxp_" #op " x %016lx y %016lx cz0 %016lx %016lx cz1 %016lx %016lx z2 %016lx\n", \ + i, x, y, c0,z0, c1,z1, z2 ); \ + return 1; \ + } \ + } while(0) + + TEST(add); + TEST(sub); + +# undef TEST +# define TEST(op) \ + do { \ + uint64_t c0,z0 = fxp_##op##_ref ( x, y, &c0 ); \ + uint64_t c1,z1 = fxp_##op ( x, y, &c1 ); \ + uint64_t z2 = fxp_##op##_fast( x, y ); \ + if( c0!=c1 || z0!=z1 || (!c0 && z0f1 ? f0-f1 : f1-f0; + if( ulp > fxp_log2_approx_ulp ) fxp_log2_approx_ulp = ulp; + if( e0!=e1 || ulp>UINT64_C(2) ) { + printf( "%i: FAIL (fxp_log2_approx x %016lx z0 %3i %016lx z1 %3i %016lx ulp %016lx\n", i, x, e0,f0, e1,f1, ulp ); + return 1; + } + } while(0); + + do { + uint64_t z0 = fxp_exp2_ref ( x ); + uint64_t z1 = fxp_exp2_approx( x ); + uint64_t ulp = z0>z1 ? z0-z1 : z1-z0; + uint64_t ix = x>>30; /* Larger x are only +/-1 ulp accurate in the first 30 bits */ + if( UINT64_C(0)>= ix; + if( ulp > fxp_exp2_approx_ulp ) fxp_exp2_approx_ulp = ulp; + if( ulp>UINT64_C(1) ) { + printf( "%i: FAIL (fxp_exp2_approx x %016lx z0 %016lx z1 %016lx ulp %016lx\n", i, x, z0, z1, ulp ); + return 1; + } + } while(0); + + do { + uint64_t z0 = fxp_rexp2_ref ( x ); + uint64_t z1 = fxp_rexp2_approx( x ); + uint64_t ulp = z0>z1 ? z0-z1 : z1-z0; + if( ulp > fxp_rexp2_approx_ulp ) fxp_rexp2_approx_ulp = ulp; + if( ulp>UINT64_C(1) ) { + printf( "%i: FAIL (fxp_rexp2_approx x %016lx z0 %016lx z1 %016lx ulp %016lx\n", i, x, z0, z1, ulp ); + return 1; + } + } while(0); + + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass (ulp fxp_log2_approx ~ %lu fxp_exp2_approx ~ %lu fxp_rexp2_approx ~ %lu)\n", + fxp_log2_approx_ulp, fxp_exp2_approx_ulp, fxp_rexp2_approx_ulp ); + + return 0; +} + diff --git a/program/src/oracle/util/test_hash.c b/program/src/oracle/util/test_hash.c new file mode 100644 index 000000000..e4dcc6cf0 --- /dev/null +++ b/program/src/oracle/util/test_hash.c @@ -0,0 +1,59 @@ +#include +#include "util.h" + +uint32_t ref32[10] = { + UINT32_C( 0x00000000 ), + UINT32_C( 0x514e28b7 ), + UINT32_C( 0x30f4c306 ), + UINT32_C( 0x85f0b427 ), + UINT32_C( 0x249cb285 ), + UINT32_C( 0xcc0d53cd ), + UINT32_C( 0x5ceb4d08 ), + UINT32_C( 0x18c9aec4 ), + UINT32_C( 0x4939650b ), + UINT32_C( 0xc27c2913 ) +}; + + +uint64_t ref64[10] = { + UINT64_C( 0x0000000000000000 ), + UINT64_C( 0xb456bcfc34c2cb2c ), + UINT64_C( 0x3abf2a20650683e7 ), + UINT64_C( 0x0b5181c509f8d8ce ), + UINT64_C( 0x47900468a8f01875 ), + UINT64_C( 0xd66ad737d54c5575 ), + UINT64_C( 0xe8b4b3b1c77c4573 ), + UINT64_C( 0x740729cbe468d1dd ), + UINT64_C( 0x46abcca593a3c687 ), + UINT64_C( 0x91209a1ff7f4f1d5 ) +}; + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + for( int i=0; i<10; i++ ) { + uint32_t x = (uint32_t)i; + uint32_t y = hash_uint32( x ); + uint32_t z = hash_inverse_uint32( y ); + if( y!=ref32[i] ) { printf( "ref32: FAIL\n" ); return 1; } + if( x!=z ) { printf( "inv32: FAIL\n" ); return 1; } + if( hash_uint32( hash_inverse_uint32( x ) )!=x ) { printf( "INV32: FAIL\n" ); return 1; } + } + + for( int i=0; i<10; i++ ) { + uint64_t x = (uint64_t)i; + uint64_t y = hash_uint64( x ); + uint64_t z = hash_inverse_uint64( y ); + if( y!=ref64[i] ) { printf( "ref64: FAIL\n" ); return 1; } + if( x!=z ) { printf( "inv64: FAIL\n" ); return 1; } + if( hash_uint64( hash_inverse_uint64( x ) )!=x ) { printf( "INV64: FAIL\n" ); return 1; } + } + + /* FIXME: MEASURE AVALANCHE PROPERTIES? */ + + printf( "pass\n" ); + return 0; +} + diff --git a/program/src/oracle/util/test_log2.c b/program/src/oracle/util/test_log2.c new file mode 100644 index 000000000..f54ee7b43 --- /dev/null +++ b/program/src/oracle/util/test_log2.c @@ -0,0 +1,61 @@ +#include +#include "util.h" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + int ctr = 0; + for( int i=0; i<100000000; i++ ) { + if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; } + ctr--; +# define TEST(w) do { \ + int n = ((int)prng_uint32( prng )) & (w-1); \ + uint##w##_t mask = (uint##w##_t)(1ULL << n); \ + uint##w##_t bit = mask--; \ + uint##w##_t x = bit | (prng_uint##w( prng ) & mask); \ + int##w##_t y = (int##w##_t)x; \ + if( log2_uint##w( x )!=n ) { \ + printf( "FAIL (iter %i op log2_uint" #w " x %lx n %i)\n", i, (long unsigned)x, n ); \ + return 1; \ + } \ + if( y>0 && log2_int##w( y )!=n ) { \ + printf( "FAIL (iter %i op log2_int" #w " x %lx n %i)\n", i, (long unsigned)y, n ); \ + return 1; \ + } \ + if( log2_uint##w##_def( x, 1234 )!=n ) { \ + printf( "FAIL (iter %i op log2_uint" #w "_def x %lx n %i)\n", i, (long unsigned)x, n ); \ + return 1; \ + } \ + if( log2_int##w##_def( y, 1234 )!=((y>(int##w##_t)0) ? n : 1234) ) { \ + printf( "FAIL (iter %i op log2_int" #w "_def y %lx n %i)\n", i, (long unsigned)x, n ); \ + return 1; \ + } \ + n--; bit>>=1; mask>>=1; x>>=1; y=(int##w##_t)x; \ + if( log2_uint##w##_def( x, 1234 )!=(x ? n : 1234) ) { \ + printf( "FAIL (iter %i op log2_uint" #w "_def x %lx n %i)\n", i, (long unsigned)x, n ); \ + return 1; \ + } \ + if( log2_int##w##_def( y, 1234 )!=(y ? n : 1234) ) { \ + printf( "FAIL (iter %i op log2_int" #w "_def x %lx n %i)\n", i, (long unsigned)x, n ); \ + return 1; \ + } \ + } while(0) + TEST(8); + TEST(16); + TEST(32); + TEST(64); +# undef TEST + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass\n" ); + + return 0; +} + diff --git a/program/src/oracle/util/test_prng.c b/program/src/oracle/util/test_prng.c new file mode 100644 index 000000000..84a428346 --- /dev/null +++ b/program/src/oracle/util/test_prng.c @@ -0,0 +1,180 @@ +#include +#include +#include +#include "util.h" + +/* First 10 uint64 for seq 0 starting from idx 0 */ + +uint64_t x0_0[10] = { + UINT64_C( 0xa036f9b67313c1aa ), + UINT64_C( 0x110a4ea5e65927d2 ), + UINT64_C( 0x9ddf34cf83d17c94 ), + UINT64_C( 0xeb33d1b534e210ec ), + UINT64_C( 0x9b15b94d3e81b76a ), + UINT64_C( 0xee9544dab2bf64bf ), + UINT64_C( 0x5c4b0ccf7c94d274 ), + UINT64_C( 0xf0f83ab44262ad1f ), + UINT64_C( 0xf11b1aa14c7dabd6 ), + UINT64_C( 0x3800dde6d02d6ed7 ) +}; + +/* First 10 uint64 for seq 0 starting from idx 20 */ + +uint64_t x1_20[10] = { + UINT64_C( 0x55369a6a0817cbce ), + UINT64_C( 0xce1baeb695229132 ), + UINT64_C( 0x0e443e81e9e722d2 ), + UINT64_C( 0xc6c065484f76e825 ), + UINT64_C( 0x37dc474806fabc8a ), + UINT64_C( 0x9fa3305df1b56824 ), + UINT64_C( 0xf3b3961a17ed881c ), + UINT64_C( 0x646f40006cef8d6f ), + UINT64_C( 0x4d6955f607a153b2 ), + UINT64_C( 0xf806bccb58d0e60b ) +}; + +void +printf_ref( uint64_t * ref, + uint32_t seq, + uint64_t idx ) { + printf( "uint64_t x%u_%lu[10] = {\n", seq, idx ); + for( int i=0; i<10; i++ ) printf( " UINT64_C( 0x%016lx )%s\n", ref[i], i<9 ? "," : "" ); + printf( "};\n" ); +} + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + uint64_t x[10]; + + if( prng_footprint()!=(uint64_t)sizeof (prng_t) ) { printf( "FAIL (footprint)\n" ); return 1; } + if( prng_align ()!=(uint64_t)alignof(prng_t) ) { printf( "FAIL (align)\n" ); return 1; } + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + if( prng_seq( prng )!=(uint32_t)0 ) { printf( "FAIL (seq)\n" ); return 1; } + if( prng_idx( prng )!=(uint64_t)0 ) { printf( "FAIL (idx)\n" ); return 1; } + + for( int i=0; i<10; i++ ) x[i] = prng_uint64( prng ); + for( int i=0; i<10; i++ ) + if( x[i]!=x0_0[i] ) { + printf( "FAIL(0,0)\n" ); + printf_ref( x0_0, (uint32_t)0, (uint64_t)0 ); + printf_ref( x, (uint32_t)0, (uint64_t)0 ); + return 1; + } + + if( prng_seq( prng )!=(uint32_t) 0 ) { printf( "FAIL (seq-1)\n" ); return 1; } + if( prng_idx( prng )!=(uint64_t)20 ) { printf( "FAIL (idx-1)\n" ); return 1; } + + if( prng_seq_set( prng, (uint32_t)1 )!=(uint32_t)0 ) { printf( "FAIL (seq_set)\n" ); return 1; } + + for( int i=0; i<10; i++ ) x[i] = prng_uint64( prng ); + for( int i=0; i<10; i++ ) + if( x[i]!=x1_20[i] ) { + printf( "FAIL(1,20)\n" ); + printf_ref( x1_20, (uint32_t)1, (uint64_t)20 ); + printf_ref( x, (uint32_t)1, (uint64_t)20 ); + return 1; + } + + if( prng_seq( prng )!=(uint32_t) 1 ) { printf( "FAIL (seq-2)\n" ); return 1; } + if( prng_idx( prng )!=(uint64_t)40 ) { printf( "FAIL (idx-2)\n" ); return 1; } + + if( prng_seq_set( prng, (uint32_t)0 )!=(uint32_t) 1 ) { printf( "FAIL (seq_set-1)\n" ); return 1; } + if( prng_idx_set( prng, (uint64_t)0 )!=(uint64_t)40 ) { printf( "FAIL (idx_set-1)\n" ); return 1; } + + for( int i=0; i<10; i++ ) x[i] = prng_uint64( prng ); + for( int i=0; i<10; i++ ) if( x[i]!=x0_0[i] ) { printf( "FAIL(0,0)-1\n" ); return 1; } + + uint64_t domain = (uint64_t)0; + for( int i=0; i<1000; i++ ) { + uint8_t u8 = prng_uint8 ( prng ); + uint16_t u16 = prng_uint16 ( prng ); + uint32_t u32 = prng_uint32 ( prng ); + uint64_t u64 = prng_uint64 ( prng ); + + int8_t i8 = prng_int8 ( prng ); if( i8 <(int8_t )0 ) { printf( "FAIL (int8 )\n" ); return 1; } + int16_t i16 = prng_int16 ( prng ); if( i16<(int16_t)0 ) { printf( "FAIL (int16)\n" ); return 1; } + int32_t i32 = prng_int32 ( prng ); if( i32<(int32_t)0 ) { printf( "FAIL (int32)\n" ); return 1; } + int64_t i64 = prng_int64 ( prng ); if( i64<(int64_t)0 ) { printf( "FAIL (int64)\n" ); return 1; } + + float fc0 = prng_float_c0 ( prng ); if( !( 0.f<=fc0 && fc0< 1.f) ) { printf( "FAIL (float_c0)\n" ); return 1; } + float fc1 = prng_float_c1 ( prng ); if( !( 0.f< fc1 && fc1<=1.f) ) { printf( "FAIL (float_c1)\n" ); return 1; } + float fc = prng_float_c ( prng ); if( !( 0.f<=fc && fc <=1.f) ) { printf( "FAIL (float_c)\n" ); return 1; } + float fnc = prng_float_o ( prng ); if( !( 0.f< fnc && fnc< 1.f) ) { printf( "FAIL (float_o)\n" ); return 1; } + + double dc0 = prng_double_c0( prng ); if( !( 0.f<=dc0 && dc0< 1.f) ) { printf( "FAIL (double_c0)\n" ); return 1; } + double dc1 = prng_double_c1( prng ); if( !( 0.f< dc1 && dc1<=1.f) ) { printf( "FAIL (double_c1)\n" ); return 1; } + double dc = prng_double_c ( prng ); if( !( 0.f<=dc && dc <=1.f) ) { printf( "FAIL (double_c)\n" ); return 1; } + double dnc = prng_double_o ( prng ); if( !( 0.f< dnc && dnc< 1.f) ) { printf( "FAIL (double_o)\n" ); return 1; } + +# define LO(i,w) ((uint64_t)((uint64_t)(i >> (w-4))==(uint64_t)0x0)) +# define HI(i,w) ((uint64_t)((uint64_t)(i >> (w-4))==(uint64_t)0xf)) + + domain |= LO(u8, 8 ) << 0; domain |= HI(u8, 8 ) << 4; + domain |= LO(u16,16) << 1; domain |= HI(u16,16) << 5; + domain |= LO(u32,32) << 2; domain |= HI(u32,32) << 6; + domain |= LO(u64,64) << 3; domain |= HI(u64,64) << 7; + + domain |= LO(i8, 7 ) << 8; domain |= HI(i8, 7 ) << 12; + domain |= LO(i16,15) << 9; domain |= HI(i16,15) << 13; + domain |= LO(i32,31) << 10; domain |= HI(i32,31) << 14; + domain |= LO(i64,63) << 11; domain |= HI(i64,63) << 15; + +# undef HI +# undef LO +# define LO(f) ((uint64_t)(f<0.0625f)) +# define HI(f) ((uint64_t)(f>0.9375f)) + + domain |= LO(fc0) << 16; domain |= HI(fc0) << 20; + domain |= LO(fc1) << 17; domain |= HI(fc1) << 21; + domain |= LO(fc ) << 18; domain |= HI(fc ) << 22; + domain |= LO(fnc) << 19; domain |= HI(fnc) << 23; + +# undef HI +# undef LO +# define LO(d) ((uint64_t)(d<0.0625)) +# define HI(d) ((uint64_t)(d>0.9375)) + + domain |= LO(dc0) << 24; domain |= HI(dc0) << 28; + domain |= LO(dc1) << 25; domain |= HI(dc1) << 29; + domain |= LO(dc ) << 26; domain |= HI(dc ) << 30; + domain |= LO(dnc) << 27; domain |= HI(dnc) << 31; + +# undef HI +# undef LO + } + if( domain!=((UINT64_C(1)<<32)-UINT64_C(1)) ) { printf( "FAIL (domain)\n" ); return 1; } + + long sum_pop = 0L; + long sum_pop2 = 0L; + int min_pop = INT_MAX; + int max_pop = INT_MIN; + + int ctr = 0; + for( long i=0; i<(1L<<32); i++ ) { + if( !ctr ) { printf( "Completed %li iterations\n", i ); ctr = 100000000; } + ctr--; + + uint64_t seq = prng_private_expand( (uint32_t)i ); + if( prng_private_contract( seq )!=(uint32_t)i ) { printf( "FAIL (contract)\n" ); return 1; } + int pop = __builtin_popcountl( seq ); + sum_pop += (long) pop; + sum_pop2 += (long)(pop*pop); + min_pop = popmax_pop ? pop : max_pop; + } + double avg_pop = ((double)sum_pop ) / ((double)(1L<<32)); + double rms_pop = sqrt( (((double)sum_pop2) / ((double)(1L<<32))) - avg_pop*avg_pop ); + + prng_delete( prng_leave( prng ) ); + + printf( "pass (expand popcount stats: %.3f +/- %.3f [%i,%i])\n", avg_pop, rms_pop, min_pop, max_pop ); + + return 0; +} + diff --git a/program/src/oracle/util/test_prng_battery.c b/program/src/oracle/util/test_prng_battery.c new file mode 100644 index 000000000..41ffb9ee2 --- /dev/null +++ b/program/src/oracle/util/test_prng_battery.c @@ -0,0 +1,92 @@ +#include +#include +#include /* Assumes TestU01 install include directory is in the include search path */ +#include "util.h" + +static double +test_GetU01( void * param, + void * state ) { + (void)param; + return 2.3283064365386962890625e-10 /* 2^-32, exact */ * (double)prng_uint32( (prng_t *)state ); +} + +static unsigned long +test_GetBits( void * param, + void * state ) { + (void)param; + return (unsigned long)prng_uint32( (prng_t *)state ); +} + +static void +test_Write( void * state ) { + prng_t * prng = (prng_t *)state; + printf( "prng(0x%08lxU,0x%016lxUL)\n", (unsigned long)prng_seq( prng ), (unsigned long)prng_idx( prng ) ); +} + +static void +usage( char const * cmd ) { + fprintf( stderr, + "Usage: %s [bat] [seq] [idx]\n" + "\tbat 0 - FIPS-140.2 (fast)\n" + "\tbat 1 - Pseudo-DIEHARD (fast)\n" + "\tbat 2 - TestU01 SmallCrush (fast)\n" + "\tbat 3 - TestU01 Crush (~20 minutes)\n" + "\tbat 4 - TestU01 BigCrush (several hours)\n" + "Note that when running a test in a battery, the probability of it failing even\n" + "though the generator is fine is roughly 0.2%%. Thus, when repeatedly running\n" + "batteries that themselves contain a large number of tests, some test failures\n" + "are expected. So long as the test failures are are sporadic, don't occur in the\n" + "same place when running multiple times with random seq and/or idx, and don't\n" + "have p-values improbably close to 0 (p <<< 1/overall_num_tests_run) or 1\n" + "(1-p <<< 1/overall_num_tests_run), it is expected and normal\n", + cmd ); +} + +int +main( int argc, + char ** argv ) { + + /* Get command line arguments */ + + if( argc!=4 ) { usage( argv[0] ); return 1; } + int bat = atoi( argv[1] ); + uint32_t seq = (uint32_t)strtoul( argv[2], NULL, 0 ); + uint64_t idx = (uint64_t)strtoul( argv[3], NULL, 0 ); + + /* Create the test generator */ + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, seq, idx ) ); + + /* Run the requested test battery */ + + char name[128]; + sprintf( name, "prng(0x%08lxU,0x%016lxUL)", (unsigned long)prng_seq( prng ), (unsigned long)prng_idx( prng ) ); + + unif01_Gen gen[1]; + gen->state = prng; + gen->param = NULL; + gen->name = name; + gen->GetU01 = test_GetU01; + gen->GetBits = test_GetBits; + gen->Write = test_Write; + + switch( bat ) { + case 0: bbattery_FIPS_140_2( gen ); break; + case 1: bbattery_pseudoDIEHARD( gen ); break; + case 2: bbattery_SmallCrush( gen ); break; + case 3: bbattery_Crush( gen ); break; + case 4: bbattery_BigCrush( gen ); break; +//case 5: bbattery_Rabbit( gen ); break; /* FIXME: NB */ +//case 6: bbattery_Alphabit( gen ); break; /* FIXME: NB/R/S */ +//case 7: bbattery_BlockAlphabit( gen ); break; /* FIXME: NB/R/S */ + default: usage( argv[0] ); return 1; + } + + /* Destroy the test generator */ + + prng_delete( prng_leave( prng ) ); + + return 0; +} + diff --git a/program/src/oracle/util/test_rexp2.c b/program/src/oracle/util/test_rexp2.c new file mode 100644 index 000000000..a612b2064 --- /dev/null +++ b/program/src/oracle/util/test_rexp2.c @@ -0,0 +1,78 @@ +#include +#include +#include "prng.h" + +#define REXP2_FXP_ORDER 7 +#include "rexp2.h" + +static double const ulp_thresh[8] = { + 0.50000000000000000000, // For measuring limit + 46209195.03792428970336914062, + 2086872.61177277565002441406, + 91856.93437397480010986328, + 2999.59625971317291259766, + 83.14213037490844726562, + 2.73321378231048583984, + 0.60266053676605224609 +}; + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + double const one_ulp = (double)(1<<30); /* exact */ + double const ulp = (1./one_ulp); /* exact */ + double const thresh = ulp_thresh[ REXP2_FXP_ORDER ]; + + double max_aerr = 0.; + double max_rerr = 0.; + + /* Exhaustively test x in [0,2^30) */ + + int ctr = 0; + for( int i=0; i<(1<<30); i++ ) { + if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 100000000; } + ctr--; + + double x = ulp*(double)i; /* exact */ + double y = ulp*(double)rexp2_fxp( (uint64_t)i ); /* approx */ + //double y = ulp*round( one_ulp*exp2( -x ) ); /* limit: bits 30.0 max_aerr 4.7e-10 max_rerr 9.3e-10 ulp 0.5 */ + double z = exp2( -x ); /* goal */ + double aerr = fabs(y-z); + double rerr = aerr/z; + if( aerr*one_ulp>thresh ) { + printf( "FAIL (x %.20f y %.20f z %.20f aerr %.20f ulp)\n", x, y, z, aerr*one_ulp ); + return 1; + } + max_aerr = fmax( max_aerr, aerr ); + max_rerr = fmax( max_rerr, rerr ); + } + + /* Randomly test larger x (with good coverage of each exponent range) */ + /* FIXME: EXPLICITLY TEST EXACT HANDLING AT POWERS OF 2 */ + + for( int i=0; i<10000000; i++ ) { + int s = ((int)prng_uint32( prng )) & 63; + uint64_t _x = prng_uint64( prng ) >> s; + + double x = ulp*(double)_x; /* exact when _x is small enough to give a non-zero result */ + double y = ulp*(double)rexp2_fxp( _x ); /* approx */ + double z = exp2( -x ); /* goal */ + double aerr = fabs(y-z); + if( aerr*one_ulp>thresh ) { + printf( "FAIL (x %.20f y %.20f z %.20f aerr %.20f ulp)\n", x, y, z, aerr*one_ulp ); + return 1; + } + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass (bits %.1f max_aerr %.1e max_rerr %.1e ulp %.1f)\n", -log2( max_rerr ), max_aerr, max_rerr, max_aerr*one_ulp ); + + return 0; +} + diff --git a/program/src/oracle/util/test_round.c b/program/src/oracle/util/test_round.c new file mode 100644 index 000000000..aff69822f --- /dev/null +++ b/program/src/oracle/util/test_round.c @@ -0,0 +1,31 @@ +#include + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + unsigned i = (unsigned)0; + + int ctr = 0; + for( int x=-32767; x<=32767; x++ ) { + for( int y=-32767; y<=32767; y++ ) { + if( !ctr ) { printf( "Completed %u iterations\n", i ); ctr = 10000000; } + ctr--; + + int u = (x+y)>>1; + int v = (x>>1)+(y>>1); + int w = v + (x&y&1); + if( u!=w ) { + printf( "FAIL (x %3i y %3i u %3i v %3i w %3i)\n", x, y, u, v, w ); + return 1; + } + + i++; + } + } + printf( "pass\n" ); + + return 0; +} + diff --git a/program/src/oracle/util/test_sar.c b/program/src/oracle/util/test_sar.c new file mode 100644 index 000000000..f0b1ac5ef --- /dev/null +++ b/program/src/oracle/util/test_sar.c @@ -0,0 +1,49 @@ +#include +#include "util.h" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + /* FIXME: EXPLICT COVERAGE OF EDGE CASES (PROBABLY STATICALLY FULLY + SAMPLED ALREADY THOUGH FOR 8 AND 16 BIT TYPES) */ + + int ctr = 0; + for( int i=0; i<1000000000; i++ ) { + if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; } + ctr--; + + /* These tests assume the unit test platform has arithmetic right + shift for signed integers (and the diagnostic printfs assume that + long is at least a w-bit wide type) */ + +# define TEST_SAR(w) do { \ + int##w##_t x = (int##w##_t)prng_uint##w( prng ); \ + int n = (int)( prng_uint32( prng ) & (uint32_t)(w-1) ); \ + int##w##_t val = sar_int##w( x, n ); \ + int##w##_t ref = (int##w##_t)(x >> n); \ + if( val!=ref ) { \ + printf( "FAIL (iter %i op %i x %li n %i val %li ref %li)", i, w, (long)x, n, (long)val, (long)ref ); \ + return 1; \ + } \ + } while(0) + + TEST_SAR(8); + TEST_SAR(16); + TEST_SAR(32); + TEST_SAR(64); +# undef TEST_SAR + + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass\n" ); + + return 0; +} + diff --git a/program/src/oracle/util/test_smag.c b/program/src/oracle/util/test_smag.c new file mode 100644 index 000000000..c006ea9f7 --- /dev/null +++ b/program/src/oracle/util/test_smag.c @@ -0,0 +1,55 @@ +#include +#include "util.h" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + int ctr = 0; + for( int i=0; i<100000000; i++ ) { + if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; } + ctr--; +# define TEST(w) do { \ + int##w##_t x = (int##w##_t)prng_uint##w( prng ); \ + int s; uint##w##_t m = smag_unpack_int##w( x, &s ); \ + int##w##_t y = smag_pack_int##w( s, m ); \ + uint##w##_t max = (uint##w##_t)(1ULL<<(w-1)); \ + if( !(s==(x<(int##w##_t)0) && (m +#include "util.h" + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + int ctr = 0; + for( int i=0; i<50000000; i++ ) { + if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 1000000; } + ctr--; + +# define TEST(w) do { \ + int n = ((int)prng_uint32( prng )) & (w-1); \ + uint##w##_t x = (uint##w##_t)(prng_uint##w( prng ) >> n); \ + uint##w##_t y = sqrt_uint##w( x ); \ + uint##w##_t r = (uint##w##_t)(x - y*y); \ + if( y>=(UINT##w##_C(1)<<(w/2)) || r>(y<<1) ) { \ + printf( "FAIL (iter %i op sqrt_uint" #w " x %lx y %lx r %lx)\n", \ + i, (long unsigned)x, (long unsigned)y, (long unsigned)r ); \ + return 1; \ + } \ + int##w##_t u = (int##w##_t)x; \ + if( u>=INT##w##_C(0) ) { \ + int##w##_t v = sqrt_int##w( u ); \ + int##w##_t s = (int##w##_t)(u - v*v); \ + if( !(INT##w##_C(0)<=s && s<=(v<<1)) ) { \ + printf( "FAIL (iter %i op sqrt_int" #w " u %li v %li s %li)\n", \ + i, (long)u, (long)v, (long)s ); \ + return 1; \ + } \ + } \ + int##w##_t v = sqrt_re_int##w( u ); \ + int##w##_t s = (int##w##_t)(u - v*v); \ + if( !( (u>=INT##w##_C(0) && INT##w##_C(0)<=s && s<=(v<<1)) || \ + (u< INT##w##_C(0) && v==INT##w##_C(0) ) ) ) { \ + printf( "FAIL (iter %i op sqrt_re_int" #w " u %li v %li s %li)\n", \ + i, (long)u, (long)v, (long)s ); \ + return 1; \ + } \ + v = sqrt_abs_int##w( u ); \ + s = (int##w##_t)sqrt_uint##w( abs_int##w( u ) ); \ + if( v!=s ) { \ + printf( "FAIL (iter %i op sqrt_abs_int" #w " u %li v %li s %li)\n", \ + i, (long)u, (long)v, (long)s ); \ + return 1; \ + } \ + } while(0) + TEST(8); + TEST(16); + TEST(32); + TEST(64); +# undef TEST + + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass\n" ); + + return 0; +} + diff --git a/program/src/oracle/util/test_uwide.c b/program/src/oracle/util/test_uwide.c new file mode 100644 index 000000000..aabb1b7e5 --- /dev/null +++ b/program/src/oracle/util/test_uwide.c @@ -0,0 +1,211 @@ +#include +#include "util.h" + +/* This unit test assumes a uint128_t extensions are available on the + test platform */ + +__extension__ typedef __int128 int128_t; +__extension__ typedef unsigned __int128 uint128_t; + +static inline uint64_t split_hi( uint128_t x ) { return (uint64_t)(x>>64); } +static inline uint64_t split_lo( uint128_t x ) { return (uint64_t) x; } + +static inline uint128_t join( uint64_t xh, uint64_t xl ) { return (((uint128_t)xh)<<64) | ((uint128_t)xl); } + +static inline uint128_t +prng_uint128( prng_t * prng ) { + uint64_t h = prng_uint64( prng ); + uint64_t l = prng_uint64( prng ); + return join( h, l ); +} + +int +main( int argc, + char ** argv ) { + (void)argc; (void)argv; + + prng_t _prng[1]; + prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) ); + + int ctr = 0; + for( int i=0; i<500000000; i++ ) { + if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; } + ctr--; + + /* Generate a random test vector */ + + uint32_t t = prng_uint32( prng ); + int nx = 1+(((int)t) & 127); t >>= 7; /* Bit width of the x, in [1,128] */ + int ny = 1+(((int)t) & 127); t >>= 7; /* Bit width of the y, in [1,128] */ + int nc = 1+(((int)t) & 63); t >>= 6; /* Bit width of the c, in [1,64] */ + int b0 = ((int)t) & 1; t >>= 1; /* Random bit, in [0,1] */ + int b1 = ((int)t) & 1; t >>= 1; + int s = b1+(((int)t) & 127); t >>= 8; /* Shift magnitude, in [0,128] (0 and 128 at half prob) */ + + uint128_t x = prng_uint128( prng ) >> (128-nx); uint64_t xh = split_hi( x ); uint64_t xl = split_lo( x ); + uint128_t y = prng_uint128( prng ) >> (128-ny); uint64_t yh = split_hi( y ); uint64_t yl = split_lo( y ); + uint64_t c = prng_uint64 ( prng ) >> ( 64-nc); + + /* Add two random uint128_t x and y with a random 64-bit carry in c */ + + do { + uint128_t z0 = (uint128_t)c + x; uint64_t w0 = (uint64_t)(z0>(128-s)); + uint64_t zh,zl; int f = uwide_sl( &zh,&zl, yh,yl, s ); + uint128_t z = join( zh,zl ); + if( z!=z0 || f!=f0 ) { + printf( "FAIL (iter %i op uwide_sl y %016lx %016lx s %i z %016lx %016lx f %i z0 %016lx %016lx f0 %i\n", + i, yh,yl, s, zh,zl, f, split_hi(z0),split_lo(z0), f0 ); + return 1; + } + } while(0); + + /* Right shift a random uint128_t y */ + + do { + uint128_t z0 = s==128 ? (uint128_t)0 : y>>s; + int f0 = (s==0) ? 0 : (s==128) ? !!y : !!(y<<(128-s)); + uint64_t zh,zl; int f = uwide_sr( &zh,&zl, yh,yl, s ); + uint128_t z = join( zh,zl ); + if( z!=z0 || f!=f0 ) { + printf( "FAIL (iter %i op uwide_sr y %016lx %016lx s %i z %016lx %016lx f %i z0 %016lx %016lx f0 %i\n", + i, yh,yl, s, zh,zl, f, split_hi(z0),split_lo(z0), f0 ); + return 1; + } + } while(0); + } + + prng_delete( prng_leave( prng ) ); + + printf( "pass\n" ); + + return 0; +} + diff --git a/program/src/oracle/util/util.h b/program/src/oracle/util/util.h new file mode 100644 index 000000000..c6dfe1bcd --- /dev/null +++ b/program/src/oracle/util/util.h @@ -0,0 +1,18 @@ +#ifndef _pyth_oracle_util_util_h_ +#define _pyth_oracle_util_util_h_ + +#include "align.h" /* includes stdint.h */ +//#include "sar.h" /* includes stdint.h */ +//#include "hash.h" /* includes stdint.h */ +//#include "log2.h" /* includes stdint.h */ +#include "smag.h" /* includes stdint.h */ +#include "rexp2.h" /* includes stdint.h */ +#include "exp2m1.h" /* includes stdint.h */ +//#include "uwide.h" /* includes log2.h */ +//#include "sqrt.h" /* includes log2.h */ +#include "fxp.h" /* includes uwide.h and sqrt.h */ +#include "avg.h" /* includes sar.h */ +#include "prng.h" /* includes stdalign.h and hash.h */ + +#endif /* _pyth_oracle_util_util_h_ */ + diff --git a/program/src/oracle/util/uwide.h b/program/src/oracle/util/uwide.h new file mode 100644 index 000000000..d8ca33862 --- /dev/null +++ b/program/src/oracle/util/uwide.h @@ -0,0 +1,363 @@ +#ifndef _pyth_oracle_util_uwide_h_ +#define _pyth_oracle_util_uwide_h_ + +/* Useful operations for unsigned 128-bit integer operations. A 128-bit + wide number is represented as a pair of uint64_t. In the notation + below == xh 2^64 + xl where xh and xl are uint64_t's. FIXME: + CONSIDER ADDING A STYLE BASED ON GCC'S __int128 EXTENSIONS? */ + +#include "log2.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Compute co 2^128 + = + + ci exactly. Returns + the carry out. Note that the carry in / carry operations should be + compile time optimized out if ci is 0 at compile time on input and/or + return value is not used. Assumes _zh and _zl are valid (e.g. + non-NULL and non-overlapping). Ignoring carry in/out related + operations costs 4 u64 adds, 1 u64 compare and 1 u64 conditional + increment. */ + +static inline uint64_t +uwide_add( uint64_t * _zh, uint64_t * _zl, + uint64_t xh, uint64_t xl, + uint64_t yh, uint64_t yl, + uint64_t ci ) { + uint64_t zh = xh; uint64_t zl = xl; + uint64_t ct; uint64_t co; + zl += ci; ct = (uint64_t)(zl = ( + y) mod 2^128 exactly (a common use of + the above) */ + +static inline void +uwide_inc( uint64_t * _zh, uint64_t * _zl, + uint64_t xh, uint64_t xl, + uint64_t y ) { + uint64_t zl = xl + y; + uint64_t zh = xh + (zl = bo 2^128 + - - bi exactly. Returns + the borrow out. Note that the borrow in / borrow operations should + be compile time optimized out if b is 0 at compile time on input + and/or return value is not used. Assumes _zh and _zl are valid (e.g. + non-NULL and non-overlapping). Ignoring borrow in/out related + operations costs 4 u64 subs, 1 u64 compare and 1 u64 conditional + decrement. */ + +static inline uint64_t +uwide_sub( uint64_t * _zh, uint64_t * _zl, + uint64_t xh, uint64_t xl, + uint64_t yh, uint64_t yl, + uint64_t bi ) { + uint64_t zh = xh; uint64_t zl = xl; + uint64_t bt; uint64_t bo; + bt = (uint64_t)(zl = ( - y) mod 2^128 exactly (a common use of + the above) */ + +static inline void +uwide_dec( uint64_t * _zh, uint64_t * _zl, + uint64_t xh, uint64_t xl, + uint64_t y ) { + uint64_t zh = xh - (uint64_t)(y>xl); + uint64_t zl = xl - y; + *_zh = zh; *_zl = zl; +} + +/* Compute = x*y exactly, will be in [0,2^128-2^65+1]. Assumes + _zh and _zl are valid (e.g. non-NULL and non-overlapping). Cost is 4 + u32*u32->u64 muls, 4 u64 adds, 2 u64 compares, 2 u64 conditional + increments, 8 u64 u32 word extractions. */ + +static inline void +uwide_mul( uint64_t * _zh, uint64_t * _zl, + uint64_t x, + uint64_t y ) { + uint64_t x1 = x>>32; uint64_t x0 = (uint64_t)(uint32_t)x; /* both 2^32-1 @ worst case (x==y==2^64-1) */ + uint64_t y1 = y>>32; uint64_t y0 = (uint64_t)(uint32_t)y; /* both 2^32-1 @ worst case */ + + uint64_t w0 = x0*y0; uint64_t w1 = x0*y1; /* both 2^64-2^33+1 @ worst case */ + uint64_t w2 = x1*y0; uint64_t w3 = x1*y1; /* both 2^64-2^33+1 @ worst case */ + + uint64_t w1h = w1>>32; uint64_t w1l = (uint64_t)(uint32_t)w1; /* w1h 2^32-2, w1l 1 @ worst case */ + uint64_t w2h = w2>>32; uint64_t w2l = (uint64_t)(uint32_t)w2; /* w2h 2^32-2, w2l 1 @ worst case */ + + uint64_t zh = w1h + w2h + w3; /* 2^64-3 @ worst case */ + uint64_t t0 = w0 + (w1l<<32); zh += (uint64_t)(t0 ) exactly. Assumes is not 0. */ + +static inline int +uwide_log2( uint64_t xh, + uint64_t xl ) { + int off = 0; + if( xh ) { off = 64; xl = xh; } + return off + log2_uint64( xl ); +} + +/* Same as the uwide_log2 but returns def is is 0. */ + +static inline int uwide_log2_def( uint64_t xh, uint64_t xl, int def ) { return (xh|xl) ? uwide_log2(xh,xl) : def; } + +/* Compute = << s. Assumes _zh and _zl are valid (e.g. + non-NULL and non-overlapping) and s is non-negative. Large values of + s are fine (shifts to zero). Returns the inexact flag (will be 0 or + 1) which indicates if any non-zero bits of were lost in the + process. Note that inexact handling and various cases should be + compile time optimized out if if s is known at compile time on input + and/or return value is not used. Ignoring inexact handling and + assuming compile time s, for the worst case s, cost is 3 u64 shifts + and 1 u64 bit or. (FIXME: CONSIDER HAVING AN INVALID FLAG FOR + NEGATIVE S?) */ + +static inline int +uwide_sl( uint64_t * _zh, uint64_t * _zl, + uint64_t xh, uint64_t xl, + int s ) { + if( s>=128 ) { *_zh = UINT64_C(0); *_zl = UINT64_C(0); return !!(xh| xl ); } + if( s> 64 ) { s -= 64; int t = 64-s; *_zh = xl<>t)); } + if( s== 64 ) { *_zh = xl; *_zl = UINT64_C(0); return !! xh; } + if( s> 0 ) { int t = 64-s; *_zh = (xh<>t); *_zl = xl<>t); } + /* s== 0 */ *_zh = xh; *_zl = xl; return 0; +} + +/* Compute = >> s. Assumes _zh and _zl are valid (e.g. + non-NULL and non-overlapping) and s is non-negative. Large values of + s are fine (shifts to zero). Returns the inexact flag (will be 0 or + 1) which indicates if any non-zero bits of were lost in the + process. Note that inexact handling and various cases should be + compile time optimized out if if s is known at compile time on input + and/or return value is not used. Ignoring inexact handling and + assuming compile time s, for the worst case s, cost is 3 u64 shifts + and 1 u64 bit or. (FIXME: CONSIDER HAVING AN INVALID FLAG FOR + NEGATIVE S AND/OR MORE DETAILED INEXACT FLAGS TO SIMPLIFY + IMPLEMENTING FIXED AND FLOATING POINT ROUNDING MODES?) */ + +static inline int +uwide_sr( uint64_t * _zh, uint64_t * _zl, + uint64_t xh, uint64_t xl, + int s ) { + if( s>=128 ) { *_zh = UINT64_C(0); *_zl = UINT64_C(0); return !!( xh |xl); } + if( s> 64 ) { s -= 64; int t = 64-s; *_zh = UINT64_C(0); *_zl = xh>>s; return !!((xh< 0 ) { int t = 64-s; *_zh = xh>>s; *_zl = (xl>>s)|(xh<u64 mul, 2 u64 add, 1 u64 u32 extract. + + Theory: Let q d + r = n 2^64 where q = floor( n 2^64 / d ). Note r + is in [0,d). Break d into d = dh 2^32 + dl where dh and dl are + 32-bit words: + q+r/d = n 2^64 / d = n 2^64 / ( dh 2^32 + dl ) + = n 2^64 / ( (dh+1) 2^32 - (2^32-dl) ) ... dh+1 can be computed without overflow, 2^32-dl is positive + > n 2^64 / ( (dh+1) 2^32 ) + >= n floor( 2^64/(dh+1) ) / 2^32 + Note that floor( 2^64/(dh+1) ) is in [2^32,2^33-4]. This suggests + letting: + 2^32 + m = floor( 2^64/(dh+1) ) + where m is in [0,2^32-4] (fits in a uint32_t). Then we have: + q+r/d > n (2^32 + m) / 2^32 + = n + n m / 2^32 + Similarly breaking n into n = nh 2^32 + nl: + q+r/d > n + (nh m) + (nl m/2^32) + >= n + nh m + floor( nl m / 2^32 ) + We get: + q+r/d > n + nh*m + ((nl*m)>>32) + And, as q is integral, r/d is less than 1 and the right hand side is + integral: + q >= qa + where: + qa = n + nh*m + (((nl*m)>>32) + and: + m = floor( 2^64 / (dh+1) ) - 2^32 + + To compute m efficiently, note: + m = floor( 1 + (2^64-(dh+1))/(dh+1) ) - 2^32 + = floor( (2^64-(dh+1))/(dh+1) ) - (2^32-1) + and in "C style" modulo 2^64 arithmetic, 2^64 - x = -x. This yields: + m = (-(dh+1))/(dh+1) - (2^32-1) +*/ + +static inline uint64_t /* In [0,2^32) */ +uwide_div_approx_init( uint64_t d ) { /* d in [2^63,2^64) */ + uint64_t m = (d>>32) + UINT64_C(1); /* m = dh+1 and is in (2^31,2^32] ... exact */ + return ((-m)/m) - (uint64_t)UINT32_MAX; /* m = floor( 2^64/(dh+1) ) - 2^32 ... exact */ +} + +static inline uint64_t /* In [n,2^64) and <= floor(n 2^64/d) */ +uwide_div_approx( uint64_t n, /* In [0,d) */ + uint64_t m ) { /* Output of uwide_div_helper_init for the desired d */ + uint64_t nh = n>>32; + uint64_t nl = (uint64_t)(uint32_t)n; + return n + nh*m + ((nl*m)>>32); +} + +/* Compute z = floor( (xh 2^64 + xl) / y ). Assumes _zh and _zl are + valid (e.g. non-NULL and non-overlapping). Requires y to be + non-zero. Returns the exception flag if y is 0 (=0 in this + case). This is not very cheap and cost is highly variable depending + on properties of both n and d. Worst case is roughly ~3 u64/u64 + divides, ~24 u64*u64->u64 muls plus other minor ops. + + Breakdown in worst case 1 u64 log2, 2 u64/u64 div, 1 u64*u64->u64 + mul, 1 u64 sub, 1 int sub, 1 u128 variable shift, 1 1 u64 variable + shift, 1 u64 div approx init, 4*(1 u64 div approx, 1 u64*u64->u128 + mul, 1 u128 sub) plus various operations to faciliating shortcutting + (e.g. when xh is zero, cost is 1 u64/u64 div). */ + +static inline int +uwide_div( uint64_t * _zh, uint64_t * _zl, + uint64_t xh, uint64_t xl, + uint64_t y ) { + + /* Simple cases (y==0, x==0 and y==2^n) */ + + if( !y ) { *_zh = UINT64_C(0); *_zl = UINT64_C(0); return 1; } + if( !xh ) { *_zh = UINT64_C(0); *_zl = xl / y; return 0; } + + int n = log2_uint64( y ); + if( !(y & (y-UINT64_C(1))) ) { uwide_sr( _zh,_zl, xh,xl, n ); return 0; } + + /* General case ... compute zh noting that: + = floor( (xh 2^64 + xl) / y ) + = floor( ((qh y + rh) 2^64 + xl) / y ) + where xh = qh y + rh and qh is floor(xh/y), rh = xh%y and rh is in + [0,y). Thus: + = qh 2^64 + floor( (rh 2^64 + xl) / y ) + where the right term is less that 2^64 such that zh is qh and + zl = floor( (rh 2^64 + xl) / y ). */ + + uint64_t qh = xh / y; + uint64_t rh = xh - qh*y; + + /* Simple zl shortcut */ + + if( !rh ) { *_zh = qh; *_zl = xl / y; return 0; } + + /* At this point, zh is qh and zl is floor( (rh 2^64 + xl) / y ) where + rh is in (0,y) and y is a positive non-power of 2. + + Normalize this by noting for: + q=n/d; r=n-q*d=n%d + we have: + -> n = q d + r where q = floor(n/d) and r in [0,d). + -> n w = q d w + r w for some integer w>0 + That is, if we scale up both n and d by w, it doesn't affect q (it + just scales the remainder). We use a scale factor of 2^s on + and y such that y will have its most significant bit set. + This will not cause to overflow as rh is less than y at + this point. */ + + int s = 63-n; + uint64_t nh, nl; uwide_sl( &nh,&nl, rh,xl, s ); + uint64_t d = y << s; + + /* At this point zh = qh and zl is floor( (nh 2^64 + nl) / d ) where + nh is in (0,d) and d is in (2^63,2^64). */ + + uint64_t m = uwide_div_approx_init( d ); + uint64_t eh = nh; uint64_t el = nl; + uint64_t ql = UINT64_C(0); + do { + + /* At this point, n-ql*d has an error of such that: + d < 2^64 <= + (i.e. eh is non-zero). If we increment ql by the correction + floor(/d), we'd be at the solution but computing this is + as hard as the original problem. We do have the ability to + very quickly compute a ~32-bit accurate estimate dqest of + floor(/d) such that: + 1 <= eh <= dqest <= floor(/d) <= floor( 0 */ + uwide_mul( &eh,&el, ql, d ); /* = ql*d */ + uwide_sub( &eh,&el, nh,nl, eh,el, UINT64_C(0) ); /* = - ql d */ + } while( eh ); + + /* At this point, n - ql*d has an error less than 2^64 so we can + directly compute the remaining correction. */ + + ql += el/d; + + *_zh = qh; *_zl = ql; return 0; +} + +/* Same as the above but returns the value of the remainder too. + For non-zero y, remainder will be in [0,y). If y==0, returns + =0 with a remainder of UINT64_MAX (to signal error). */ + +static inline uint64_t +uwide_divrem( uint64_t * _zh, uint64_t * _zl, + uint64_t xh, uint64_t xl, + uint64_t y ) { + + if( !y ) { *_zh = UINT64_C(0); *_zl = UINT64_C(0); return UINT64_MAX; } + if( !xh ) { uint64_t ql = xl / y; uint64_t r = xl - ql*y; *_zh = UINT64_C(0); *_zl = ql; return r; } + + int n = log2_uint64( y ); + if( !(y & (y-UINT64_C(1))) ) { int s = 64-n; uwide_sr( _zh,_zl, xh,xl, n ); return n ? ((xl << s) >> s) : UINT64_C(0); } + + uint64_t qh = xh / y; + uint64_t rh = xh - qh*y; + + if( !rh ) { uint64_t ql = xl / y; uint64_t r = xl - ql*y; *_zh = qh; *_zl = ql; return r; } + + int s = 63-n; + uint64_t nh, nl; uwide_sl( &nh,&nl, rh,xl, s ); + uint64_t d = y << s; + + uint64_t m = uwide_div_approx_init( d ); + uint64_t eh = nh; uint64_t el = nl; + uint64_t ql = UINT64_C(0); + do { + ql += uwide_div_approx( eh, m ); + uwide_mul( &eh,&el, ql, d ); + uwide_sub( &eh,&el, nh,nl, eh,el, UINT64_C(0) ); + } while( eh ); + + uint64_t dq = el / d; + uint64_t r = (el - dq*d) >> s; + ql += dq; + + *_zh = qh; *_zl = ql; return r; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _pyth_oracle_util_uwide_h_ */