diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2a13188f1..291e15b41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ set( PC_SRC
   pc/request.cpp;
   pc/rpc_client.cpp;
   pc/user.cpp;
-  program/src/oracle/sort.c
+  program/src/oracle/model/price_model.c
   )
 
 set( PC_HDR
@@ -116,9 +116,8 @@ add_test( test_net test_net )
 # fuzz testing application
 #
 
-add_executable( fuzz pctest/fuzz.cpp )
-target_link_libraries( fuzz ${PC_DEP} )
-
+#add_executable( fuzz pctest/fuzz.cpp )
+#target_link_libraries( fuzz ${PC_DEP} )
 
 #
 # bpf static analysis (CLion)
diff --git a/program/src/oracle/model/clean b/program/src/oracle/model/clean
new file mode 100755
index 000000000..860ef1641
--- /dev/null
+++ b/program/src/oracle/model/clean
@@ -0,0 +1,3 @@
+#!/bin/sh
+rm -rfv bin
+
diff --git a/program/src/oracle/model/gap_model.h b/program/src/oracle/model/gap_model.h
new file mode 100644
index 000000000..99df18f6b
--- /dev/null
+++ b/program/src/oracle/model/gap_model.h
@@ -0,0 +1,236 @@
+#ifndef _pyth_oracle_model_gap_model_h_
+#define _pyth_oracle_model_gap_model_h_
+
+/* gap_model computes our apriori belief how comparable
+   (agg_price,agg_conf) pairs measured at different points in time are
+   expected to be.
+
+   Let ps / pd be our apriori belief that aggregated values measured at
+   two different block sequence numbers (separated by gap where gap is
+   strictly positive) are representative of statistically similar /
+   distinct price distributions (note that ps+pd=1).  Then gap_model
+   computes:
+
+     r/2^30 ~ ps / pd
+
+   The specific model used below is:
+
+     ps ~ exp( -gap / lambda ) 
+
+   where lambda is the timescale (in block sequence numbers) over which
+   it is believed to be sensible to consider (agg_price,agg_conf)
+   price distribution parameterizations comparable.  This model reduces
+   to a standard EMA when gap is always 1 but naturally generalizes to
+   non-uniform sampling (e.g. chain congestion), large gaps (e.g.
+   EOD-to-next-SOD) and initialization (gap->infinity).  For this model,
+   lambda is known at compile time (see GAP_MODEL_LAMBA) and on return r
+   will be in [0,lambda 2^30].  Gap==0 should not be passed to this
+   model but for specificity, the model here returns 0 if it is.
+
+   For robustness and accuracy of the finite precision computation,
+   lambda should be in 1 << lambda << 294801.  The precision of the
+   calculation is impacted by the EXP2M1_FXP_ORDER (default value is
+   beyond IEEE single precision accurate).
+
+   Note that by modifying this we can correct for subtle sampling
+   effects for specific chains and/or symbols (SOD, EOD, impact of
+   different mechanisms for chain congestion) without needing to modify
+   anything else.  We could even incorporate time invariant processes by
+   tweaking the API here to take the block sequence numbers of the
+   samples in question.
+
+   If GAP_MODEL_NEED_REF is defined, this will also declare a high
+   accuracy floating point reference implementation "gap_model_ref" to
+   aid in testing / tuning block chain suitable approximations. */
+
+#include "../util/exp2m1.h"
+
+#ifdef GAP_MODEL_NEED_REF
+#include <math.h>
+#endif
+
+/* GAP_MODEL_LAMBDA should an integer in [1,294802)
+   (such that 720 GAP_MODEL_LAMBDA^3 < 2^64 */
+
+#ifndef GAP_MODEL_LAMBDA
+#define GAP_MODEL_LAMBDA (3000)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef GAP_MODEL_NEED_REF
+
+/* Returns ps/pd ~ 1/(exp(gap/GAP_MODEL_LAMBDA)-1) if gap is positive
+   and 0 otherwise */
+
+static inline long double
+gap_model_ref( long double gap ) {
+  return gap>0.L ? (1.L / expm1l( gap / ((long double)GAP_MODEL_LAMBDA) ) ) : 0.L;
+}
+
+#endif
+
+static inline uint64_t      /* In [0,2^30 GAP_MODEL_LAMBDA] */
+gap_model( uint64_t gap ) { /* Non-zero */
+
+  /* Let x = gap/lambda.  Then:
+
+       r/2^30 = ps / pd = exp(-x) / ( 1 - exp(-x) )
+
+     For x<<1, the series expansion is:
+
+       r/2^30 ~ 1/x - 1/2 + (1/12) x - (1/720) x^3 + (1/30240) x^5 - ...
+
+     Noting that this is a convergent series with alternating terms, the
+     error in truncated this series at the x^3 term is at most the x^5
+     term.  Requiring then the x^5 term to be at most 2^-31 guarantees
+     the series truncation error will be at most 1/2 ulp.  This implies
+     the series can be used for a practically full precision if:
+
+       x <~ (30240 2^-31)^(1/5) ~ 0.10709...
+       -> gap < 0.10709... lambda
+
+     The truncated series in terms of gap can be written as:
+
+       r/2^30 ~ lambda/gap - 1/2 + gap/(12 lambda) - gap^3/(720 lambda^3)
+              = (lambda + gap/2 + gap^2 ( 1/(12 lambda) - gap^2 (1/(720 lambda^3)) )) / gap
+
+     Convering the division to an integer division with grade school
+     rounding:
+
+       r ~ floor( ( 2^(30+s) lambda + gap (2^(29+s)-2^(s-1))
+                  + gap^2 ( 2^(30+s)/(12 lambda) - gap^2 2^(30+s) / (720 lambda^3) ) ) / (2^s gap) )
+
+     where s is a positive integer picked to maximize precision while
+     not causing intermediate overflow.  We require 720 lambda^3 < 2^64
+     -> lambda <= lambda_max = 294801 (such that the coefficients above
+     are straightforward to compute at compile time).
+     
+     Given gap <= 0.10709 lambda_max, gap^2 can be computed exactly
+     without overflow.  Adding an intermediate scale factor to improve
+     the accuracy of the correction coefficients then yields:
+
+       r ~ floor( cp + gap c0 + floor((gap^2 (c1 - gap^2 c3) + 2^(t-1)) / 2^t) ) / (2^s gap) )
+
+     where:
+
+       cp = 2^(30+s) lambda
+       c0 = 2^(29+s)-2^(s-1)
+       c1 = 2^(30+s+t)/(12 lambda)
+       c3 = 2^(30+s+t)/(720 lambda^3)
+
+     To prevent overflow in the c1 and c3 corrections, it is sufficient that:
+
+       s+t < log(12 2^34 / lambda_max ) ~ 22.7
+
+     So we use s+t==22.  Maximizing precision in the correction
+     calculation then is achieved with s==1 and t==21.  (The pole and
+     constant calculations are exact so long as s is positive.)  This
+     yields the small gap approximation:
+
+       r ~ floor( (cp + gap c0 + (gap^2 ( c1 - gap^2 c3 ) + 2^20)>>21) / 2 gap) )
+
+       cp = 2^31 lambda
+       c0 = 2^30 - 1
+       c1 = floor( (2^52 + 6 lambda)/(12 lambda) )
+       c3 = floor( (2^52 + 360 lambda^3)/(720 lambda^3) )
+
+     This is ~30-bits accurate when gap < 0.10709... lambda.
+
+     For x>>1, multiplying by 1 = exp(x) / exp(x) yields:
+
+       r/2^30 = 1    / ( exp(x)       - 1 )
+              = 1    / ( exp2(x log_2 e) - 1 )
+              = 1    / ( exp2( gap ((2^30 log_2 e)/lambda) ) - 1 )
+              ~ 1    / exp2m1( (gap c + 2^(s-1)) >> s )
+              = 2^30 / exp2m1_fxp( (gap c + 2^(s-1)) >> s )
+
+     where c = round( (2^30+s) log_2 e / lambda ) and s is picked to
+     maximize precision of computing gap / lambda without causing
+     overflow.  Further evaluating:
+
+       c ~ ((2^(31+s)) log_2 e + lambda) / (lambda<<1)
+       d = exp2m1_fxp( (gap*c + 2^(s-1)) >> s )
+       r ~ (2^61 + d) / (d<<1)
+
+     For the choice of s, we note that if:
+
+       exp(-x) / (1-exp(-x)) <= 2^-31
+       (1+2^-31) exp(-x) <= 2^-31
+       exp(-x) <= 2^-31/(1+2^-31)
+       x >= -ln (2^31/(1+2^-31)) ~ 21.488...
+
+     The result will round to zero.  Thus, we only care about cases
+     where:
+
+       gap < thresh 
+
+     and:
+
+       thresh ~ 21.5 lambda
+
+     We thus require:
+
+       c (thresh-1) + 2^(s-1) < 2^64
+       (2^(30+s)) log_2 e 21.5 < ~2^64
+       2^s <~ 2^34 / (21.5 log_2 e)
+       s <~ 34 - log_2 (21.5 log_2 e ) ~ 29.045...
+
+     So we use s == 29 when computing compile time constant c.
+
+     (Note: while it looks like we could just use the x>>1 approximation
+     everywhere, it isn't so accurate for gap ~ 1 as the error in the
+     c*gap is amplified exponentially into the output.) */
+
+  /* If we need the behavior that this never returns 0 (i.e. we are
+     never 100 percent confident that samples separated by a large gap
+     cannot be compared), tweak big_thresh for the condition
+
+       exp(-x) / (1-exp(-x)) <= 3 2^-31
+
+     and return 1 here. */
+
+  static uint64_t const big_thresh = (UINT64_C(43)*(uint64_t)GAP_MODEL_LAMBDA) >> 1;
+  if( !gap || gap>=big_thresh ) return UINT64_C(0);
+
+  /* round( 0.10709... lambda ) <= 3191 */
+//static uint64_t const small_thresh = (UINT64_C(0x1b69f363043ef)*(uint64_t)GAP_MODEL_LAMBDA + (UINT64_C(1)<<51)) >> 52;
+
+  /* hand tuned value minimizes worst case ulp error in the
+     EXP2M1_ORDER=7 case */
+  static uint64_t const small_thresh = (UINT64_C(0x2f513cc1e098e)*(uint64_t)GAP_MODEL_LAMBDA + (UINT64_C(1)<<51)) >> 52;
+
+  if( gap<=small_thresh ) {
+
+    /* 2^31 lambda, exact */
+    static uint64_t const cp = ((uint64_t)GAP_MODEL_LAMBDA)<<31;
+
+    /* 2^30 - 1, exact */
+    static uint64_t const c0 = (UINT64_C(1)<<30) - UINT64_C(1);
+
+    /* round( 2^52 / (12 lambda) ) */
+    static uint64_t const c1 = ((UINT64_C(1)<<52) + UINT64_C(6)*(uint64_t)GAP_MODEL_LAMBDA)
+                             / (UINT64_C(12)*(uint64_t)GAP_MODEL_LAMBDA);
+
+    /* round( 2^52 / (720 lambda^3) ) */
+    static uint64_t const c3 = ((UINT64_C(1)<<52) + UINT64_C(360)*((uint64_t)GAP_MODEL_LAMBDA)*((uint64_t)GAP_MODEL_LAMBDA)*((uint64_t)GAP_MODEL_LAMBDA))
+                             / (UINT64_C(720)*((uint64_t)GAP_MODEL_LAMBDA)*((uint64_t)GAP_MODEL_LAMBDA)*((uint64_t)GAP_MODEL_LAMBDA));
+
+    uint64_t gap_sq = gap*gap; /* exact, at most 3191^2 (no overflow) */
+    return (cp - c0*gap + (( gap_sq*(c1 - gap_sq*c3) + (UINT64_C(1)<<20))>>21)) / (gap<<1);
+  }
+
+  /* round( 2^59 log_2 / lambda ) */
+  static uint64_t const c = ((UINT64_C(0x171547652b82fe18)) + (uint64_t)GAP_MODEL_LAMBDA) / (((uint64_t)GAP_MODEL_LAMBDA)<<1);
+
+  uint64_t d = exp2m1_fxp( (gap*c + (UINT64_C(1)<<28)) >> 29 );
+  return ((UINT64_C(1)<<61) + d) / (d<<1);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_model_gap_model_h_ */
diff --git a/program/src/oracle/model/leaky_integrator.h b/program/src/oracle/model/leaky_integrator.h
new file mode 100644
index 000000000..46a8e6171
--- /dev/null
+++ b/program/src/oracle/model/leaky_integrator.h
@@ -0,0 +1,82 @@
+#ifndef _pyth_oracle_model_leaky_integrator_h_
+#define _pyth_oracle_model_leaky_integrator_h_
+
+/* Compute:
+
+     (z/2^30) ~ (y/2^30)(w/2^30) + x
+
+   Specifically returns:
+
+     co 2^128 + <zh,zl> = round( <yh,yl> w / 2^30 ) + 2^30 x
+
+   where co is the "carry out".  (Can be used to detect overflow and/or
+   automatically rescale as necessary.)  Assumes w is in [0,2^30].
+   Calculation is essentially full precision O(1) and more accurate than
+   a long double implementation for a wide range inputs.
+
+   If LEAKY_INTEGRATOR_NEED_REF is defined, this will also declare a
+   high accuracy floating point reference implementation
+   "leaky_integrator_ref" to aid in testing / tuning block chain
+   suitable approximations.  (Note: In this case, the limitations of
+   IEEE long double floating point mean in this case leaky_integator_ref
+   is actually expected to be _less_ _precise_ than leaky_integrator
+   when the time scale * price scales involved require more than 64-bit
+   mantissas.) */
+
+#include "../util/uwide.h"
+
+#ifdef LEAKY_INTEGRATOR_NEED_REF
+#include <math.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef LEAKY_INTEGRATOR_NEED_REF
+
+/* Compute z ~ y w + x */
+
+static inline long double
+leaky_integrator_ref( long double y,
+                      long double w,
+                      long double x ) {
+  return fmal(y,w,x);
+}
+
+#endif
+
+static inline uint64_t
+leaky_integrator( uint64_t * _zh, uint64_t * _zl,
+                  uint64_t    yh, uint64_t    yl,
+                  uint64_t    w,    /* In [0,2^30] */
+                  uint64_t    x ) {
+
+  /* Start from the exact expression:
+
+       z/2^30 = (y/2^30)*(w/2^30) + x
+
+     This yields:
+
+       z = (y w)/2^30 + x 2^30
+       z = (yh w 2^64 + yl w)/2^30 + 2^30 x
+         = yh w 2^34 + yl w/2^30 + 2^30 x
+         ~ yh w 2^34 + floor((yl w + 2^29)/2^30) + 2^30 x */
+
+  uint64_t t0_h,t0_l; uwide_mul( &t0_h,&t0_l, yh, w );     /* At most 2^94-2^30 */
+  uwide_sl( &t0_h,&t0_l, t0_h,t0_l, 34 );                  /* At most 2^128-2^64 (so cannot overflow) */
+
+  uint64_t t1_h,t1_l; uwide_mul( &t1_h,&t1_l, yl, w );     /* At most 2^94-2^30 */
+  uwide_inc( &t1_h,&t1_l, t1_h,t1_l, UINT64_C(1)<<29 );    /* At most 2^94-2^29 (all ones 30:93) */
+  uwide_sr ( &t1_h,&t1_l, t1_h,t1_l, 30 );                 /* At most 2^64-1 (all ones 0:63) (t1_h==0) */
+
+  uint64_t xh,xl; uwide_sl( &xh,&xl,  UINT64_C(0),x, 30 ); /* At most 2^94-2^30 */
+  return uwide_add( _zh,_zl, t0_h,t0_l, xh,xl, t1_l );
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_model_leaky_integrator_h_ */
+
diff --git a/program/src/oracle/model/overlap_model.h b/program/src/oracle/model/overlap_model.h
new file mode 100644
index 000000000..2e9432a35
--- /dev/null
+++ b/program/src/oracle/model/overlap_model.h
@@ -0,0 +1,341 @@
+#ifndef _pyth_oracle_model_overlap_model_h_
+#define _pyth_oracle_model_overlap_model_h_
+
+/* overlap_model computes our prior belief whether two
+   (agg_price,agg_conf) estimates parameterize the same underlying
+   process.  Specifically, let ps / pd be our apriori belief that two
+   different price distribution estimates came from the same / different
+   underlying process.  Then overlap_model computes:
+
+     r/2^30 ~ ps / pd
+
+   Let Lij be the expected likelihood of IID sample drawn from a
+   Gaussian distribution with mean/rms mu_i/sigma_i for a Gaussian
+   distribution with mean/rms mu_j/sigma_j.  The implementation below
+   corresponds to:
+
+     ps / pd = weight sqrt( (L01 L10) / (L00 L11) )
+
+   After some fun with integrals:
+
+     ps / pd = weight sqrt( 2 sigma_0 sigma_1 / (sigma_0^2 + sigma_1^2) )
+                      exp( -0.5 (mu_0-mu_1)^2 / (sigma_0^2 + sigma_1^2) )
+
+   where the estimates to compare are (mu0,sigma0) and (mu1,sigma1) and
+   weight is the ps / pd prior to use for identical estimates (e.g.
+   what our prior belief that two similar agg_price,agg_conf estimates
+   just happen to be parameterizing different processes with similar
+   distributions).  As such the output is in [0,weight 2^30].  This
+   model is heuristic but well motivated theoretically with good
+   limiting behaviors and robust numerical implementations.
+
+   The implementation below has hardcoded weight to 1 and also has a
+   tunable but hardcoded sigma normalization factor of 1.  If the user
+   wants to run with different weights and normalizations, see notes
+   below ("weight", "alpha" and "beta" will need to be adjusted).
+
+   This implementation has been crafted to be completely price scale
+   invariant and the result should be accurate to O(1) ulp.  If
+   OVERLAP_MODEL_NEED_REF is defined, this will also declare a high
+   accuracy floating point reference implementation "overlap_model_ref"
+   to aid in testing / tuning block chain suitable approximations. */
+
+#include "../util/fxp.h"
+#include "../util/sar.h"
+
+#ifdef OVERLAP_MODEL_NEED_REF
+#include <math.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef OVERLAP_MODEL_NEED_REF
+
+/* Compute ps/pd ~ weight sqrt( 2 sigma_0 sigma_1 / (sigma_0^2 + sigma_1^2) )
+                   exp( -0.5 norm^2 (mu_0-mu_1)^2 / (sigma_0^2 + sigma_1^2) )
+   where weight==1 and norm==1.  If sigma_0 is not positive, it will be
+   treated as zero.  Similarly for sigma_1.  Assumes inputs are tame
+   such that there are no risks of nans, infs or intermediate overflows. */
+
+static inline long double
+overlap_model_ref( long double mu_0, long double sigma_0,
+                   long double mu_1, long double sigma_1 ) {
+  static long double const weight = 1.L;
+  static long double const norm   = 1.L;
+  if( !(sigma_0>0.L) ) sigma_0 = 0.L;
+  if( !(sigma_1>0.L) ) sigma_1 = 0.L;
+  if( sigma_0==0.L && sigma_1==0.L ) return mu_0==mu_1 ? 1.L : 0.L;
+  long double kappa_sq = 1.L/(sigma_0*sigma_0 + sigma_1*sigma_1);
+  long double delta    = mu_0-mu_1;
+  return weight*sqrtl( (2.L*kappa_sq)*(sigma_0*sigma_1) )*expl( -((0.5L*norm*norm)*kappa_sq)*(delta*delta) );
+}
+
+#endif
+
+static inline uint64_t
+overlap_model( uint64_t mu_0, uint64_t sigma_0,
+               uint64_t mu_1, uint64_t sigma_1 ) {
+  uint64_t ovfl;
+
+  /* Methodology:
+
+     If we compute sqrt and exp terms of r separately, the sqrt term
+     will end up with calculations like sqrt( 1/2^30 ) ~ 1/2^15 with a
+     loss of the bits after the decimal point.  This has the effect of
+     roughly halving the precision in extreme cases.  To reduce the
+     impact of this and speed up the calculation, we instead absorb the
+     sqrt into the exp term.  Further, since exp2 is faster and more
+     accurate to compute, we rewrite r as:
+
+       r/2^30 ~ exp2( zeta_0 - zeta_1 )
+       
+     where:
+
+       zeta_0 = log2( weight sqrt( 2 sigma_0 sigma_1 / (sigma_0^2 + sigma_1^2) ) )
+       zeta_1 = alpha (mu_0 - mu_1)^2 / (sigma_0^2 + sigma_1^2)
+       alpha  = 0.5 log2(e) norm^2 = 0.721... norm^2
+
+     and norm is a factor of how to the model should normalize the input
+     sigma values to a "Gaussian RMS equivalent".
+     
+     We don't have any prior price scale restrictions for mu and sigma
+     so we let:
+
+       sigma_min = min(sigma_0,sigma_1)
+       sigma_max = max(sigma_0,sigma_1)
+
+     With this we have:
+
+       zeta_1 = delta^2 t_sigma
+
+     where:
+
+       delta       = (mu_0-mu_1) / sigma_max
+       t_sigma     = alpha / ( 1 + ratio_sigma^2 ) ... in [alpha/2,alpha]
+       ratio_sigma = sigma_min / sigma_max         ... in [0,1] when sigma_max>0
+
+     For zeta_0, we note:
+
+       zeta_0 = 0.5 + log2( weight ) + 0.5 log2( ratio_sigma / ( 1 + ratio_sigma^2 ) )
+
+     To optimize this calculation we note that the log2 term can be
+     rewritten in terms of t_sigma from above:
+
+       zeta_0 = 0.5 + log2( weight ) + 0.5 log2( ratio_sigma t_sigma / alpha )
+              = beta + 0.5 log2( ratio_sigma t_sigma )
+
+     where:
+
+       beta = 0.5 + log2( weight ) - 0.5 log2( alpha )
+
+     Like alpha, beta can be tuned to taste to reflect how identical
+     inputs distributions should be weighted.
+
+     This yields the method (in exact math):
+
+        sigma_max   = max( sigma_0, sigma_1 )
+        if sigma_max==0, return mu_0==mu_1 ? weight : 0 ... two delta functions were input, return weight if same and 0 if not
+        sigma_min   = min( sigma_0, sigma_1 )
+        delta       = (mu_0-mu_1) / sigma_max
+        ratio_sigma = sigma_min / sigma_max
+        t_sigma     = alpha / ( 1 + ratio_sigma^2 )
+        zeta_0      = beta + 0.5 log2( ratio_sigma t_sigma )
+        zeta_1      = delta^2 t_sigma
+        return exp2( zeta_0 - zeta_1 )
+
+     This is straightforward to convert to fixed points as described in
+     the details below. */
+
+  /* Compute:
+
+       sigma_min = min(sigma_0,sigma_1)
+       sigma_max = max(sigma_0,sigma_1).
+
+     If both are zero (i.e. sigma_max==0), we have two delta functions
+     and we return weight if they are the same delta function (i.e.
+     mu_0==mu_1) or 0 if not.  If just one is a delta function (i.e.
+     sigma_max!=0 and sigma_min==0), we return 0 to save a lot of work.  */
+  
+  /* FIXME: HARDCODED TO weight==1 */
+  static uint64_t const weight = UINT64_C(1)<<30; /* round( 2^30 weight ) */
+
+  int      sigma_c   = sigma_0 <= sigma_1;
+  uint64_t sigma_max = sigma_c ? sigma_1 : sigma_0;
+  if( !sigma_max ) return mu_0==mu_1 ? weight : UINT64_C(0);
+
+  uint64_t sigma_min = sigma_c ? sigma_0 : sigma_1;
+  if( !sigma_min ) return UINT64_C(0);
+
+  /* At this point, sigma_max >= sigma_min > 0.  Compute:
+
+       delta/2^30 ~ |mu_0-mu_1| / sigma_max
+
+     with round-nearest-away style rounding.  For a fixed delta, the
+     overlap model result is maximized when sigma_0==sigma_1 (maximizes
+     simultaneously the sqrt factor and the exp factor).  In this limit
+     we have:
+
+       r = weight exp( -0.25 norm^2 delta^2 )
+
+     If this is less than 2^-31, the result necessarily round to zero
+     regardless of sigma_0 or sigma_1.  In equations, if:
+
+          weight exp( -0.25 norm^2 (delta/2^30)^2 ) < 2^-31
+       -> -0.25 norm^2 (delta/2^30)^2 < ln( 2^-31 / weight )
+       -> delta/2^30 > 2^30 sqrt( -4 ln( 2^-31 / weight ) / norm^2 )
+                     ~ 9.2709 for weight==1 and norm==1
+
+     the result rounds to zero and do not need to compute further.  This
+     implies that if:
+
+       delta > thresh
+
+     where
+
+       thresh ~ ceil( 2^30 sqrt( -4 ln( 2^-31 / weight ) / norm^2 ) )
+
+     we know the result rounds to zero and can return immediately.  This
+     restricts the range of delta we need to handle which in turn
+     simplifies the remaining fixed point conversions.
+
+     We can't use fxp_div_rna_fast because the div might overflow (e.g.
+     mu_max==UINT64_MAX, mu_min=0, sigma_max=2 such that delta/2^30~2^63
+     -> delta~2^93 but this gets handled as part of the thresh check) */
+
+  /* FIXME: HARDCODED TO weight==1 AND norm==1 */
+  static uint64_t const thresh = UINT64_C(0x251570310); /* ~33.3 bits */
+
+  int      mu_c   = mu_0 <= mu_1;
+  uint64_t mu_min = mu_c ? mu_0 : mu_1;
+  uint64_t mu_max = mu_c ? mu_1 : mu_0;
+  uint64_t delta  = fxp_div_rna( mu_max-mu_min, sigma_max, &ovfl );
+  if( ovfl || delta>thresh ) return UINT64_C(0);
+
+  /* Compute:
+
+       ratio_sigma/2^30 ~ sigma_min/sigma_max
+                        = (sigma_min/2^30) / (sigma_max/2^30)
+
+     with round-nearest-away style rounding.  We can't use
+     fxp_div_rna_fast here because 2^30 sigma_min might be >= 2^64.  We
+     know that the result will not overflow though as
+     sigma_min<=sigma_max at this point.
+
+     FIXME: IDEALLY WE'D FIND A REPRESENTATION OF 1/SIGMA_MAX AND USE
+     THAT HERE AND ABOVE TO SAVE A SLOW FXP_DIV.  THIS IS TRICKY TO MAKE
+     ROBUST GIVEN LACK OF LIMITS ON DYNAMIC RANGE OF SIGMA. */
+
+  uint64_t ratio_sigma = fxp_div_rna( sigma_min, sigma_max, &ovfl );
+
+  /* At this point, ratio_sigma is in [0,2^30] (it might have rounded
+     to zero if the sigmas are wildly different).  Compute:
+
+       t_sigma/2^30 ~ alpha / (1+(ratio_sigma/2^30)^2)
+                    ~ in [alpha/2,alpha]
+
+     to fixed point precision.  Interval is approximate as alpha itself
+     might be rounded from its ideal value.  For typical cases,
+     0.5<alpha<1 and t_sigma will fit within 30-bits.  As noted above
+     though, the user might fine tune alpha to fine tune normalization
+     between mu and sigma.
+
+     Note:
+
+       d       = (2^60 + ratio_sigma^2)
+       t_sigma = floor( (alpha 2^90 + (d>>1)) / d )
+
+     (or, in code):
+
+       uint64_t d = (UINT64_C(1)<<60) + ratio_sigma*ratio_sigma;
+       uint64_t nh,nl; uwide_inc( &nh,&nl, UINT64_C(0x2e2a8ec),UINT64_C(0xa5705fc2f0000000), d>>1 ); // ~2^90 alpha (use higher precision)
+       uwide_div( &nh,&nl, nh,nl, d );
+       uint64_t t_sigma = nl;
+
+     does this calculation with precise round nearest away rounding but
+     it is a lot more expensive due to the use of 128b/64b division and
+     the extra precision empirically doesn't improve the overall
+     accuracy to justify the extra work.
+
+     fxp_mul_rna_fast is okay as ratio_sigma <= 2^30
+     fxp_add_fast is okay 2^30 + ratio_sigma^2/2^30 <= 2^31
+     fxp_div_rna_fast is okay as alpha 2^30 + (2^31>>1) < 2^64 */
+
+  /* FIXME: HARDCODED TO norm==1 */
+  static uint64_t const alpha = UINT64_C(0x2e2a8eca); /* round( 2^30 0.5 log2(e) norm^2 ) */
+
+  uint64_t t_sigma = fxp_div_rna_fast( alpha, fxp_add_fast( UINT64_C(1)<<30, fxp_mul_rna_fast( ratio_sigma, ratio_sigma ) ) );
+
+  /* Compute:
+
+         e + (f/2^30) ~ 0.5 log2( ratio_sigma t_sigma )
+                      ~ 0.5 ( e0 + f0/2^30 )
+                      = floor(e0/2) + 0.5 (e0 is odd) + f0/2^31
+                      = floor(e0/2) + (2^30 (e0 is odd) + f0) / 2^31
+                      = (e0>>1) + (((e0&1)<<30) + f0) / 2^31
+                      ~ (e0>>1) + ((((e0&1)<<30) + f0 + 1)>>1) / 2^30
+       -> e = (e0>>1) with sign extension
+          f = (((e0&1)<<30)+f0+1)>>1)
+
+     If ratio_sigma t_sigma == 0 (or rounds to zero), the log2 term is
+     going to -inf such that the result will round to zero.  (Note that
+     this might be a source of precision loss when sigmas are wildly
+     dissimilar.)
+
+     fxp_mul_rna_fast is okay here is ratio_sigma and t_sigma both fit
+     in O(30) bits. */
+
+  uint64_t f0; int e0 = fxp_log2_approx( &f0, fxp_mul_rna_fast( ratio_sigma, t_sigma ) );
+  if( e0<-30 ) return UINT64_C(0);
+  int32_t  e = sar_int32( (int32_t)e0, 1 );                          /* |e| <= 16 */
+  uint64_t f = ((((uint64_t)(e0&1)) << 30) + f0 + UINT64_C(1)) >> 1;
+
+  /* Compute:
+
+       zeta0/2^30 ~ beta + f/2^30   + ifelse(e<0, 0, e)
+       zeta1/2^30 ~ delta^2 t_sigma + ifelse(e<0,-e, 0)
+
+     such that:
+
+       (zeta0-zeta1)/2^30 ~ -delta^2 t_sigma + beta + 0.5 log2( ratio_sigma t_sigma )
+
+     The ifelse trick allows handling of signed exponents without
+     needing to use a signed fixed point representation.
+     
+     delta^2 t_sigma isn't generally safe to compute with fast fxp
+     multiplies (for the current hard coded weight==1 and norm==1,
+     delta*t_sigma is safe but delta^2 isn't and delta*(delta*t_sigma)
+     isn't due to potential intermediate overflow risk).  We know there
+     is no risk of overflow of the result though. */
+
+  /* FIXME: HARDCODED TO WEIGHT==1 and NORM==1 */
+  static uint64_t const beta = UINT64_C(0x2f14588b); /* round( 2^30 ( 0.5 + log2(weight) - 0.5*log2(alpha) ) ), fits in 30-bits */
+
+  uint64_t zeta0 = fxp_add_fast( beta, f );
+  uint64_t zeta1 = fxp_mul_rna( fxp_mul_rna( delta, delta, &ovfl ), t_sigma, &ovfl );
+  if( e<INT32_C(0) ) zeta1 += ((uint64_t)(uint32_t)(-e)) << 30;
+  else               zeta0 += ((uint64_t)            e ) << 30;
+
+  /* Compute the result:
+
+       r/2^30 ~ exp2( (zeta0-zeta1)/2^30 )
+
+     Note that the exponent can have different signs (e.g. when
+     weight>1).  So we use the appropriate exp2 variant to avoid a
+     division.
+     
+     fxp_sub_fast is okay because we always subtract the smaller from
+     the larger. */
+
+  uint64_t r = (zeta0>=zeta1) ?  fxp_exp2_approx( fxp_sub_fast( zeta0, zeta1 ) )
+                              : fxp_rexp2_approx( fxp_sub_fast( zeta1, zeta0 ) );
+  return (r<weight) ? r : weight; /* Paranoia in case roundoff pushed things over weight */
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_model_overlap_model_h_ */
+
diff --git a/program/src/oracle/model/price_model.c b/program/src/oracle/model/price_model.c
new file mode 100644
index 000000000..303d52e0a
--- /dev/null
+++ b/program/src/oracle/model/price_model.c
@@ -0,0 +1,113 @@
+#include "price_model.h"
+#include "../util/avg.h" /* For avg_2_int64 */
+
+#define SORT_NAME  int64_sort_ascending
+#define SORT_KEY_T int64_t
+#include "../sort/tmpl/sort_stable.c"
+
+int64_t *
+price_model_core( uint64_t  cnt,
+                  int64_t * quote,
+                  int64_t * _p25,
+                  int64_t * _p50,
+                  int64_t * _p75,
+                  void    * scratch ) {
+
+  /* Sort the quotes.  The sorting implementation used here is a highly
+     optimized mergesort (merge with an unrolled insertion sorting
+     network small n base cases).  The best case is ~0.5 n lg n compares
+     and the average and worst cases are ~n lg n compares.
+     
+     While not completely data oblivious, this has quite low variance in
+     operation count practically and this is _better_ than quicksort's
+     average case and quicksort's worst case is a computational
+     denial-of-service and timing attack vulnerable O(n^2).  Unlike
+     quicksort, this is also stable (but this stability does not
+     currently matter ... it might be a factor in future models).
+     
+     A data oblivious sorting network approach might be viable here with
+     and would have a completely deterministic operations count.  It
+     currently isn't used as the best known practical approaches for
+     general n have a worse algorithmic cost (O( n (lg n)^2 )) and,
+     while the application probably doesn't need perfect obliviousness,
+     mergesort is still moderately oblivious and the application can
+     benefit from mergesort's lower operations cost.  (The main drawback
+     of mergesort over quicksort is that it isn't in place, but memory
+     footprint isn't an issue here.)
+
+     Given the operations cost model (e.g. cache friendliness is not
+     incorporated), a radix sort might be viable here (O(n) in best /
+     average / worst).  It currently isn't used as we expect invocations
+     with small-ish n to be common and radix sort would be have large
+     coefficients on the O(n) and additional fixed overheads that would
+     make it more expensive than mergesort in this regime.
+
+     Note: price_model_cnt_valid( cnt ) implies
+     int64_sort_ascending_cnt_valid( cnt ) currently.
+
+     Note: consider filtering out "NaN" quotes (i.e. INT64_MIN)? */
+
+  int64_t * sort_quote = int64_sort_ascending_stable( quote, cnt, scratch );
+
+  /* Extract the p25
+
+     There are many variants with subtle tradeoffs here.  One option is
+     to interpolate when the ideal p25 is bracketed by two samples (akin
+     to the p50 interpolation above when the number of quotes is even).
+     That is, for p25, interpolate between quotes floor((cnt-2)/4) and
+     ceil((cnt-2)/4) with the weights determined by cnt mod 4.  The
+     current preference is to not do that as it is slightly more
+     complex, doesn't exactly always minimize the current loss function
+     and is more exposed to the confidence intervals getting skewed by
+     bum quotes with the number of quotes is small.
+
+     Another option is to use the inside quote of the above pair.  That
+     is, for p25, use quote ceil((cnt-2)/4) == floor((cnt+1)/4) ==
+     (cnt+1)>>2.  The current preference is not to do this as, though
+     this has stronger bum quote robustness, it results in p25==p50==p75
+     when cnt==3.  (In this case, the above wants to do an interpolation
+     between quotes 0 and 1 to for the p25 and between quotes 1 and 2
+     for the p75.  But limiting to just the inside quote results in
+     p25/p50/p75 all using the median quote.)
+
+     A tweak to this option, for p25, is to use floor(cnt/4) == cnt>>2.
+     This is simple, has the same asymptotic behavior for large cnt, has
+     good behavior in the cnt==3 case and practically as good bum quote
+     rejection in the moderate cnt case. */
+
+  uint64_t p25_idx = cnt >> 2;
+
+  *_p25 = sort_quote[p25_idx];
+
+  /* Extract the p50 */
+
+  if( (cnt & (uint64_t)1) ) { /* Odd number of quotes */
+
+    uint64_t p50_idx = cnt >> 1; /* ==ceil((cnt-1)/2) */
+
+    *_p50 = sort_quote[p50_idx];
+
+  } else { /* Even number of quotes (at least 2) */
+
+    uint64_t p50_idx_right = cnt >> 1;                    /* == ceil((cnt-1)/2)> 0 */
+    uint64_t p50_idx_left  = p50_idx_right - (uint64_t)1; /* ==floor((cnt-1)/2)>=0 (no overflow/underflow) */
+
+    int64_t vl = sort_quote[p50_idx_left ];
+    int64_t vr = sort_quote[p50_idx_right];
+
+    /* Compute the average of vl and vr (with floor / round toward
+       negative infinity rounding and without possibility of
+       intermediate overflow). */
+
+    *_p50 = avg_2_int64( vl, vr );
+  }
+  
+  /* Extract the p75 (this is the mirror image of the p25 case) */
+
+  uint64_t p75_idx = cnt - ((uint64_t)1) - p25_idx;
+
+  *_p75 = sort_quote[p75_idx];
+
+  return sort_quote;
+}
+
diff --git a/program/src/oracle/model/price_model.h b/program/src/oracle/model/price_model.h
new file mode 100644
index 000000000..0a667c72f
--- /dev/null
+++ b/program/src/oracle/model/price_model.h
@@ -0,0 +1,101 @@
+#ifndef _pyth_oracle_model_price_model_h_
+#define _pyth_oracle_model_price_model_h_
+
+#include "../util/compat_stdint.h"
+#include <stdalign.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Returns the minimum and maximum number of quotes the implementation
+   can handle */
+
+static inline uint64_t
+price_model_quote_min( void ) {
+  return (uint64_t)1;
+}
+
+static inline uint64_t
+price_model_quote_max( void ) {
+  return (UINT64_MAX-(uint64_t)alignof(int64_t)+(uint64_t)1) / (uint64_t)sizeof(int64_t);
+}
+
+/* price_model_cnt_valid returns non-zero if cnt is a valid value or
+   zero if not. */
+
+static inline int
+price_model_cnt_valid( uint64_t cnt ) {
+  return price_model_quote_min()<=cnt && cnt<=price_model_quote_max();
+}
+
+/* price_model_scratch_footprint returns the number of bytes of scratch
+   space needed for an arbitrarily aligned scratch region required by
+   price_model to handle price_model_quote_min() to cnt quotes
+   inclusive. */
+
+static inline uint64_t
+price_model_scratch_footprint( uint64_t cnt ) { /* Assumes price_model_cnt_valid( cnt ) is true */
+  /* cnt int64_t's plus worst case alignment padding, no overflow
+     possible as cnt is valid at this point */
+  return cnt*(uint64_t)sizeof(int64_t)+(uint64_t)alignof(int64_t)-(uint64_t)1;
+}
+
+/* price_model_core minimizes (to quote precision in a floor / round
+   toward negative infinity sense) the loss model of the given quotes.
+   Assumes valid inputs (e.g. cnt is at least 1 and not unreasonably
+   large ... typically a multiple of 3 but this is not required,
+   quote[i] for i in [0,cnt) are the quotes of interest on input, p25,
+   p50, p75 point to where to write model outputs, scratch points to a
+   suitable footprint srcatch region).
+
+   Returns a pointer to the quotes sorted in ascending order.  As such,
+   the min and max and any other rank statistic can be extracted easily
+   on return.  This location will either be quote itself or to a
+   location in scratch.  Use price_model below for a variant that always
+   replaces quote with the sorted quotes (potentially has extra ops for
+   copying).  Further, on return, *_p25, *_p50, *_p75 will hold the loss
+   model minimizing values for the input quotes and the scratch region
+   was clobbered.
+
+   Scratch points to a memory region of arbitrary alignment with at
+   least price_model_scratch_footprint( cnt ) bytes and it will be
+   clobbered on output.  It is sufficient to use a normally aligned /
+   normally allocated / normally declared array of cnt int64_t's.
+
+   The cost of this function is a fast and low variance (but not
+   completely data oblivious) O(cnt lg cnt) in the best / average /
+   worst cases.  This function uses no heap / dynamic memory allocation.
+   It is thread safe provided it passed non-conflicting quote, output
+   and scratch arrays.  It has a bounded call depth ~lg cnt <= ~64 (this
+   could reducd to O(1) by using a non-recursive sort/select
+   implementation under the hood if desired). */
+
+int64_t *                              /* Returns pointer to sorted quotes (either quote or ALIGN_UP(scratch,int64_t)) */
+price_model_core( uint64_t  cnt,       /* Assumes price_model_cnt_valid( cnt ) is true */
+                  int64_t * quote,     /* Assumes quote[i] for i in [0,cnt) is the i-th quote on input */
+                  int64_t * _p25,      /* Assumes *_p25 is safe to write to the p25 model output */
+                  int64_t * _p50,      /* Assumes *_p50 " */
+                  int64_t * _p75,      /* Assumes *_p75 " */
+                  void    * scratch ); /* Assumes a suitable scratch region */
+
+/* Same as the above but always returns quote and quote always holds the
+   sorted quotes on return. */
+
+static inline int64_t *
+price_model( uint64_t  cnt,
+             int64_t * quote,
+             int64_t * _p25,
+             int64_t * _p50,
+             int64_t * _p75,
+             void    * scratch ) {
+  int64_t * tmp = price_model_core( cnt, quote, _p25, _p50, _p75, scratch );
+  if( tmp!=quote ) for( uint64_t idx=(uint64_t)0; idx<cnt; idx++ ) quote[ idx ] = tmp[ idx ];
+  return quote;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_model_price_model_h_ */
diff --git a/program/src/oracle/model/run_tests b/program/src/oracle/model/run_tests
new file mode 100755
index 000000000..cc0e0fbeb
--- /dev/null
+++ b/program/src/oracle/model/run_tests
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+module purge          || exit 1
+module load gcc-9.3.0 || exit 1
+
+./clean       || exit 1
+mkdir -pv bin || exit 1
+
+CC="gcc -g -Wall -Werror -Wextra -Wconversion -Wstrict-aliasing=2 -Wimplicit-fallthrough=2 -pedantic -D_XOPEN_SOURCE=600 -O2 -march=native -std=c17 -lm"
+
+set -x
+
+$CC test_price_model.c price_model.c -o bin/test_price_model      || exit 1
+$CC test_gap_model.c                 -o bin/test_gap_model        || exit 1
+$CC test_overlap_model.c             -o bin/test_overlap_model    || exit 1
+$CC test_weight_model.c              -o bin/test_weight_model     || exit 1
+$CC test_leaky_integrator.c          -o bin/test_leaky_integrator || exit 1
+
+bin/test_price_model      || exit 1
+bin/test_gap_model        || exit 1
+bin/test_overlap_model    || exit 1
+bin/test_weight_model     || exit 1
+bin/test_leaky_integrator || exit 1
+
+echo all tests passed
+
diff --git a/program/src/oracle/model/test_gap_model.c b/program/src/oracle/model/test_gap_model.c
new file mode 100644
index 000000000..11c1be320
--- /dev/null
+++ b/program/src/oracle/model/test_gap_model.c
@@ -0,0 +1,60 @@
+#include <stdio.h>
+#include <math.h>
+#include "../util/prng.h"
+
+#define GAP_MODEL_NEED_REF
+#define EXP2M1_FXP_ORDER 7 /* EXP2M1_FXP_ORDER 4 yields a better than IEEE single precision accurate approximation */
+#include "gap_model.h"
+
+static uint64_t const ulp_thresh[8] = {
+  UINT64_C(0),
+  UINT64_C(1260649965), /* max_rerr ~ 3.9e-04 */
+  UINT64_C(95149836),   /* max_rerr ~ 3.0e-05 */
+  UINT64_C(1348998),    /* max_rerr ~ 4.2e-07 */
+  UINT64_C(67273),      /* max_rerr ~ 2.1e-08 */
+  UINT64_C(3331),       /* max_rerr ~ 1.0e-09 */
+  UINT64_C(40),         /* max_rerr ~ 1.2e-11 */
+  UINT64_C(21)          /* max_rerr ~ 6.5e-12 */
+};
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  uint64_t max_ulp = UINT64_C(0);
+  uint64_t thresh  = ulp_thresh[ EXP2M1_FXP_ORDER ];
+
+  for( int i=0; i<100*GAP_MODEL_LAMBDA; i++ ) {
+
+    uint64_t x;
+    if( i<(22*GAP_MODEL_LAMBDA) ) x = (uint64_t)i;
+    else {
+      int s = ((int)prng_uint32( prng )) & 63;
+      x = prng_uint64( prng ) >> s;
+    }
+
+    uint64_t y = gap_model( x );
+    uint64_t z = (uint64_t)roundl( gap_model_ref( x )*(long double)(1<<30) );
+
+    if( !x ) {
+      if( y!=z ) { printf( "FAIL (iter %i: special x %lu y %lu z %lu)\n", i, x, y, z ); return 1; }
+      continue;
+    }
+    if( y>(((uint64_t)GAP_MODEL_LAMBDA)<<30) ) { printf( "FAIL (iter %i: range x %lu y %lu z %lu)\n", i, x, y, z ); return 1; }
+
+    uint64_t ulp = y>z ? (y-z) : (z-y);
+    if( ulp>thresh ) { printf( "FAIL (iter %i: ulp x %lu y %lu z %lu ulp %lu)\n", i, x, y, z, ulp ); return 1; }
+    if( ulp>max_ulp ) max_ulp = ulp;
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass (max_rerr %.1e max ulp %lu)\n", (double)max_ulp / (((double)(1<<30))*((double)GAP_MODEL_LAMBDA)), max_ulp );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/model/test_leaky_integrator.c b/program/src/oracle/model/test_leaky_integrator.c
new file mode 100644
index 000000000..8b3568974
--- /dev/null
+++ b/program/src/oracle/model/test_leaky_integrator.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <math.h>
+#include "../util/prng.h"
+
+#define LEAKY_INTEGRATOR_NEED_REF
+#include "leaky_integrator.h"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  long double max_rerr = 0.L;
+
+  static long double const _2_30  =     (long double)(UINT64_C(1)<<30);
+  static long double const _2_n30 = 1.L/(long double)(UINT64_C(1)<<30);
+  static long double const _2_64  = 18446744073709551616.L;
+  static long double const _2_128 = 18446744073709551616.L*18446744073709551616.L;
+
+  int ctr = 0;
+  for( int iter=0; iter<100000000; iter++ ) {
+    if( !ctr ) { printf( "Completed %u iterations\n", iter ); ctr = 10000000; }
+    ctr--;
+
+    uint32_t t = prng_uint32( prng );
+
+    uint64_t yh = prng_uint64( prng );
+    uint64_t yl = prng_uint64( prng );
+    uwide_sr( &yh,&yl, yh,yl, (int)(t & UINT32_C(127)) ); t >>= 7;
+    long double y = (_2_64*(long double)yh) + (long double)yl;
+
+    uint64_t w = (uint64_t)(prng_uint32( prng ) >> (t & UINT32_C(31))); t >>= 5;
+    if( w>(UINT64_C(1)<<30) ) w = (UINT64_C(1)<<30);
+
+    uint64_t x = prng_uint64( prng ) >> (t & UINT32_C(63)); t >>= 6;
+    
+    uint64_t zh,zl; uint64_t co = leaky_integrator( &zh,&zl, yh,yl, w, x );
+    long double z = ((long double)co)*_2_128 + ((long double)zh)*_2_64 + ((long double)zl);
+
+    long double z_ref = roundl( leaky_integrator_ref( y, ((long double)w)*_2_n30, _2_30*(long double)x ) );
+
+    long double aerr = fabsl( z-z_ref );
+    long double rerr = aerr / fmaxl( 1.L, z_ref );
+
+    if( rerr > 2.6e-14 ) {
+      printf( "FAIL (iter %i: y %Le w %lx x %lx z %Le z_ref %Le aerr %.1Le rerr %.1Le\n", iter,
+              y, w, x, z, z_ref, aerr, rerr );
+      return 1;
+    }
+    max_rerr = fmaxl( rerr, max_rerr );
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass (max_rerr %.1Le)\n", max_rerr );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/model/test_overlap_model.c b/program/src/oracle/model/test_overlap_model.c
new file mode 100644
index 000000000..7a24eecb3
--- /dev/null
+++ b/program/src/oracle/model/test_overlap_model.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <math.h>
+#include "../util/prng.h"
+
+#define OVERLAP_MODEL_NEED_REF
+#include "overlap_model.h"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  long double max_ulp_fine   = 0.L;
+  long double max_ulp_coarse = 0.L;
+
+  long double const thresh_fine   = 2.6L;    /* max_rerr 2.4e-09 */
+  long double const thresh_coarse = 32768.L; /* max_rerr 3.1e-05 */
+
+  int ctr = 0;
+  for( int iter=0; iter<100000000; iter++ ) {
+    if( !ctr ) { printf( "Completed %u iterations\n", iter ); ctr = 10000000; }
+    ctr--;
+
+    /* Generate random tests that really stress price invariance assumptions */
+
+    uint32_t t       = prng_uint32( prng );
+    uint64_t mu_0    = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6;
+    uint64_t mu_1    = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6;
+    uint64_t sigma_0 = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6;
+    uint64_t sigma_1 = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6;
+
+    /* When sigmas are really dissimilar, limitations of the fixed point
+       representation limit accuracy to ~2^15 ulp.  So we do separate
+       states for when the two distributions have similar widths. */
+
+    uint64_t sigma_min = sigma_0<sigma_1 ? sigma_0 : sigma_1;
+    uint64_t sigma_max = sigma_0<sigma_1 ? sigma_1 : sigma_0;
+    int fine = (sigma_min >= (sigma_max>>1)); /* sigmas of the two are comparable */
+
+    long double y   = (long double)overlap_model( mu_0,sigma_0, mu_1,sigma_1 );
+    long double z   = ((long double)(1<<30))*overlap_model_ref( (long double)mu_0,(long double)sigma_0,
+                                                                (long double)mu_1,(long double)sigma_1 );
+    long double ulp = fabsl( y - z );
+    if( ulp>=(fine ? thresh_fine : thresh_coarse) ) {
+      printf( "FAIL (iter %i: i(%lu,%lu) j(%lu,%lu) y %Lf z %Lf ulp %Lf)\n", iter, mu_0, sigma_0, mu_1, sigma_1, y, z, ulp );
+      return 1;
+    }
+    if( fine && ulp>max_ulp_fine   ) max_ulp_fine   = ulp;
+    if(         ulp>max_ulp_coarse ) max_ulp_coarse = ulp;
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass (fine: max_rerr %.1Le max ulp %.1Lf, coarse: max_rerr %.1Le max ulp %.1Lf)\n",
+          max_ulp_fine   / ((long double)(UINT64_C(1)<<30)), max_ulp_fine,
+          max_ulp_coarse / ((long double)(UINT64_C(1)<<30)), max_ulp_coarse );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/model/test_price_model.c b/program/src/oracle/model/test_price_model.c
new file mode 100644
index 000000000..321ff1746
--- /dev/null
+++ b/program/src/oracle/model/test_price_model.c
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../util/util.h"
+#include "price_model.h"
+
+int
+qcmp( void const * _p,
+      void const * _q ) {
+  int64_t p = *(int64_t const *)_p;
+  int64_t q = *(int64_t const *)_q;
+  if( p < q ) return -1;
+  if( p > q ) return  1;
+  return 0;
+}
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+# define N 96
+
+  int64_t quote0 [N];
+  int64_t quote  [N];
+  int64_t val    [3];
+  int64_t scratch[N];
+
+  int ctr = 0;
+  for( int iter=0; iter<10000000; iter++ ) {
+    if( !ctr ) { printf( "Completed %u iterations\n", iter ); ctr = 100000; }
+    ctr--;
+
+    /* Generate a random test */
+
+    uint64_t cnt = (uint64_t)1 + (uint64_t)(prng_uint32( prng ) % (uint32_t)N); /* In [1,N], approx uniform IID */
+    for( uint64_t idx=(uint64_t)0; idx<cnt; idx++ ) quote0[ idx ] = (int64_t)prng_uint64( prng );
+
+    /* Apply the model */
+
+    memcpy( quote, quote0, sizeof(int64_t)*(size_t)cnt );
+    if( price_model( cnt, quote, val+0, val+1, val+2, scratch )!=quote ) { printf( "FAIL (compose)\n" ); return 1; }
+
+    /* Validate the results */
+
+    qsort( quote0, (size_t)cnt, sizeof(int64_t), qcmp );
+    if( memcmp( quote, quote0, sizeof(int64_t)*(size_t)cnt ) ) { printf( "FAIL (sort)\n" ); return 1; }
+
+    uint64_t p25_idx = cnt>>2;
+    uint64_t p50_idx = cnt>>1;
+    uint64_t p75_idx = cnt - (uint64_t)1 - p25_idx;
+    uint64_t is_even = (uint64_t)!(cnt & (uint64_t)1);
+
+    if( val[0]!=quote[ p25_idx ] ) { printf( "FAIL (p25)\n" ); return 1; }
+    if( val[1]!=avg_2_int64( quote[ p50_idx-is_even ], quote[ p50_idx ] ) ) { printf( "FAIL (p50)\n" ); return 1; }
+    if( val[2]!=quote[ p75_idx ] ) { printf( "FAIL (p75)\n" ); return 1; }
+  }
+
+# undef N
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass\n" );
+  return 0;
+}
+
diff --git a/program/src/oracle/model/test_weight_model.c b/program/src/oracle/model/test_weight_model.c
new file mode 100644
index 000000000..14a09933c
--- /dev/null
+++ b/program/src/oracle/model/test_weight_model.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <math.h>
+#include "../util/prng.h"
+
+#define WEIGHT_MODEL_NEED_REF
+#include "weight_model.h"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  static long double const _2_30  =     (long double)(UINT64_C(1)<<30);
+  static long double const _2_n30 = 1.L/(long double)(UINT64_C(1)<<30);
+
+  uint64_t max_ulp = UINT64_C(0);
+
+  int ctr = 0;
+  for( int iter=0; iter<100000000; iter++ ) {
+    if( !ctr ) { printf( "Completed %u iterations\n", iter ); ctr = 10000000; }
+    ctr--;
+
+    uint32_t t = prng_uint32( prng );
+    uint64_t ratio_gap     = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6;
+    uint64_t ratio_overlap = prng_uint64( prng ) >> (t & (uint32_t)63); t >>= 6;
+
+    uint64_t y = weight_model( ratio_gap, ratio_overlap );
+    uint64_t z = (uint64_t)roundl( _2_30*weight_model_ref( _2_n30*(long double)ratio_gap, _2_n30*(long double)ratio_overlap ) );
+
+    uint64_t ulp = y>z ? y-z : z-y;
+    if( y>(UINT64_C(1)<<30) || ulp>UINT64_C(1) ) {
+      printf( "FAIL (iter %i: gap %lx overlap %lx y %lx z %lx ulp %lu)\n", iter, ratio_gap, ratio_overlap, y, z, ulp );
+      return 1;
+    }
+    if( ulp>max_ulp ) max_ulp = ulp;
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass (max_rerr %.1e max ulp %lu)\n", (double)max_ulp / (double)(1UL<<30), max_ulp );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/model/twap_model.c b/program/src/oracle/model/twap_model.c
new file mode 100644
index 000000000..eaa3ae7c8
--- /dev/null
+++ b/program/src/oracle/model/twap_model.c
@@ -0,0 +1,73 @@
+#define REXP2_FXP_ORDER  7
+#define EXP2M1_FXP_ORDER 7
+
+#include "twap_model.h"
+
+#include "gap_model.h"
+#include "overlap_model.h"
+#include "weight_model.h"
+#include "leaky_integrator.h"
+#include "../util/uwide.h"
+
+twap_model_t *
+twap_model_update( twap_model_t * tm,
+                   uint64_t       now,
+                   uint64_t       agg_price,
+                   uint64_t       agg_conf ) {
+
+  /* If this is the first observation, reset */
+
+  if( !tm->valid ) goto reset;
+
+  /* If sequence numbers didn't advance since last observation, reset
+     (this logic also correctly handles sequence number wrapping). */
+
+  int64_t gap = (int64_t)(now-tm->ts);
+  if( gap<=INT64_C(0) ) goto reset;
+
+  /* Compute weight/2^30 ~ our Bayesian probability that this
+     (agg_price,agg_conf) estimate parameterizes a price distribution
+     similiar to the most recent observed price distribution (event
+     SIM). */
+
+  uint64_t weight = weight_model( gap_model( (uint64_t)gap ), overlap_model( tm->twap,tm->twac, agg_price,agg_conf ) );
+
+  /* The number of samples if  SIM is D+1 and prob  SIM =   weight/2^30
+     The number of samples if ~SIM is 1   and prob ~SIM = 1-weight/2^30
+     Expected number of recent samples available to average over then:
+        (D+1)*weight + 1*(1-weight) == D*weight + 1
+     Thus we can apply a leaky integrator recursion for D and similarly
+     for the twap_N and twac_N.  While these should never have any
+     overflow or what not given a GAP_MODEL_LAMBDA and
+     OVERLAP_MODEL_WEIGHT, we reset if we have even the slightest hint
+     of an issue to fail safe. */
+
+  uint64_t tmp_h,tmp_l;
+
+  if( leaky_integrator( &tm->twap_Nh,&tm->twap_Nl, tm->twap_Nh,tm->twap_Nl, weight, agg_price   )          ) goto reset;
+  if( leaky_integrator( &tm->twac_Nh,&tm->twac_Nl, tm->twac_Nh,tm->twac_Nl, weight, agg_conf    )          ) goto reset;
+  if( leaky_integrator( &tmp_h,      &tm->D,       UINT64_C(0),tm->D,       weight, UINT64_C(1) ) || tmp_h ) goto reset;
+
+  /* Compute twap ~ twap_N/D via:
+       twap = floor( (twap_N+(D/2)) / D )
+     and similarly for twac. */
+
+  tm->valid = INT8_C(1);
+  tm->ts    = now;
+  if( uwide_add( &tmp_h,&tmp_l, tm->twap_Nh,tm->twap_Nl, UINT64_C(0),UINT64_C(0), tm->D>>1 ) ) goto reset;
+  if( uwide_div( &tmp_h,&tm->twap, tmp_h,tmp_l, tm->D ) || tmp_h                             ) goto reset;
+  if( uwide_add( &tmp_h,&tmp_l, tm->twac_Nh,tm->twac_Nl, UINT64_C(0),UINT64_C(0), tm->D>>1 ) ) goto reset;
+  if( uwide_div( &tmp_h,&tm->twac, tmp_h,tmp_l, tm->D ) || tmp_h                             ) goto reset;
+  return tm;
+
+reset:
+  tm->valid = INT8_C(1);
+  tm->ts    = now;
+  tm->twap  = agg_price;
+  tm->twac  = agg_conf;
+  uwide_sl( &tm->twap_Nh,&tm->twap_Nl, UINT64_C(0),agg_price, 30 );
+  uwide_sl( &tm->twac_Nh,&tm->twac_Nl, UINT64_C(0),agg_conf,  30 );
+  tm->D     = UINT64_C(1)<<30;
+  return tm;
+}
+
diff --git a/program/src/oracle/model/twap_model.h b/program/src/oracle/model/twap_model.h
new file mode 100644
index 000000000..3b7873b0c
--- /dev/null
+++ b/program/src/oracle/model/twap_model.h
@@ -0,0 +1,102 @@
+#ifndef _pyth_oracle_model_twap_model_h_
+#define _pyth_oracle_model_twap_model_h_
+
+#include "../util/compat_stdint.h"
+#include <stdalign.h>
+
+/* twap_model_t should be treated as an opaque handle of a twap_model.
+   (It technically isn't here to facilitate inlining of operations.) */
+
+struct twap_model {
+  int8_t   valid;           /* If 0, all values below should be ignored */
+  uint64_t ts;              /* Block sequence number corresponding to the below values */
+  uint64_t twap;            /* round( twap_N/D ) */
+  uint64_t twac;            /* round( twac_N/D ) */
+  uint64_t twap_Nh,twap_Nl; /* twap_N is the weighted sum of recent aggregrated price observations */
+  uint64_t twac_Nh,twac_Nl; /* twac_N is the weighted sum of recent aggregrated confidence observations */
+  uint64_t D;               /* D      is the weighted number of recent observations */
+};
+
+typedef struct twap_model twap_model_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* twap_model_footprint and twap_model_align give the needed footprint
+   and alignment for a memory region to hold a twap_model's state.
+   malloc / alloca / declaration friendly (e.g. a memory region declared
+   as "twap_model_t _tm[1];", or created by
+   "malloc(sizeof(twap_model_t))" or created by
+   "alloca(sizeof(twap_model_t))" will all automatically have the needed
+   footprint and alignment).
+
+   twap_model_new takes ownership of the memory region pointed to by
+   shmem (which is assumed to be non-NULL with the appropriate footprint
+   and alignment) and formats it as a twap_model.  Returns mem (which
+   will be formatted for use).  The caller will not be joined to the
+   region on return.
+
+   twap_model_join joins the caller to a memory region holding the state
+   of a twap_model.  shtm points to a memory region in the local address
+   space that holds the twap_model.  Returns an opaque handle of the
+   local join in the local address space to twap_model (which might not
+   be the same thing as shtm ... the separation of new and join is to
+   facilitate interprocess shared memory usage patterns while supporting
+   transparent upgrades users of this to more elaborate models where
+   address translations under the hood may not be trivial).
+
+   twap_model_leave leaves the current twap_model join.  Returns a
+   pointer in the local address space to the memory region holding the
+   state of the twap_model.  The twap_model join should not be used
+   afterward.
+
+   twap_model_delete unformats the memory region currently used to hold
+   the state of a _twap_model and returns ownership of the underlying
+   memory region to the caller.  There should be no joins in the system
+   on the twap_model.  Returns a pointer to the underlying memory
+   region. */
+
+static inline uint64_t twap_model_footprint( void ) { return (uint64_t) sizeof(twap_model_t); }
+static inline uint64_t twap_model_align    ( void ) { return (uint64_t)alignof(twap_model_t); }
+
+static inline void *
+twap_model_new( void * shmem ) {
+  twap_model_t * tm = (twap_model_t *)shmem;
+  tm->valid   = INT8_C(0);
+  tm->ts      = UINT64_C(0);
+  tm->twap    = UINT64_C(0);
+  tm->twac    = UINT64_C(0);
+  tm->twap_Nh = UINT64_C(0); tm->twap_Nl = UINT64_C(0);
+  tm->twac_Nh = UINT64_C(0); tm->twac_Nl = UINT64_C(0);
+  tm->D       = UINT64_C(0);
+  return tm;
+}
+
+static inline twap_model_t * twap_model_join  ( void *         shtm ) { return (twap_model_t *)shtm; }
+static inline void *         twap_model_leave ( twap_model_t * tm   ) { return tm; }
+static inline void *         twap_model_delete( void *         shtm ) { return shtm; }
+
+/* Accessors to the current state of a twap model. */
+
+static inline int      twap_model_valid( twap_model_t const * tm ) { return !!tm->valid; }
+static inline uint64_t twap_model_ts   ( twap_model_t const * tm ) { return tm->ts;      }
+static inline uint64_t twap_model_twap ( twap_model_t const * tm ) { return tm->twap;    }
+static inline uint64_t twap_model_twac ( twap_model_t const * tm ) { return tm->twac;    }
+
+/* Update the state of the twap_model to incorporate observation
+   at block chain sequence number now the aggregrate price / confidence
+   pair (agg_price,agg_conf).  Returns tm.  */
+
+twap_model_t *
+twap_model_update( twap_model_t * tm,
+                   uint64_t       now,
+                   uint64_t       agg_price,
+                   uint64_t       agg_conf );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_model_twap_model_h_ */
+
diff --git a/program/src/oracle/model/weight_model.h b/program/src/oracle/model/weight_model.h
new file mode 100644
index 000000000..348413757
--- /dev/null
+++ b/program/src/oracle/model/weight_model.h
@@ -0,0 +1,124 @@
+#ifndef _pyth_oracle_model_weight_model_h_
+#define _pyth_oracle_model_weight_model_h_
+
+/* Given our estimate of the distribution of prices gap blocks ago and
+   an estimate of the current price distribution ("OBS"), compute our
+   belief that the two estimates are modeling statistically similar
+   price distributions ("SIM").  Via Bayes theorem then:
+
+     PROB(SIM;OBS) = PROB(OBS;SIM) PROB(SIM) / PROB(OBS)
+                   = PROB(OBS;SIM) PROB(SIM) / ( PROB(OBS;SIM) PROB(SIM) + PROB(OBS;~SIM) PROB(~SIM) )
+                   = R / ( 1 + R )
+
+   where:
+
+     R = ( PROB(OBS;SIM)/PROB(OBS;~SIM) ) ( PROB(SIM)/PROB(~SIM) )
+
+   The first term in R is exactly what overlap_model estimates.  The
+   second term is exactly what gap_model estimates.  Specifically,
+   weight_model computes:
+
+      weight/2^30 ~ PROB(SIM;OBS)
+
+   given the two ratios
+
+      ratio_overlap/2^30 ~ PROB(OBS;SIM) / PROB(OBS;~SIM)
+      ratio_gap    /2^30 ~ PROB(SIM)     / PROB(~SIM)
+
+   This calcuation ~30-bits accurate (the result is +/-1 ulp of the
+   correctly rounded result).
+
+   If WEIGHT_MODEL_NEED_REF is defined, this will also declare a high
+   accuracy floating point reference implementation "weight_model_ref"
+   to aid in testing / tuning block chain suitable approximations. */
+
+#include "../util/uwide.h"
+
+#ifdef WEIGHT_MODEL_NEED_REF
+#include <math.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef WEIGHT_MODEL_NEED_REF
+
+static inline long double
+weight_model_ref( long double ratio_gap,
+                  long double ratio_overlap ) {
+  long double r = ratio_gap*ratio_overlap;
+  return r / (1.L + r);
+}
+
+#endif
+
+static inline uint64_t /* In [0,2^30] */
+weight_model( uint64_t ratio_gap,
+              uint64_t ratio_overlap ) {
+  uint64_t tmp_h,tmp_l;
+
+  /* Starting from the exact expression:
+
+       weight/2^30 = ((ratio_gap/2^30) (ratio_overlap/2^30)) / ( 1 + ((ratio_gap/2^30) (ratio_overlap/2^30)))
+
+     we have:
+
+       weight = 2^30 ratio_gap ratio_overlap / ( 2^60 + ratio_gap ratio_overlap )
+
+     When ratio_gap ratio_overlap <= 2^59/2^30 - 0.5 (>= 2^91-2^60)
+     weight rounds to 0 (2^30).  This implies we don't actually need to
+     do a division with a 128-bit denominator to get a practically full
+     precision result.  Instead we compute:
+
+       ratio_gap ratio_overlap / 2^s
+
+     where s the smallest integer that keep the denominator in 63-bits
+     for all non-trivial values of ratio_gap ratio_overlap (63 is used
+     instead of 64 to with the rounding calc).  Given the above, s=28 is
+     sufficient, yielding:
+
+       weight = 2^2 ratio_gap ratio_overlap / ( 2^32 + (ratio_gap ratio_overlap/2^28) )
+       
+     The below approximates this via:
+
+       num    = ratio_gap ratio_overlap
+       den    = 2^32 + floor( (num + 2^27)/2^28 )
+       weight = floor( (2^3 num + den) / (2 den) )
+
+     Should have a maximum error of O(1) ulp. */
+
+  /* Compute num. 128-bit wide needed because these ratios can both
+     potentially be >=2^32. */
+
+  uint64_t num_h,num_l; uwide_mul( &num_h,&num_l, ratio_gap, ratio_overlap );
+
+  /* Compute den.  Note that as ratio_gap ratio_overlap is at most
+     2^128-2^65+1, the rounding add by 2^27 will not overflow.  If the
+     result here >= (2^91-2^60)/2^28 = 2^63-2^32, we know the result
+     rounds to 2^30 and return immediately. */
+
+  static uint64_t const thresh = (UINT64_C(1)<<63) - (UINT64_C(1)<<32);
+
+  uwide_inc( &tmp_h,&tmp_l, num_h,num_l, UINT64_C(1)<<27 );
+  uwide_sr ( &tmp_h,&tmp_l, tmp_h,tmp_l, 28 );
+  if( tmp_h || tmp_l>=thresh ) return UINT64_C(1)<<30;
+  uint64_t den = (UINT64_C(1)<<32) + tmp_l;
+
+  /* Compute weight.  Note that at this point <num_h,num_l> < 2^91-2^60
+     and den < 2^63 such that none of the below can overflow. */
+
+  uwide_sl ( &tmp_h,&tmp_l, num_h,num_l, 3      );
+  uwide_inc( &tmp_h,&tmp_l, tmp_h,tmp_l, den    );
+  uwide_div( &tmp_h,&tmp_l, tmp_h,tmp_l, den<<1 );
+  uint64_t weight = tmp_l; /* tmp_h==0 at this point */
+
+  return weight;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_model_weight_model_h_ */
+
diff --git a/program/src/oracle/pd.h b/program/src/oracle/pd.h
deleted file mode 100644
index 413e4ccdb..000000000
--- a/program/src/oracle/pd.h
+++ /dev/null
@@ -1,191 +0,0 @@
-#pragma once
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define PD_SCALE9     (1000000000L)
-#define PD_EMA_MAX_DIFF 4145     // maximum slots before reset
-#define PD_EMA_EXPO     (-9)     // exponent of temporary storage
-#define PD_EMA_DECAY   (-117065) // 1e9*-log(2)/5921
-#define PC_FACTOR_SIZE       18
-
-#define EXP_BITS 5
-#define EXP_MASK ( ( 1L << EXP_BITS ) - 1 )
-
-#define pd_new( n,v,e ) {(n)->v_=v;(n)->e_=e;}
-#define pd_set( r,n ) (r)[0] = (n)[0]
-#define pd_new_scale(n,v,e) {(n)->v_=v;(n)->e_=e;pd_scale(n);}
-
-// decimal number format
-typedef struct pd
-{
-  int32_t  e_;
-  int64_t  v_;
-} pd_t;
-
-static inline void pd_scale( pd_t *n )
-{
-  int const neg = n->v_ < 0L;
-  int64_t v = neg ? -n->v_ : n->v_; // make v positive for loop condition
-  for( ; v >= ( 1L << 28 ); v /= 10L, ++n->e_ );
-  n->v_ = neg ? -v : v;
-}
-
-static inline bool pd_store( int64_t *r, pd_t const *n )
-{
-  int64_t v = n->v_;
-  int32_t e = n->e_;
-  while ( v <  -( 1L << 58 ) ) {
-    v /= 10;
-    ++e;
-  }
-  while ( v > ( 1L << 58 ) - 1 ) {
-    v /= 10;
-    ++e;
-  }
-  while ( e < -( 1 << ( EXP_BITS - 1 ) ) ) {
-    v /= 10;
-    ++e;
-  }
-  while ( e > ( 1 << ( EXP_BITS - 1 ) ) - 1 ) {
-    v *= 10;
-    if ( v < -( 1L << 58 ) || v > ( 1L << 58 ) - 1 ) {
-      return false;
-    }
-    --e;
-  }
-  *r = ( v << EXP_BITS ) | ( e & EXP_MASK );
-  return true;
-}
-
-void pd_load( pd_t *r, int64_t const n )
-{
-  pd_new( r, n >> EXP_BITS, ( ( n & EXP_MASK ) << 59 ) >> 59 );
-  pd_scale( r );
-}
-
-static inline void pd_adjust( pd_t *n, int e, const int64_t *p )
-{
-  int64_t v = n->v_;
-  int d = n->e_ - e;
-  if ( d > 0 ) {
-    v *= p[ d ];
-  }
-  else if ( d < 0 ) {
-    v /= p[ -d ];
-  }
-  pd_new( n, v, e );
-}
-
-static inline void pd_mul( pd_t *r, const pd_t *n1, const pd_t *n2 )
-{
-  r->v_ = n1->v_ * n2->v_;
-  r->e_ = n1->e_ + n2->e_;
-  pd_scale( r );
-}
-
-static inline void pd_div( pd_t *r, pd_t *n1, pd_t *n2 )
-{
-  if ( n1->v_ == 0 ) { pd_set( r, n1 ); return; }
-  int64_t v1 = n1->v_, v2 = n2->v_;
-  int neg1 = v1 < 0L, neg2 = v2 < 0L, m = 0;
-  if ( neg1 ) v1 = -v1;
-  if ( neg2 ) v2 = -v2;
-  for( ; 0UL == ( ( uint64_t )v1 & 0xfffffffff0000000UL ); v1 *= 10L, ++m );
-  r->v_ = ( v1 * PD_SCALE9 ) / v2;
-  if ( neg1 ) r->v_ = -r->v_;
-  if ( neg2 ) r->v_ = -r->v_;
-  r->e_ = n1->e_ - n2->e_ - m - 9;
-  pd_scale( r );
-}
-
-static inline void pd_add( pd_t *r, const pd_t *n1, const pd_t *n2, const int64_t *p )
-{
-  int d = n1->e_ - n2->e_;
-  if ( d==0 ) {
-    pd_new( r, n1->v_ + n2->v_, n1->e_ );
-  } else if ( d>0 ) {
-    if ( d<9 ) {
-      pd_new( r, n1->v_*p[d] + n2->v_, n2->e_ );
-    } else if ( d < PC_FACTOR_SIZE+9 ) {
-      pd_new( r, n1->v_*PD_SCALE9 + n2->v_/p[d-9], n1->e_-9);
-    } else {
-      pd_set( r, n1 );
-    }
-  } else {
-    d = -d;
-    if ( d<9 ) {
-      pd_new( r, n1->v_ + n2->v_*p[d], n1->e_ );
-    } else if ( d < PC_FACTOR_SIZE+9 ) {
-      pd_new( r, n1->v_/p[d-9] + n2->v_*PD_SCALE9, n2->e_-9 );
-    } else {
-      pd_set( r, n2 );
-    }
-  }
-  pd_scale( r );
-}
-
-static inline void pd_sub( pd_t *r, const pd_t *n1, const pd_t *n2, const int64_t *p )
-{
-  int d = n1->e_ - n2->e_;
-  if ( d==0 ) {
-    pd_new( r, n1->v_ - n2->v_, n1->e_ );
-  } else if ( d>0 ) {
-    if ( d<9 ) {
-      pd_new( r, n1->v_*p[d] - n2->v_, n2->e_ );
-    } else if ( d < PC_FACTOR_SIZE+9 ) {
-      pd_new( r, n1->v_*PD_SCALE9 - n2->v_/p[d-9], n1->e_-9);
-    } else {
-      pd_set( r, n1 );
-    }
-  } else {
-    d = -d;
-    if ( d<9 ) {
-      pd_new( r, n1->v_ - n2->v_*p[d], n1->e_ );
-    } else if ( d < PC_FACTOR_SIZE+9 ) {
-      pd_new( r, n1->v_/p[d-9] - n2->v_*PD_SCALE9, n2->e_-9 );
-    } else {
-      pd_new( r, -n2->v_, n2->e_ );
-    }
-  }
-  pd_scale( r );
-}
-
-static inline int pd_lt( const pd_t *n1, const pd_t *n2, const int64_t *p )
-{
-  pd_t r[1];
-  pd_sub( r, n1, n2, p );
-  return r->v_ < 0L;
-}
-
-static inline int pd_gt( const pd_t *n1, const pd_t *n2, const int64_t *p )
-{
-  pd_t r[1];
-  pd_sub( r, n1, n2, p );
-  return r->v_ > 0L;
-}
-
-static inline void pd_sqrt( pd_t *r, pd_t *val, const int64_t *f )
-{
-  pd_t t[1], x[1], hlf[1];
-  pd_set( t, val );
-  pd_new( r, 1, 0 );
-  pd_add( x, t, r, f );
-  pd_new( hlf, 5, -1 );
-  pd_mul( x, x, hlf );
-  for(;;) {
-    pd_div( r, t, x );
-    pd_add( r, r, x, f );
-    pd_mul( r, r, hlf );
-    if ( x->v_ == r->v_ ) {
-      break;
-    }
-    pd_set( x, r );
-  }
-}
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/program/src/oracle/sort.c b/program/src/oracle/sort.c
deleted file mode 100644
index 4cb53ebc3..000000000
--- a/program/src/oracle/sort.c
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "sort.h"
-
-#ifndef __bpf__
-#include <assert.h>
-#endif
-
-static inline void swap_int64( int64_t *a, int64_t *b )
-{
-  int64_t const temp = *a;
-  *a = *b;
-  *b = temp;
-}
-
-static inline int partition_int64( int64_t* v, int i, int j )
-{
-#ifndef __bpf__
-#ifndef NDEBUG
-  int const i0 = i;
-  int const j0 = j;
-#endif
-#endif
-
-  int const p = i;
-  int64_t const pv = v[ p ];
-
-  while ( i < j ) {
-    while ( v[ i ] <= pv && i <= j ) {
-      ++i;
-    }
-    while ( v[ j ] > pv ) {
-      --j;
-    }
-    if ( i < j ) {
-      swap_int64( &v[ i ], &v[ j ] );
-    }
-  }
-  swap_int64( &v[ p ], &v[ j ] );
-
-#ifndef __bpf__
-#ifndef NDEBUG
-  int k;
-  for ( k = i0; k < j; ++k ) {
-    assert( v[ k ] <= pv );
-  }
-  assert( v[ j ] == pv );
-  for ( k = j + 1; k <= j0; ++k ) {
-    assert( v[ k ] > pv );
-  }
-#endif
-#endif
-
-  return j;
-}
-
-static void qsort_help_int64( int64_t* const v, int i, int j )
-{
-  if ( i >= j ) {
-    return;
-  }
-  int p = partition_int64( v, i, j );
-  qsort_help_int64( v, i, p - 1 );
-  qsort_help_int64( v, p + 1, j );
-}
-
-void qsort_int64( int64_t* const v, unsigned const size )
-{
-  qsort_help_int64( v, 0, ( int )size - 1 );
-
-#ifndef __bpf__
-#ifndef NDEBUG
-  if ( size ) {
-    int64_t x = v[ 0 ];
-    unsigned i = 1;
-    for ( ; i < size; ++i ) {
-      assert( x <= v[ i ] );
-      x = v[ i ];
-    }
-  }
-#endif
-#endif
-}
-
diff --git a/program/src/oracle/sort.h b/program/src/oracle/sort.h
deleted file mode 100644
index ebf67198c..000000000
--- a/program/src/oracle/sort.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#pragma once
-
-#ifdef __bpf__
-#include <solana_sdk.h>
-#else
-#include <stdint.h>
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void qsort_int64( int64_t* v, unsigned size );
-
-#ifdef __cplusplus
-}
-#endif
-
diff --git a/program/src/oracle/sort/clean b/program/src/oracle/sort/clean
new file mode 100755
index 000000000..860ef1641
--- /dev/null
+++ b/program/src/oracle/sort/clean
@@ -0,0 +1,3 @@
+#!/bin/sh
+rm -rfv bin
+
diff --git a/program/src/oracle/sort/run_tests b/program/src/oracle/sort/run_tests
new file mode 100755
index 000000000..0eb277605
--- /dev/null
+++ b/program/src/oracle/sort/run_tests
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+module purge          || exit 1
+module load gcc-9.3.0 || exit 1
+
+./clean       || exit 1
+mkdir -pv bin || exit 1
+
+CC="gcc -g -Wall -Werror -Wextra -Wconversion -Wstrict-aliasing=2 -Wimplicit-fallthrough=2 -pedantic -D_XOPEN_SOURCE=600 -O2 -march=native -std=c17"
+
+set -x
+
+$CC test_sort_stable.c -o bin/test_sort_stable || exit 1
+
+bin/test_sort_stable || exit 1
+
+echo all tests passed
+
diff --git a/program/src/oracle/sort/sort_stable_base_gen.c b/program/src/oracle/sort/sort_stable_base_gen.c
new file mode 100644
index 000000000..70ac61597
--- /dev/null
+++ b/program/src/oracle/sort/sort_stable_base_gen.c
@@ -0,0 +1,92 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+void
+sort_gen( int n ) {
+
+# if 0
+
+  /* In register variant (PSEUDO OPS ~ 9+4n+3n(n-1))
+     Assumes switch ~ 3 PSEUDO OPS (LDA,LD,JMP) -> 3 switch statements / 9 pseudo ops
+     Assumes load   ~ 2 PSEUDO OPS (LDA,LD)     -> n loads             / 2n pseudo ops
+     Assumes store  ~ "
+     Assumes cswap  ~ 6 PSEUDO OPS (CMP,MOV,TESTEQ,CMOV,TESTNEQ,CMOV)  / 6*0.5*n*(n-1) pseudo ops */
+
+//printf( "static inline key_t *              /* returns (sorted) x */\n" );
+//printf( "sort_network_stable( key_t * x,    /* indexed [0,n) */\n" );
+//printf( "                     ulong   n ) { /* assumes n in [0,%i) */\n", n );
+  printf( "  int c;\n" );
+  printf( "  key_t t" );
+  for( int i=0; i<n; i++ ) printf( ", x%i", i );
+  printf( ";\n" );
+  printf( "\n" );
+  printf( "  switch( n ) {\n" );
+  for( int i=n; i; i-- ) printf( "  case %2iUL: x%i = x[%i]; /* fall through */\n", i, i-1, i-1 );
+  printf( "  default: break;\n" );
+  printf( "  }\n" );
+  printf( "\n" );
+  printf( "# define _(i,j) c = BEFORE( x##j, x##i ); t = x##i; x##i = c ? x##j : x##i; x##j = c ? t : x##j\n" );
+  printf( "  switch( n ) {\n" );
+  for( int i=n-1; i; i-- ) {
+    printf( "  case %2iUL:", i+1 );
+    for( int j=0; j<i; j++ ) printf( " _(%2i,%2i);", j,j+1 );
+    printf( " /* fall through */\n" );
+  }
+  printf( "  default: break;\n" );
+  printf( "  }\n" );
+  printf( "# undef _\n" );
+  printf( "\n" );
+  printf( "  switch( n ) {\n" );
+  for( int i=n; i; i-- ) printf( "  case %2iUL: x[%i] = x%i; /* fall through */\n", i, i-1, i-1 );
+  printf( "  default: break;\n" );
+  printf( "  }\n" );
+
+//printf( "\n" );
+//printf( "  return x;\n" );
+//printf( "}\n" );
+
+# else
+
+  /* Memory variant (PSEUDO OPS ~ 3+4.5n(n-1))
+     Assumes switch ~ 3 PSEUDO OPS (LDA,LD,JMP) -> 3 pseudo ops
+     Assumes cswap  ~ 9 PSEUDO OPS (LDA,LDA,LD,LD,CMP,CMOV,CMOV,ST,ST) / 9*0.5*n*(n-1) pseudo ops */
+
+//printf( "static inline key_t *              /* returns (sorted) x */\n" );
+//printf( "sort_network_stable( key_t * x,    /* indexed [0,n) */\n" );
+//printf( "                     ulong   n ) { /* assumes n in [0,%i) */\n", n );
+
+  printf( "  do { /* BEGIN AUTOGENERATED CODE (n=%2i) *****************************/\n", n );
+  printf( "#   define SORT_STABLE_CE(i,j) u = x[(SORT_IDX_T)i]; v = x[(SORT_IDX_T)j]; c = SORT_BEFORE( v, u ); x[(SORT_IDX_T)i] = c ? v : u; x[(SORT_IDX_T)j] = c ? u : v\n" );
+  printf( "    int        c;\n" );
+  printf( "    SORT_KEY_T u;\n" );
+  printf( "    SORT_KEY_T v;\n" );
+  printf( "    switch( n ) {\n" );
+  for( int i=n-1; i>=0; i-- ) {
+    printf( "    case (SORT_IDX_T)%2i:", i+1 );
+    for( int j=0; j<i; j++ ) printf( " SORT_STABLE_CE(%2i,%2i);", j,j+1 );
+    printf( " /* fall through */\n" );
+  }
+  printf( "    case (SORT_IDX_T) 0: return x;\n" );
+  printf( "    default: break;\n" );
+  printf( "    }\n" );
+  printf( "#   undef SORT_STABLE_CE\n" );
+  printf( "  } while(0); /* END AUTOGENERATED CODE *******************************/\n" );
+
+//printf( "\n" );
+//printf( "  return x;\n" );
+//printf( "}\n" );
+
+# endif
+
+}
+
+int
+main( int     argc,
+      char ** argv ) {
+  if( argc!=2 ) { fprintf( stderr, "Usage: %s [max_base_case]\n", argv[0] ); return 1; }
+  int n = atoi( argv[1] );
+  if( n<1 ) { fprintf( stderr, "n (%i) must be positive\n", n ); return 1; }
+  sort_gen( n );
+  return 0;
+}
+
diff --git a/program/src/oracle/sort/test_sort_stable.c b/program/src/oracle/sort/test_sort_stable.c
new file mode 100644
index 000000000..86d72b59f
--- /dev/null
+++ b/program/src/oracle/sort/test_sort_stable.c
@@ -0,0 +1,83 @@
+#include <stdio.h>
+#include "../util/util.h"
+
+#define BEFORE(i,j) (((i)>>16)<((j)>>16))
+
+#define SORT_NAME        sort
+#define SORT_KEY_T       int
+#define SORT_IDX_T       int
+#define SORT_BEFORE(i,j) BEFORE(i,j)
+#include "tmpl/sort_stable.c"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+# define N 96
+  int x[N];
+  int y[N];
+  int w[N];
+
+  /* Brute force validate small sizes via the 0-1 principle (with
+     additional information in the keys to validate stability as well). */
+
+  for( int n=0; n<=24; n++ ) {
+    printf( "Zero-One: Testing n=%i\n", n );
+    for( long b=0L; b<(1L<<n); b++ ) {
+      for( int i=0; i<n; i++ ) x[i] = (((int)((b>>i) & 1L))<<16) | i;
+      for( int i=0; i<n; i++ ) w[i] = x[i];
+
+      int * z = sort_stable( x,n, y );
+
+      /* Make sure that z is a permutation of input data */
+      for( int i=0; i<n; i++ ) {
+        int j = z[i] & (int)0xffff; /* j is the index where z was initially */
+        if( j<0 || j>=n || z[i]!=w[j] ) { printf( "FAIL (corrupt)\n" ); return 1; }
+        w[j] = -1; /* Mark that this entry has already been confirmed */
+      }
+      for( int i=0; i<n; i++ ) if( w[i]!=-1 ) { printf( "FAIL (perm)\n" ); return 1; }
+
+      /* Make sure that z is in order and stable */
+      for( int i=1; i<n; i++ )
+        if( z[i]<=z[i-1] ) { printf( "FAIL (%s, b=%lx)\n", BEFORE( z[i], z[i-1] ) ? "order" : "stable", b ); return 1; }
+    }
+  }
+
+  /* Randomized validation for larger sizes */
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  int ctr = 0;                                                                                                                      
+  for( int iter=0; iter<10000000; iter++ ) {
+    if( !ctr ) { printf( "Randomized: Completed %i iterations\n", iter ); ctr = 100000; }
+    ctr--;
+    
+    int n = (int)(prng_uint32( prng ) % (uint32_t)(N+1)); /* In [0,N], approx uniform IID */
+    for( int i=0; i<n; i++ ) x[i] = (int)((prng_uint32( prng ) & UINT32_C( 0x00ff0000 )) | (uint32_t)i);
+    for( int i=0; i<n; i++ ) w[i] = x[i];
+
+    int * z = sort_stable( x,n, y );
+
+    /* Make sure that z is a permutation of input data */
+    for( int i=0; i<n; i++ ) {
+      int j = z[i] & (int)0xffff; /* j is the index where z was initially */
+      if( j<0 || j>=n || z[i]!=w[j] ) { printf( "FAIL (corrupt)\n" ); return 1; }
+      w[j] = -1; /* Mark that this entry has already been confirmed */
+    }
+    for( int i=0; i<n; i++ ) if( w[i]!=-1 ) { printf( "FAIL (perm)\n" ); return 1; }
+
+    /* Make sure that z is in order and stable */
+    for( int i=1; i<n; i++ )
+      if( z[i]<=z[i-1] ) { printf( "FAIL (%s)\n", BEFORE( z[i], z[i-1] ) ? "order" : "stable" ); return 1; }
+  }
+
+  prng_delete( prng_leave( prng ) );
+ 
+  printf( "pass\n" );
+  return 0;
+}
+
+#undef BEFORE
+
diff --git a/program/src/oracle/sort/tmpl/sort_stable.c b/program/src/oracle/sort/tmpl/sort_stable.c
new file mode 100644
index 000000000..9f95e5be7
--- /dev/null
+++ b/program/src/oracle/sort/tmpl/sort_stable.c
@@ -0,0 +1,228 @@
+/* Usage:
+
+     #define SORT_NAME  mysort
+     #define SORT_KEY_T mykey_t
+     #include "sort_stable_impl.c"
+
+   This will instantiate the following APIs:
+
+      // Returns non-zero if n is a supported sort size and zero if not.
+      // Unsupported values are negative n (only applicable for signed
+      // indexing types) or unreasonably large n (such that the scratch
+      // space requirement would be bigger than UINT64_MAX).
+
+      static inline int
+      mysort_stable_cnt_valid( uint64_t cnt );
+
+      // Return the footprint required for a scratch space of any
+      // alignment sufficient for sorting up to n items inclusive.
+      // Returns 0 if cnt is not valid or no scratch is needed.
+
+      static inline uint64_t
+      mysort_stable_scratch_footprint( uint64_t cnt );
+
+      // Sort elements of keys into an ascending order.  Algorithm has a
+      // best case of ~0.5 cnt lg cnt and an average and worst case of
+      // cnt lg cnt such that it is moderately resistant to timing and
+      // computational DOS attacks.  Further, the sort is stable.  The
+      // values in scratch are irrelevant on input.  Returns where the
+      // sorted data ended up (either key or ALIGN_UP(scratch,mykey_t)).
+      // That is, if this returns key, the values in key are the stably
+      // sorted data and scratch was clobbered.  Otherwise, the values
+      // at ALIGN_UP(scratch,mykey_t) are the stably sorted data and key
+      // was clobbered.  Users wanting the data in a particular location
+      // can copy as necessary (allowing this flexibility minimizes the
+      // amount of copying needed to do the sorting).  E.g.:
+      //
+      //   mykey_t * tmp = mysort_stable( key, cnt, scratch );
+      //   if( tmp!=key ) memcpy( key, tmp, cnt*sizeof(mykey_t) );
+      //
+      // scratch points to a scratch memory region of any alignment with
+      // room for mysort_stable_scratch_footprint( cnt ) bytes.  (Any
+      // normally declared / normally allocated region with mykey_t
+      // compatible alignment and space for cnt mykey_t's will work
+      // too.)
+      //
+      // FIXME: CONSIDER RETURNING NULL IF BAD INPUT ARGS
+
+      static mykey_t *
+      mysort_stable( mykey_t * key,       // Indexed [0,n)
+                     uint64_t  cnt,       // Assumes mysort_stable_cnt_valid( cnt ) is true
+                     void *    scratch ); // Pointer to suitable scratch region
+
+   This can be included multiple types with different names / parameters
+   to define many family of sorts that might be useful for a compilation
+   unit.
+
+   Other defines exist to change the sort criteria / direction, linkage
+   and so forth.  See below for details. */
+
+#include "../../util/compat_stdint.h" /* For uint64_t */
+#include "../../util/align.h"         /* For ALIGN_UP */
+
+#ifndef SORT_NAME
+#error "Define SORT_NAME"
+#endif
+
+#ifndef SORT_KEY_T
+#error "Define SORT_KEY_T; nominally a POD (plain-old-data) type"
+#endif
+
+/* Define SORT_IDX_T to specify the data type used to index key arrays.
+   Default is uint64_t. */
+
+#ifndef SORT_IDX_T
+#define SORT_IDX_T uint64_t
+#endif
+
+/* Define SORT_BEFORE to specify how sorted keys should be ordered.
+   Default is ascending as defined by the "<" operator for the type.
+   SORT_BEFORE(u,v) should be non-zero if key u should go strictly
+   before key v and zero otherwise. */
+
+#ifndef SORT_BEFORE
+#define SORT_BEFORE(u,v) ((u)<(v))
+#endif
+
+/* Define SORT_STATIC to specify the type of linkage the non-inlined
+   APIs should have (e.g. if defined to nothing, these will have
+   external linkage).  Default is static linkage. */
+
+#ifndef SORT_STATIC
+#define SORT_STATIC static
+#endif
+
+/* Define SORT_STATIC_INLINE to specify the type of linkage inlined
+   APIs should have (e.g. if defined to nothing, these will have
+   non-inlined external linkage).  Default is static inline linkage. */
+
+#ifndef SORT_STATIC_INLINE
+#define SORT_STATIC_INLINE static inline
+#endif
+
+/* Some macro preprocessor helpers */
+
+#define SORT_C3(a,b,c)a##b##c
+#define SORT_XC3(a,b,c)SORT_C3(a,b,c)
+#define SORT_IMPL(impl)SORT_XC3(SORT_NAME,_,impl)
+
+SORT_STATIC_INLINE int
+SORT_IMPL(stable_cnt_valid)( SORT_IDX_T cnt ) {
+  /* Written this way for complier warning free signed SORT_IDX_T and/or
+     byte size SORT_KEY_T support (e.g. compiler often will warn to the
+     effect "n>=0 always true" if idx is an unsigned type or
+     "n<=UINT64_MAX always true" if key is a byte type). */
+  static uint64_t const max = ((UINT64_MAX - (uint64_t)alignof(SORT_KEY_T) + (uint64_t)1) / (uint64_t)sizeof(SORT_KEY_T));
+  return !cnt || (((SORT_IDX_T)0)<cnt && ((uint64_t)cnt)<max) || ((uint64_t)cnt)==max;
+}
+
+SORT_STATIC_INLINE uint64_t
+SORT_IMPL(stable_scratch_footprint)( SORT_IDX_T cnt ) {
+  if( !SORT_IMPL(stable_cnt_valid)( cnt ) ) return (uint64_t)0;
+  /* Guaranteed not to overflow given a valid cnt */
+  return ((uint64_t)sizeof (SORT_KEY_T))*(uint64_t)cnt /* Space for the n SORT_KEY_T's */
+       + ((uint64_t)alignof(SORT_KEY_T))-(uint64_t)1;  /* Worst case alignment padding */
+}
+
+SORT_STATIC SORT_KEY_T *
+SORT_IMPL(stable_node)( SORT_KEY_T * x,
+                        SORT_IDX_T   n,
+                        SORT_KEY_T * t ) {
+
+  /* Optimized handling of base cases */
+
+# include "sort_stable_base.c"
+
+  /* Note that n is at least 2 at this point */
+  /* Break input into approximately equal halves and sort them */
+
+  SORT_KEY_T * xl = x;
+  SORT_KEY_T * tl = t;
+  SORT_IDX_T   nl = n >> 1;
+  SORT_KEY_T * yl = SORT_IMPL(stable_node)( xl,nl, tl );
+
+  SORT_KEY_T * xr = x + nl;
+  SORT_KEY_T * tr = t + nl;
+  SORT_IDX_T   nr = n - nl;
+  SORT_KEY_T * yr = SORT_IMPL(stable_node)( xr,nr, tr );
+  
+  /* If left subsort result ended up in orig array, merge into temp
+     array.  Otherwise, merge into orig array. */
+
+  if( yl==xl ) x = t;
+ 
+  /* At this point, note that yl does not overlap with the location for
+     merge output at this point.  yr might overlap (with the right half)
+     with the location for merge output but this will still work in that
+     case. */
+
+  SORT_IDX_T i = (SORT_IDX_T)0;
+  SORT_IDX_T j = (SORT_IDX_T)0;
+  SORT_IDX_T k = (SORT_IDX_T)0;
+
+  /* Note that nl and nr are both at least one at this point so at least
+     one interation of the loop body is necessary. */
+
+  for(;;) { /* Minimal C language operations */
+    if( SORT_BEFORE( yr[k], yl[j] ) ) {
+      x[i++] = yr[k++];
+      if( k>=nr ) { /* append left  stragglers (at least one) */ do x[i++] = yl[j++]; while( j<nl ); break; }
+    } else {
+      x[i++] = yl[j++];
+      if( j>=nl ) { /* append right stragglers (at least one) */ do x[i++] = yr[k++]; while( k<nr ); break; }
+    }
+  }
+
+# if 0 /* These variants of the above don't seem to produce much better code */
+  do { /* Single loop exit */
+    if( SORT_BEFORE( yr[k], yl[j] ) ) x[i++] = yr[k++];
+    else                              x[i++] = yl[j++];
+  } while( (j<nl) && (k<nr) );
+
+  do { /* Single loop exit, minimal mem ops and branching */
+    SORT_KEY_T yj = yl[j];
+    SORT_KEY_T yk = yr[k];
+    int c = SORT_BEFORE( yk, yj );
+    x[i++] = c ? yk : yj;
+    j += (SORT_IDX_T)!c;
+    k += (SORT_IDX_T) c;
+  } while( (j<nl) & (k<nr) );
+
+  do { /* Single loop exit, Minimal mem ops */
+    SORT_KEY_T yj = yl[j];
+    SORT_KEY_T yk = yr[k];
+    if( SORT_BEFORE( yk, yj ) ) x[i++] = yk, k++;
+    else                        x[i++] = yj, j++;
+  } while( (j<nl) && (k<nr) );
+
+  do { /* Single loop exit, trinary variant of above */
+    x[i++] = SORT_BEFORE( yr[k], yl[j] ) ? yr[k++] : yl[j++];
+  } while( (j<nl) && (k<nr) );
+
+  /* Append any stragglers */
+
+  if( j<nl ) do x[i++] = yl[j++]; while( j<nl );
+  else       do x[i++] = yr[k++]; while( k<nr );
+# endif
+
+  return x;
+}
+
+SORT_STATIC_INLINE SORT_KEY_T *
+SORT_IMPL(stable)( SORT_KEY_T * key,
+                   SORT_IDX_T   cnt,        /* Assumed valid cnt */
+                   void       * scratch ) {
+  return SORT_IMPL(stable_node)( key, cnt, ALIGN_UP( scratch, SORT_KEY_T ) );
+}
+
+#undef SORT_IMPL
+#undef SORT_XC3
+#undef SORT_C3
+
+#undef SORT_STATIC_INLINE
+#undef SORT_STATIC
+#undef SORT_BEFORE
+#undef SORT_IDX_T
+#undef SORT_KEY_T
+#undef SORT_NAME
+
diff --git a/program/src/oracle/sort/tmpl/sort_stable_base.c b/program/src/oracle/sort/tmpl/sort_stable_base.c
new file mode 100644
index 000000000..3fc580827
--- /dev/null
+++ b/program/src/oracle/sort/tmpl/sort_stable_base.c
@@ -0,0 +1,18 @@
+  do { /* BEGIN AUTOGENERATED CODE (n= 7) *****************************/
+#   define SORT_STABLE_CE(i,j) u = x[(SORT_IDX_T)i]; v = x[(SORT_IDX_T)j]; c = SORT_BEFORE( v, u ); x[(SORT_IDX_T)i] = c ? v : u; x[(SORT_IDX_T)j] = c ? u : v
+    int        c;
+    SORT_KEY_T u;
+    SORT_KEY_T v;
+    switch( n ) {
+    case (SORT_IDX_T) 7: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); SORT_STABLE_CE( 2, 3); SORT_STABLE_CE( 3, 4); SORT_STABLE_CE( 4, 5); SORT_STABLE_CE( 5, 6); /* fall through */
+    case (SORT_IDX_T) 6: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); SORT_STABLE_CE( 2, 3); SORT_STABLE_CE( 3, 4); SORT_STABLE_CE( 4, 5); /* fall through */
+    case (SORT_IDX_T) 5: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); SORT_STABLE_CE( 2, 3); SORT_STABLE_CE( 3, 4); /* fall through */
+    case (SORT_IDX_T) 4: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); SORT_STABLE_CE( 2, 3); /* fall through */
+    case (SORT_IDX_T) 3: SORT_STABLE_CE( 0, 1); SORT_STABLE_CE( 1, 2); /* fall through */
+    case (SORT_IDX_T) 2: SORT_STABLE_CE( 0, 1); /* fall through */
+    case (SORT_IDX_T) 1: /* fall through */
+    case (SORT_IDX_T) 0: return x;
+    default: break;
+    }
+#   undef SORT_STABLE_CE
+  } while(0); /* END AUTOGENERATED CODE *******************************/
diff --git a/program/src/oracle/test_oracle.c b/program/src/oracle/test_oracle.c
index c31c8630d..601dd1243 100644
--- a/program/src/oracle/test_oracle.c
+++ b/program/src/oracle/test_oracle.c
@@ -1,7 +1,6 @@
 char heap_start[8192];
 #define PC_HEAP_START (heap_start)
 #include "oracle.c"
-#include "sort.c"
 #include <criterion/criterion.h>
 
 Test(oracle, init_mapping) {
diff --git a/program/src/oracle/upd_aggregate.h b/program/src/oracle/upd_aggregate.h
index 221af607a..622f1a33a 100644
--- a/program/src/oracle/upd_aggregate.h
+++ b/program/src/oracle/upd_aggregate.h
@@ -1,406 +1,152 @@
-#pragma once
+#ifndef _pyth_oracle_upd_aggregrate_h_
+#define _pyth_oracle_upd_aggregrate_h_
+
+#include <limits.h> /* For INT64_MAX */
 
 #include "oracle.h"
-#include "pd.h"
-#include "sort.h"
+#include "model/price_model.h"
+#include "model/twap_model.h"
 
-#include <limits.h>
+/* FIXME: TEMP HACK TO DEAL WITH LINKING */
+#include "model/price_model.c"
+#include "model/twap_model.c"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef struct pc_qset
-{
-  pd_t      iprice_[PC_COMP_SIZE];
-  pd_t      uprice_[PC_COMP_SIZE];
-  pd_t      lprice_[PC_COMP_SIZE];
-  pd_t      weight_[PC_COMP_SIZE];
-  int64_t   decay_[1+PC_MAX_SEND_LATENCY];
-  int64_t   fact_[PC_FACTOR_SIZE];
-  int32_t   expo_;
-} pc_qset_t;
-
-// initialize quote-set temporary data in heap area
-static pc_qset_t *qset_new( int expo )
-{
-  // allocate off heap
-  pc_qset_t *qs = (pc_qset_t*)PC_HEAP_START;
-
-  // sqrt of numbers 1 to 25 for decaying conf. interval based on slot delay
-  qs->decay_[0]  = 1000000000L;
-  qs->decay_[1]  = 1000000000L;
-  qs->decay_[2]  = 1414213562L;
-  qs->decay_[3]  = 1732050807L;
-  qs->decay_[4]  = 2000000000L;
-  qs->decay_[5]  = 2236067977L;
-  qs->decay_[6]  = 2449489742L;
-  qs->decay_[7]  = 2645751311L;
-  qs->decay_[8]  = 2828427124L;
-  qs->decay_[9]  = 3000000000L;
-  qs->decay_[10] = 3162277660L;
-  qs->decay_[11] = 3316624790L;
-  qs->decay_[12] = 3464101615L;
-  qs->decay_[13] = 3605551275L;
-  qs->decay_[14] = 3741657386L;
-  qs->decay_[15] = 3872983346L;
-  qs->decay_[16] = 4000000000L;
-  qs->decay_[17] = 4123105625L;
-  qs->decay_[18] = 4242640687L;
-  qs->decay_[19] = 4358898943L;
-  qs->decay_[20] = 4472135954L;
-  qs->decay_[21] = 4582575694L;
-  qs->decay_[22] = 4690415759L;
-  qs->decay_[23] = 4795831523L;
-  qs->decay_[24] = 4898979485L;
-  qs->decay_[25] = 5000000000L;
-
-  // powers of 10 for use in decimal arithmetic scaling
-  qs->fact_[0]   = 1L;
-  qs->fact_[1]   = 10L;
-  qs->fact_[2]   = 100L;
-  qs->fact_[3]   = 1000L;
-  qs->fact_[4]   = 10000L;
-  qs->fact_[5]   = 100000L;
-  qs->fact_[6]   = 1000000L;
-  qs->fact_[7]   = 10000000L;
-  qs->fact_[8]   = 100000000L;
-  qs->fact_[9]   = 1000000000L;
-  qs->fact_[10]  = 10000000000L;
-  qs->fact_[11]  = 100000000000L;
-  qs->fact_[12]  = 1000000000000L;
-  qs->fact_[13]  = 10000000000000L;
-  qs->fact_[14]  = 100000000000000L;
-  qs->fact_[15]  = 1000000000000000L;
-  qs->fact_[16]  = 10000000000000000L;
-  qs->fact_[17]  = 100000000000000000L;
-
-  qs->expo_ = expo;
-
-  return qs;
-}
-
-static void upd_ema(
-    pc_ema_t *ptr, pd_t *val, pd_t *conf, int64_t nslot, pc_qset_t *qs
-    , pc_price_t *prc_ptr)
-{
-  pd_t numer[1], denom[1], cwgt[1], wval[1], decay[1], diff[1], one[1];
-  pd_new( one, 100000000L, -8 );
-  if ( conf->v_ ) {
-    pd_div( cwgt, one, conf );
-  } else {
-    pd_set( cwgt, one );
-  }
-  if ( nslot > PD_EMA_MAX_DIFF ) {
-    // initial condition
-    pd_mul( numer, val, cwgt );
-    pd_set( denom, cwgt );
-  } else {
-    // compute decay factor
-    pd_new( diff, nslot, 0 );
-    pd_new( decay, PD_EMA_DECAY, PD_EMA_EXPO );
-    pd_mul( decay, decay, diff );
-    pd_add( decay, decay, one, qs->fact_ );
-
-    // compute numer/denom and new value from decay factor
-    if ( prc_ptr->drv1_ ) {
-      pd_load( numer, ptr->numer_ );
-      pd_load( denom, ptr->denom_ );
-    }
-    else {
-      // temporary upgrade code
-      pd_new_scale( numer, ptr->numer_, PD_EMA_EXPO );
-      pd_new_scale( denom, ptr->denom_, PD_EMA_EXPO );
-    }
-    if ( numer->v_ < 0 || denom->v_ < 0 ) {
-      // temporary reset twap on negative value
-      pd_set( numer, val );
-      pd_set( denom, one );
-    }
-    else {
-      pd_mul( numer, numer, decay );
-      pd_mul( wval, val, cwgt );
-      pd_add( numer, numer, wval, qs->fact_ );
-      pd_mul( denom, denom, decay );
-      pd_add( denom, denom, cwgt, qs->fact_ );
-      pd_div( val, numer, denom );
-    }
-  }
-
-  // adjust and store results
-  pd_adjust( val, qs->expo_, qs->fact_ );
-  ptr->val_   = val->v_;
-  int64_t numer1, denom1;
-  if ( pd_store( &numer1, numer ) && pd_store( &denom1, denom ) ) {
-    ptr->numer_ = numer1;
-    ptr->denom_ = denom1;
-  }
-  prc_ptr->drv1_ = 1;
-}
-
-static inline void upd_twap(
-    pc_price_t *ptr, int64_t nslots, pc_qset_t *qs )
-{
-  pd_t px[1], conf[1];
-  pd_new_scale( px, ptr->agg_.price_, ptr->expo_ );
-  pd_new_scale( conf, ( int64_t )( ptr->agg_.conf_ ), ptr->expo_ );
-  upd_ema( &ptr->twap_, px, conf, nslots, qs, ptr );
-  upd_ema( &ptr->twac_, conf, conf, nslots, qs, ptr );
-}
-
-// compute weighted percentile
-static void wgt_ptile(
-  pd_t * const res
-  , const pd_t * const prices, const pd_t * const weights, const uint32_t num
-  , const pd_t * const ptile, pc_qset_t *qs )
-{
-  pd_t cumwgt[ PC_COMP_SIZE ];
-
-  pd_t cumwgta[ 1 ], half[ 1 ];
-  pd_new( cumwgta, 0, 0 );
-  pd_new( half, 5, -1 );
-  for ( uint32_t i = 0; i < num; ++i ) {
-    const pd_t * const wptr = &weights[ i ];
-    pd_t weight[ 1 ];
-    pd_mul( weight, wptr, half );
-    pd_add( &cumwgt[ i ], cumwgta, weight, qs->fact_ );
-    pd_add( cumwgta, cumwgta, wptr, qs->fact_ );
-  }
+// update aggregate price
 
-  uint32_t i = 0;
-  for( ; i != num && pd_lt( &cumwgt[i], ptile, qs->fact_ ); ++i );
-  if ( i == num ) {
-    pd_set( res, &prices[num-1] );
-  } else if ( i == 0 ) {
-    pd_set( res, &prices[0] );
-  } else {
-    pd_t t1[1], t2[1];
-    pd_sub( t1, &prices[i], &prices[i-1], qs->fact_ );
-    pd_sub( t2, ptile, &cumwgt[i-1], qs->fact_ );
-    pd_mul( t1, t1, t2 );
-    pd_sub( t2, &cumwgt[i], &cumwgt[i-1], qs->fact_ );
-    pd_div( t1, t1, t2 );
-    pd_add( res, &prices[i-1], t1, qs->fact_ );
-  }
-}
+static inline void
+upd_aggregate( pc_price_t *ptr,
+               uint64_t    now ) {
 
-// update aggregate price
-static inline void upd_aggregate( pc_price_t *ptr, uint64_t slot )
-{
-  // only re-compute aggregate in next slot
-  if ( slot <= ptr->agg_.pub_slot_ ) {
-    return;
-  }
-  pc_qset_t *qs = qset_new( ptr->expo_ );
+  // only re-compute aggregate in newer slots
+  // this logic correctly handles sequence number wrapping
 
-  // get number of slots from last published valid price
-  int64_t agg_diff = ( int64_t )slot - ( int64_t )( ptr->last_slot_ );
+  int64_t agg_diff = (int64_t)(now - ptr->last_slot_);
+  if( agg_diff<INT64_C(0) ) return;
 
   // copy previous price
+
   ptr->prev_slot_  = ptr->valid_slot_;
   ptr->prev_price_ = ptr->agg_.price_;
   ptr->prev_conf_  = ptr->agg_.conf_;
 
   // update aggregate details ready for next slot
-  ptr->valid_slot_ = ptr->agg_.pub_slot_;// valid slot-time of agg. price
-  ptr->agg_.pub_slot_ = slot;            // publish slot-time of agg. price
 
-  uint32_t numv = 0;
-  uint32_t vidx[ PC_COMP_SIZE ];
-  // identify valid quotes and order them by price
-  for ( uint32_t i = 0; i != ptr->num_; ++i ) {
-    pc_price_comp_t *iptr = &ptr->comp_[i];
-    // copy contributing price to aggregate snapshot
-    iptr->agg_ = iptr->latest_;
-    // add quote to sorted permutation array if it is valid
-    int64_t slot_diff = ( int64_t )slot - ( int64_t )( iptr->agg_.pub_slot_ );
-    if ( iptr->agg_.status_ == PC_STATUS_TRADING &&
-         iptr->agg_.conf_ != 0UL &&
-         iptr->agg_.price_ > 0L &&
-         slot_diff >= 0 && slot_diff <= PC_MAX_SEND_LATENCY ) {
-      vidx[numv++] = i;
-    }
-  }
-
-  uint32_t nprcs = 0;
-  int64_t prcs[ PC_COMP_SIZE * 2 ];
-  for ( uint32_t i = 0; i < numv; ++i ) {
-    pc_price_comp_t const *iptr = &ptr->comp_[ vidx[ i ] ];
-    int64_t const price = iptr->agg_.price_;
-    int64_t const conf = ( int64_t )( iptr->agg_.conf_ );
-    prcs[ nprcs++ ] = price - conf;
-    prcs[ nprcs++ ] = price + conf;
-  }
-  qsort_int64( prcs, nprcs );
-
-  uint32_t numa = 0;
-  uint32_t aidx[PC_COMP_SIZE];
-
-  if ( nprcs ) {
-    int64_t const mprc = ( prcs[ numv - 1 ] + prcs[ numv ] ) / 2;
-    int64_t const lb = mprc / 5;
-    int64_t const ub = ( mprc > LONG_MAX / 5 ) ? LONG_MAX : mprc * 5;
-
-    for ( uint32_t i = 0; i < numv; ++i ) {
-      uint32_t const idx = vidx[ i ];
-      pc_price_comp_t const *iptr = &ptr->comp_[ idx ];
-      int64_t const prc = iptr->agg_.price_;
-      if ( prc >= lb && prc <= ub ) {
-        uint32_t j = numa++;
-        for( ; j > 0 && ptr->comp_[ aidx[ j - 1 ] ].agg_.price_ > prc; --j ) {
-          aidx[ j ] = aidx[ j - 1 ];
-        }
-        aidx[ j ] = idx;
+  ptr->valid_slot_    = ptr->agg_.pub_slot_; // valid slot-time of agg. price
+  ptr->agg_.pub_slot_ = now;                 // publish slot-time of agg. price
+
+  // identify valid quotes (those that have a trading status, have a
+  // recent quote (i.e. from a block seq number
+  // [now-PC_MAX_SEND_LATENCY,now]) and do not have obviously bad values
+  // for price and conf for that quote (i.e. 0<conf<price and
+  // conf<=INT64_MAX-price) and if there are enough of them, apply the
+  // price aggegration model to these quotes
+
+  uint32_t numv = UINT32_C(0);
+  int64_t  agg_price;
+  int64_t  agg_conf;
+  {
+    uint32_t  nprcs = UINT32_C(0);
+    int64_t * prcs  = (int64_t *)PC_HEAP_START; // Uses at most ~1.5KiB for current PC_COMP_SIZE in heap area (FIXME: USE MALLOC?)
+    for( uint32_t i=UINT32_C(0); i<ptr->num_; i++ ) {
+      pc_price_comp_t *iptr = &ptr->comp_[i];
+      // copy contributing price to aggregate snapshot
+      iptr->agg_ = iptr->latest_;
+      // add quote values to prcs if valid
+      int64_t slot_diff = (int64_t)(now - iptr->agg_.pub_slot_);
+      int64_t price     = iptr->agg_.price_;
+      int64_t conf      = ( int64_t )( iptr->agg_.conf_ );
+      if( iptr->agg_.status_ == PC_STATUS_TRADING &&
+          INT64_C(0)<=slot_diff && slot_diff<=PC_MAX_SEND_LATENCY &&
+          INT64_C(0)<conf && conf<price &&
+          conf <= (INT64_MAX-price) ) { // No overflow for INT64_MAX-price as price>0
+        numv++;
+        prcs[ nprcs++ ] = price - conf; // No overflow as 0 < conf < price
+        prcs[ nprcs++ ] = price;
+        prcs[ nprcs++ ] = price + conf; // No overflow as 0 < conf <= INT64_MAX-price
       }
     }
-  }
-
-  // zero quoters
-  ptr->num_qt_ = numa;
-  if ( numa == 0 || numa < ptr->min_pub_ || numa * 2 <= numv ) {
-    ptr->agg_.status_ = PC_STATUS_UNKNOWN;
-    return;
-  }
-
-  // update status and publish slot of last trading status price
-  ptr->agg_.status_ = PC_STATUS_TRADING;
-  ptr->last_slot_ = slot;
-
-  // single quoter
-  if ( numa == 1 ) {
-    pc_price_comp_t *iptr = &ptr->comp_[aidx[0]];
-    ptr->agg_.price_ = iptr->agg_.price_;
-    ptr->agg_.conf_  = iptr->agg_.conf_;
-    upd_twap( ptr, agg_diff, qs );
-    return;
-  }
-
-  // assign quotes and compute weights
-  pc_price_comp_t *pptr = 0;
-  pd_t price[1], conf[1], weight[1], one[1], wsum[1];
-  pd_new( one, 100000000L, -8 );
-  pd_new( wsum, 0, 0 );
-  int64_t ldiff = INT64_MAX;
-  pd_t *wptr = qs->weight_;
-  for( uint32_t i=0;i != numa; ++i ) {
-    pc_price_comp_t *iptr = &ptr->comp_[aidx[i]];
-    // scale confidence interval by sqrt of slot age
-    int64_t slot_diff = ( int64_t )slot - ( int64_t )( iptr->agg_.pub_slot_ );
-    pd_t decay[1];
-    pd_new( decay, qs->decay_[slot_diff], PC_EXP_DECAY );
-    pd_new_scale( conf, ( int64_t )( iptr->agg_.conf_ ), ptr->expo_ );
-    pd_mul( conf, conf, decay );
 
-    // assign price and upper/lower bounds of price
-    pd_new_scale( price, iptr->agg_.price_, ptr->expo_ );
-    pd_set( &qs->iprice_[i], price );
-    pd_add( &qs->uprice_[i], price, conf, qs->fact_ );
-    pd_sub( &qs->lprice_[i], price, conf, qs->fact_ );
-
-    // compute weight (1/(conf+nearest_neighbor))
-    pd_set( &qs->weight_[i], conf );
-    if ( i ) {
-      int64_t idiff = iptr->agg_.price_ - pptr->agg_.price_;
-      pd_new_scale( weight, idiff < ldiff ? idiff : ldiff, ptr->expo_ );
-      pd_add( wptr, wptr, weight, qs->fact_ );
-      pd_div( wptr, one, wptr );
-      pd_add( wsum, wsum, wptr, qs->fact_ );
-      ldiff = idiff;
-      ++wptr;
+    // too few valid quotes
+    if( !numv || numv<ptr->min_pub_ ) {
+      // FIXME: SHOULD NUM_QT_ AND/OR LAST_SLOT_ BE UPDATED?  IN EARLIER
+      // CODE, NUM_QT_ WAS BUT LAST SLOT WASN'T IN CASES LIKE THIS.  IT
+      // SEEMS LIKE IT WAS WRONG TO UPDATE NUM_QT_ THOUGH AS IT SEEMS TO
+      // CORRESPOND THE POINT IN TIME SPECIFIED BY LAST_SLOT_.
+      ptr->agg_.status_ = PC_STATUS_UNKNOWN;
+      return;
     }
-    pptr = iptr;
-  }
-  // compute weight for last quote
-  pd_new_scale( weight, ldiff, ptr->expo_ );
-  pd_add( wptr, wptr, weight, qs->fact_ );
-  pd_div( wptr, one, wptr );
-  pd_add( wsum, wsum, wptr, qs->fact_ );
 
-  // bound weights at 1/sqrt(Nquotes) and redeistribute the remaining weight
-  // among the remaining quoters proportional to their weights
-  pd_t wmax[1], rnumer[1], rdenom[1];
-  pd_set( rnumer, one );
-  pd_new( rdenom, 0, 0 );
-  // wmax = 1 / sqrt( numa )
-  pd_new( wmax, numa, 0 );
-  pd_sqrt( wmax, wmax, qs->fact_ );
-  pd_div( wmax, one, wmax );
-  for( uint32_t i=0;i != numa; ++i ) {
-    wptr = &qs->weight_[i];
-    pd_div( wptr, wptr, wsum );
-    if ( pd_gt( wptr, wmax, qs->fact_ ) ) {
-      pd_set( wptr, wmax );
-      pd_sub( rnumer, rnumer, wmax, qs->fact_ );
-      aidx[i] = 1;
-    } else {
-      pd_add( rdenom, rdenom, wptr, qs->fact_ );
-      aidx[i] = 0;
-    }
-  }
-  if ( rdenom->v_ ) {
-    pd_div( rnumer, rnumer, rdenom );
-  }
-  for ( uint32_t i = 0; i != numa; ++i ) {
-    wptr = &qs->weight_[ i ];
-    if ( aidx[ i ] == 0 ) {
-      pd_mul( wptr, wptr, rnumer );
+    // evaluate price_model to get the p25/p50/p75 prices
+    // note: numv>0 and nprcs = 3*numv at this point
+    int64_t agg_p25;
+    int64_t agg_p75;
+    price_model_core( (uint64_t)nprcs, prcs, &agg_p25, &agg_price, &agg_p75, prcs+nprcs );
+
+    // get the left and right confidences
+    // note that as valid quotes have positive prices currently and
+    // agg_p25, agg_price, agg_p75 are ordered, this calculation can't
+    // overflow / underflow
+    int64_t agg_conf_left  = agg_price - agg_p25;
+    int64_t agg_conf_right = agg_p75 - agg_price;
+
+    // use the larger of the left and right confidences
+    agg_conf = agg_conf_right > agg_conf_left ? agg_conf_right : agg_conf_left;
+
+    // if the aggregated values don't look sane, we abort
+    // this is paranoia given the current pricing model and
+    // above quoter sanity checks
+    if( !(INT64_C(0)<agg_conf && agg_conf<agg_price && agg_conf<=(INT64_MAX-agg_price)) ) {
+      // FIXME: SEE NOTE ABOVE ABOUT NUM_QT_ AND LAST_SLOT_
+      ptr->agg_.status_ = PC_STATUS_UNKNOWN;
+      return;
     }
   }
 
-  const pd_t half = { .e_ = -1, .v_ = 5 };
+  // we have an valid (now,agg_price,agg_conf) observation
+  // update the twap_model
 
-  // compute aggregate price as weighted median
-  pd_t iprice[1], lprice[1], uprice[1], q3price[1], q1price[1], ptile[1];
-  pd_new( ptile, 5, -1 );
-  wgt_ptile( iprice, qs->iprice_, qs->weight_, numa, ptile, qs );
-  pd_adjust( iprice, ptr->expo_, qs->fact_ );
-  ptr->agg_.price_  = iprice->v_;
+  // FIXME: HIDEOUS TEMPORARY HACK TO PACK THE EXISTING PC_PRICE_T DATA
+  // LAYOUT INTO A TWAP_MODEL_T (PC_PRICE_T SHOULD JUST HAVE THIS AS A
+  // COMPONENT AND MAYBE ELIMINATE LAST_SLOT_ IN FAVOR OF TM->TS).
+  twap_model_t tm[1];
+  tm->valid   = ptr->drv2_;                   // FIXME: MAKE SURE DRV2_ IS INITIALIZED TO ZERO ON ACCOUNT CREATION
+  tm->ts      = ptr->last_slot_;
+  tm->twap    = (uint64_t)ptr->twap_.val_;
+  tm->twac    = (uint64_t)ptr->twac_.val_;
+  tm->twap_Nh = (uint64_t)ptr->twap_.numer_; tm->twap_Nl = (uint64_t)ptr->twap_.denom_; // FIXME: UGLY REINTERP
+  tm->twac_Nh = (uint64_t)ptr->twac_.numer_; tm->twac_Nl = (uint64_t)ptr->twac_.denom_; // FIXME: UGLY REINTERP
+  tm->D       = (uint64_t)ptr->drv1_;
 
-  // compute diff in weighted median between upper and lower price bounds
-  pd_t prices[ PC_COMP_SIZE ];
-  pd_t weights[ PC_COMP_SIZE ];
-  // sort upper prices and weights
-  for ( uint32_t i = 0; i < numa; ++i ) {
-    uint32_t j = i;
-    for ( ; j > 0 && pd_lt( &qs->uprice_[ i ], &prices[ j - 1 ], qs->fact_ ); --j ) {
-      prices[ j ] = prices[ j - 1 ];
-      weights[ j ] = weights[ j - 1 ];
-    }
-    prices[ j ] = qs->uprice_[ i ];
-    weights[ j ] = qs->weight_[ i ];
-  }
-  wgt_ptile( uprice, prices, weights, numa, ptile, qs );
-  // sort lower prices and weights
-  for ( uint32_t i = 0; i < numa; ++i ) {
-    uint32_t j = i;
-    for ( ; j > 0 && pd_lt( &qs->lprice_[ i ], &prices[ j - 1 ], qs->fact_ ); --j ) {
-      prices[ j ] = prices[ j - 1 ];
-      weights[ j ] = weights[ j - 1 ];
-    }
-    prices[ j ] = qs->lprice_[ i ];
-    weights[ j ] = qs->weight_[ i ];
-  }
-  wgt_ptile( lprice, prices, weights, numa, ptile, qs );
-
-  pd_sub( uprice, uprice, lprice, qs->fact_ );
-  pd_mul( uprice, uprice, &half );
+  twap_model_update( tm, now, (uint64_t)agg_price, (uint64_t)agg_conf );
 
-  // compute weighted iqr of prices
-  pd_new( ptile, 75, -2 );
-  wgt_ptile( q3price, qs->iprice_, qs->weight_, numa, ptile, qs );
-  pd_new( ptile, 25, -2 );
-  wgt_ptile( q1price, qs->iprice_, qs->weight_, numa, ptile, qs );
-  pd_sub( q3price, q3price, q1price, qs->fact_ );
-  pd_mul( q3price, q3price, &half );
-
-  // take confidence interval as larger
-  pd_t *cptr = pd_gt( uprice, q3price, qs->fact_ ) ? uprice : q3price;
-  pd_adjust( cptr, ptr->expo_, qs->fact_ );
-  ptr->agg_.conf_ = ( uint64_t )( cptr->v_ );
-  upd_twap( ptr, agg_diff, qs );
+  // FIXME: HIDEOUS TEMPORARY HACK TO UNPACK.  SEE NOTE ABOVE.  IF
+  // ANYTHING EVEN WENT THE SLIGHTEST WEIRD, WE ABORT.
+  if( !(INT64_C(0)<tm->twac && tm->twac<tm->twap && tm->twac<=((uint64_t)INT64_MAX-tm->twap)) ) {
+    // FIXME: SEE NOTE ABOVE ABOUT NUM_QT_ AND LAST_SLOT_
+    ptr->agg_.status_ = PC_STATUS_UNKNOWN;
+    return;
+  }
+  ptr->drv2_        = tm->valid;
+  ptr->twap_.val_   = (int64_t)tm->twap;
+  ptr->twac_.val_   = (int64_t)tm->twac;
+  ptr->twap_.numer_ = (int64_t)tm->twap_Nh; ptr->twap_.denom_ = (int64_t)tm->twap_Nl;
+  ptr->twac_.numer_ = (int64_t)tm->twac_Nh; ptr->twac_.denom_ = (int64_t)tm->twac_Nl;
+  ptr->drv1_        = (int64_t)tm->D;
+
+  // all good
+  ptr->num_qt_      = numv;
+  ptr->last_slot_   = now;
+  ptr->agg_.price_  = agg_price;
+  ptr->agg_.conf_   = (uint64_t)agg_conf;
+  ptr->agg_.status_ = PC_STATUS_TRADING;
 }
 
 #ifdef __cplusplus
 }
 #endif
 
+#endif /* _pyth_oracle_upd_aggregrate_h_ */
diff --git a/program/src/oracle/util/align.h b/program/src/oracle/util/align.h
new file mode 100644
index 000000000..023e85735
--- /dev/null
+++ b/program/src/oracle/util/align.h
@@ -0,0 +1,57 @@
+#ifndef _pyth_oracle_util_align_h_
+#define _pyth_oracle_util_align_h_
+
+#include "compat_stdint.h" /* For uintptr_t */
+#include <stdalign.h>      /* For alignof */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* align_ispow2 returns non-zero if a is a non-negative integral power
+   of 2 and zero otherwise. */
+
+static inline int
+align_ispow2( uintptr_t a ) {
+  uintptr_t mask = a - (uintptr_t)1;
+  return (a>(uintptr_t)0) & !(a & mask);
+}
+
+/* align_isaligned returns non-zero if p has byte alignment a.  Assumes
+   align_ispow2(a) is true. */
+
+static inline int
+align_isaligned( void *    p,
+                 uintptr_t a ) {
+  uintptr_t mask = a - (uintptr_t)1;
+  return !(((uintptr_t)p) & mask);
+}
+
+/* align_dn/align_up aligns p to the first byte at or before / after p
+   with alignment a.  Assumes align_ispow2(a) is true. */
+
+static inline void *
+align_dn( void *    p,
+          uintptr_t a ) {
+  uintptr_t mask = a - (uintptr_t)1;
+  return (void *)(((uintptr_t)p) & (~mask));
+}
+
+static inline void *
+align_up( void *    p,
+          uintptr_t a ) {
+  uintptr_t mask = a - (uintptr_t)1;
+  return (void *)((((uintptr_t)p) + mask) & (~mask));
+}
+
+/* Helper macros for aligning pointers up or down to compatible
+   alignments for type T.  FIXME: ADD UNIT TEST COVERAGE */
+
+#define ALIGN_DN( p, T ) ((T *)align_dn( p, (uintptr_t)alignof(T) ))
+#define ALIGN_UP( p, T ) ((T *)align_up( p, (uintptr_t)alignof(T) ))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_align_h_ */
diff --git a/program/src/oracle/util/avg.h b/program/src/oracle/util/avg.h
new file mode 100644
index 000000000..cf4e0fa00
--- /dev/null
+++ b/program/src/oracle/util/avg.h
@@ -0,0 +1,170 @@
+#ifndef _pyth_oracle_util_avg_h_
+#define _pyth_oracle_util_avg_h_
+
+/* Portable robust integer averaging.  (No intermediate overflow, no
+   assumptions about platform type wides, no assumptions about integer
+   signed right shift, no signed integer division, no implict
+   conversions, etc.) */
+
+#include "sar.h"
+
+/* FIXME: ADD SUPPORT FOR 64_BIT ARRAY AVERAGES BASED ON UWIDE? */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* uint8_t  a = avg_2_uint8 ( x, y ); int8_t  a = avg_2_int8 ( x, y );
+   uint16_t a = avg_2_uint16( x, y ); int16_t a = avg_2_int16( x, y );
+   uint32_t a = avg_2_uint32( x, y ); int32_t a = avg_2_int32( x, y );
+   uint64_t a = avg_2_uint64( x, y ); int64_t a = avg_2_int64( x, y );
+
+   compute the average (a) of x and y of the corresponding type with <1
+   ulp error (floor / round toward negative infinity rounding).  That
+   is, these compute this exactly:
+     floor( (x+y)/2 )
+   On systems where signed right shifts are sign extending, this is
+   identical to:
+     (x+y) >> 1
+   but the below will be safe against intermediate overflow. */
+
+static inline uint8_t  avg_2_uint8 ( uint8_t  x, uint8_t  y ) { return (uint8_t )((((uint16_t)x)+((uint16_t)y))>>1); }
+static inline uint16_t avg_2_uint16( uint16_t x, uint16_t y ) { return (uint16_t)((((uint32_t)x)+((uint32_t)y))>>1); }
+static inline uint32_t avg_2_uint32( uint32_t x, uint32_t y ) { return (uint32_t)((((uint64_t)x)+((uint64_t)y))>>1); }
+
+static inline uint64_t
+avg_2_uint64( uint64_t x,
+              uint64_t y ) {
+
+  /* x+y might have intermediate overflow and we don't have an efficient
+     portable wider integer type available.  So we can't just do
+     (x+y)>>1 to compute floor((x+y)/2). */
+
+# if 1 /* Fewer ops but less parallel issue */
+  /* Emulate an adc (add with carry) to get the 65-bit wide intermediate
+     and then shift back in as necessary */
+  uint64_t z = x + y;
+  uint64_t c = (uint64_t)(z<x); // Or z<y
+  return (z>>1) + (c<<63);
+# else /* More ops but more parallel issue */
+  /* floor(x/2)+floor(y/2)==(x>>1)+(y>>1) doesn't have intermediate
+     overflow but it has two rounding errors.  Noting that
+       (x+y)/2 == floor(x/2) + floor(y/2) + (x_lsb+y_lsb)/2
+               == (x>>1)     + (y>>1)     + (x_lsb+y_lsb)/2
+     implies we should increment when x and y have their lsbs set. */
+  return (x>>1) + (y>>1) + (x & y & UINT64_C(1));
+# endif
+
+}
+
+static inline int8_t  avg_2_int8  ( int8_t  x, int8_t  y ) { return (int8_t )sar_int16((int16_t)(((int16_t)x)+((int16_t)y)),1); }
+static inline int16_t avg_2_int16 ( int16_t x, int16_t y ) { return (int16_t)sar_int32((int32_t)(((int32_t)x)+((int32_t)y)),1); }
+static inline int32_t avg_2_int32 ( int32_t x, int32_t y ) { return (int32_t)sar_int64((int64_t)(((int64_t)x)+((int64_t)y)),1); }
+
+static inline int64_t
+avg_2_int64( int64_t x,
+             int64_t y ) {
+
+  /* Similar considerations as above */
+
+# if 1 /* Fewer ops but less parallel issue */
+  /* x+y = x+2^63 + y+2^63 - 2^64 ... exact ops 
+         = ux     + uy     - 2^64 ... exact ops where ux and uy are exactly representable as a 64-bit uint 
+         = uz     + 2^64 c - 2^64 ... exact ops where uz is a 64-bit uint and c is in [0,1].
+     Thus, as before
+       uz = ux+uy   ... c ops
+       c  = (uz<ux) ... exact ops
+     Further simplifying:
+       x+y = uz - 2^64 ( 1-c ) ... exact ops
+           = uz - 2^64 b       ... exact ops
+     where:
+       b  = 1-c = ~c = (uz>=ux) ... exact ops
+     Noting that (ux + uy) mod 2^64 = (x+y+2^64) mod 2^64 = (x+y) mod 2^64
+     and that signed and added adds are the same operation binary in twos
+     complement math yields:
+       uz  = (uint64_t)(x+y) ... c ops
+       b   = uz>=(x+2^63)    ... exact ops
+     as we know x+2^63 is in [0,2^64), x+2^63 = x+/-2^63 mod 2^64.  And
+     using the signed and unsigned adds are the same operation binary
+     in we have:
+       x+2^63              ... exact ops
+       ==x+/-2^63 mod 2^64 ... exact ops 
+       ==x+/-2^63          ... c ops  */
+   uint64_t t = (uint64_t)x;
+   uint64_t z = t + (uint64_t)y;
+   uint64_t b = (uint64_t)(z>=(t+(((uint64_t)1)<<63)));
+   return (int64_t)((z>>1) - (b<<63));
+# else /* More ops but more parallel issue (significant more ops if no platform arithmetic right shift) */
+  /* See above for uint64_t for details on this calc. */
+  return sar_int64( x, 1 ) + sar_int64( y, 1 ) + (x & y & (int64_t)1);
+# endif
+
+}
+
+/* uint8_t  a = avg_uint8 ( x, n ); int8_t  a = avg_int8 ( x, n );
+   uint16_t a = avg_uint16( x, n ); int16_t a = avg_int16( x, n );
+   uint32_t a = avg_uint32( x, n ); int32_t a = avg_int32( x, n );
+
+   compute the average (a) of the n element array f the corresponding
+   type pointed to by x with <1 ulp round error (truncate / round toward
+   zero rounding).  Supports arrays with up to 2^32-1 elements.  Returns
+   0 for arrays with 0 elements.  Elements of the array are unchanged by
+   this function.  No 64-bit equivalents are provided here as that
+   requires a different and slower implementation (at least if
+   portability is required). */
+
+#define PYTH_ORACLE_UTIL_AVG_DECL(func,T) /* Comments for uint32_t but apply for uint8_t and uint16_t too */ \
+static inline T       /* == (1/n) sum_i x[i] with one rounding error and round toward zero rounding */       \
+func( T const * x,    /* Indexed [0,n) */                                                                    \
+      uint32_t  n ) { /* Returns 0 on n==0 */                                                                \
+  if( !n ) return (T)0; /* Handle n==0 case */                                                               \
+  /* At this point n is in [1,2^32-1].  Sum up all the x into a 64-bit */                                    \
+  /* wide signed accumulator. */                                                                             \
+  uint64_t a = (uint64_t)0;                                                                                  \
+  for( uint32_t r=n; r; r-- ) a += (uint64_t)*(x++);                                                         \
+  /* At this point a is in [ 0, (2^32-1)(2^32-1) ] < 2^64 and thus was */                                    \
+  /* computed without intermediate overflow.  Complete the average.    */                                    \
+  return (T)( a / (uint64_t)n );                                                                             \
+}
+
+PYTH_ORACLE_UTIL_AVG_DECL( avg_uint8,  uint8_t  )
+PYTH_ORACLE_UTIL_AVG_DECL( avg_uint16, uint16_t )
+PYTH_ORACLE_UTIL_AVG_DECL( avg_uint32, uint32_t )
+/* See note above about 64-bit variants */
+
+#undef PYTH_ORACLE_UTIL_AVG_DECL
+
+#define PYTH_ORACLE_UTIL_AVG_DECL(func,T) /* Comments for int32_t but apply for int8_t and int16_t too */ \
+static inline T       /* == (1/n) sum_i x[i] with one rounding error and round toward zero rounding */    \
+func( T const * x,    /* Indexed [0,n) */                                                                 \
+      uint32_t  n ) { /* Returns 0 on n==0 */                                                             \
+  if( !n ) return (T)0; /* Handle n==0 case */                                                            \
+  /* At this point n is in [1,2^32-1].  Sum up all the x into a 64-bit */                                 \
+  /* wide signed accumulator. */                                                                          \
+  int64_t a = (int64_t)0;                                                                                 \
+  for( uint32_t r=n; r; r-- ) a += (int64_t)*(x++);                                                       \
+  /* At this point a is in [ -2^31 (2^32-1), (2^31-1)(2^32-1) ] such that */                              \
+  /* |a| < 2^63.  It thus was computed without intermediate overflow and  */                              \
+  /* further |a| can fit with an uint64_t.  Compute |a|/n.  If a<0, the   */                              \
+  /* result will be at most 2^31-1 (i.e. all x[i]==2^31-1).  Otherwise,   */                              \
+  /* the result will be at most 2^31 (i.e. all x[i]==-2^31). */                                           \
+  int      s = a<(int64_t)0;                                                                              \
+  uint64_t u = (uint64_t)(s ? -a : a);                                                                    \
+  u /= (uint64_t)n;                                                                                       \
+  a = (int64_t)u;                                                                                         \
+  return (T)(s ? -a : a);                                                                                 \
+}
+
+PYTH_ORACLE_UTIL_AVG_DECL( avg_int8,  int8_t  )
+PYTH_ORACLE_UTIL_AVG_DECL( avg_int16, int16_t )
+PYTH_ORACLE_UTIL_AVG_DECL( avg_int32, int32_t )
+/* See note above about 64-bit variants */
+
+#undef PYTH_ORACLE_UTIL_AVG_DECL
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_avg_h_ */
+
diff --git a/program/src/oracle/util/clean b/program/src/oracle/util/clean
new file mode 100755
index 000000000..860ef1641
--- /dev/null
+++ b/program/src/oracle/util/clean
@@ -0,0 +1,3 @@
+#!/bin/sh
+rm -rfv bin
+
diff --git a/program/src/oracle/util/compat_stdint.h b/program/src/oracle/util/compat_stdint.h
new file mode 100644
index 000000000..1842cc52c
--- /dev/null
+++ b/program/src/oracle/util/compat_stdint.h
@@ -0,0 +1,28 @@
+#ifndef _pyth_oracle_util_compat_stdint_h_
+#define _pyth_oracle_util_compat_stdint_h_
+
+/* Include functionality from <stdint.h>.  Define
+   PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE to indicate how to do this:
+     0 - Use stdint.h directly
+     1 - Use solana_sdk.h (solana uses its own definitions for stdint
+         types and that can conflicts with stdint.h)
+   Defaults to 0 or 1 depending on if __bpf__ is set. */
+
+#ifndef PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE
+#ifndef __bpf__
+#define PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE 0
+#else
+#define PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE 1
+#endif
+#endif
+
+#if PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE==0
+#include <stdint.h>
+#elif PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE==1
+#include <solana_sdk.h>
+typedef uint64_t uintptr_t;
+#else
+#error "Unsupported PYTH_ORACLE_UTIL_COMPAT_STDINT_STYLE"
+#endif
+
+#endif /* _pyth_oracle_util_compat_stdint_h_ */
diff --git a/program/src/oracle/util/exp2m1.h b/program/src/oracle/util/exp2m1.h
new file mode 100644
index 000000000..6b47c0611
--- /dev/null
+++ b/program/src/oracle/util/exp2m1.h
@@ -0,0 +1,165 @@
+#ifndef _pyth_oracle_util_exp2m1_h_
+#define _pyth_oracle_util_exp2m1_h_
+
+/* exp2m1_fxp computes:
+
+     y/2^30 ~ exp2m1( x/2^30 ) = 2^( x/2^30 ) - 1
+
+   That is, it computes a fixed point approximation of 2^x-1 where x is
+   non-negative and represented with 34 integer bits and 30 fractional
+   bits (34u.30 fxp).  If x/2^30 >= 34 (i.e. x>exp2m1_fxp_max()),
+   returns UINT64_MAX.  For x<=exp2m1_fxp_max(), a exp2_fxp can also
+   trivially constructed by just adding 2^30 to the result.
+
+   The approximation is designed to minimize the RMS error over the
+   interval [0,2^30) while preserving continuity overall and matching at
+   at key places exactly (e.g. if x/2^30 = 2^i for i in [-30,33], this
+   will be exact).
+
+   Define EXP2M1_FXP_ORDER from 1:7 to get the desired cost / accuracy
+   tradeoff.  At EXP2M1_FXP_ORDER 1, this costs virtually nothing (some
+   fast bit ops); it is a piecewise linear approximation that is
+   typically accurate to a few percent.
+
+   At EXP2M1_FXP_ORDER 7, this is nearly IEEE style 30 bit precision
+   (i.e. for x in [0,2^30), the result with 0.6 ulp instead of the
+   theoretical limit of 0.5 ulp).  For x in [0,2^30), the rounding is
+   approximately round nearest even.  For larger x, relative error is
+   approximately constant.  It requires some fast bits ops, 7 64*64->64
+   adds and 7 64*64->64 multiplies (similarly for the orders 2:6).
+
+   EXP2M1_FXP_ORDER 5 is the default and has a worst case output
+   accuracy comparable IEEE single precision.
+
+   In short, for integer x/2^30, it is the exact value correctly rounded
+   and, for IID random x in [0,2^30), the approximation errors can be
+   thought of as a random Gaussian perturbation on the output value such
+   that the approximation for a given order is the one that minimizes
+   the perturbation variance. */
+
+#include "compat_stdint.h"
+
+#ifndef EXP2M1_FXP_ORDER
+#define EXP2M1_FXP_ORDER 5
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline uint64_t exp2m1_fxp_max( void ) { return UINT64_C(0x87fffffff); } /* == 34*2^30 - 1 (i=33,d=2^30-1) */
+
+static inline uint64_t
+exp2m1_fxp( uint64_t x ) {
+
+  /* This calculation is based on splitting x into integer and
+     fractional bits such that:
+       x = 2^30 i + d
+     where d is in [0,2^30).  Then we have:
+       y = 2^(30+i) exp2( d/2^30 ) - 2^30
+         = (2^64 exp2( d/2^30 )) / 2^(64-30-i) - 2^30
+         = (2^64 exp2m1( d/2^30 ) + 2^64) / 2^(34-i) - 2^30
+         = (2^64 exp2m1( d/2^30 ))/2^(34-i) + 2^(30+i)- 2^30
+     Thus, the main challenge is computing an approximation of:
+       2^64 exp2m1( d/2^30 ) / 2^(34-i)
+     for delta = d/2^30 in [0,1) with good enough precision.  As this
+     smoothly goes from 0 to 1, this is an ideal candidate for a
+     polynomial approximation (Pade' approximation might be even better
+     if we are willing to accept the cost of a divide).  We aren't too
+     particular or maxing out precision or precise rounding in the
+     interest of compactness and speed.
+
+     The polynomial coefficients below were selected as the polynomials
+     that minimize the RMS error over the interval [0,1) while matching
+     exactly at 0 and 1 (RMS error was used over minimax as it has more
+     intuitive error properties for users typically).  The coefficients
+     were scaled such that the intermediates in Horner's rule are all
+     less than 2^34 to minimize the coefficient quantization error while
+     not overflowing 64 bits when multiplied by d.  The output scale was
+     fixed at 2^64 so that we have some extra precision when i is
+     non-zero. */
+
+  if( x>exp2m1_fxp_max() ) return UINT64_MAX;
+
+  uint64_t i = x >> 30;                             /* In [0,34) */
+  uint64_t d = x & ((UINT64_C(1)<<30)-UINT64_C(1)); /* In [0,2^30) */
+
+  /* Estimate y ~ 2^64 exp2m1( d/2^30 ) */
+
+  uint64_t y;
+
+  /* BEGIN AUTOGENERATED CODE - See KJB for code generator */
+# if EXP2M1_FXP_ORDER==1
+  /* bits 4.0 rms_aerr 2.6e-02 rms_rerr 1.8e-02 max_aerr 8.6e-02 max_rerr 6.1e-02 */
+  /* As implemented: bits 4.0 max_aerr 8.6e-02 max_rerr 6.1e-02 ulp 92418389.1 */
+  y = d<<34;
+# elif EXP2M1_FXP_ORDER==2
+  /* bits 8.3 rms_aerr 2.7e-03 rms_rerr 2.0e-03 max_aerr 3.9e-03 max_rerr 3.2e-03 */
+  /* As implemented: bits 8.3 max_aerr 3.9e-03 max_rerr 3.2e-03 ulp 4173744.7 */
+  y = UINT64_C(0x2c029d07d);
+  y = UINT64_C(0x29feb17c1) + ((y*d)>>31);
+  y =  (y*d);
+# elif EXP2M1_FXP_ORDER==3
+  /* bits 13.0 rms_aerr 1.1e-04 rms_rerr 7.9e-05 max_aerr 1.7e-04 max_rerr 1.2e-04 */
+  /* As implemented: bits 13.0 max_aerr 1.7e-04 max_rerr 1.2e-04 ulp 183713.4 */
+  y = UINT64_C(0x288319c3e);
+  y = UINT64_C(0x1cd1735e6) + ((y*d)>>32);
+  y = UINT64_C(0x2c86e3185) + ((y*d)>>31);
+  y =  (y*d);
+# elif EXP2M1_FXP_ORDER==4
+  /* bits 17.6 rms_aerr 3.3e-06 rms_rerr 2.4e-06 max_aerr 5.6e-06 max_rerr 5.0e-06 */
+  /* As implemented: bits 17.6 max_aerr 5.6e-06 max_rerr 5.0e-06 ulp 5998.7 */
+  y = UINT64_C(0x38100ce15);
+  y = UINT64_C(0x1a7f168b9) + ((y*d)>>33);
+  y = UINT64_C(0x1eeba70d4) + ((y*d)>>32);
+  y = UINT64_C(0x2c5a09747) + ((y*d)>>31);
+  y =  (y*d);
+# elif EXP2M1_FXP_ORDER==5
+  /* bits 22.8 rms_aerr 9.1e-08 rms_rerr 6.7e-08 max_aerr 1.5e-07 max_rerr 1.4e-07 */
+  /* As implemented: bits 22.8 max_aerr 1.5e-07 max_rerr 1.4e-07 ulp 165.8 */
+  y = UINT64_C(0x3e1a2fa1b);
+  y = UINT64_C(0x24a7ddfee) + ((y*d)>>33);
+  y = UINT64_C(0x1c994ed30) + ((y*d)>>33);
+  y = UINT64_C(0x1ebd13698) + ((y*d)>>32);
+  y = UINT64_C(0x2c5c9fe11) + ((y*d)>>31);
+  y =  (y*d);
+# elif EXP2M1_FXP_ORDER==6
+  /* bits 27.9 rms_aerr 2.1e-09 rms_rerr 1.6e-09 max_aerr 4.2e-09 max_rerr 3.9e-09 */
+  /* As implemented: bits 27.8 max_aerr 4.6e-09 max_rerr 4.3e-09 ulp 5.0 */
+  y = UINT64_C(0x3959e8bc0);
+  y = UINT64_C(0x28987867f) + ((y*d)>>33);
+  y = UINT64_C(0x27aac1b83) + ((y*d)>>33);
+  y = UINT64_C(0x1c67f6aa0) + ((y*d)>>33);
+  y = UINT64_C(0x1ebfde70a) + ((y*d)>>32);
+  y = UINT64_C(0x2c5c8510d) + ((y*d)>>31);
+  y =  (y*d);
+# elif EXP2M1_FXP_ORDER==7
+  /* bits 33.4 rms_aerr 4.7e-11 rms_rerr 3.5e-11 max_aerr 9.9e-11 max_rerr 9.0e-11 */
+  /* As implemented: bits 30.8 max_aerr 5.9e-10 max_rerr 5.4e-10 ulp 0.6 */
+  y = UINT64_C(0x2d6e5bd1d);
+  y = UINT64_C(0x257992e8b) + ((y*d)>>33);
+  y = UINT64_C(0x2c02265a3) + ((y*d)>>33);
+  y = UINT64_C(0x27607eb13) + ((y*d)>>33);
+  y = UINT64_C(0x1c6b30b08) + ((y*d)>>33);
+  y = UINT64_C(0x1ebfbcc25) + ((y*d)>>32);
+  y = UINT64_C(0x2c5c8604f) + ((y*d)>>31);
+  y =  (y*d);
+# else
+# error "Unsupported EXP2M1_FXP_ORDER"
+# endif
+  /* END AUTOGENERATED CODE */
+
+  /* Note: (y + 2^(s-1)) / 2^s does a divide with grade school rounding
+     (rounds to nearest with ties away from 0).  The add will not
+     overflow as y is <= 2^64-2^34 at this point and 2^(s-1) is at most
+     2^33. */
+
+  int s = 34-(int)i; /* In [1,34] so no UB in below shifts */
+  return ((y+(UINT64_C(1) << (s-1))) >> s) + (UINT64_C(1)<<(64-s)) - (UINT64_C(1)<<30);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_exp2m1_h_ */
diff --git a/program/src/oracle/util/fxp.h b/program/src/oracle/util/fxp.h
new file mode 100644
index 000000000..793cbfd94
--- /dev/null
+++ b/program/src/oracle/util/fxp.h
@@ -0,0 +1,935 @@
+#ifndef _pyth_oracle_util_fxp_h_
+#define _pyth_oracle_util_fxp_h_
+
+/* Large set of primitives for portable fixed point arithmetic with
+   correct rounding and/or overflow detection.  Strongly targeted at
+   platforms with reasonable performance C-style 64b unsigned integer
+   arithmetic under the hood.  Likewise, strongly targets unsigned fixed
+   point representations that fit within 64b and have 30 fractional
+   bits. */
+
+#include <limits.h> /* For INT_MIN */
+#include "uwide.h"
+#include "sqrt.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* FIXED POINT ADDITION ***********************************************/
+
+/* Compute:
+     (c 2^64 + z)/2^30 = x/2^30 + y/2^30
+   Exact.  c will be in [0,1].  Fast variant assumes that the user knows
+   c is zero or is not needed. */
+
+static inline uint64_t fxp_add( uint64_t x, uint64_t y, uint64_t * _c ) { *_c = (uint64_t)(x>(~y)); return x + y; }
+static inline uint64_t fxp_add_fast( uint64_t x, uint64_t y ) { return x + y; }
+
+/* FIXED POINT SUBTRACTION ********************************************/
+
+/* Compute:
+     z/2^30 = (2^64 b + x)/2^30 - y/2^30
+   Exact.  b will be in [0,1].  Fast variant assumes that the user knows
+   b is zero (i.e. x>=y) or is not needed. */
+
+static inline uint64_t fxp_sub( uint64_t x, uint64_t y, uint64_t * _b ) { *_b = (uint64_t)(x<y); return x - y; }
+static inline uint64_t fxp_sub_fast( uint64_t x, uint64_t y ) { return x - y; }
+
+/* FIXED POINT MULTIPLICATION *****************************************/
+
+/* Compute:
+     (2^64 c + z)/2^30 ~ (x/2^30)(y/2^30)
+   under various rounding modes.  c<2^34. */
+
+/* Helper used by the below that computes
+     2^64 c + y = floor( (2^64 xh + xl) / 2^30 )
+   (i.e. shift a 128b uint right by 30).  Exact.  c<2^34. */
+
+static inline uint64_t fxp_mul_contract( uint64_t xh, uint64_t xl, uint64_t * _c ) { *_c = xh>>30; return (xh<<34) | (xl>>30); }
+
+/* rtz -> Round toward zero (aka truncate rounding)
+   Fast variant assumes x*y < 2^64 (i.e. exact result < ~2^4).
+   Based on:
+        z/2^30 ~ (x/2^30)(y/2^30)
+     -> z      ~ x y / 2^30
+   With RTZ rounding: 
+        z      = floor( x y / 2^30 )
+               = (x*y) >> 30
+   As x*y might overflow 64 bits, we need to do a 64b*64b->128b
+   multiplication (uwide_mul) and a 128b shift right by 30
+   (fxp_mul_contract).
+   
+   Fastest style of rounding.  Rounding error in [0,1) ulp.
+   (ulp==2^-30). */
+
+static inline uint64_t
+fxp_mul_rtz( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  uint64_t zh,zl; uwide_mul( &zh,&zl, x, y ); /* <= 2^128 - 2^65 + 1 so no overflow */
+  return fxp_mul_contract( zh,zl, _c );
+}
+
+static inline uint64_t fxp_mul_rtz_fast( uint64_t x, uint64_t y ) { return (x*y) >> 30; }
+
+/* raz -> Round away from zero
+   Fast variant assumes x*y < 2^64-2^30+1 (i.e. exact result < ~2^4)
+   Based on:
+        z/2^30 ~ (x/2^30)(y/2^30)
+     -> z      ~ x y / 2^30
+   With RAZ rounding: 
+        z      = ceil( x y / 2^30 )
+               = floor( (x y + 2^30 - 1) / 2^30 )
+               = (x*y + 2^30 -1) >> 30
+   As x*y might overflow 64 bits, we need to do a 64b*64b->128b
+   multiplication (uwide_mul), a 64b increment of a 128b (uwide_inc) and
+   a 128b shift right by 30 (fxp_mul_contract).
+   
+   Slightly more expensive (one uwide_inc) than RTZ rounding.  Rounding
+   error in (-1,0] ulp.  (ulp==2^-30). */
+
+static inline uint64_t
+fxp_mul_raz( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  uint64_t zh,zl; uwide_mul( &zh,&zl, x, y );                 /* <= 2^128 - 2^65 + 1 so no overflow */
+  uwide_inc( &zh,&zl, zh,zl, (UINT64_C(1)<<30)-UINT64_C(1) ); /* <= 2^128 - 2^65 + 2^30 so no overflow */
+  return fxp_mul_contract( zh,zl, _c );
+}
+
+static inline uint64_t fxp_mul_raz_fast( uint64_t x, uint64_t y ) { return (x*y+((UINT64_C(1)<<30)-UINT64_C(1))) >> 30; }
+
+/* rnz -> Round nearest with ties toward zero
+   Fast variant assumes x*y < 2^64-2^29+1 (i.e. exact result < ~2^4)
+   Based on:
+        z/2^30 ~ (x/2^30)(y/2^30)
+     -> z      ~ x y / 2^30
+   Let frac be the least significant 30 bits of x*y.  If frac<2^29
+   (frac>2^29), we should round down (up).  frac==2^29 is a tie and we
+   should round down.  If we add 2^29-1 to frac, the result will be
+   <2^30 when frac<=2^29 and will be >=2^30.  Thus bit 30 of frac+2^29-1
+   indicates whether we need to round up or down.  This yields:
+        z = floor( x y + 2^29 - 1 ) / 2^30 )
+          = (x*y + 2^29 - 1) >> 30
+   As x*y might overflow 64 bits, we need to do a 64b*64b->128b
+   multiplication (uwide_mul), a 64b increment of a 128b (uwide_inc) and
+   a 128b shift right by 30 (fxp_mul_contract).
+
+   Slightly more expensive (one uwide_inc) than RTZ rounding.  Rounding
+   error in (-1/2,1/2] ulp.  (ulp==2^-30). */
+
+static inline uint64_t
+fxp_mul_rnz( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  uint64_t zh,zl; uwide_mul( &zh,&zl, x, y );                 /* <= 2^128 - 2^65 + 1 so no overflow */
+  uwide_inc( &zh,&zl, zh,zl, (UINT64_C(1)<<29)-UINT64_C(1) ); /* <= 2^128 - 2^65 + 2^29 so no overflow */
+  return fxp_mul_contract( zh,zl, _c );
+}
+
+static inline uint64_t fxp_mul_rnz_fast( uint64_t x, uint64_t y ) { return (x*y+((UINT64_C(1)<<29)-UINT64_C(1))) >> 30; }
+
+/* rna -> Round nearest with ties away from zero (aka grade school rounding)
+   Fast variant assumes x*y < 2^64-2^29 (i.e. exact result < ~2^4)
+   Based on:
+        z/2^30 ~ (x/2^30)(y/2^30)
+     -> z      ~ x y / 2^30
+   Let frac be the least significant 30 bits of x*y.  If frac<2^29
+   (frac>2^29), we should round down (up).  frac==2^29 is a tie and we
+   should round up.  If we add 2^29-1 to frac, the result will be <2^30
+   when frac<=2^29 and will be >=2^30.  Thus bit 30 of frac+2^29
+   indicates whether we need to round up or down.  This yields:
+        z = floor( x y + 2^29 ) / 2^30 )
+          = (x*y + 2^29) >> 30
+   As x*y might overflow 64 bits, we need to do a 64b*64b->128b
+   multiplication (uwide_mul), a 64b increment of a 128b (uwide_inc) and
+   a 128b shift right by 30 (fxp_mul_contract).
+
+   Slightly more expensive (one uwide_inc) than RTZ rounding.  Rounding
+   error in [-1/2,1/2) ulp.  (ulp==2^-30). */
+
+static inline uint64_t
+fxp_mul_rna( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  uint64_t zh,zl; uwide_mul( &zh,&zl, x, y );   /* <= 2^128 - 2^65 + 1 so no overflow */
+  uwide_inc( &zh,&zl, zh,zl, UINT64_C(1)<<29 ); /* <= 2^128 - 2^65 + 2^29 so no overflow */
+  return fxp_mul_contract( zh,zl, _c );
+}
+
+static inline uint64_t fxp_mul_rna_fast( uint64_t x, uint64_t y ) { return (x*y+(UINT64_C(1)<<29)) >> 30; }
+
+/* rne -> Round toward nearest with ties toward even (aka banker's rounding / IEEE style rounding)
+   Fast variant assumes x*y < 2^64-2^29 (i.e. exact result < ~2^4)
+   Based on the observation that rnz / rna rounding should be used when
+   floor(x*y/2^30) is even/odd.  That is, use the rnz / rna increment of
+   2^29-1 / 2^29 when bit 30 of x*y is 0 / 1.  As x*y might overflow 64
+   bits, we need to do a 64b*64b->128b multiplication (uwide_mul), a 64b
+   increment of a 128b (uwide_inc) and a 128b shift right by 30
+   (fxp_mul_contract).
+   
+   The most accurate style of rounding usually and somewhat more
+   expensive (some sequentially dependent bit ops and one uwide_inc)
+   than RTZ rounding.  Rounding error in [-1/2,1/2] ulp (unbiased).
+   (ulp==2^-30). */
+
+static inline uint64_t
+fxp_mul_rne( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  uint64_t zh,zl; uwide_mul( &zh,&zl, x, y );                              /* <= 2^128 - 2^65 + 1 */
+  uint64_t t = ((UINT64_C(1)<<29)-UINT64_C(1)) + ((zl>>30) & UINT64_C(1)); /* t = 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1 */
+  uwide_inc( &zh,&zl, zh,zl, t );                                          /* <= 2^128 - 2^65 + 2^29 */
+  return fxp_mul_contract( zh,zl, _c );
+}
+
+static inline uint64_t
+fxp_mul_rne_fast( uint64_t x,
+                  uint64_t y ) {
+  uint64_t z = x*y;
+  uint64_t t = ((UINT64_C(1)<<29)-UINT64_C(1)) + ((z>>30) & UINT64_C(1)); /* t = 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1 */
+  return (z + t) >> 30;
+}
+
+/* rno -> Round toward nearest with ties toward odd
+   Fast variant assumes x*y < 2^64-2^29 (i.e. exact result < ~2^4)
+   Same as rne with the parity flipped for the increment.  As x*y might
+   overflow 64 bits, we need to do a 64b*64b->128b multiplication
+   (uwide_mul), a 64b increment of a 128b (uwide_inc) and a 128b shift
+   right by 30 (fxp_mul_contract).
+   
+   Somewhat more expensive (some sequentially dependent bit ops and one
+   uwide_inc) than RTZ rounding.  Rounding error in [-1/2,1/2] ulp
+   (unbiased).  (ulp==2^-30). */
+
+static inline uint64_t
+fxp_mul_rno( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  uint64_t zh,zl; uwide_mul( &zh,&zl, x, y );                /* <= 2^128 - 2^65 + 1 */
+  uint64_t t = (UINT64_C(1)<<29) - ((zl>>30) & UINT64_C(1)); /* t = 2^29 / 2^29-1 when bit 30 of x*y is 0 / 1 */
+  uwide_inc( &zh,&zl, zh,zl, t );                            /* <= 2^128 - 2^65 + 2^29 */
+  return fxp_mul_contract( zh,zl, _c );
+}
+
+static inline uint64_t
+fxp_mul_rno_fast( uint64_t x,
+                  uint64_t y ) {
+  uint64_t z = x*y;
+  uint64_t t = (UINT64_C(1)<<29) - ((z>>30) & UINT64_C(1)); /* t = 2^29-1 / 2^29 when bit 30 of x*y is 0 / 1 */
+  return (z + t) >> 30;
+}
+
+/* Other rounding modes:
+     rdn -> Round down                   / toward floor / toward -inf ... same as rtz for unsigned
+     rup -> Round up                     / toward ceil  / toward +inf ... same as raz for unsigned
+     rnd -> Round nearest with ties down / toward floor / toward -inf ... same as rnz for unsigned
+     rnu -> Round nearest with ties up   / toward ceil  / toward -inf ... same as rna for unsigned */
+
+static inline uint64_t fxp_mul_rdn( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_mul_rtz( x, y, _c ); }
+static inline uint64_t fxp_mul_rup( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_mul_raz( x, y, _c ); }
+static inline uint64_t fxp_mul_rnd( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_mul_rnz( x, y, _c ); }
+static inline uint64_t fxp_mul_rnu( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_mul_rna( x, y, _c ); }
+
+static inline uint64_t fxp_mul_rdn_fast( uint64_t x, uint64_t y ) { return fxp_mul_rtz_fast( x, y ); }
+static inline uint64_t fxp_mul_rup_fast( uint64_t x, uint64_t y ) { return fxp_mul_raz_fast( x, y ); }
+static inline uint64_t fxp_mul_rnd_fast( uint64_t x, uint64_t y ) { return fxp_mul_rnz_fast( x, y ); }
+static inline uint64_t fxp_mul_rnu_fast( uint64_t x, uint64_t y ) { return fxp_mul_rna_fast( x, y ); }
+
+/* FIXED POINT DIVISION ***********************************************/
+
+/* Compute:
+     (2^64 c + z)/2^30 ~ (x/2^30)/(y/2^30)
+   under various rounding modes.  c<2^30 if y is non-zero.  Returns
+   c=UINT64_MAX,z=0 if y is zero. */
+
+/* Helper used by the below that computes
+     2^64 yh + yl = 2^30 x
+   (i.e. widen a 64b number to 128b and shift it left by 30.)  Exact.
+   yh<2^30. */
+
+static inline void fxp_div_expand( uint64_t * _yh, uint64_t * _yl, uint64_t x ) { *_yh = x>>34; *_yl = x<<30; }
+
+/* rtz -> Round toward zero (aka truncate rounding)
+   Fast variant assumes y!=0 and x<2^34 (i.e. exact result < ~2^34)
+   Based on:
+        z/2^30 ~ (x/2^30) / (y/2^30)
+     -> z      ~ 2^30 x / y
+   With RTZ rounding: 
+        z      = floor( 2^30 x / y )
+   As 2^30 x might overflow 64 bits, we need to expand x
+   (fxp_div_expand) and then use a 128b/64b -> 128b divider.
+   (uwide_div).
+
+   Fastest style of rounding.  Rounding error in [0,1) ulp.
+   (ulp==2^-30). */
+
+static inline uint64_t
+fxp_div_rtz( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */
+  uint64_t zh,zl; fxp_div_expand( &zh,&zl, x );      /* 2^30 x  <= 2^94-2^30 so no overflow */
+  uwide_div( &zh,&zl, zh,zl, y );                    /* <zh,zl> <= 2^94-2^30 so no overflow */
+  *_c = zh; return zl;
+}
+
+static inline uint64_t fxp_div_rtz_fast( uint64_t x, uint64_t y ) { return (x<<30) / y; }
+
+/* raz -> Round away from zero
+   Fast variant assumes y!=0 and 2^30*x+y-1<2^64 (i.e. exact result < ~2^34)
+   Based on:
+        z/2^30 ~ (x/2^30) / (y/2^30)
+     -> z      ~ 2^30 x / y
+   With RAZ rounding: 
+        z      = ceil( 2^30 x / y )
+               = floor( (2^30 x + y - 1) / y )
+   As 2^30 x might overflow 64 bits, we need to expand x
+   (fxp_div_expand), increment it by the 64b y-1 (uwide_inc) and then
+   use a 128b/64b->128b divider (uwide_div).
+
+   Slightly more expensive (one uwide_inc) than RTZ rounding.  Rounding
+   error in (-1,0] ulp. (ulp==2^-30). */
+
+static inline uint64_t
+fxp_div_raz( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */
+  uint64_t zh,zl; fxp_div_expand( &zh,&zl, x );      /* 2^30 x  <= 2^94-2^30 so no overflow */
+  uwide_inc( &zh,&zl, zh,zl, y-UINT64_C(1) );        /* <zh,zl> <= 2^94+2^64-2^30-2 so no overflow */
+  uwide_div( &zh,&zl, zh,zl, y );                    /* <zh,zl> = ceil( 2^30 x / y ) <= 2^94-2^30 so no overflow */
+  *_c = zh; return zl;
+}
+
+static inline uint64_t fxp_div_raz_fast( uint64_t x, uint64_t y ) { return ((x<<30)+(y-UINT64_C(1))) / y; }
+
+/* rnz -> Round nearest with ties toward zero
+   Fast variant assumes y!=0 and 2^30*x+floor((y-1)/2)<2^64 (i.e. exact result < ~2^34)
+
+   Consider:
+     z = floor( (2^30 x + floor( (y-1)/2 )) / y )
+   where y>0.
+   
+   If y is even:                                   odd
+     z       = floor( (2^30 x + (y/2) - 1) / y )     = floor( (2^30 x + (y-1)/2) / y )
+   or:
+     z y + r = 2^30 x + (y/2)-1                      = 2^30 x + (y-1)/2
+   for some r in [0,y-1].  Or:
+     z y     = 2^30 x + delta                        = 2^30 x + delta                        
+   where:
+     delta   in [-y/2,y/2-1]                         in [-y/2+1/2,y/2-1/2]
+   or:
+     z       = 2^30 x / y + epsilon                  = 2^30 x / y + epsilon
+   where:
+     epsilon in [-1/2,1/2-1/y]                       in [-1/2+1/(2y),1/2-1/(2y)]
+   Thus we have:
+     2^30 x/y - 1/2 <= z < 2^30 x/y + 1/2            2^30 x/y - 1/2 < z < 2^30 x/y + 1/2
+
+   Combining yields:
+
+     2^30 x/y - 1/2 <= z < 2^30 x/y + 1/2
+
+   Thus, the z computed as per the above is the RNZ rounded result.  As
+   2^30 x might overflow 64 bits, we need to expand x (fxp_div_expand),
+   increment it by the 64b (y-1)>>1 (uwide_inc) and then use a
+   128b/64b->128b divider (uwide_div).
+
+   Slightly more expensive (one uwide_inc) than RTZ rounding.  Rounding
+   error in (-1/2,1/2] ulp. (ulp==2^-30). */
+
+static inline uint64_t
+fxp_div_rnz( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */
+  uint64_t zh,zl; fxp_div_expand( &zh,&zl, x );      /* 2^30 x <= 2^94-2^30 so no overflow */
+  uwide_inc( &zh,&zl, zh,zl, (y-UINT64_C(1))>>1 );   /* <zh,zl> <= 2^94-2^30 + 2^63-1 so no overflow */
+  uwide_div( &zh,&zl, zh,zl, y );                    /* <zh,zl> <= ceil(2^30 x/y) <= 2^94-2^30 so no overflow */
+  *_c = zh; return zl;
+}
+
+static inline uint64_t fxp_div_rnz_fast( uint64_t x, uint64_t y ) { return ((x<<30)+((y-UINT64_C(1))>>1)) / y; }
+
+/* rna -> Round nearest with ties away from zero (aka grade school rounding)
+   Fast variant assumes y!=0 and 2^30*x+floor(y/2)<2^64 (i.e. exact result < ~2^34)
+
+   Consider:
+     z = floor( (2^30 x + floor( y/2 )) / y )
+   where y>0.
+   
+   If y is even:                                   odd
+     z       = floor( (2^30 x + (y/2)) / y )         = floor( (2^30 x + (y-1)/2) / y )
+   or:
+     z y + r = 2^30 x + (y/2)                        = 2^30 x + (y-1)/2
+   for some r in [0,y-1].  Or:
+     z y     = 2^30 x + delta                        = 2^30 x + delta                        
+   where:
+     delta   in [-y/2+1,y/2]                         in [-y/2+1/2,y/2-1/2]
+   or:
+     z       = 2^30 x / y + epsilon                  = 2^30 x / y + epsilon
+   where:
+     epsilon in [-1/2+1/y,1/2]                       in [-1/2+1/(2y),1/2-1/(2y)]
+   Thus we have:
+     2^30 x/y - 1/2 < z <= 2^30 x/y + 1/2            2^30 x/y - 1/2 < z < 2^30 x/y + 1/2
+
+   Combining yields:
+
+     2^30 x/y - 1/2 < z <= 2^30 x/y + 1/2
+
+   Thus, the z computed as per the above is the RNA rounded result.  As
+   2^30 x might overflow 64 bits, we need to expand x (fxp_div_expand),
+   increment it by the 64b y>>1 (uwide_inc) and then use a
+   128b/64b->128b divider (uwide_div).
+
+   Slightly more expensive (one uwide_inc) than RTZ rounding.  Rounding
+   error in [-1/2,1/2) ulp. (ulp==2^-30).
+   
+   Probably worth noting that if y has its least significant bit set,
+   all the rnz/rna/rne/rno modes yield the same result (as ties aren't
+   possible) and this is the cheapest of the round nearest modes.*/
+
+static inline uint64_t
+fxp_div_rna( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */
+  uint64_t zh,zl; fxp_div_expand( &zh,&zl, x );      /* 2^30 x <= 2^94-2^30 so no overflow */
+  uwide_inc( &zh,&zl, zh,zl, y>>1 );                 /* <zh,zl> <= 2^94-2^30 + 2^63-1 so no overflow */
+  uwide_div( &zh,&zl, zh,zl, y );                    /* <zh,zl> <= ceil(2^30 x/y) <= 2^94-2^30 so no overflow */
+  *_c = zh; return zl;
+}
+
+static inline uint64_t fxp_div_rna_fast( uint64_t x, uint64_t y ) { return ((x<<30)+(y>>1)) / y; }
+
+/* rne -> Round nearest with ties toward even (aka banker's rounding / IEEE style rounding)
+   Fast variant assumes y!=0 and 2^30 x < 2^64 (i.e. exact result < ~2^34)
+
+   Based on computing (when y>0):
+
+     q y + r = 2^30 x
+
+   where q = floor( 2^30 x / y ) and r is in [0,y-1].
+
+   If r < y/2, the result should round down.  And if r > y/2 the result
+   should round up.  If r==y/2 (which is only possible if y is even),
+   the result should round down / up when q is even / odd.
+   
+   Combining yields we need to round up when:
+
+     r>floor(y/2) or (r==floor(y/2) and y is even and q is odd)
+
+   As 2^30 x might overflow 64 bits, we need to expand x
+   (fxp_div_expand).  Since we need both the 128b quotient and the 64b
+   remainder, we need a 128b/64b->128b,64b divider (uwide_divrem ... if
+   there was a way to quickly determine if floor( 2^30 x / y ) is even
+   or odd, we wouldn't need the remainder and could select the
+   appropriate RNZ/RNA based uwide_inc increment) and then a 128b
+   conditional increment (uwide_inc).
+
+   The most accurate style of rounding usually and somewhat more
+   expensive (needs remainder, some sequentially dependent bit ops and
+   one uwide_inc) than RTZ rounding.  Rounding error in [-1/2,1/2] ulp
+   (unbiased).  (ulp==2^-30). */
+
+static inline uint64_t
+fxp_div_rne( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */
+  uint64_t zh,zl; fxp_div_expand( &zh,&zl, x );      /* 2^30 x <= 2^94-2^30 so no overflow */
+  uint64_t r    = uwide_divrem( &zh,&zl, zh,zl, y ); /* <zh,zl>*y + r = 2^30 x where r is in [0,y-1] so no overflow */
+  uint64_t flhy = y>>1;                              /* floor(y/2) so no overflow */
+  uwide_inc( &zh,&zl, zh,zl, (uint64_t)( (r>flhy) | ((r==flhy) & !!((~y) & zl & UINT64_C(1))) ) ); /* <zh,zl> <= ceil( 2^30 x / y ) <= 2^94-2^30 so no overflow */
+  *_c = zh; return zl;
+}
+
+static inline uint64_t
+fxp_div_rne_fast( uint64_t x,
+                  uint64_t y ) {
+  uint64_t n    = x << 30;
+  uint64_t q    = n / y;
+  uint64_t r    = n - q*y;
+  uint64_t flhy = y>>1;
+  return q + (uint64_t)( (r>flhy) | ((r==flhy) & !!((~y) & q & UINT64_C(1))) );
+}
+
+/* rno -> Round nearest with ties toward odd
+   Fast variant assumes y!=0 and 2^30 x < 2^64 (i.e. exact result < ~2^34)
+
+   Similar considerations as RNE with the parity for rounding on ties
+   swapped.
+
+   Somewhat more expensive (needs remainder, some sequentially dependent
+   bit ops and one uwide_inc) than RTZ rounding.  Rounding error in
+   [-1/2,1/2] ulp (unbiased).  (ulp==2^-30). */
+
+static inline uint64_t
+fxp_div_rno( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  if( !y ) { *_c = UINT64_MAX; return UINT64_C(0); } /* Handle divide by zero */
+  uint64_t zh,zl; fxp_div_expand( &zh,&zl, x );      /* 2^30 x <= 2^94-2^30 so no overflow */
+  uint64_t r    = uwide_divrem( &zh,&zl, zh,zl, y ); /* <zh,zl>*y + r = 2^30 x where r is in [0,y-1] so no overflow */
+  uint64_t flhy = y>>1;                              /* floor(y/2) so no overflow */
+  uwide_inc( &zh,&zl, zh,zl, (uint64_t)( (r>flhy) | ((r==flhy) & !!((~y) & (~zl) & UINT64_C(1))) ) ); /* <zh,zl> <= ceil( 2^30 x / y ) <= 2^94-2^30 so no overflow */
+  *_c = zh; return zl;
+}
+
+static inline uint64_t
+fxp_div_rno_fast( uint64_t x,
+                  uint64_t y ) {
+  uint64_t n    = x << 30;
+  uint64_t q    = n / y;
+  uint64_t r    = n - q*y;
+  uint64_t flhy = y>>1;
+  return q + (uint64_t)( (r>flhy) | ((r==flhy) & !!((~y) & (~q) & UINT64_C(1))) );
+}
+
+/* Other rouding modes:
+     rdn -> Round down                   / toward floor / toward -inf ... same as rtz for unsigned
+     rup -> Round up                     / toward ceil  / toward +inf ... same as raz for unsigned
+     rnd -> Round nearest with ties down / toward floor / toward -inf ... same as rnz for unsigned
+     rnu -> Round nearest with ties up   / toward ceil  / toward -inf ... same as rna for unsigned */
+
+static inline uint64_t fxp_div_rdn( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_div_rtz( x, y, _c ); }
+static inline uint64_t fxp_div_rup( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_div_raz( x, y, _c ); }
+static inline uint64_t fxp_div_rnd( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_div_rnz( x, y, _c ); }
+static inline uint64_t fxp_div_rnu( uint64_t x, uint64_t y, uint64_t * _c ) { return fxp_div_rna( x, y, _c ); }
+
+static inline uint64_t fxp_div_rdn_fast( uint64_t x, uint64_t y ) { return fxp_div_rtz_fast( x, y ); }
+static inline uint64_t fxp_div_rup_fast( uint64_t x, uint64_t y ) { return fxp_div_raz_fast( x, y ); }
+static inline uint64_t fxp_div_rnd_fast( uint64_t x, uint64_t y ) { return fxp_div_rnz_fast( x, y ); }
+static inline uint64_t fxp_div_rnu_fast( uint64_t x, uint64_t y ) { return fxp_div_rna_fast( x, y ); }
+
+/* FIXED POINT SQRT ***************************************************/
+
+/* Compute:
+     z/2^30 ~ sqrt( x/2^30 )
+   under various rounding modes. */
+
+/* rtz -> Round toward zero (aka truncate rounding)
+   Fast variant assumes x<2^34
+   Based on:
+        z/2^30 ~ sqrt( x/2^30)
+     -> z      ~ sqrt( 2^30 x )
+   With RTZ rounding:
+        z      = floor( sqrt( 2^30 x ) )
+   Fastest style of rounding.  Rounding error in [0,1) ulp.
+   (ulp==2^-30). */
+
+static inline uint64_t
+fxp_sqrt_rtz( uint64_t x ) {
+
+  /* Initial guess.  Want to compute
+       y = sqrt( x 2^30 )
+     but x 2^30 does not fit into 64-bits at this point.  So we instead
+     approximate:
+       y = sqrt( x 2^(2s) 2^(30-2s) )
+         = sqrt( x 2^(2s) ) 2^(15-s)
+         ~ floor( sqrt( x 2^(2s) ) ) 2^(15-s)
+     where s is the largest integer such that x 2^(2s) does not
+     overflow. */
+
+  int s = (63-log2_uint64( x )) >> 1;                /* lg x in [34,63], 63-lg x in [0,29], s in [0,14] when x>=2^34 */
+  if( s>15 ) s = 15;                                 /* s==15 when x<2^34 */
+  uint64_t y = sqrt_uint64( x << (s<<1) ) << (15-s); /* All shifts well defined */
+  if( s==15 ) return y;                              /* No iteration if x<2^34 */
+
+  /* Expand x to 2^30 x for the fixed point iteration */
+  uint64_t xh,xl; fxp_div_expand( &xh,&xl, x );
+  for(;;) {
+
+    /* Iterate y' = floor( (y(y+1) + 2^30 x) / (2y+1) ).  This is the
+       same iteration as sqrt_uint{8,16,32,64} (which converges on the
+       floor(sqrt(x)) but applied to the (wider than 64b) quantity
+       2^30*x and then starting from an exceptionally good guess (such
+       that ~2 iterations should be needed at most). */
+
+    uint64_t yh,yl;
+    uwide_mul( &yh,&yl, y,y+UINT64_C(1) );
+    uwide_add( &yh,&yl, yh,yl, xh,xl, UINT64_C(0) );
+    uwide_div( &yh,&yl, yh,yl, (y<<1)+UINT64_C(1) );
+    if( yl==y ) break;
+    y = yl;
+  }
+
+  return y;
+}
+
+static inline uint64_t fxp_sqrt_rtz_fast( uint64_t x ) { return sqrt_uint64( x<<30 ); }
+
+/* raz -> Round away zero
+   Fast variant assumes x<2^34
+   Based on:
+        z/2^30 ~ sqrt( x/2^30)
+     -> z      ~ sqrt( 2^30 x )
+   Let y by the RTZ rounded result:
+        y = floor( sqrt( 2^30 x ) )
+   and consider the residual:
+        r = 2^30 x - y^2
+   which, given the above, will be in [0,2y].  If r==0, the result
+   is exact and thus already correctly rounded.  Otherwise, we need
+   to round up.  We note that the residual of the RTZ iteration is
+   the same as this residual at convergence:
+        y = floor( (y^2 + y + 2^30 x) / (2y+1) )
+          = (y^2 + y + 2^30 x - r') / (2y+1)
+   where r' in [0,2y]:
+        2y^2 + y = y^2 + y + 2^30 x - r'
+        y^2 = 2^30 x - r'
+        r' = 2^30 x - y^2
+     -> r' = r
+   Thus we can use explicitly compute the remainder or use uwide_divrem
+   in the iteration itself to produce the needed residual.
+
+   Alternatively, the iteration
+        y = floor( (y^2 + y - 2 + 2^30 x) / (2y-1) )
+          = floor( (y(y+1) + (2^30 x-2)) / (2y-1) )
+   should converge on the RAZ rounded result as:
+        y = (y^2 + y - 2 + 2^30 x - r'') / (2y-1)
+   where r'' in [0,2y-2]
+        2y^2 - y = y^2 + y - 2 + 2^30 x - r''
+        y^2 - 2 y + 2 + r'' = 2^30 x
+   Thus at r'' = 0:
+        (y-1)^2 + 1 = 2^30 x
+     -> (y-1)^2 < 2^30 x
+   and at r'' = 2y-2
+        y^2 = 2^30 x
+   such that:
+        (y-1)^2 < 2^30 x <= y^2
+   which means y is the correctly rounded result.
+
+   Slightly more expensive than RTZ rounding.  Rounding error in (-1,0]
+   ulp.  (ulp==2^-30). */
+
+static inline uint64_t
+fxp_sqrt_raz( uint64_t x ) {
+  uint64_t xh, xl;
+
+  /* Same guess as rtz rounding */
+  int s = (63-log2_uint64( x )) >> 1;
+  if( s>15 ) s = 15;
+  xl = x << (s<<1);
+  uint64_t y = sqrt_uint64( xl ) << (15-s);
+  if( s==15 ) return y + (uint64_t)!!(xl-y*y); /* Explicitly compute residual to round when no iteration needed */
+
+  /* Use the modified iteration to converge on raz rounded result */
+  fxp_div_expand( &xh,&xl, x );
+  uwide_dec( &xh,&xl, xh, xl, UINT64_C(2) );
+  for(;;) {
+    uint64_t yh,yl;
+    uwide_mul( &yh,&yl, y, y+UINT64_C(1) );
+    uwide_add( &yh,&yl, yh,yl, xh,xl, UINT64_C(0) );
+    uwide_div( &yh,&yl, yh,yl, (y<<1)-UINT64_C(1) );
+    if( yl==y ) break;
+    y = yl;
+  }
+
+  return y;
+}
+
+static inline uint64_t
+fxp_sqrt_raz_fast( uint64_t x ) {
+  uint64_t xl = x<<30;
+  uint64_t y  = sqrt_uint64( xl );
+  uint64_t r  = xl - y*y;
+  return y + (uint64_t)!!r;
+}
+
+/* rnz/rna/rne/rno -> Round nearest with ties toward zero/away zero/toward even/toward odd
+   Fast variant assumes x<2^34
+   Based on:
+        z/2^30 ~ sqrt( x/2^30)
+     -> z      ~ sqrt( 2^30 x )
+   Assuming there are no ties, we want to integer z such that:
+        (z-1/2)^2 < 2^30 x < (z+1/2)^2
+        z^2 - z + 1/4 < 2^30 x < z^2 + z + 1/4
+   since z is integral, this is equivalent to finding a z such that:
+     -> z^2 - z + 1 <= 2^30 x < z^2 + z + 1
+     -> r = 2^30 x - (z^2 - z + 1) and is in [0,2z)
+   This suggests using the iteration:
+        z = floor( (z^2 + z - 1 + 2^30 x) / (2z) )
+          = floor( (z(z+1) + (2^30 x-1)) / (2z) )
+   which, at convergence, has:
+        2z^2 = z^2 + z - 1 + 2^30 x - r'
+   where r' is in [0,2z).  Solving for r', at convergence:
+        r' = 2^30 x - (z^2 - z + 1)
+        r' = r
+   Thus, this iteration converges to the correctly rounded z when there
+   are no ties.  But this also demonstrates that no ties are possible
+   when z is integral ... the above derivation hold when either endpoint
+   of the initial inequality is closed because the endpoint values are
+   fraction and cannot be exactly met for any integral z.  As such,
+   there are no ties and all round nearest styles can use the same
+   iteration for the sqrt function.
+
+   For computing a faster result for small x, let y by the RTZ rounded
+   result:
+        y = floor( sqrt( 2^30 x ) )
+   and consider the residual:
+        r'' = 2^30 x - y^2
+   which, given the above, will be in [0,2y].  If r''==0, the result is
+   exact and thus already correctly rounded.  Otherwise, let:
+        z = y when r''<=y and y+1 when when r''>y
+   Consider r''' from the above for this z.
+        r''' = 2^30 x - z^2
+             = 2^30 x - y^2 when r''<=y and 2^30 x - (y+1)^2 o.w.
+             = r''' when r''<=y and r'' - 2y - 1 o.w.
+     -> r''' in [0,y] when r''<=y and in [y+1,2y]-2y-1 o.w.
+     -> r''' in [0,y] when r''<=y and in [-y,-1]-2y-1 o.w.
+     -> r''' in [-y,y] and is negative when r''>y
+     -> r''' in [-z+1,z]
+   This implies that  we have for
+        2^30 x - (z^2-z+1) = r''' + z-1 is in [0,2z)
+   As such, z computed by this method is also correctly rounded.  Thus
+   we can use explicitly compute the remainder or use uwide_divrem in
+   the iteration itself to produce the needed residual.
+
+   Very slightly more expensive than RTZ rounding.  Rounding error in
+   (-1/2,1/2) ulp.  (ulp==2^-30). */
+
+static inline uint64_t
+fxp_sqrt_rnz( uint64_t x ) {
+  uint64_t xh, xl;
+
+  /* Same guess as rtz rounding */
+  int s = (63-log2_uint64( x )) >> 1;
+  if( s>15 ) s = 15;
+  xl = x << (s<<1);
+  uint64_t y = sqrt_uint64( xl ) << (15-s);
+  if( s==15 ) return y + (uint64_t)((xl-y*y)>y); /* Explicitly compute residual to round when no iteration needed */
+
+  /* Use the modified iteration to converge on rnz rounded result */
+  fxp_div_expand( &xh,&xl, x );                      /* 2^30 x */
+  uwide_dec( &xh,&xl, xh,xl, UINT64_C(1) );          /* 2^30 x - 1 */
+  for(;;) {
+    uint64_t yh,yl;
+    uwide_mul( &yh,&yl, y,y+UINT64_C(1) );           /* y^2 + y */
+    uwide_add( &yh,&yl, yh,yl, xh,xl, UINT64_C(0) ); /* y^2 + y - 1 + 2^30 x */
+    uwide_div( &yh,&yl, yh,yl, y<<1 );
+    if( yl==y ) break;
+    y = yl;
+  }
+
+  return y;
+}
+
+static inline uint64_t
+fxp_sqrt_rnz_fast( uint64_t x ) {
+  uint64_t xl = x<<30;
+  uint64_t y  = sqrt_uint64( xl );
+  uint64_t r  = xl - y*y;
+  return y + (uint64_t)(r>y);
+}
+
+static inline uint64_t fxp_sqrt_rna( uint64_t x ) { return fxp_sqrt_rnz( x ); }
+static inline uint64_t fxp_sqrt_rne( uint64_t x ) { return fxp_sqrt_rnz( x ); }
+static inline uint64_t fxp_sqrt_rno( uint64_t x ) { return fxp_sqrt_rnz( x ); }
+
+static inline uint64_t fxp_sqrt_rna_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); }
+static inline uint64_t fxp_sqrt_rne_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); }
+static inline uint64_t fxp_sqrt_rno_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); }
+
+/* Other rouding modes:
+     rdn -> Round down                   / toward floor / toward -inf ... same as rtz for unsigned
+     rup -> Round up                     / toward ceil  / toward +inf ... same as raz for unsigned
+     rnd -> Round nearest with ties down / toward floor / toward -inf ... same as rnz for unsigned
+     rnu -> Round nearest with ties up   / toward ceil  / toward -inf ... same as rna for unsigned */
+
+static inline uint64_t fxp_sqrt_rdn( uint64_t x ) { return fxp_sqrt_rtz( x ); }
+static inline uint64_t fxp_sqrt_rup( uint64_t x ) { return fxp_sqrt_raz( x ); }
+static inline uint64_t fxp_sqrt_rnd( uint64_t x ) { return fxp_sqrt_rnz( x ); }
+static inline uint64_t fxp_sqrt_rnu( uint64_t x ) { return fxp_sqrt_rnz( x ); }
+
+static inline uint64_t fxp_sqrt_rdn_fast( uint64_t x ) { return fxp_sqrt_rtz_fast( x ); }
+static inline uint64_t fxp_sqrt_rup_fast( uint64_t x ) { return fxp_sqrt_raz_fast( x ); }
+static inline uint64_t fxp_sqrt_rnd_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); }
+static inline uint64_t fxp_sqrt_rnu_fast( uint64_t x ) { return fxp_sqrt_rnz_fast( x ); }
+
+/* FIXED POINT LOG2 ***************************************************/
+
+/* Compute:
+
+     e + f/2^30 ~ log2( x/2^30 )
+
+   If x is non-zero, e will be in [-30,33] and f will be in [0,2^30]
+
+   Note: This is not guaranteed to be exactly rounded and thus this
+   doesn't have variants for every rounding mode under the sun.  The
+   current implementation has <=2 ulp error with a round-nearest flavor
+   though.
+
+   Given the round-nearest flavor, it is possible to have a f/2^30=1
+   exactly (e.g. log2_approx( UINT64_MAX ) will have e=33 and f/2^30=1
+   such that e still is exactly determined by the index of x's most
+   significant bit but the fractional part is 1 after rounding up.
+
+   It is possible modify this while retaining a round-nearest flavor
+   such that e is strictly in [-30,34] and f/2^30 is strictly in [0,1)
+   (e will no longer strictly be determined the index of x's most
+   significant bit so technically this retains less info than the
+   above).
+
+   Likewise, it is possible to modify this to use a round-toward-zero
+   flavor such that e will be in [-30,33] and f/2^30 is in [0,1) always.
+   The resulting approximation accuracy would be slightly lower and
+   slightly biased.
+
+   If x is zero, the result is undefined mathematically (the limit
+   x->zero+ is -inf ... this implementation returns i=INT_MIN<<-30 and
+   f=0 for specificity. */
+
+static inline int                  /* e, in [-30,33] (==log2_uint64(x)-30 ... see note above) */
+fxp_log2_approx( uint64_t * _f,    /* f, in [0,2^30] (yes, closed ... see note above) */
+                 uint64_t    x ) {
+
+  /* Handle bad inputs */
+
+  if( !x ) { *_f = UINT64_C(0); return INT_MIN; }
+
+  /* Crack x into:
+
+       x = 2^i ( 1 + y/2^63 )
+
+     where y is in [0,2^63).  This can always be done _exactly_ for
+     non-zero x.  That is, i is the index of x's most significant
+     non-zero bit (in [0,63]) and y is trailing i bits shifted up to be
+     63b wide. */
+
+  int      i = log2_uint64( x );                  /* In [0,63] */
+  uint64_t y = (x << (63-i)) - (UINT64_C(1)<<63); /* In [0,2^63) */
+
+  /* Convert this to a fixed point approximation of x:
+
+       x ~ 2^i ( 1 + d/2^30 )
+
+     via:
+
+       d = floor( (y+2^32) / 2^33 )
+
+     This representation is still exact when i <= 30 and at most 1/2 ulp
+     error in d when i>30 (rna / round nearest with ties away from zero
+     rounding ... consider using ties toward even or truncate rounding
+     here as per note above).  Given the use of a round nearest style, d
+     is in [0,2^30] (closed at both ends). */
+
+  uint64_t d = (y + (UINT64_C(1)<<32)) >> 33;
+
+  /* Given this, we have:
+
+       e + f/2^30 = log2( x/2^30 )
+                  = log2( x ) - 30
+                  ~ log2( 2^i ( 1+ d/2^30 ) ) - 30
+                  = i-30 + log2( 1 + d/2^30 )
+
+     From this, we identify:
+
+       e      = i-30               (exact)
+       f/2^30 ~ log2( 1 + d/2^30 ) (approximate)
+
+     Thus, f of a 30b fixed point lg1p calculator with a 30b fixed point
+     input.  The below is automatically generated code for a fixed point
+     implementation of a minimax Pade(4,3) approximant to log2(1+x) over
+     the domain [0,1].  If exact math, the approximation has an accuracy
+     better than 1/2 ulp over the whole domain and is exact at the
+     endpoints.  As implemented the accuracy is O(1) ulp over the whole
+     domain (with round nearest flavored rounding) style), monotonic and
+     still exact at the endpoints. */
+
+  uint64_t f;
+  uint64_t g;
+
+  /* BEGIN AUTOGENERATED CODE */
+  /* bits 31.8 rms_aerr 1.9e-10 rms_rerr 1.3e-10 max_aerr 2.7e-10 max_rerr 2.7e-10 */
+
+  f = UINT64_C(0x0000000245c36b35);               /* scale 41 bout 34 bmul  - */
+  f = UINT64_C(0x000000029c5b8e15) + ((f*d)>>36); /* scale 35 bout 34 bmul 64 */
+  f = UINT64_C(0x0000000303d59639) + ((f*d)>>32); /* scale 33 bout 34 bmul 64 */
+  f = UINT64_C(0x00000001715475cc) + ((f*d)>>31); /* scale 32 bout 34 bmul 64 */
+  f =                                 (f*d);      /* scale 62 bout 64 bmul 64 */
+  /* f max 0xd1fb651800000000 */
+
+  g = UINT64_C(0x000000024357c946);               /* scale 37 bout 34 bmul  - */
+  g = UINT64_C(0x00000002a94e3723) + ((g*d)>>33); /* scale 34 bout 34 bmul 64 */
+  g = UINT64_C(0x000000018b7f484d) + ((g*d)>>32); /* scale 32 bout 34 bmul 64 */
+  g = UINT64_C(0x0000000100000000) + ((g*d)>>30); /* scale 32 bout 34 bmul 64 */
+  /* g max 0x0000000347ed945f */
+
+  f = (f + (g>>1)) / g; /* RNA style rounding */
+  /* END AUTOGENERATED CODE */
+
+  *_f = f; return i-30;
+}
+
+/* FIXED POINT EXPONENTIALS *******************************************/
+
+/* fxp_exp2_approx computes:
+
+     y/2^30 ~ exp2( x/2^30 )
+
+   with an error of O(1) ulp for x/2^30 < ~1.  This works identical to
+   exp2m1.h but uses a minimax polynomial that is better than 0.5 ulp
+   accurate in exact arithmetic.  As implemented this is +/-1 ulp of the
+   correctly rounded RNE result when x<=2^30, has the leading ~30 bits
+   correct for larger x and is exact for input values that yield exactly
+   representable outputs.  Returns UINT64_MAX if output would overflow a
+   34.30u input. */
+
+static inline uint64_t
+fxp_exp2_approx( uint64_t x ) {
+  uint64_t i = x >> 30;
+  if( i>=UINT64_C(34) ) return UINT64_MAX;
+  uint64_t d = x & ((UINT64_C(1)<<30)-UINT64_C(1));
+  uint64_t y;
+  /* BEGIN AUTOGENERATED CODE */
+  /* bits 33.8 rms_aerr 4.7e-11 rms_rerr 3.5e-11 max_aerr 6.7e-11 max_rerr 6.6e-11 */
+  y = UINT64_C(0x00000002d6e2cc42);               /* scale 49 bout 34 bmul  - */
+  y = UINT64_C(0x0000000257c0894c) + ((y*d)>>33); /* scale 46 bout 34 bmul 64 */
+  y = UINT64_C(0x00000002c01421b9) + ((y*d)>>33); /* scale 43 bout 34 bmul 64 */
+  y = UINT64_C(0x000000027609e3a4) + ((y*d)>>33); /* scale 40 bout 34 bmul 64 */
+  y = UINT64_C(0x00000001c6b2ea70) + ((y*d)>>33); /* scale 37 bout 34 bmul 64 */
+  y = UINT64_C(0x00000001ebfbce13) + ((y*d)>>32); /* scale 35 bout 34 bmul 64 */
+  y = UINT64_C(0x00000002c5c8603b) + ((y*d)>>31); /* scale 34 bout 34 bmul 64 */
+  y =                                 (y*d);      /* scale 64 bout 64 bmul 64 */
+  /* END AUTOGENERATED CODE */
+  int s = 34-(int)i;
+  return ((y + (UINT64_C(1)<<(s-1))) >> s) + (UINT64_C(1)<<(64-s));
+}
+
+/* fxp_rexp2_approx computes:
+
+     y/2^30 ~ exp2( -x/2^30 )
+
+   with an error of O(1) ulp everywhere.  This works identical to
+   rexp2.h but uses a minimax polynomial that is better than 0.5 ulp
+   accurate in exact arithmetic.  As implemented this is +/-1 ulp of the
+   correctly rounded RNE result everywhere and exact for input values
+   that have exactly representable outputs. */
+
+static inline uint64_t
+fxp_rexp2_approx( uint64_t x ) {
+  uint64_t i = x >> 30;
+  if( i>=UINT64_C(31) ) return UINT64_C(0);
+  uint64_t d = x & ((UINT64_C(1)<<30)-UINT64_C(1));
+  uint64_t y;
+  /* BEGIN AUTOGENERATED CODE */
+  /* bits 35.4 rms_aerr 2.4e-11 rms_rerr 1.4e-11 max_aerr 3.3e-11 max_rerr 2.2e-11 */
+  y = UINT64_C(0x00000002d6e2c6a2);               /* scale 50 bout 34 bmul  - */
+  y = UINT64_C(0x0000000269e37ccb) - ((y*d)>>34); /* scale 46 bout 34 bmul 64 */
+  y = UINT64_C(0x00000002b83379a8) - ((y*d)>>33); /* scale 43 bout 34 bmul 64 */
+  y = UINT64_C(0x00000002762c0cea) - ((y*d)>>33); /* scale 40 bout 34 bmul 64 */
+  y = UINT64_C(0x00000001c6af4b81) - ((y*d)>>33); /* scale 37 bout 34 bmul 64 */
+  y = UINT64_C(0x00000001ebfbd6aa) - ((y*d)>>32); /* scale 35 bout 34 bmul 64 */
+  y = UINT64_C(0x00000002c5c85fb1) - ((y*d)>>31); /* scale 34 bout 34 bmul 64 */
+  y = UINT64_C(0x8000000000000000) - ((y*d)>> 1); /* scale 63 bout 64 bmul 64 */
+  /* END AUTOGENERATED CODE */
+  int s = 33+(int)i;
+  return (y + (UINT64_C(1)<<(s-1))) >> s;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_fxp_h_ */
diff --git a/program/src/oracle/util/hash.h b/program/src/oracle/util/hash.h
new file mode 100644
index 000000000..4bdd1023a
--- /dev/null
+++ b/program/src/oracle/util/hash.h
@@ -0,0 +1,74 @@
+#ifndef _pyth_oracle_util_hash_h_
+#define _pyth_oracle_util_hash_h_
+
+/* Useful hashing utilities */
+
+#include "compat_stdint.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* High quality (full avalanche) high speed integer to integer hashing.
+   hash_uint32 has the properties that [0,2^32) hashes to a random
+   looking permutation of [0,2^32) and hash(0)==0.  Similarly for
+   hash_uint64.  Based on Google's Murmur3 hash finalizer (public
+   domain).  Not cryptographically secure but passes various strict
+   tests of randomness when used as a PRNG (see prng.h). */
+
+static inline uint32_t
+hash_uint32( uint32_t x ) {
+  x ^= x >> 16;
+  x *= UINT32_C( 0x85ebca6b );
+  x ^= x >> 13;
+  x *= UINT32_C( 0xc2b2ae35 );
+  x ^= x >> 16;
+  return x;
+}
+
+static inline uint64_t
+hash_uint64( uint64_t x ) {
+  x ^= x >> 33;
+  x *= UINT64_C( 0xff51afd7ed558ccd );
+  x ^= x >> 33;
+  x *= UINT64_C( 0xc4ceb9fe1a85ec53 );
+  x ^= x >> 33;
+  return x;
+}
+
+/* Inverses of the above.  E.g.:
+     hash_inverse_uint32( hash_uint32( (uint32_t)x ) )==(uint32_t)x
+   and:
+     hash_uint32( hash_inverse_uint32( (uint32_t)x ) )==(uint32_t)x
+   Similarly for hash_inverse_uint64.  These by themselves are similar
+   quality hashes to the above and having the inverses of the above can
+   be useful.  The fact these have (nearly) identical operations /
+   operation counts concretely demonstrates that none of these are
+   standalone cryptographically secure. */
+
+static inline uint32_t
+hash_inverse_uint32( uint32_t x ) {
+  x ^= x >> 16;
+  x *= UINT32_C( 0x7ed1b41d );
+  x ^= (x >> 13) ^ (x >> 26);
+  x *= UINT32_C( 0xa5cb9243 );
+  x ^= x >> 16;
+  return x;
+}
+
+static inline uint64_t
+hash_inverse_uint64( uint64_t x ) {
+  x ^= x >> 33;
+  x *= UINT64_C( 0x9cb4b2f8129337db );
+  x ^= x >> 33;
+  x *= UINT64_C( 0x4f74430c22a54005 );
+  x ^= x >> 33;
+  return x;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_hash_h_ */
+
diff --git a/program/src/oracle/util/log2.h b/program/src/oracle/util/log2.h
new file mode 100644
index 000000000..4093f7d5f
--- /dev/null
+++ b/program/src/oracle/util/log2.h
@@ -0,0 +1,108 @@
+#ifndef _pyth_oracle_util_log2_h_
+#define _pyth_oracle_util_log2_h_
+
+/* Portable robust integer log base 2 (i.e. find most significant bit).
+   Define PYTH_ORACLE_UTIL_LOG2_STYLE to indicate the implementation
+   style:
+     0 - portable
+     1 - use gcc's compiler builtins assuming uint32_t/uint64_t is an
+         "unsigned int"/"unsigned long".
+   Default is 0. */
+
+#include "compat_stdint.h"
+
+#ifndef PYTH_ORACLE_UTIL_LOG2_STYLE
+#define PYTH_ORACLE_UTIL_LOG2_STYLE 0
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Compute z = floor( log_2( x ) ) for positive x (i.e. return the
+   index of the most significant bit set in x.  Exact.  Undefined
+   behavior if x is not positive (can depend on the implementation
+   style).  For the portable style, the cost is a O(log_2 N) fast
+   integer ops where N is bit width of x. */
+
+#if PYTH_ORACLE_UTIL_LOG2_STYLE==0
+
+static inline int          /* In [0,7] */
+log2_uint8( uint8_t _x ) { /* Positive */
+  uint32_t x = (uint32_t)_x;
+  int z = 0;
+  if( x >= (UINT32_C(1)<< 4) ) z +=  4, x >>=  4;
+  if( x >= (UINT32_C(1)<< 2) ) z +=  2, x >>=  2;
+  if( x >= (UINT32_C(1)<< 1) ) z +=  1;
+  return z;
+}
+
+static inline int            /* In [0,15] */
+log2_uint16( uint16_t _x ) { /* Positive */
+  uint32_t x = (uint32_t)_x;
+  int z = 0;
+  if( x >= (UINT32_C(1)<< 8) ) z +=  8, x >>=  8;
+  if( x >= (UINT32_C(1)<< 4) ) z +=  4, x >>=  4;
+  if( x >= (UINT32_C(1)<< 2) ) z +=  2, x >>=  2;
+  if( x >= (UINT32_C(1)<< 1) ) z +=  1;
+  return z;
+}
+
+static inline int           /* In [0,31] */
+log2_uint32( uint32_t x ) { /* Positive */
+  int z = 0;
+  if( x >= (UINT32_C(1)<<16) ) z += 16, x >>= 16;
+  if( x >= (UINT32_C(1)<< 8) ) z +=  8, x >>=  8;
+  if( x >= (UINT32_C(1)<< 4) ) z +=  4, x >>=  4;
+  if( x >= (UINT32_C(1)<< 2) ) z +=  2, x >>=  2;
+  if( x >= (UINT32_C(1)<< 1) ) z +=  1;
+  return z;
+}
+
+static inline int           /* In [0,63] */
+log2_uint64( uint64_t x ) { /* Positive */
+  int z = 0;
+  if( x >= (UINT64_C(1)<<32) ) z += 32, x >>= 32;
+  if( x >= (UINT64_C(1)<<16) ) z += 16, x >>= 16;
+  if( x >= (UINT64_C(1)<< 8) ) z +=  8, x >>=  8;
+  if( x >= (UINT64_C(1)<< 4) ) z +=  4, x >>=  4;
+  if( x >= (UINT64_C(1)<< 2) ) z +=  2, x >>=  2;
+  if( x >= (UINT64_C(1)<< 1) ) z +=  1;
+  return z;
+}
+
+#elif PYTH_ORACLE_UTIL_LOG2_STYLE==1
+
+static inline int log2_uint8 ( uint8_t  x ) { return 31-__builtin_clz ( (unsigned int )x ); }
+static inline int log2_uint16( uint16_t x ) { return 31-__builtin_clz ( (unsigned int )x ); }
+static inline int log2_uint32( uint32_t x ) { return 31-__builtin_clz ( (unsigned int )x ); }
+static inline int log2_uint64( uint64_t x ) { return 63-__builtin_clzl( (unsigned long)x ); }
+
+#else
+#error "Unsupported PYTH_ORACLE_UTIL_LOG2_STYLE"
+#endif
+
+static inline int /* In [0, 6] */ log2_int8 ( int8_t  x /* positive */ ) { return log2_uint8 ( (uint8_t )x ); }
+static inline int /* In [0,14] */ log2_int16( int16_t x /* positive */ ) { return log2_uint16( (uint16_t)x ); }
+static inline int /* In [0,30] */ log2_int32( int32_t x /* positive */ ) { return log2_uint32( (uint32_t)x ); }
+static inline int /* In [0,62] */ log2_int64( int64_t x /* positive */ ) { return log2_uint64( (uint64_t)x ); }
+
+/* Same as the above but these variants return default when x is zero
+   such that they have no undefined behavior */
+
+static inline int log2_uint8_def ( uint8_t  x, int def ) { return x ? log2_uint8 ( x ) : def; }
+static inline int log2_uint16_def( uint16_t x, int def ) { return x ? log2_uint16( x ) : def; }
+static inline int log2_uint32_def( uint32_t x, int def ) { return x ? log2_uint32( x ) : def; }
+static inline int log2_uint64_def( uint64_t x, int def ) { return x ? log2_uint64( x ) : def; }
+
+static inline int log2_int8_def ( int8_t  x, int def ) { return (x>(int8_t )0) ? log2_int8 ( x ) : def; }
+static inline int log2_int16_def( int16_t x, int def ) { return (x>(int16_t)0) ? log2_int16( x ) : def; }
+static inline int log2_int32_def( int32_t x, int def ) { return (x>(int32_t)0) ? log2_int32( x ) : def; }
+static inline int log2_int64_def( int64_t x, int def ) { return (x>(int64_t)0) ? log2_int64( x ) : def; }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_log2_h_ */
+
diff --git a/program/src/oracle/util/prng.h b/program/src/oracle/util/prng.h
new file mode 100644
index 000000000..08e42f4ff
--- /dev/null
+++ b/program/src/oracle/util/prng.h
@@ -0,0 +1,366 @@
+#ifndef _pyth_oracle_util_prng_h_
+#define _pyth_oracle_util_prng_h_
+
+/* Simple fast high quality non-cryptographic pseudo random number
+   generator.  Supports parallel generation, interprocess shared memory
+   usage, checkpointing, random access, reversible, atomic, etc.  Passes
+   extremely strict tests of randomness.
+   
+   Assumes hash.h provides a high quality 64<>64-bit integer hash
+   functions (i.e. full avalanche) with the property hash_uint64(0)==0,
+   hash_uint64(i) for i in [0,2^64) yields a permutation of [0,2^64) and
+   also provides reasonably efficient inverse of this. */
+
+#include <stdalign.h>
+#include "hash.h" /* includes stdint.h */
+
+/* prng_t should be treated as an opaque handle of a pseudo random
+   number generator.  (It technically isn't here to faciliate inlining
+   of prng operations.) */
+
+struct prng {
+  uint64_t seq;
+  uint64_t idx;
+};
+
+typedef struct prng prng_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Private: randomly expands an arbitrary 32-bit value into a unique
+   64-bit non-sparse value such that the original 32-bit value can be
+   recovered and that 0 expands to something non-zero (the non-sparse
+   expansion helps reduce correlations between different sequences, the
+   zero to non-zero expansion means the common initialization of seq=0,
+   idx=0 doesn't yield 0 for the first random value as would happen for
+   hash functions that have the property hash_uint64(0)==0, the XOR
+   64-bit const here is zero in the lower 32-bits and an arbitrary
+   non-zero in the upper 32-bits).  For the current hash_uint64
+   implementation and XOR 64-bit constant, the pop count of the expanded
+   seq is ~32.000 +/- ~4.000 and in [8,56] for all possible values of
+   seq (i.e. the expanded seq popcount is well approximated as a normal
+   with mean 32 and rms 4 and the extremes are in line with the expected
+   extremes for 2^32 samples). */
+
+static inline uint64_t
+prng_private_expand( uint32_t seq ) {
+  return hash_uint64( UINT64_C( 0x900df00d00000000 ) ^ (uint64_t)seq );
+}
+
+/* Private: extract the original 32-bit seq from its expanded value */
+
+static inline uint32_t
+prng_private_contract( uint64_t seq ) {
+  return (uint32_t)hash_inverse_uint64( seq );
+}
+
+/* prng_footprint and prng_align give the needed footprint and alignment
+   for a memory region to hold a prng's state.  malloc / alloca /
+   declaration friendly (e.g. a memory region declared as "prng_t
+   _prng[1];", or created by "malloc(sizeof(prng_t))" or created by
+   "alloca(sizeof(prng_t))" will all automatically have the needed
+   footprint and alignment).
+
+   prng_new takes ownership of the memory region pointed to by mem
+   (which is assumed to be non-NULL with the appropriate footprint and
+   alingment) and formats it as a prng.  The pseudo random number
+   generator stream will initialized to use sequence random sequence
+   "seq" and will start at index "idx".  Returns mem (which will be
+   formatted for use).  The caller will not be joined to the region on
+   return.
+
+   prng_join joins the caller to a memory region holding the state of a
+   prng.  _prng points to a memory region in the local address space
+   that holds prng.  Returns an opaque handle of the local join in the
+   local address space to the prng (which might not be the same thing as
+   _prng ... the separation of new and join is to facilitate
+   interprocess shared memory usage patterns while supporting
+   transparent upgrades users of this to more elaborate PRNG algorithms
+   where address translations under the hood may not be trivial).
+
+   prng_leave leaves the current prng join.  Returns a pointer in the
+   local address space to the memory region holding the state of the
+   prng.  The prng join should not be used afterward.
+
+   prng_delete unformats the memory region currently used to hold the
+   state of a _prng and returns ownership of the underlying memory
+   region to the caller.  There should be no joins in the system on the
+   prng.  Returns a pointer to the underlying memory region.
+   
+   FIXME: CONSIDER FLATTENING ALIGN? */
+
+static inline uint64_t prng_footprint( void ) { return (uint64_t)sizeof ( prng_t ); }
+static inline uint64_t prng_align    ( void ) { return (uint64_t)alignof( prng_t ); }
+
+static inline void *
+prng_new( void *   mem,
+          uint32_t seq,
+          uint64_t idx ) {
+  prng_t * prng = (prng_t *)mem;
+  prng->seq = prng_private_expand( seq );
+  prng->idx = idx;
+  return (void *)prng;
+}
+
+static inline prng_t * prng_join  ( void   * _prng ) { return (prng_t *)_prng; }
+static inline void   * prng_leave ( prng_t *  prng ) { return (void   *) prng; }
+static inline void *   prng_delete( void   * _prng ) { return (void   *)_prng; }
+
+/* prng_seq returns the sequence used by the prng.  prng_idx returns the
+   next index that will be consumed by the prng. */
+
+static inline uint32_t prng_seq( prng_t * prng ) { return prng_private_contract( prng->seq ); }
+static inline uint64_t prng_idx( prng_t * prng ) { return prng->idx;                          }
+
+/* prng_seq_set sets the sequence to be used by the prng and returns the
+   replaced value.  prng_idx_set sets the next index that will be
+   consumed by the prng and returns the replaced value. */
+
+static inline uint32_t
+prng_seq_set( prng_t * prng,
+              uint32_t seq ) {
+  uint32_t old = prng_seq( prng );
+  prng->seq = prng_private_expand( seq );
+  return old;
+}
+
+static inline uint64_t
+prng_idx_set( prng_t * prng,
+              uint64_t idx ) {
+  uint64_t old = prng_idx( prng );
+  prng->idx = idx;
+  return old;
+}
+
+/* prng_uintN returns the next integer in the PRNG sequence in [0,2^N)
+   with uniform probability with a period of 2^64 (prng_uint64 has a
+   period of 2^63 as consumes two prng indices).  Passes various strict
+   PRNG tests (e.g. diehard, dieharder, testu01, etc).  Assumes prng is
+   a current join.  prng_intN is the same as prng_uintN but returns a
+   value in [0,2^(N-1)).  (A signed generator that can assume all
+   possible values of of a signed int uniform IID can be obtained by
+   casting the output of the unsigned generator of the same, assuming
+   typical twos complement arithmetic platform.)
+   
+   The theory for this that hash_uint64(i) for i in [0,2^64) specifies a
+   random looking permutation of the integers in [0,2^64).  Returning
+   the low order bits of this random permutation then yields a high
+   quality non-cryptographic random number stream automatically as it:
+
+   - Has a long period (2^64).  This is implied by the permutation
+     property.
+
+   - Has the expected random properties (as theoretically best possible
+     for a finite period generator) of a true uniform IID bit source.
+     For example, the probability of next random number is uniform and
+     independent of previous N random numbers for N<<2^64).  This is
+     also implied by the full avalanche and permutation property.
+
+   - Is "unpredictable".  That is, the internal state of the generator
+     is difficult to recover from its outputs, e.g. a return from
+     prng_uint32 could be have been generated from 2^32 internal states
+     (if the sequence is known), up to 2^32 sequences (if the state is
+     known) and up to 2^64 (state,seq) pairs neither is known (the state
+     / sequence is potentially recoverable given a long enough stream of
+     values though).  This is implied by the truncation of hash values.
+
+   To turn this into parameterizable family of generators, note that
+   hash_uint64( perm_j( i ) ) where j is some parameterized family of
+   random permutations is still a permutation and would have all the
+   above properties for free so long as no perm_j is similar to the
+   inverse of the hash permutation.  Practically, xoring i by a
+   non-sparse 64-bit number will ultra cheaply generate a wide family of
+   "good enough" permutations to do a parameterized shuffling of the
+   original hash_uint64 permutation, creating a large number of parallel
+   sequences.  Since users are empirically notoriously bad at seeding
+   though, we only let the user specify a 32-bit sequence id and then
+   generate a unique non-sparse 64-bit random looking number from it. */
+
+static inline uint8_t  prng_uint8 ( prng_t * prng ) { return (uint8_t )hash_uint64( prng->seq ^ (prng->idx++) ); }
+static inline uint16_t prng_uint16( prng_t * prng ) { return (uint16_t)hash_uint64( prng->seq ^ (prng->idx++) ); }
+static inline uint32_t prng_uint32( prng_t * prng ) { return (uint32_t)hash_uint64( prng->seq ^ (prng->idx++) ); }
+
+static inline uint64_t
+prng_uint64( prng_t * prng ) {
+  uint64_t hi = (uint64_t)prng_uint32( prng );
+  return (hi<<32) | (uint64_t)prng_uint32( prng );
+}
+
+static inline int8_t  prng_int8 ( prng_t * prng ) { return (int8_t )( prng_uint8 ( prng ) >> 1 ); }
+static inline int16_t prng_int16( prng_t * prng ) { return (int16_t)( prng_uint16( prng ) >> 1 ); }
+static inline int32_t prng_int32( prng_t * prng ) { return (int32_t)( prng_uint32( prng ) >> 1 ); }
+static inline int64_t prng_int64( prng_t * prng ) { return (int64_t)( prng_uint64( prng ) >> 1 ); }
+
+/* These quickly and robustly convert uniform random integers into
+   uniform random floating point with appropriate rounding.  Intervals
+   are:
+
+     *_o  -> (0,1)
+     *_c0 -> [0,1)
+     *_c1 -> (0,1]
+     *_c  -> [0,1]
+
+   To provide more specifics, let the real numbers from [0,1) be
+   partitioned into N uniform disjoint intervals such that interval i
+   goes from [i/N,(i+1)/N) where i is in [0,N).  For single (double)
+   precision, "float" ("double"), the largest N for which the range of
+   each interval is _exactly_ representable is N = 2^24 (2^53).
+   
+   Given then a uniform IID uint32_t random input, the
+   prng_uint32_to_float_c0 converter output is as though an continuous
+   IID uniform random in [0,1) was generated and then rounded down to
+   the start of the containing interval (2^24 intervals).  As such, this
+   generator can never return exactly 1 but it can exactly return +0.
+   Since floats have higher resolution near 0 than 1, this will not
+   return all float possible representations in [0,1) but it can return
+   all possible float representations in [1/2,1).  In particular, this
+   will never return a denorm or -0.
+
+   Similarly for prng_uint32_to_float_c1 converter rounds up to the end
+   of the containing interval / start of the next interval (2^24
+   intervals).  As such, this converter can never return exactly +0 but
+   it can exactly return 1.  It will not return all possible float
+   representations in (0,1] but it can return all possible float
+   representations in [1/2,1].  This will never return a denorm or -0.
+
+   The prng_uint32_to_float_c converter rounds toward nearest even
+   toward the start containing interval or start of the next interval
+   (2^24 intervals).  As such, this can return both exactly +0 and
+   exactly 1 (and the probability of returning exactly +0 == probability
+   of exactly 1 == (1/2) probability all other possible return values).
+   It will not return all possible float representations in [0,1] but it
+   can return all float possible representations in [1/2,1].  This will
+   never return a denorm or -0.
+
+   The prng_uint32_to_float_o converter rounds toward the middle of
+   containing internal (2^23 intervals ... note that then in a sense
+   this converter is 1-bit less accurate than the above).  As such, this
+   can neither return +0 nor 1 and will not return all possible float
+   representations in (0,1).  This will never return a denorm or -0.
+
+   Similarly for the double variants (*_{c0,c1,c} uses 2^53 intervals
+   and o uses 2^52 intervals).
+
+   Note that it is possible to make converters that will handle exactly
+   rounding toward all possible floating point representations in [0,1]
+   but this are significantly more expensive.
+   
+   Assumes IEEE-754 style float and doubles.
+
+   FIXME: ADD UNIT TEST COVERAGE */
+
+static inline float
+prng_uint32_to_float_c0( uint32_t u ) {
+  return (1.f/(float )(INT32_C(1)<<24))*(float)(int32_t)( u>>(32-24)                 );
+}
+
+static inline float
+prng_uint32_to_float_c1( uint32_t u ) {
+  return (1.f/(float )(INT32_C(1)<<24))*(float)(int32_t)((u>>(32-24))+   UINT32_C(1) );
+}
+
+static inline float
+prng_uint32_to_float_c( uint32_t u ) {
+  return (1.f/(float )(INT32_C(1)<<24))*(float)(int32_t)((u>>(32-24))+(u&UINT32_C(1)));
+}
+
+static inline float
+prng_uint32_to_float_o( uint32_t u ) {
+  return (1.f/(float )(INT32_C(1)<<24))*(float)(int32_t)((u>>(32-24))|   UINT32_C(1) );
+}
+
+static inline double
+prng_uint64_to_double_c0( uint64_t u ) {
+  return (1. /(double)(INT64_C(1)<<53))*(double)(int64_t)( u>>(64-53)                 );
+}
+
+static inline double
+prng_uint64_to_double_c1( uint64_t u ) {
+  return (1. /(double)(INT64_C(1)<<53))*(double)(int64_t)((u>>(64-53))+   UINT64_C(1) );
+}
+
+static inline double
+prng_uint64_to_double_c( uint64_t u ) {
+  return (1. /(double)(INT64_C(1)<<53))*(double)(int64_t)((u>>(64-53))+(u&UINT64_C(1)));
+}
+
+static inline double
+prng_uint64_to_double_o( uint64_t u ) {
+  return (1. /(double)(INT64_C(1)<<53))*(double)(int64_t)((u>>(64-53))|   UINT64_C(1) );
+}
+
+static inline float  prng_float_c0 ( prng_t * prng ) { return prng_uint32_to_float_c0 ( prng_uint32( prng ) ); }
+static inline float  prng_float_c1 ( prng_t * prng ) { return prng_uint32_to_float_c1 ( prng_uint32( prng ) ); }
+static inline float  prng_float_c  ( prng_t * prng ) { return prng_uint32_to_float_c  ( prng_uint32( prng ) ); }
+static inline float  prng_float_o  ( prng_t * prng ) { return prng_uint32_to_float_o  ( prng_uint32( prng ) ); }
+
+static inline double prng_double_c0( prng_t * prng ) { return prng_uint64_to_double_c0( prng_uint64( prng ) ); }
+static inline double prng_double_c1( prng_t * prng ) { return prng_uint64_to_double_c1( prng_uint64( prng ) ); }
+static inline double prng_double_c ( prng_t * prng ) { return prng_uint64_to_double_c ( prng_uint64( prng ) ); }
+static inline double prng_double_o ( prng_t * prng ) { return prng_uint64_to_double_o ( prng_uint64( prng ) ); }
+
+/* prng_int32_roll uses the given prng to roll an n-sided die where n
+   is the number of sides (assumed to be positive).  That is returns
+   uniform IID rand in [0,n) even if n is not an exact power of two.
+
+   Rejection method based.  Specifically, the number of prng slots
+   consumed is typically 1 but theoretically might be higher
+   occassionally (64-bit wide types consume prng slots twice as fast).
+
+   Deterministic prng slot consumption possible with a slightly more
+   approximate implementation (bound the number of iterations such that
+   this always consumes a fixed number of slot and accept the
+   practically infinitesimal bias when n is not a power of 2).
+
+   FIXME: ADD UNIT TEST COVERAGE. */
+
+static inline uint32_t
+prng_private_roll32( prng_t * prng,
+                     uint32_t n ) {
+  uint32_t r = (-n) % n; /* Compute 2^32 mod n = (2^32 - n) mod n = (-n) mod n, compile time for compile time n */
+  uint32_t u; do u = prng_uint32( prng ); while( u<r ); /* Rejection unlikely (highly unlikely for n<<<2^32) */
+  /* At this point, u is uniform in [r,2^32) which has an integer
+     multiple of n elements (thus u % n is in [0,n) uniform) */
+  return u % n;
+}
+
+static inline uint64_t
+prng_private_roll64( prng_t * prng,
+                     uint64_t n ) {
+  uint64_t r = (-n) % n; /* Compute 2^64 mod n = (2^64 - n) mod n = (-n) mod n, compile time for compile time n */
+  uint64_t u; do u = prng_uint64( prng ); while( u<r ); /* Rejection unlikely (highly unlikely for n<<<2^64) */
+  /* At this point, u is uniform in [r,2^64) which has an integer
+     multiple of n elements (thus u % n is in [0,n) uniform) */
+  return u % n;
+}
+
+static inline uint8_t  prng_uint8_roll ( prng_t * prng, uint8_t  n ) { return (uint8_t )prng_private_roll32( prng, (uint32_t)n ); }
+static inline uint16_t prng_uint16_roll( prng_t * prng, uint16_t n ) { return (uint16_t)prng_private_roll32( prng, (uint32_t)n ); }
+static inline uint32_t prng_uint32_roll( prng_t * prng, uint32_t n ) { return (uint32_t)prng_private_roll32( prng, (uint32_t)n ); }
+static inline uint64_t prng_uint64_roll( prng_t * prng, uint64_t n ) { return (uint64_t)prng_private_roll64( prng, (uint64_t)n ); }
+
+static inline int8_t   prng_int8_roll  ( prng_t * prng, int8_t   n ) { return (int8_t  )prng_private_roll32( prng, (uint32_t)n ); }
+static inline int16_t  prng_int16_roll ( prng_t * prng, int16_t  n ) { return (int16_t )prng_private_roll32( prng, (uint32_t)n ); }
+static inline int32_t  prng_int32_roll ( prng_t * prng, int32_t  n ) { return (int32_t )prng_private_roll32( prng, (uint32_t)n ); }
+static inline int64_t  prng_int64_roll ( prng_t * prng, int64_t  n ) { return (int64_t )prng_private_roll64( prng, (uint64_t)n ); }
+
+/* FIXME: COIN TOSSING / BERNOULLI TRIALS */
+
+/* FIXME: ADD EXPONENTIAL AND NORMAL GENERATORS.  EXPONENTIAL IS EASY
+   (E.G. -LOGF( PRNG_FLOAT_C1( PRNG ) )) BUT THIS REQUIRES AVAILABILITY
+   OF MATH.H (MAYBE DO OUT OF LINE).  NORMAL IS BEST DONE VIA ZIGGURAT
+   STYLE REJECTION, WHICH IS BEST DONE OUT OF LINE.  EXP CAN ALSO BE
+   DONE WITH ZIGGURAT STYLE REJECTION IF MATH.H FUNCTIONS NOT AVAILABLE
+   (AND THEN ALSO BEST DOWN OUT OF LINE). */
+
+/* FIXME: IMPORT ATOMIC VARIANTS FOR REENTRANT USAGE (E.G. ATOMIC_XCHG
+   FOR SET, ATOMIC_INC OF INDEX FOR THE RETURN TYPES, CAS STATE UPDATES,
+   ETC) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_prng_h_ */
diff --git a/program/src/oracle/util/rexp2.h b/program/src/oracle/util/rexp2.h
new file mode 100644
index 000000000..fb4bced33
--- /dev/null
+++ b/program/src/oracle/util/rexp2.h
@@ -0,0 +1,157 @@
+#ifndef _pyth_oracle_util_rexp2_h_
+#define _pyth_oracle_util_rexp2_h_
+
+/* rexp2_fxp computes:
+
+     y/2^30 ~ exp2( -x/2^30 )
+
+   That is, it computes a fixed point approximation of 1 / 2^x = 2^-x
+   where x is non-negative and represented with 34 integer bits and 30
+   fractional bits (34u.30 fxp).  The output will be in [0,2^30] (i.e.
+   1u.30 fxp).
+
+   The approximation is designed to minimize the RMS error over the
+   interval [0,2^30) while preserving continuity overall and matching at
+   the interval end points exactly (as such that 2^-i for non-negative
+   integer i will be exact for i in [0,30] and correctly round-near-even
+   for larger i).
+
+   Define REXP2_FXP_ORDER from 1:7 to get the desired cost / accuracy
+   tradeoff.  At REXP2_FXP_ORDER 1, this costs virtually nothing (some
+   fast bit ops and 64-bit integer subtract); it is a piecewise linear
+   approximation that is typically accurate to a few percent.
+
+   At REXP2_FXP_ORDER 7, this is effectively IEEE style 30 bit precision
+   with nearest even rounding (e.g. for x in [0,2^30), the result with
+   a worst case of ~0.6 ulp instead of the theoretical limit of 0.5 ulp
+   for round nearest).  It requires some fast bits ops, 7 64*64->64
+   subtracts and 7 64*64->64 multiplies (similarly for the orders 2:6).
+
+   REXP2_FXP_ORDER 5 is the default and has a worst case output accuracy
+   comparable IEEE single precision.
+
+   In short, for integer x/2^30, it is the exact value correctly rounded
+   and, for IID random x in [0,2^30), the approximation errors can be
+   thought of as a random Gaussian perturbation on the output value such
+   that the approximation for a given order is the one that minimizes
+   the perturbation variance. */
+
+#include "compat_stdint.h"
+
+#ifndef REXP2_FXP_ORDER
+#define REXP2_FXP_ORDER 5
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline uint64_t
+rexp2_fxp( uint64_t x ) {
+
+  /* This calculation is based on splitting x into integer and
+     fractional bits such that:
+       x = 2^30 i + d
+     where d is in [0,2^30).  Then we have:
+       y = 2^30 2^-i 2^( -d/2^30 )
+         = (2^63 2^( -d/2^30 )) / 2^(63-30+i)
+         = (2^63 2^( -d/2^30 )) / 2^(33+i)
+         ~ floor( (2^63 2^(-d/2^30) + 2^(32+i)) / 2^(33+i) )
+     Thus, the main challenge is computing an approximation of:
+       2^63 2^(-d/2^30)
+     As this smoothly goes from 1 to 1/2, this is an ideal candidate for
+     a polynomial approximation (Pade' approximation might be even be
+     better if willing to accept the cost of the divide).  We aren't too
+     particular about maxing out precision or precise rounding in the
+     interest of compactness and speed.
+
+     The polynomial coefficients below were selected to minimize the RMS
+     error over the interval [0,1) while matching exactly at 0 and 1
+     (RMS error was used over minimax as it has more intuitive error
+     properties for users typically).  All but the leading coefficent
+     were scaled to be in the range [2^33,2^34) to minimize the
+     coefficient quantization error while yielding a value that cannot
+     not overflow 64-bits when multiplied by d.  The intermediate shifts
+     needed for the Horner's rule application were then computed
+     accordingly.  The leading coefficient scale was fixed at 2^63 to
+     enable quick-and-dirty implementation of rounding modes. */
+
+  uint64_t i = x >> 30;
+  if( i>UINT64_C(30) ) return UINT64_C(0);
+
+  uint64_t d = x & ((UINT64_C(1)<<30)-UINT64_C(1));
+  uint64_t y;
+
+  /* BEGIN AUTOGENERATED CODE - See KJB for code generator */
+# if REXP2_FXP_ORDER==1
+  /* bits 4.0 rms_aerr 1.3e-02 rms_rerr 1.8e-02 max_aerr 4.3e-02 max_rerr 6.1e-02 */
+  /* As implemented: bits 4.0 max_aerr 4.3e-02 max_rerr 6.1e-02 ulp 46209195.0 */
+  y = UINT64_C(0x8000000000000000) - (d<<32);
+# elif REXP2_FXP_ORDER==2
+  /* bits 8.3 rms_aerr 1.4e-03 rms_rerr 2.0e-03 max_aerr 1.9e-03 max_rerr 3.2e-03 */
+  /* As implemented: bits 8.3 max_aerr 1.9e-03 max_rerr 3.2e-03 ulp 2086872.6 */
+  y = UINT64_C(0x2c029d07d);
+  y = UINT64_C(0x2b00a741f) - ((y*d)>>32);
+  y = UINT64_C(0x8000000000000000) - ((y*d)>>1);
+# elif REXP2_FXP_ORDER==3
+  /* bits 13.0 rms_aerr 5.5e-05 rms_rerr 7.9e-05 max_aerr 8.6e-05 max_rerr 1.2e-04 */
+  /* As implemented: bits 13.0 max_aerr 8.6e-05 max_rerr 1.2e-04 ulp 91856.9 */
+  y = UINT64_C(0x288319c3e);
+  y = UINT64_C(0x3b33c6b15) - ((y*d)>>32);
+  y = UINT64_C(0x2c44c0101) - ((y*d)>>32);
+  y = UINT64_C(0x8000000000000000) - ((y*d)>>1);
+# elif REXP2_FXP_ORDER==4
+  /* bits 17.6 rms_aerr 1.6e-06 rms_rerr 2.4e-06 max_aerr 2.8e-06 max_rerr 5.0e-06 */
+  /* As implemented: bits 17.6 max_aerr 2.8e-06 max_rerr 5.0e-06 ulp 2999.6 */
+  y = UINT64_C(0x38100ce16);
+  y = UINT64_C(0x36871cfc4) - ((y*d)>>33);
+  y = UINT64_C(0x3d4dfa602) - ((y*d)>>32);
+  y = UINT64_C(0x2c5b2ce21) - ((y*d)>>32);
+  y = UINT64_C(0x8000000000000000) - ((y*d)>>1);
+# elif REXP2_FXP_ORDER==5
+  /* bits 22.8 rms_aerr 4.6e-08 rms_rerr 6.7e-08 max_aerr 7.7e-08 max_rerr 1.4e-07 */
+  /* As implemented: bits 22.8 max_aerr 7.7e-08 max_rerr 1.4e-07 ulp 83.1 */
+  y = UINT64_C(0x3e1a2f97e);
+  y = UINT64_C(0x25bc1de09) - ((y*d)>>34);
+  y = UINT64_C(0x38a155436) - ((y*d)>>32);
+  y = UINT64_C(0x3d7c8e03d) - ((y*d)>>32);
+  y = UINT64_C(0x2c5c78186) - ((y*d)>>32);
+  y = UINT64_C(0x8000000000000000) - ((y*d)>>1);
+# elif REXP2_FXP_ORDER==6
+  /* bits 27.9 rms_aerr 1.1e-09 rms_rerr 1.6e-09 max_aerr 2.1e-09 max_rerr 3.9e-09 */
+  /* As implemented: bits 27.6 max_aerr 2.5e-09 max_rerr 4.9e-09 ulp 2.7 */
+  y = UINT64_C(0x3959e0dfb);
+  y = UINT64_C(0x29cdf1eff) - ((y*d)>>34);
+  y = UINT64_C(0x273d8f899) - ((y*d)>>33);
+  y = UINT64_C(0x38d2ad669) - ((y*d)>>32);
+  y = UINT64_C(0x3d7f590ad) - ((y*d)>>32);
+  y = UINT64_C(0x2c5c85808) - ((y*d)>>32);
+  y = UINT64_C(0x8000000000000000) - ((y*d)>>1);
+# elif REXP2_FXP_ORDER==7
+  /* bits 33.5 rms_aerr 2.3e-11 rms_rerr 3.4e-11 max_aerr 4.4e-11 max_rerr 8.5e-11 */
+  /* As implemented: bits 29.8 max_aerr 5.6e-10 max_rerr 1.1e-09 ulp 0.6 */
+  y = UINT64_C(0x2d6cd448b);
+  y = UINT64_C(0x269cc5254) - ((y*d)>>34);
+  y = UINT64_C(0x2b82bc124) - ((y*d)>>33);
+  y = UINT64_C(0x2762b03ae) - ((y*d)>>33);
+  y = UINT64_C(0x38d5e75bc) - ((y*d)>>32);
+  y = UINT64_C(0x3d7f7ab76) - ((y*d)>>32);
+  y = UINT64_C(0x2c5c85fa8) - ((y*d)>>32);
+  y = UINT64_C(0x8000000000000000) - ((y*d)>>1);
+# else
+# error "Unsupported REXP2_FXP_ORDER"
+# endif
+  /* END AUTOGENERATED CODE */
+
+  /* At this point, y ~ 2^63 2^(-d/2^30) <= 2^63 while i is in [0,30].
+     Do a rounding right shift to get the final result. */
+
+  int s = 33+(int)i; /* s is [33,63], s-1 in [32,62], y+(1<<(s-1)) at most <=2^63+2^62 and will not overflow */
+  return (y + (UINT64_C(1)<<(s-1))) >> s;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_rexp2_h_ */
diff --git a/program/src/oracle/util/run_tests b/program/src/oracle/util/run_tests
new file mode 100755
index 000000000..2d0d052af
--- /dev/null
+++ b/program/src/oracle/util/run_tests
@@ -0,0 +1,69 @@
+#!/bin/sh
+
+module purge          || exit 1
+module load gcc-9.3.0 || exit 1
+
+./clean       || exit 1
+mkdir -pv bin || exit 1
+
+CC="gcc -g -Wall -Werror -Wextra -Wconversion -Wstrict-aliasing=2 -Wimplicit-fallthrough=2 -pedantic -D_XOPEN_SOURCE=600 -O2 -march=native -std=c17 -lm"
+
+set -x
+
+$CC test_align.c  -o bin/test_align  || exit 1
+$CC test_hash.c   -o bin/test_hash   || exit 1
+$CC test_prng.c   -o bin/test_prng   || exit 1
+$CC test_sar.c    -o bin/test_sar    || exit 1
+$CC test_round.c  -o bin/test_round  || exit 1
+$CC test_log2.c   -o bin/test_log2   || exit 1
+$CC test_smag.c   -o bin/test_smag   || exit 1
+$CC test_rexp2.c  -o bin/test_rexp2  || exit 1
+$CC test_exp2m1.c -o bin/test_exp2m1 || exit 1
+$CC test_uwide.c  -o bin/test_uwide  || exit 1
+$CC test_sqrt.c   -o bin/test_sqrt   || exit 1
+$CC test_fxp.c    -o bin/test_fxp    || exit 1
+$CC test_avg.c    -o bin/test_avg    || exit 1
+
+bin/test_align  || exit 1
+bin/test_hash   || exit 1
+bin/test_prng   || exit 1
+bin/test_sar    || exit 1
+bin/test_round  || exit 1
+bin/test_log2   || exit 1
+bin/test_smag   || exit 1
+bin/test_rexp2  || exit 1
+bin/test_exp2m1 || exit 1
+bin/test_uwide  || exit 1
+bin/test_sqrt   || exit 1
+bin/test_fxp    || exit 1
+bin/test_avg    || exit 1
+
+# PRNG batteries
+
+# Note that when running an individual test in a TestU01 test battery,
+# the probability of that test failing even though the generator is fine
+# is roughly 0.2%.  Thus, when repeatedly running batteries that
+# themselves contain a large number of tests, some test failures are
+# expected.  So long as the test failures are are sporadic, don't occur
+# in the same place when running multiple times with random seq and/or
+# idx, and don't have p-values improbably close to 0
+# (p<<<1/overall_num_tests_run) or 1 (1-p<<<1/overall_num_tests_run), it
+# is expected and normal.  Because of this, TestU01 doesn't just fail if
+# a handful of test fails ... need to read the reports produced to see
+# how well the generator is actually behaving.
+
+# Point the TESTU01 variable at where TestU01 is installed.  Testing
+# thus far has been done with TestU01-1.2.3.
+
+#TESTU01=~/nfs/TestU01
+#
+#$CC test_prng_battery.c -o bin/test_prng_battery -I$TESTU01/include $TESTU01/lib/libtestu01.a $TESTU01/lib/libprobdist.a $TESTU01/lib/libmylib.a || exit 1
+#
+#bin/test_prng_battery 0 0 0 || exit 1 # FIPS-140.2 (seq 0, idx 0)
+#bin/test_prng_battery 1 0 0 || exit 1 # PseudoDiehard (seq 0, idx 0)
+#bin/test_prng_battery 2 0 0 || exit 1 # SmallCrush (seq 0, idx 0)
+#bin/test_prng_battery 3 0 0 || exit 1 # Crush: Takes >~20 min (seq 0, idx 0)
+#bin/test_prng_battery 4 0 0 || exit 1 # BigCrush: Takes >~3 hours (seq 0, idx 0)
+
+echo all tests passed
+
diff --git a/program/src/oracle/util/sar.h b/program/src/oracle/util/sar.h
new file mode 100644
index 000000000..1fbd8b30b
--- /dev/null
+++ b/program/src/oracle/util/sar.h
@@ -0,0 +1,71 @@
+#ifndef _pyth_oracle_util_sar_h_
+#define _pyth_oracle_util_sar_h_
+
+/* Portable arithmetic shift right.  Define PYTH_ORACLE_UTIL_SAR_STYLE
+   to indicate the implementation style:
+     0 - portable (i.e. platform/compiler does not do arithmetic right
+         shift for signed integers or is not known to)
+     1 - platform/compiler does do arithmetic right shits.
+   Default is 0. */
+
+#include "compat_stdint.h"
+
+#ifndef PYTH_ORACLE_UTIL_SAR_STYLE
+#define PYTH_ORACLE_UTIL_SAR_STYLE 0
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if PYTH_ORACLE_UTIL_SAR_STYLE==0
+
+static inline int8_t
+sar_int8( int8_t x,
+          int    n ) { /* Assumed in [0,7] */
+  uint8_t ux = (uint8_t)x;
+  uint8_t s  = (uint8_t)-(ux >> 7); /* get the sign, ((int8_t)s)==-1 if x<0 and 0 otherwise */
+  return (int8_t)(((ux ^ s) >> n) ^ s);
+}
+
+static inline int16_t
+sar_int16( int16_t x,
+           int     n ) { /* Assumed in [0,15] */
+  uint16_t ux = (uint16_t)x;
+  uint16_t s  = (uint16_t)-(ux >> 15); /* get the sign, ((int16_t)s)==-1 if x<0 and 0 otherwise */
+  return (int16_t)(((ux ^ s) >> n) ^ s);
+}
+
+static inline int32_t
+sar_int32( int32_t x,
+           int     n ) { /* Assumed in [0,31] */
+  uint32_t ux = (uint32_t)x;
+  uint32_t s  = -(ux >> 31); /* get the sign, ((int32_t)s)==-1 if x<0 and 0 otherwise */
+  return (int32_t)(((ux ^ s) >> n) ^ s);
+}
+
+static inline int64_t
+sar_int64( int64_t x,
+           int     n ) { /* Assumed in [0,63] */
+  uint64_t ux = (uint64_t)x;
+  uint64_t s  = -(ux >> 63); /* get the sign, ((int64_t)s)==-1 if x<0 and 0 otherwise */
+  return (int64_t)(((ux ^ s) >> n) ^ s);
+}
+
+#elif PYTH_ORACLE_UTIL_SAR_STYLE==1
+
+static inline int8_t  sar_int8 ( int8_t  x, int n ) { return (int8_t )(x >> n); }
+static inline int16_t sar_int16( int16_t x, int n ) { return (int16_t)(x >> n); }
+static inline int32_t sar_int32( int32_t x, int n ) { return (int32_t)(x >> n); }
+static inline int64_t sar_int64( int64_t x, int n ) { return (int64_t)(x >> n); }
+
+#else
+#error "Unsupported PYTH_ORACLE_UTIL_SAR_STYLE"
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_sar_h_ */
+
diff --git a/program/src/oracle/util/smag.h b/program/src/oracle/util/smag.h
new file mode 100644
index 000000000..daf8a8e10
--- /dev/null
+++ b/program/src/oracle/util/smag.h
@@ -0,0 +1,54 @@
+#ifndef _pyth_oracle_util_smag_h_
+#define _pyth_oracle_util_smag_h_
+
+/* Signed magnitude conversions to/from twos complement
+   representations. */
+
+#include "compat_stdint.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Unpack a twos complement represention into signed magnitude
+   representation.  Returns magnitude, assumes _s is non-NULL and *_s
+   will be 0 or 1 on return.  For the 32-bit variant, return will be in
+   [0,2^31] and *_s==1 if the return is 2^31.  Similarly for the other
+   widths. */
+
+static inline uint8_t  smag_unpack_int8 ( int8_t  x, int * _s ) { int s = x<INT8_C( 0); *_s = s; return (uint8_t )( s ? -x : x ); }
+static inline uint16_t smag_unpack_int16( int16_t x, int * _s ) { int s = x<INT16_C(0); *_s = s; return (uint16_t)( s ? -x : x ); }
+static inline uint32_t smag_unpack_int32( int32_t x, int * _s ) { int s = x<INT32_C(0); *_s = s; return (uint32_t)( s ? -x : x ); }
+static inline uint64_t smag_unpack_int64( int64_t x, int * _s ) { int s = x<INT64_C(0); *_s = s; return (uint64_t)( s ? -x : x ); }
+
+/* Pack signed magnitude representation into twos complement
+   representation (roughly speaking compute (-1)^s m).  For the 32-bit
+   variant, assumes m<2^31 or (m==2^31 and s!=0).  Similarly for the
+   other widths. */
+
+static inline int8_t  smag_pack_int8 ( int s, uint8_t  m ) { return (int8_t )( s ? -m : m ); }
+static inline int16_t smag_pack_int16( int s, uint16_t m ) { return (int16_t)( s ? -m : m ); }
+static inline int32_t smag_pack_int32( int s, uint32_t m ) { return (int32_t)( s ? -m : m ); }
+static inline int64_t smag_pack_int64( int s, uint64_t m ) { return (int64_t)( s ? -m : m ); }
+
+/* FIXME: CONSIDER MOVING SIGNBIT AND ABS TO THEIR OWN HEADERS */
+
+static inline int signbit_int8 ( int8_t  x ) { return x<INT8_C( 0); }
+static inline int signbit_int16( int16_t x ) { return x<INT16_C(0); }
+static inline int signbit_int32( int32_t x ) { return x<INT32_C(0); }
+static inline int signbit_int64( int64_t x ) { return x<INT64_C(0); }
+
+static inline uint8_t  abs_int8 ( int8_t  x ) { return (uint8_t )( x<INT8_C( 0) ? -x : x ); }
+static inline uint16_t abs_int16( int16_t x ) { return (uint16_t)( x<INT16_C(0) ? -x : x ); }
+static inline uint32_t abs_int32( int32_t x ) { return (uint32_t)( x<INT32_C(0) ? -x : x ); }
+static inline uint64_t abs_int64( int64_t x ) { return (uint64_t)( x<INT64_C(0) ? -x : x ); }
+
+/* FIXME: CONSIDER FOR API COMPLETENESS/CONSISTENCY UINT VARIANTS (THEY
+   ARE NOPS) */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_smag_h_ */
+
diff --git a/program/src/oracle/util/sqrt.h b/program/src/oracle/util/sqrt.h
new file mode 100644
index 000000000..57e93122f
--- /dev/null
+++ b/program/src/oracle/util/sqrt.h
@@ -0,0 +1,219 @@
+#ifndef _pyth_oracle_util_sqrt_h_
+#define _pyth_oracle_util_sqrt_h_
+
+/* Portable robust integer sqrt */
+
+#include "log2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Compute y = floor( sqrt( x ) ) for unsigned integers exactly.  This
+   is based on the fixed point iteration:
+
+     y' = (y + x/y) / 2
+
+   In continuum math, this converges quadratically to sqrt(x).  This is
+   a useful starting point for a method because we have a relatively low
+   cost unsigned integer division in the machine model and the
+   operations and intermediates in this calculation all have magnitudes
+   smaller than x (so limited concern about overflow issues).
+
+   We don't do this iteration in integer arithmetic directly because the
+   iteration has two roundoff errors while the actual result only has
+   one (the floor of the continuum value).  As such, even if it did
+   converge in integer arithmetic, probably would not always converge
+   exactly.
+
+   We instead combine the two divisions into one, yielding single round
+   off error iteration:
+
+     y' = floor( (y^2 + x) / (2 y) )
+
+   As y has about half the width of x given a good initial guess, 2 y
+   will not overflow and y^2 + x will be ~2 x and thus any potential
+   intermediate overflow issues are cheap to handle.  If this converges,
+   at convergence:
+
+        y = (y^2 + x - r) / (2 y)
+     -> 2 y^2 = y^2 + x - r
+     -> y^2 = x - r
+
+   for some r in [0,2 y-1].  We note that if y = floor( sqrt( x ) )
+   exactly though:
+
+        y^2 <= x < (y+1)^2
+     -> y^2 <= x < y^2 + 2 y + 1
+     -> y^2  = x - r'
+
+   for some r' in [0,2 y].  r' is r with the element 2 y added.  And it
+   is possible to have y^2 = x - 2 y.  Namely if x+1 = z^2 for integer
+   z, this becomes y^2 + 2 y + 1 = z^2 -> y = z-1.  That is, when
+   x = z^2 - 1 for integral z, the relationship can never converge.  If
+   we instead used a denominator of 2y+1 in the iteration, r would have
+   the necessary range:
+
+     y' = floor( (y^2 + x) / (2 y + 1) )
+
+   At convergence we have:
+
+        y = (y^2 + x - r) / (2 y+1)
+     -> 2 y^2 + y = (y^2 + x - r)
+     -> y^2 = x-y-r
+
+   for some r in [0,2 y].  This isn't quite right but we change the
+   recurrence numerator to compensate:
+
+     y' = floor( (y^2 + y + x) / (2 y + 1) )
+  
+   At convergence we now have:
+
+     y^2 = x-r
+
+   for some r in [0,2 y].  That is, at convergence y = floor( sqrt(x) )
+   exactly!  The addition of y to the numerator has not made
+   intermediate overflow much more diffcult to deal with either as y <<<
+   x for large x.  So to compute this without intermediate overflow, we
+   compute the terms individually and then combine the reminders
+   appropriately.  x/(2y+1) term is trivial.  The other term,
+   (y^2+y)/(2y+1) is asymptotically approximately y/2.  Breaking it into
+   its asymptotic and residual:
+
+      (y^2 + y) / (2y+1) = y/2 + ( y^2 + y - (y/2)(2y+1) ) / (2y+1)
+                         = y/2 + ( y^2 + y - y^2 - y/2   ) / (2y+1)
+                         = y/2 + (y/2) / (2y+1)
+
+   For even y, y/2 = y>>1 = yh and we have the partial quotient yh and
+   remainder yh.  For odd y, we have:
+
+                         = yh + (1/2) + (yh+(1/2)) / (2y+1)
+                         = yh + ((1/2)(2y+1)+yh+(1/2)) / (2y+1)
+                         = yh + (y+yh+1) / (2y+1)
+
+   with partial quotent yh and remainder y+yh+1.  This yields the
+   iteration:
+
+     y ~ sqrt(x)                               // <<< INT_MAX for all x
+     for(;;) {
+       d  = 2*y + 1;
+       qx = x / d; rx = x - qx*d;              // Compute x  /(2y+1), rx in [0,2y]
+       qy = y>>1;  ry = (y&1) ? (y+yh+1) : yh; // Compute y^2/(2y+1), ry in [0,2y]
+       q  = qx+qy; r  = rx+ry;                 // Combine partials, r in [0,4y]
+       if( r>=d ) q++, r-=d;                   // Handle carry (at most 1), r in [0,2y]
+       if( y==q ) break;                       // At convergence y = floor(sqrt(x))
+       y = q;
+     }
+
+   The better the initial guess, the faster this will converge.  Since
+   convergence is still quadratic though, it will converge even given
+   very simple guesses.  We use:
+
+     y = sqrt(x) = sqrt( 2^n + d ) <~ 2^(n/2)
+   
+   where n is the index of the MSB and d is in [0,2^n) (i.e. is n bits
+   wide).  Thus:
+
+     y ~ 2^(n>>1) if n is even and 2^(n>>1) sqrt(2) if n is odd
+
+   and we can do a simple fixed point calculation to compute this.
+
+   For small values of x, we encode a 20 entry 3-bit wide lookup table
+   in a 64-bit constant and just do a quick lookup. */
+
+static inline uint8_t
+sqrt_uint8( uint8_t x ) { /* FIXME: CONSIDER 8-bit->4-bit LUT INSTEAD (128bytes)? */
+  if( x<UINT8_C(21) ) return (uint8_t)((UINT64_C(0x49246db6da492248) >> (3*(int)x)) & UINT64_C(7));
+  int     n = log2_uint8( x );
+  uint8_t y = (uint8_t)(((n & 1) ? UINT8_C(0xb) /* floor( 2^3 sqrt(2) ) */ : UINT8_C(0x8) /* 2^3 */) >> (3-(n>>1)));
+  for(;;) {
+    uint8_t d = (uint8_t)(y<<1); d++;
+    uint8_t qx = x / d;            uint8_t rx = (uint8_t)(x - qx*d);
+    uint8_t qy = y>>1;             uint8_t ry = (y & UINT8_C(1)) ? (uint8_t)(y+qy+UINT8_C(1)) : qy;
+    uint8_t q  = (uint8_t)(qx+qy); uint8_t r  = (uint8_t)(rx+ry);
+    if( r>=d ) q++;
+    if( y==q ) break;
+    y = q;
+  }
+  return y;
+}
+
+static inline uint16_t
+sqrt_uint16( uint16_t x ) {
+  if( x<UINT16_C(21) ) return (uint16_t)((UINT64_C(0x49246db6da492248) >> (3*(int)x)) & UINT64_C(7));
+  int      n = log2_uint16( x );
+  uint16_t y = (uint16_t)(((n & 1) ? UINT16_C(0xb5) /* floor( 2^7 sqrt(2) ) */ : UINT16_C(0x80) /* 2^7 */) >> (7-(n>>1)));
+  for(;;) {
+    uint16_t d = (uint16_t)(y<<1); d++;
+    uint16_t qx = x / d;             uint16_t rx = (uint16_t)(x - qx*d);
+    uint16_t qy = y>>1;              uint16_t ry = (y & UINT16_C(1)) ? (uint16_t)(y+qy+UINT16_C(1)) : qy;
+    uint16_t q  = (uint16_t)(qx+qy); uint16_t r  = (uint16_t)(rx+ry);
+    if( r>=d ) q++;
+    if( y==q ) break;
+    y = q;
+  }
+  return y;
+}
+
+static inline uint32_t
+sqrt_uint32( uint32_t x ) {
+  if( x<UINT32_C(21) ) return (uint32_t)((UINT64_C(0x49246db6da492248) >> (3*(int)x)) & UINT64_C(7));
+  if( !x ) return UINT32_C(0);
+  int      n = log2_uint32( x );
+  uint32_t y = ((n & 1) ? UINT32_C(0xb504) /* floor( 2^15 sqrt(2) ) */ : UINT32_C(0x8000) /* 2^15 */) >> (15-(n>>1));
+  for(;;) {
+    uint32_t d = (y<<1); d++;
+    uint32_t qx = x / d; uint32_t rx = x - qx*d;
+    uint32_t qy = y>>1;  uint32_t ry = (y & UINT32_C(1)) ? (y+qy+UINT32_C(1)) : qy;
+    uint32_t q  = qx+qy; uint32_t r  = rx+ry;
+    if( r>=d ) q++;
+    if( y==q ) break;
+    y = q;
+  }
+  return y;
+}
+
+static inline uint64_t
+sqrt_uint64( uint64_t x ) {
+  if( x<UINT64_C(21) ) return (UINT64_C(0x49246db6da492248) >> (3*(int)x)) & UINT64_C(7);
+  int      n = log2_uint64( x );
+  uint64_t y = ((n & 1) ? UINT64_C(0xb504f333) /* floor( 2^31 sqrt(2) ) */ : UINT64_C(0x80000000) /* 2^31 */) >> (31-(n>>1));
+  for(;;) {
+    uint64_t d = (y<<1); d++;
+    uint64_t qx = x / d; uint64_t rx = x - qx*d;
+    uint64_t qy = y>>1;  uint64_t ry = (y & UINT64_C(1)) ? (y+qy+UINT64_C(1)) : qy;
+    uint64_t q  = qx+qy; uint64_t r  = rx+ry;
+    if( r>=d ) q++;
+    if( y==q ) break;
+    y = q;
+  }
+  return y;
+}
+
+/* These return floor( sqrt( x ) ), undefined behavior for negative x. */
+
+static inline int8_t  sqrt_int8 ( int8_t  x ) { return (int8_t )sqrt_uint8 ( (uint8_t )x ); }
+static inline int16_t sqrt_int16( int16_t x ) { return (int16_t)sqrt_uint16( (uint16_t)x ); }
+static inline int32_t sqrt_int32( int32_t x ) { return (int32_t)sqrt_uint32( (uint32_t)x ); }
+static inline int64_t sqrt_int64( int64_t x ) { return (int64_t)sqrt_uint64( (uint64_t)x ); }
+
+/* These return the floor( re sqrt(x) ) */
+
+static inline int8_t  sqrt_re_int8 ( int8_t  x ) { return x>INT8_C( 0) ? (int8_t )sqrt_uint8 ( (uint8_t )x ) : INT8_C( 0); }
+static inline int16_t sqrt_re_int16( int16_t x ) { return x>INT16_C(0) ? (int16_t)sqrt_uint16( (uint16_t)x ) : INT16_C(0); }
+static inline int32_t sqrt_re_int32( int32_t x ) { return x>INT32_C(0) ? (int32_t)sqrt_uint32( (uint32_t)x ) : INT32_C(0); }
+static inline int64_t sqrt_re_int64( int64_t x ) { return x>INT64_C(0) ? (int64_t)sqrt_uint64( (uint64_t)x ) : INT64_C(0); }
+
+/* These return the floor( sqrt( |x| ) ) */
+
+static inline int8_t  sqrt_abs_int8 ( int8_t  x ) { return (int8_t )sqrt_uint8 ( (uint8_t )(x<INT8_C( 0) ? -x : x) ); }
+static inline int16_t sqrt_abs_int16( int16_t x ) { return (int16_t)sqrt_uint16( (uint16_t)(x<INT16_C(0) ? -x : x) ); }
+static inline int32_t sqrt_abs_int32( int32_t x ) { return (int32_t)sqrt_uint32( (uint32_t)(x<INT32_C(0) ? -x : x) ); }
+static inline int64_t sqrt_abs_int64( int64_t x ) { return (int64_t)sqrt_uint64( (uint64_t)(x<INT64_C(0) ? -x : x) ); }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_sqrt_h_ */
+
diff --git a/program/src/oracle/util/test_align.c b/program/src/oracle/util/test_align.c
new file mode 100644
index 000000000..eaa55767b
--- /dev/null
+++ b/program/src/oracle/util/test_align.c
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include <limits.h>
+#include "util.h"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  uint32_t shift_mask = (uint32_t)(sizeof(uintptr_t)*(size_t)CHAR_BIT);
+  if( !align_ispow2( (uintptr_t)shift_mask ) ) {
+    printf( "FAIL (platform)\n" );
+    return 1;
+  }
+  shift_mask--;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  int ctr = 0;                                                                                                                      
+  for( int i=0; i<1000000000; i++ ) {
+    if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; }
+    ctr--;
+
+    /* Test align_ispow2 */
+
+    do {
+      uint32_t r = prng_uint32( prng );
+      uint32_t s = (r>>8) & shift_mask;  /* s is uniform IID valid right shift amount for a uintptr_t */
+      r &= (uint32_t)255;                /* r is 8-bit uniform IID rand (power of 2 with prob ~8/256=1/32) */
+      uintptr_t x = ((uintptr_t)r) << s; /* x is power of 2 with prob ~1/32 */
+
+      int c = align_ispow2( x );
+      int d = (x>(uintptr_t)0) && !(x & (x-(uintptr_t)1));
+      if( c!=d ) {
+        printf( "FAIL (x %lx c %i d %i)\n", (unsigned long)x, c, d );
+        return 1;
+      }
+    } while(0);
+
+    /* Test align_up/align_dn/align_isaligned */
+
+    do {
+      uintptr_t a = ((uintptr_t)1) << (prng_uint32( prng ) & shift_mask);
+
+      uintptr_t x = (uintptr_t)prng_uint64( prng );
+      void *    u = (void *)x;
+      void *    v = align_dn( u, a );
+      void *    w = align_up( u, a );
+      uintptr_t y = (uintptr_t)v;
+      uintptr_t z = (uintptr_t)w;
+
+      int already_aligned   = (u==v);
+      int already_aligned_1 = (u==w);
+
+      uintptr_t mask = a-(uintptr_t)1;
+      if( already_aligned != already_aligned_1     ||
+          align_isaligned( u, a )!=already_aligned ||
+          !align_isaligned( v, a )                 ||
+          !align_isaligned( w, a )                 ||
+          ( x       & ~mask) != y                  ||
+          ((x+mask) & ~mask) != z                  ) {
+        printf( "FAIL (a %lx aa %i aa1 %i x %lx y %lx z %lx)\n",
+                (unsigned long)a, already_aligned, already_aligned_1, (unsigned long)x, (unsigned long)y, (unsigned long)z );
+        return 1;
+      }
+
+    } while(0);
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass\n" );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_avg.c b/program/src/oracle/util/test_avg.c
new file mode 100644
index 000000000..ee07de0a9
--- /dev/null
+++ b/program/src/oracle/util/test_avg.c
@@ -0,0 +1,141 @@
+#include <stdio.h>
+#include "util.h"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  int ctr = 0;                                                                                                                      
+  for( int i=0; i<1000000000; i++ ) {
+    if( !ctr ) { printf( "reg: Completed %i iterations\n", i ); ctr = 10000000; }
+    ctr--;
+     
+#   define TEST(w) do {                                                                      \
+      uint##w##_t x = prng_uint##w( prng );                                                  \
+      uint##w##_t y = prng_uint##w( prng );                                                  \
+      uint##w##_t z = avg_2_uint##w( x, y );                                                 \
+      uint##w##_t r = (uint##w##_t)(((uint64_t)x+(uint64_t)y) >> 1);                         \
+      if( z!=r ) {                                                                           \
+        printf( "FAIL (iter %i op avg_2_uint" #w "x %lu y %lu z %lu r %lu\n",                \
+                i, (long unsigned)x, (long unsigned)y, (long unsigned)z, (long unsigned)r ); \
+        return 1;                                                                            \
+      }                                                                                      \
+    } while(0)
+    TEST(8);
+    TEST(16);
+    TEST(32);
+#   undef TEST
+
+    do {
+      uint64_t x = prng_uint64( prng );
+      uint64_t y = prng_uint64( prng );
+      uint64_t z = avg_2_uint64( x, y );
+      uint64_t r = (x>>1) + (y>>1) + (x & y & (uint64_t)1);
+      if( z!=r ) {
+        printf( "FAIL (iter %i op avg_2_uint64 x %lu y %lu z %lu r %lu\n",
+                i, (long unsigned)x, (long unsigned)y, (long unsigned)z, (long unsigned)r );
+        return 1;
+      }
+    } while(0);
+                                                                                             \
+#   define TEST(w) do {                                                                      \
+      int##w##_t x = (int##w##_t)prng_uint##w( prng );                                       \
+      int##w##_t y = (int##w##_t)prng_uint##w( prng );                                       \
+      int##w##_t z = avg_2_int##w( x, y );                                                   \
+      int##w##_t r = (int##w##_t)(((int64_t)x+(int64_t)y) >> 1);                             \
+      if( z!=r ) {                                                                           \
+        printf( "FAIL (iter %i op avg_2_int" #w "x %li y %li z %li r %li\n",                 \
+                i, (long)x, (long)y, (long)z, (long)r );                                     \
+        return 1;                                                                            \
+      }                                                                                      \
+    } while(0)
+    TEST(8);
+    TEST(16);
+    TEST(32);
+#   undef TEST
+
+    do {
+      int64_t x = (int64_t)prng_uint64( prng );
+      int64_t y = (int64_t)prng_uint64( prng );
+      int64_t z = avg_2_int64( x, y );
+      int64_t r = (x>>1) + (y>>1) + (x & y & (int64_t)1);
+      if( z!=r ) {
+        printf( "FAIL (iter %i op avg_2_int64 x %li y %li z %li r %li\n",
+                i, (long)x, (long)y, (long)z, (long)r );
+        return 1;
+      }
+    } while(0);
+  }
+
+# define N 512
+
+  ctr = 0;
+  for( int i=0; i<10000000; i++ ) {
+    if( !ctr ) { printf( "mem: Completed %i iterations\n", i ); ctr = 100000; }
+    ctr--;
+
+#   define TEST(w) do {                                                    \
+      uint##w##_t x[N];                                                    \
+      uint32_t n = prng_uint32( prng ) & (uint32_t)(N-1);                  \
+      uint64_t a = (uint64_t)0;                                            \
+      for( uint32_t i=(uint32_t)0; i<n; i++ ) {                            \
+        uint##w##_t xi = prng_uint##w( prng );                             \
+        x[i] = xi;                                                         \
+        a += (uint64_t)xi;                                                 \
+      }                                                                    \
+      if( n ) a /= (uint64_t)n;                                            \
+      uint##w##_t z = avg_uint##w( x, n );                                 \
+      uint##w##_t r = (uint##w##_t)a;                                      \
+      if( z!=r ) {                                                         \
+        printf( "FAIL (iter %i op avg_uint" #w " n %lu z %lu r %lu\n",     \
+                i, (long unsigned)n, (long unsigned)z, (long unsigned)r ); \
+        return 1;                                                          \
+      }                                                                    \
+    } while(0)
+
+    TEST(8);
+    TEST(16);
+    TEST(32);
+
+#   undef TEST
+
+#   define TEST(w) do {                                                               \
+      int##w##_t x[N];                                                                \
+      uint32_t n = prng_uint32( prng ) & (uint32_t)(N-1);                             \
+      int64_t a = (int64_t)0;                                                         \
+      for( uint32_t i=(uint32_t)0; i<n; i++ ) {                                       \
+        int##w##_t xi = (int##w##_t)prng_uint##w( prng );                             \
+        x[i] = xi;                                                                    \
+        a += (int64_t)xi;                                                             \
+      }                                                                               \
+      if( n ) a /= (int64_t)n; /* Assumes round to zero signed int div on platform */ \
+      int##w##_t z = avg_int##w( x, n );                                              \
+      int##w##_t r = (int##w##_t)a;                                                   \
+      if( z!=r ) {                                                                    \
+        printf( "FAIL (iter %i op avg_int" #w " n %lu z %li r %li\n",                 \
+                i, (long unsigned)n, (long)z, (long)r );                              \
+        return 1;                                                                     \
+      }                                                                               \
+    } while(0)
+
+    TEST(8);
+    TEST(16);
+    TEST(32);
+
+#   undef TEST
+
+  }
+
+# undef N
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass\n" );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_exp2m1.c b/program/src/oracle/util/test_exp2m1.c
new file mode 100644
index 000000000..192ebb49a
--- /dev/null
+++ b/program/src/oracle/util/test_exp2m1.c
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <math.h>
+#include "prng.h"
+
+#define EXP2M1_FXP_ORDER 7
+#include "exp2m1.h"
+
+static inline double exp2m1( double x ) { return expm1(0.69314718055994530943*x); } /* FIXME: USE PROPER METHOD */
+
+static double const ulp_thresh[8] = {
+         0.50000000000000000000, // For measuring limit
+  92418389.07584851980209350586,
+   4173744.68851661682128906250,
+    183713.37186783552169799805,
+      5998.71748709678649902344,
+       165.76749026775360107422,
+         4.96575236320495605469,
+         0.63437640666961669922
+};
+
+static double const rel_thresh[8] = {
+  4.7e-10, // For measuring limit
+  6.2e-02,
+  3.3e-03,
+  1.3e-04,
+  5.1e-06,
+  1.5e-07,
+  4.4e-09,
+  5.5e-10
+};
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  double const one_ulp = (double)(1<<30); /* exact */
+  double const ulp     = (1./one_ulp);    /* exact */
+
+  /* Exhaustively test x in [0,2^30) */
+
+  double max_aerr = 0.;
+  double max_rerr = 0.;
+
+  double thresh = ulp_thresh[ EXP2M1_FXP_ORDER ];
+
+  int ctr = 0;
+  for( int i=0; i<(1<<30); i++ ) {
+    if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 100000000; }
+    ctr--;
+
+    double x = ulp*(double)i;                         /* exact */
+    double y = ulp*(double)exp2m1_fxp( (uint64_t)i ); /* approx */
+  //double y = ulp*round( one_ulp*exp2m1( x ) );      /* limit: bits 31.0 max_aerr 4.7e-10 max_rerr 4.7e-10 ulp 0.5 */
+    double z = exp2m1( x );                           /* goal */
+    double aerr = fabs(y-z);
+    double rerr = aerr/(1.+z);
+    if( aerr*one_ulp>thresh ) {
+      printf( "FAIL (x %.20f y %.20f z %.20f aerr %.20f ulp)\n", x, y, z, aerr*one_ulp );
+      return 1;
+    }
+    max_aerr = fmax( max_aerr, aerr );
+    max_rerr = fmax( max_rerr, rerr );
+  }
+
+  /* Randomly test larger x (with coverage of each exponent range and
+     special values) */
+
+  thresh = rel_thresh[ EXP2M1_FXP_ORDER ];
+
+  for( int i=0; i<10000000; i++ ) {
+    uint64_t _x;
+    if     ( i< 64 ) _x = UINT64_C(1)<<i;
+    else if( i==64 ) _x = UINT64_C(0);
+    else if( i==65 ) _x = exp2m1_fxp_max();
+    else {
+      int s  = ((int)prng_uint32( prng )) & 63;
+      _x = prng_uint64( prng ) >> s;
+    }
+    uint64_t _y = exp2m1_fxp( _x );
+
+    if( _x>exp2m1_fxp_max() ) {
+      if( _y!=UINT64_MAX ) {
+        printf( "FAIL (iter %i sat _x %lx y %lx)\n", i, _x, _y );
+        return 1;
+      }
+      continue;
+    }
+
+    uint64_t xi = _x>>30;
+    uint64_t xf = (_x<<34)>>34;
+    if( !(xi & (xi-UINT64_C(1))) && !xf ) {
+      if( _y!=((UINT64_C(1)<<(xi+UINT64_C(30)))-(UINT64_C(1)<<30)) ) {
+        printf( "FAIL (iter %i exact_x %lx y %lx)\n", i, _x, _y );
+        return 1;
+      }
+    }
+
+    double x = ulp*(double)_x; /* exact */
+    double y = ulp*(double)_y; /* exact */
+    double z = exp2m1( x );    /* goal  */
+    double aerr = fabs(y-z);
+    double rerr = aerr/(1.+z);
+    if( rerr>thresh ) {
+      printf( "FAIL (x %.20f y %.20f z %.20f rerr %.20e)\n", x, y, z, rerr );
+      return 1;
+    }
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass (bits %.1f max_aerr %.1e max_rerr %.1e ulp %.1f)\n", -log2( max_rerr ), max_aerr, max_rerr, max_aerr*one_ulp );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_fxp.c b/program/src/oracle/util/test_fxp.c
new file mode 100644
index 000000000..ce4dfc40a
--- /dev/null
+++ b/program/src/oracle/util/test_fxp.c
@@ -0,0 +1,376 @@
+#include <stdio.h>
+#include <math.h>
+#include "prng.h"
+
+#include "fxp.h"
+
+/* Create random bit patterns with lots of leading and/or trailing zeros
+   or ones to really stress limits of implementations. */
+
+static inline uint64_t            /* Random fxp */
+make_rand_fxp( uint64_t x,        /* Random 64-bit */
+               uint32_t *_ctl ) { /* Least significant 8 bits random, uses them up */
+  uint32_t ctl = *_ctl;
+  int s = (int)(ctl & UINT32_C(63)); ctl >>= 6; /* Shift, in [0,63] */
+  int d = (int)(ctl & UINT32_C( 1)); ctl >>= 1; /* Direction, in [0,1] */
+  int i = (int)(ctl & UINT32_C( 1)); ctl >>= 1; /* Invert, in [0,1] */
+  *_ctl = ctl;
+  x = d ? (x<<s) : (x>>s);
+  return i ? (~x) : x;
+}
+
+__extension__ typedef unsigned __int128 uint128_t;
+
+static inline uint64_t split_hi( uint128_t x ) { return (uint64_t)(x>>64); }
+static inline uint64_t split_lo( uint128_t x ) { return (uint64_t) x;      }
+
+static inline uint64_t
+fxp_add_ref( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _c ) {
+  uint128_t z = ((uint128_t)x) + ((uint128_t)y);
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_sub_ref( uint64_t   x,
+             uint64_t   y,
+             uint64_t * _b ) {
+  uint64_t  b = (uint64_t)(x<y);
+  uint128_t z = (((uint128_t)b)<<64) + ((uint128_t)x) - ((uint128_t)y);
+  *_b = b;
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_mul_rtz_ref( uint64_t   x,
+                 uint64_t   y,
+                 uint64_t * _c ) {
+  uint128_t z = ((uint128_t)x)*((uint128_t)y);
+  z >>= 30;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_mul_raz_ref( uint64_t   x,
+                 uint64_t   y,
+                 uint64_t * _c ) {
+  uint128_t z = (((uint128_t)x)*((uint128_t)y) + ((uint128_t)((UINT64_C(1)<<30)-UINT64_C(1)))) >> 30;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_mul_rnz_ref( uint64_t   x,
+                 uint64_t   y,
+                 uint64_t * _c ) {
+  uint128_t z = (((uint128_t)x)*((uint128_t)y) + ((uint128_t)((UINT64_C(1)<<29)-UINT64_C(1)))) >> 30;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_mul_rna_ref( uint64_t   x,
+                 uint64_t   y,
+                 uint64_t * _c ) {
+  uint128_t z = (((uint128_t)x)*((uint128_t)y) + ((uint128_t)(UINT64_C(1)<<29))) >> 30;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_mul_rne_ref( uint64_t   x,
+                 uint64_t   y,
+                 uint64_t * _c ) {
+  uint128_t z = ((uint128_t)x)*((uint128_t)y);
+  uint64_t  f = split_lo( z ) & ((UINT64_C(1)<<30)-UINT64_C(1));
+  z >>= 30;
+  if( (f>(UINT64_C(1)<<29)) || ((f==(UINT64_C(1)<<29)) && (z & UINT64_C(1))) ) z++;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_mul_rno_ref( uint64_t   x,
+                 uint64_t   y,
+                 uint64_t * _c ) {
+  uint128_t z = ((uint128_t)x)*((uint128_t)y);
+  uint64_t  f = split_lo( z ) & ((UINT64_C(1)<<30)-UINT64_C(1));
+  z >>= 30;
+  if( (f>(UINT64_C(1)<<29)) || ((f==(UINT64_C(1)<<29)) && !(z & UINT64_C(1))) ) z++;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_div_rtz_ref( uint64_t   x,
+                 uint64_t   _y,
+                 uint64_t * _c ) {
+  if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); }
+  uint128_t ex = ((uint128_t)x) << 30;
+  uint128_t y  = (uint128_t)_y;
+  uint128_t z  = ex / y;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_div_raz_ref( uint64_t   x,
+                 uint64_t   _y,
+                 uint64_t * _c ) {
+  if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); }
+  uint128_t ex = ((uint128_t)x) << 30;
+  uint128_t y  = (uint128_t)_y;
+  uint128_t z  = (ex + y - (uint128_t)1) / y;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_div_rnz_ref( uint64_t   x,
+                 uint64_t   _y,
+                 uint64_t * _c ) {
+  if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); }
+  uint128_t ex = ((uint128_t)x) << 30;
+  uint128_t y  = (uint128_t)_y;
+  uint128_t z  = (ex + ((y-(uint128_t)1)>>1)) / y;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_div_rna_ref( uint64_t   x,
+                 uint64_t   _y,
+                 uint64_t * _c ) {
+  if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); }
+  uint128_t ex = ((uint128_t)x) << 30;
+  uint128_t y  = (uint128_t)_y;
+  uint128_t z  = (ex + (y>>1)) / y;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_div_rne_ref( uint64_t   x,
+                 uint64_t   _y,
+                 uint64_t * _c ) {
+  if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); }
+  uint128_t ex = ((uint128_t)x) << 30;
+  uint128_t y  = (uint128_t)_y;
+  uint128_t z  = ex / y;
+  uint128_t r2 = (ex - z*y) << 1;
+  if( r2>y || (r2==y && (z & (uint128_t)1)) ) z++;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline uint64_t
+fxp_div_rno_ref( uint64_t   x,
+                 uint64_t   _y,
+                 uint64_t * _c ) {
+  if( !_y ) { *_c = UINT64_MAX; return UINT64_C(0); }
+  uint128_t ex = ((uint128_t)x) << 30;
+  uint128_t y  = (uint128_t)_y;
+  uint128_t z  = ex / y;
+  uint128_t r2 = (ex - z*y) << 1;
+  if( r2>y || (r2==y && !(z & (uint128_t)1)) ) z++;
+  *_c = split_hi( z );
+  return split_lo( z );
+}
+
+static inline int
+test_fxp_sqrt_rtz( uint64_t x,
+                   uint64_t y ) {
+  if( !x ) return !!y;
+  if( !(((UINT64_C(1)<<15)<=y) && (y<=(UINT64_C(1)<<(32+15)))) ) return 1;
+  uint64_t xh,xl; uwide_sl ( &xh,&xl, UINT64_C(0),x, 30 );      /* 2^30 x */
+  uint64_t rh,rl; uwide_mul( &rh,&rl, y,y );                    /* y^2 */
+  uint64_t b = uwide_sub( &rh,&rl, xh,xl, rh,rl, UINT64_C(0) ); /* r = 2^30 x - y^2, in [0,2y] if rounded correctly */
+  return b || rh || rl>(y<<1);
+}
+
+static inline int
+test_fxp_sqrt_raz( uint64_t x,
+                   uint64_t y ) {
+  if( !x ) return !!y;
+  if( !(((UINT64_C(1)<<15)<=y) && (y<=(UINT64_C(1)<<(32+15)))) ) return 1;
+  uint64_t xh,xl; uwide_sl ( &xh,&xl, UINT64_C(0),x, 30 );      /* 2^30 x */
+  uint64_t rh,rl; uwide_mul( &rh,&rl, y,y );                    /* y^2 */
+  uint64_t b = uwide_sub( &rh,&rl, rh,rl, xh,xl, UINT64_C(0) ); /* y^2 - 2^30 x, in [0,2y-2] if rounded correctly */
+  return b || rh || rl>((y<<1)-UINT64_C(2));
+}
+
+static inline int
+test_fxp_sqrt_rnz( uint64_t x,
+                   uint64_t y ) {
+  if( !x ) return !!y;
+  if( !(((UINT64_C(1)<<15)<=y) && (y<=(UINT64_C(1)<<(32+15)))) ) return 1;
+  uint64_t xh,xl; uwide_sl ( &xh,&xl, UINT64_C(0),x, 30 );      /* 2^30 x */
+  uint64_t rh,rl; uwide_mul( &rh,&rl, y,y-UINT64_C(1) );        /* y^2 - y */
+  uwide_inc( &rh,&rl, rh,rl, UINT64_C(1) );                     /* y^2 - y + 1 */
+  uint64_t b = uwide_sub( &rh,&rl, xh,xl, rh,rl, UINT64_C(0) ); /* r = 2^30 x - (y^2 - y + 1), in [0,2y-1] if rounded correctly */
+  return b || rh || rl>=(y<<1);
+}
+
+static inline int
+fxp_log2_ref( uint64_t * _f,
+              uint64_t   x ) {
+  if( !x ) { *_f = UINT64_C(0); return INT_MIN; }
+  long double ef = log2l( (long double)x );
+  int e = log2_uint64( x );
+  *_f = (uint64_t)roundl( (ef - (long double)e)*(long double)(UINT64_C(1)<<30) );
+  return e-30;
+}
+
+static inline uint64_t
+fxp_exp2_ref( uint64_t x ) {
+  if( x>=UINT64_C(0x880000000) ) return UINT64_MAX;
+  return (uint64_t)roundl( ((long double)(UINT64_C(1)<<30))*exp2l( ((long double)x)*(1.L/(long double)(UINT64_C(1)<<30)) ) );
+}
+
+static inline uint64_t
+fxp_rexp2_ref( uint64_t x ) {
+  return (uint64_t)roundl( ((long double)(UINT64_C(1)<<30))*exp2l( ((long double)x)*(-1.L/(long double)(UINT64_C(1)<<30)) ) );
+}
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  uint64_t fxp_log2_approx_ulp  = UINT64_C(0);
+  uint64_t fxp_exp2_approx_ulp  = UINT64_C(0);
+  uint64_t fxp_rexp2_approx_ulp = UINT64_C(0);
+
+  int ctr = 0;
+  for( int i=0; i<100000000; i++ ) {
+    if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; }
+    ctr--;
+
+    uint32_t t =                prng_uint32( prng );
+    uint64_t x = make_rand_fxp( prng_uint64( prng ), &t );
+    uint64_t y = make_rand_fxp( prng_uint64( prng ), &t );
+
+#   define TEST(op)                                  \
+    do {                                             \
+      uint64_t c0,z0 = fxp_##op##_ref ( x, y, &c0 ); \
+      uint64_t c1,z1 = fxp_##op       ( x, y, &c1 ); \
+      uint64_t    z2 = fxp_##op##_fast( x, y );      \
+      if( c0!=c1 || z0!=z1 || (!c0 && z0!=z2) ) {    \
+        printf( "%i: FAIL (fxp_" #op " x %016lx y %016lx cz0 %016lx %016lx cz1 %016lx %016lx z2 %016lx\n", \
+                i, x, y, c0,z0, c1,z1, z2 );         \
+        return 1;                                    \
+      }                                              \
+    } while(0)
+
+    TEST(add);
+    TEST(sub);
+
+#   undef TEST
+#   define TEST(op)                                  \
+    do {                                             \
+      uint64_t c0,z0 = fxp_##op##_ref ( x, y, &c0 ); \
+      uint64_t c1,z1 = fxp_##op       ( x, y, &c1 ); \
+      uint64_t    z2 = fxp_##op##_fast( x, y );      \
+      if( c0!=c1 || z0!=z1 || (!c0 && z0<UINT64_C(0x3c0000000) && z0!=z2) ) { \
+        printf( "%i: FAIL (fxp_" #op " x %016lx y %016lx cz0 %016lx %016lx cz1 %016lx %016lx z2 %016lx\n", \
+                i, x, y, c0,z0, c1,z1, z2 );         \
+        return 1;                                    \
+      }                                              \
+    } while(0)
+
+    TEST(mul_rtz);
+    TEST(mul_raz);
+    TEST(mul_rnz);
+    TEST(mul_rna);
+    TEST(mul_rne);
+    TEST(mul_rno);
+
+#   undef TEST
+#   define TEST(op)                                  \
+    do {                                             \
+      uint64_t c0,z0 = fxp_##op##_ref ( x, y, &c0 ); \
+      uint64_t c1,z1 = fxp_##op       ( x, y, &c1 ); \
+      uint64_t    z2 = y ? fxp_##op##_fast( x, y ) : UINT64_C(0); \
+      if( c0!=c1 || z0!=z1 || ((x<UINT64_C(0x400000000)) && (y<=UINT64_MAX-(x<<30)) && (z0!=z2)) ) { \
+        printf( "%i: FAIL (fxp_" #op " x %016lx y %016lx cz0 %016lx %016lx cz1 %016lx %016lx z2 %016lx\n", \
+                i, x, y, c0,z0, c1,z1, z2 );         \
+        return 1;                                    \
+      }                                              \
+    } while(0)
+
+    TEST(div_rtz);
+    TEST(div_raz);
+    TEST(div_rnz);
+    TEST(div_rna);
+    TEST(div_rne);
+    TEST(div_rno);
+
+#   undef TEST
+#   define TEST(op)                       \
+    do {                                  \
+      uint64_t z1 = fxp_##op       ( x ); \
+      uint64_t z2 = fxp_##op##_fast( x ); \
+      if( test_fxp_##op( x, z1 ) || ((x<UINT64_C(0x400000000)) && (z1!=z2)) ) { \
+        printf( "%i: FAIL (fxp_" #op " x %016lx z1 %016lx z2 %016lx\n", \
+                i, x, z1, z2 );           \
+        return 1;                         \
+      }                                   \
+    } while(0)
+
+    TEST(sqrt_rtz);
+    TEST(sqrt_raz);
+    TEST(sqrt_rnz);
+
+#   undef TEST
+
+    do {
+      uint64_t f0; int e0 = fxp_log2_ref   ( &f0, x );
+      uint64_t f1; int e1 = fxp_log2_approx( &f1, x );
+      uint64_t ulp = f0>f1 ? f0-f1 : f1-f0;
+      if( ulp > fxp_log2_approx_ulp ) fxp_log2_approx_ulp = ulp;
+      if( e0!=e1 || ulp>UINT64_C(2) ) {
+        printf( "%i: FAIL (fxp_log2_approx x %016lx z0 %3i %016lx z1 %3i %016lx ulp %016lx\n", i, x, e0,f0, e1,f1, ulp );
+        return 1;
+      }
+    } while(0);
+
+    do {
+      uint64_t z0  = fxp_exp2_ref   ( x );
+      uint64_t z1  = fxp_exp2_approx( x );
+      uint64_t ulp = z0>z1 ? z0-z1 : z1-z0;
+      uint64_t ix = x>>30; /* Larger x are only +/-1 ulp accurate in the first 30 bits */
+      if( UINT64_C(0)<ix && ix<UINT64_C(64) ) ulp >>= ix;
+      if( ulp > fxp_exp2_approx_ulp ) fxp_exp2_approx_ulp = ulp;
+      if( ulp>UINT64_C(1) ) {
+        printf( "%i: FAIL (fxp_exp2_approx x %016lx z0 %016lx z1 %016lx ulp %016lx\n", i, x, z0, z1, ulp );
+        return 1;
+      }
+    } while(0);
+
+    do {
+      uint64_t z0 = fxp_rexp2_ref   ( x );
+      uint64_t z1 = fxp_rexp2_approx( x );
+      uint64_t ulp = z0>z1 ? z0-z1 : z1-z0;
+      if( ulp > fxp_rexp2_approx_ulp ) fxp_rexp2_approx_ulp = ulp;
+      if( ulp>UINT64_C(1) ) {
+        printf( "%i: FAIL (fxp_rexp2_approx x %016lx z0 %016lx z1 %016lx ulp %016lx\n", i, x, z0, z1, ulp );
+        return 1;
+      }
+    } while(0);
+
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass (ulp fxp_log2_approx ~ %lu fxp_exp2_approx ~ %lu fxp_rexp2_approx ~ %lu)\n",
+          fxp_log2_approx_ulp, fxp_exp2_approx_ulp, fxp_rexp2_approx_ulp );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_hash.c b/program/src/oracle/util/test_hash.c
new file mode 100644
index 000000000..e4dcc6cf0
--- /dev/null
+++ b/program/src/oracle/util/test_hash.c
@@ -0,0 +1,59 @@
+#include <stdio.h>
+#include "util.h"
+
+uint32_t ref32[10] = {
+  UINT32_C( 0x00000000 ),
+  UINT32_C( 0x514e28b7 ),
+  UINT32_C( 0x30f4c306 ),
+  UINT32_C( 0x85f0b427 ),
+  UINT32_C( 0x249cb285 ),
+  UINT32_C( 0xcc0d53cd ),
+  UINT32_C( 0x5ceb4d08 ),
+  UINT32_C( 0x18c9aec4 ),
+  UINT32_C( 0x4939650b ),
+  UINT32_C( 0xc27c2913 )
+};
+
+
+uint64_t ref64[10] = {
+  UINT64_C( 0x0000000000000000 ),
+  UINT64_C( 0xb456bcfc34c2cb2c ),
+  UINT64_C( 0x3abf2a20650683e7 ),
+  UINT64_C( 0x0b5181c509f8d8ce ),
+  UINT64_C( 0x47900468a8f01875 ),
+  UINT64_C( 0xd66ad737d54c5575 ),
+  UINT64_C( 0xe8b4b3b1c77c4573 ),
+  UINT64_C( 0x740729cbe468d1dd ),
+  UINT64_C( 0x46abcca593a3c687 ),
+  UINT64_C( 0x91209a1ff7f4f1d5 )
+};
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  for( int i=0; i<10; i++ ) {
+    uint32_t x = (uint32_t)i;
+    uint32_t y = hash_uint32( x );
+    uint32_t z = hash_inverse_uint32( y );
+    if( y!=ref32[i] ) { printf( "ref32: FAIL\n" ); return 1; }
+    if( x!=z        ) { printf( "inv32: FAIL\n" ); return 1; }
+    if( hash_uint32( hash_inverse_uint32( x ) )!=x ) { printf( "INV32: FAIL\n" ); return 1; }
+  }
+
+  for( int i=0; i<10; i++ ) {
+    uint64_t x = (uint64_t)i;
+    uint64_t y = hash_uint64( x );
+    uint64_t z = hash_inverse_uint64( y );
+    if( y!=ref64[i] ) { printf( "ref64: FAIL\n" ); return 1; }
+    if( x!=z        ) { printf( "inv64: FAIL\n" ); return 1; }
+    if( hash_uint64( hash_inverse_uint64( x ) )!=x ) { printf( "INV64: FAIL\n" ); return 1; }
+  }
+
+  /* FIXME: MEASURE AVALANCHE PROPERTIES? */
+
+  printf( "pass\n" );
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_log2.c b/program/src/oracle/util/test_log2.c
new file mode 100644
index 000000000..f54ee7b43
--- /dev/null
+++ b/program/src/oracle/util/test_log2.c
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include "util.h"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  int ctr = 0;
+  for( int i=0; i<100000000; i++ ) {
+    if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; }
+    ctr--;
+#   define TEST(w) do {                                                                         \
+      int         n    = ((int)prng_uint32( prng )) & (w-1);                                    \
+      uint##w##_t mask = (uint##w##_t)(1ULL << n);                                              \
+      uint##w##_t bit  = mask--;                                                                \
+      uint##w##_t x    = bit | (prng_uint##w( prng ) & mask);                                   \
+      int##w##_t  y    = (int##w##_t)x;                                                         \
+      if( log2_uint##w( x )!=n ) {                                                              \
+        printf( "FAIL (iter %i op log2_uint" #w " x %lx n %i)\n", i, (long unsigned)x, n );     \
+        return 1;                                                                               \
+      }                                                                                         \
+      if( y>0 && log2_int##w( y )!=n ) {                                                        \
+        printf( "FAIL (iter %i op log2_int" #w " x %lx n %i)\n",  i, (long unsigned)y, n );     \
+        return 1;                                                                               \
+      }                                                                                         \
+      if( log2_uint##w##_def( x, 1234 )!=n ) {                                                  \
+        printf( "FAIL (iter %i op log2_uint" #w "_def x %lx n %i)\n", i, (long unsigned)x, n ); \
+        return 1;                                                                               \
+      }                                                                                         \
+      if( log2_int##w##_def( y, 1234 )!=((y>(int##w##_t)0) ? n : 1234) ) {                      \
+        printf( "FAIL (iter %i op log2_int" #w "_def y %lx n %i)\n",  i, (long unsigned)x, n ); \
+        return 1;                                                                               \
+      }                                                                                         \
+      n--; bit>>=1; mask>>=1; x>>=1; y=(int##w##_t)x;                                           \
+      if( log2_uint##w##_def( x, 1234 )!=(x ? n : 1234) ) {                                     \
+        printf( "FAIL (iter %i op log2_uint" #w "_def x %lx n %i)\n", i, (long unsigned)x, n ); \
+        return 1;                                                                               \
+      }                                                                                         \
+      if( log2_int##w##_def( y, 1234 )!=(y ? n : 1234) ) {                                      \
+        printf( "FAIL (iter %i op log2_int" #w "_def x %lx n %i)\n",  i, (long unsigned)x, n ); \
+        return 1;                                                                               \
+      }                                                                                         \
+    } while(0)
+    TEST(8);
+    TEST(16);
+    TEST(32);
+    TEST(64);
+#   undef TEST
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass\n" );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_prng.c b/program/src/oracle/util/test_prng.c
new file mode 100644
index 000000000..84a428346
--- /dev/null
+++ b/program/src/oracle/util/test_prng.c
@@ -0,0 +1,180 @@
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+#include "util.h"
+
+/* First 10 uint64 for seq 0 starting from idx 0 */
+
+uint64_t x0_0[10] = {
+  UINT64_C( 0xa036f9b67313c1aa ),
+  UINT64_C( 0x110a4ea5e65927d2 ),
+  UINT64_C( 0x9ddf34cf83d17c94 ),
+  UINT64_C( 0xeb33d1b534e210ec ),
+  UINT64_C( 0x9b15b94d3e81b76a ),
+  UINT64_C( 0xee9544dab2bf64bf ),
+  UINT64_C( 0x5c4b0ccf7c94d274 ),
+  UINT64_C( 0xf0f83ab44262ad1f ),
+  UINT64_C( 0xf11b1aa14c7dabd6 ),
+  UINT64_C( 0x3800dde6d02d6ed7 )
+};
+
+/* First 10 uint64 for seq 0 starting from idx 20 */
+
+uint64_t x1_20[10] = {
+  UINT64_C( 0x55369a6a0817cbce ),
+  UINT64_C( 0xce1baeb695229132 ),
+  UINT64_C( 0x0e443e81e9e722d2 ),
+  UINT64_C( 0xc6c065484f76e825 ),
+  UINT64_C( 0x37dc474806fabc8a ),
+  UINT64_C( 0x9fa3305df1b56824 ),
+  UINT64_C( 0xf3b3961a17ed881c ),
+  UINT64_C( 0x646f40006cef8d6f ),
+  UINT64_C( 0x4d6955f607a153b2 ),
+  UINT64_C( 0xf806bccb58d0e60b )
+};
+
+void
+printf_ref( uint64_t * ref,
+            uint32_t   seq,
+            uint64_t   idx ) {
+  printf( "uint64_t x%u_%lu[10] = {\n", seq, idx );
+  for( int i=0; i<10; i++ ) printf( "  UINT64_C( 0x%016lx )%s\n", ref[i], i<9 ? "," : "" );
+  printf( "};\n" );
+}
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  uint64_t x[10];
+
+  if( prng_footprint()!=(uint64_t)sizeof (prng_t) ) { printf( "FAIL (footprint)\n" ); return 1; }
+  if( prng_align    ()!=(uint64_t)alignof(prng_t) ) { printf( "FAIL (align)\n"     ); return 1; }
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  if( prng_seq( prng )!=(uint32_t)0 ) { printf( "FAIL (seq)\n" ); return 1; }
+  if( prng_idx( prng )!=(uint64_t)0 ) { printf( "FAIL (idx)\n" ); return 1; }
+
+  for( int i=0; i<10; i++ ) x[i] = prng_uint64( prng );
+  for( int i=0; i<10; i++ )
+    if( x[i]!=x0_0[i] ) {
+      printf( "FAIL(0,0)\n" );
+      printf_ref( x0_0, (uint32_t)0, (uint64_t)0 );
+      printf_ref( x,    (uint32_t)0, (uint64_t)0 );
+      return 1;
+    }
+
+  if( prng_seq( prng )!=(uint32_t) 0 ) { printf( "FAIL (seq-1)\n" ); return 1; }
+  if( prng_idx( prng )!=(uint64_t)20 ) { printf( "FAIL (idx-1)\n" ); return 1; }
+
+  if( prng_seq_set( prng, (uint32_t)1 )!=(uint32_t)0 ) { printf( "FAIL (seq_set)\n" ); return 1; }
+
+  for( int i=0; i<10; i++ ) x[i] = prng_uint64( prng );
+  for( int i=0; i<10; i++ )
+    if( x[i]!=x1_20[i] ) {
+      printf( "FAIL(1,20)\n" );
+      printf_ref( x1_20, (uint32_t)1, (uint64_t)20 );
+      printf_ref( x,     (uint32_t)1, (uint64_t)20 );
+      return 1;
+    }
+
+  if( prng_seq( prng )!=(uint32_t) 1 ) { printf( "FAIL (seq-2)\n" ); return 1; }
+  if( prng_idx( prng )!=(uint64_t)40 ) { printf( "FAIL (idx-2)\n" ); return 1; }
+
+  if( prng_seq_set( prng, (uint32_t)0 )!=(uint32_t) 1 ) { printf( "FAIL (seq_set-1)\n" ); return 1; }
+  if( prng_idx_set( prng, (uint64_t)0 )!=(uint64_t)40 ) { printf( "FAIL (idx_set-1)\n" ); return 1; }
+
+  for( int i=0; i<10; i++ ) x[i] = prng_uint64( prng );
+  for( int i=0; i<10; i++ ) if( x[i]!=x0_0[i] ) { printf( "FAIL(0,0)-1\n" ); return 1; }
+
+  uint64_t domain = (uint64_t)0;
+  for( int i=0; i<1000; i++ ) {
+    uint8_t  u8  = prng_uint8    ( prng );
+    uint16_t u16 = prng_uint16   ( prng );
+    uint32_t u32 = prng_uint32   ( prng );
+    uint64_t u64 = prng_uint64   ( prng );
+
+    int8_t   i8  = prng_int8     ( prng ); if( i8 <(int8_t )0           ) { printf( "FAIL (int8 )\n" ); return 1; }
+    int16_t  i16 = prng_int16    ( prng ); if( i16<(int16_t)0           ) { printf( "FAIL (int16)\n" ); return 1; }
+    int32_t  i32 = prng_int32    ( prng ); if( i32<(int32_t)0           ) { printf( "FAIL (int32)\n" ); return 1; }
+    int64_t  i64 = prng_int64    ( prng ); if( i64<(int64_t)0           ) { printf( "FAIL (int64)\n" ); return 1; }
+
+    float    fc0 = prng_float_c0 ( prng ); if( !( 0.f<=fc0 && fc0< 1.f) ) { printf( "FAIL (float_c0)\n"  ); return 1; }
+    float    fc1 = prng_float_c1 ( prng ); if( !( 0.f< fc1 && fc1<=1.f) ) { printf( "FAIL (float_c1)\n"  ); return 1; }
+    float    fc  = prng_float_c  ( prng ); if( !( 0.f<=fc  && fc <=1.f) ) { printf( "FAIL (float_c)\n"   ); return 1; }
+    float    fnc = prng_float_o  ( prng ); if( !( 0.f< fnc && fnc< 1.f) ) { printf( "FAIL (float_o)\n"   ); return 1; }
+
+    double   dc0 = prng_double_c0( prng ); if( !( 0.f<=dc0 && dc0< 1.f) ) { printf( "FAIL (double_c0)\n" ); return 1; }
+    double   dc1 = prng_double_c1( prng ); if( !( 0.f< dc1 && dc1<=1.f) ) { printf( "FAIL (double_c1)\n" ); return 1; }
+    double   dc  = prng_double_c ( prng ); if( !( 0.f<=dc  && dc <=1.f) ) { printf( "FAIL (double_c)\n"  ); return 1; }
+    double   dnc = prng_double_o ( prng ); if( !( 0.f< dnc && dnc< 1.f) ) { printf( "FAIL (double_o)\n"  ); return 1; }
+
+#   define LO(i,w) ((uint64_t)((uint64_t)(i >> (w-4))==(uint64_t)0x0))
+#   define HI(i,w) ((uint64_t)((uint64_t)(i >> (w-4))==(uint64_t)0xf))
+
+    domain |= LO(u8, 8 ) <<  0; domain |= HI(u8, 8 ) <<  4;
+    domain |= LO(u16,16) <<  1; domain |= HI(u16,16) <<  5;
+    domain |= LO(u32,32) <<  2; domain |= HI(u32,32) <<  6;
+    domain |= LO(u64,64) <<  3; domain |= HI(u64,64) <<  7;
+
+    domain |= LO(i8, 7 ) <<  8; domain |= HI(i8, 7 ) << 12;
+    domain |= LO(i16,15) <<  9; domain |= HI(i16,15) << 13;
+    domain |= LO(i32,31) << 10; domain |= HI(i32,31) << 14;
+    domain |= LO(i64,63) << 11; domain |= HI(i64,63) << 15;
+
+#   undef HI
+#   undef LO
+#   define LO(f) ((uint64_t)(f<0.0625f))
+#   define HI(f) ((uint64_t)(f>0.9375f))
+
+    domain |= LO(fc0)    << 16; domain |= HI(fc0)    << 20;
+    domain |= LO(fc1)    << 17; domain |= HI(fc1)    << 21;
+    domain |= LO(fc )    << 18; domain |= HI(fc )    << 22;
+    domain |= LO(fnc)    << 19; domain |= HI(fnc)    << 23;
+
+#   undef HI
+#   undef LO
+#   define LO(d) ((uint64_t)(d<0.0625))
+#   define HI(d) ((uint64_t)(d>0.9375))
+
+    domain |= LO(dc0)    << 24; domain |= HI(dc0)    << 28;
+    domain |= LO(dc1)    << 25; domain |= HI(dc1)    << 29;
+    domain |= LO(dc )    << 26; domain |= HI(dc )    << 30;
+    domain |= LO(dnc)    << 27; domain |= HI(dnc)    << 31;
+
+#   undef HI
+#   undef LO
+  }
+  if( domain!=((UINT64_C(1)<<32)-UINT64_C(1)) ) { printf( "FAIL (domain)\n" ); return 1; }
+
+  long sum_pop  = 0L;
+  long sum_pop2 = 0L;
+  int  min_pop  = INT_MAX;
+  int  max_pop  = INT_MIN;
+
+  int ctr = 0;
+  for( long i=0; i<(1L<<32); i++ ) {
+    if( !ctr ) { printf( "Completed %li iterations\n", i ); ctr = 100000000; }
+    ctr--;
+
+    uint64_t seq = prng_private_expand( (uint32_t)i );
+    if( prng_private_contract( seq )!=(uint32_t)i ) { printf( "FAIL (contract)\n" ); return 1; }
+    int pop  = __builtin_popcountl( seq );
+    sum_pop  += (long) pop;
+    sum_pop2 += (long)(pop*pop);
+    min_pop  = pop<min_pop ? pop : min_pop;
+    max_pop  = pop>max_pop ? pop : max_pop;
+  }
+  double avg_pop = ((double)sum_pop ) / ((double)(1L<<32));
+  double rms_pop = sqrt( (((double)sum_pop2) / ((double)(1L<<32))) - avg_pop*avg_pop );
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass (expand popcount stats: %.3f +/- %.3f [%i,%i])\n", avg_pop, rms_pop, min_pop, max_pop );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_prng_battery.c b/program/src/oracle/util/test_prng_battery.c
new file mode 100644
index 000000000..41ffb9ee2
--- /dev/null
+++ b/program/src/oracle/util/test_prng_battery.c
@@ -0,0 +1,92 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <bbattery.h> /* Assumes TestU01 install include directory is in the include search path */
+#include "util.h"
+
+static double
+test_GetU01( void * param,
+             void * state ) {
+  (void)param;
+  return 2.3283064365386962890625e-10 /* 2^-32, exact */ * (double)prng_uint32( (prng_t *)state );
+}
+
+static unsigned long
+test_GetBits( void * param,
+              void * state ) {
+  (void)param;
+  return (unsigned long)prng_uint32( (prng_t *)state );
+}
+
+static void
+test_Write( void * state ) {
+  prng_t * prng = (prng_t *)state;
+  printf( "prng(0x%08lxU,0x%016lxUL)\n", (unsigned long)prng_seq( prng ), (unsigned long)prng_idx( prng ) );
+}
+
+static void
+usage( char const * cmd ) {
+  fprintf( stderr,
+           "Usage: %s [bat] [seq] [idx]\n"
+           "\tbat 0 - FIPS-140.2 (fast)\n"
+           "\tbat 1 - Pseudo-DIEHARD (fast)\n"
+           "\tbat 2 - TestU01 SmallCrush (fast)\n"
+           "\tbat 3 - TestU01 Crush (~20 minutes)\n"
+           "\tbat 4 - TestU01 BigCrush (several hours)\n"
+           "Note that when running a test in a battery, the probability of it failing even\n"
+           "though the generator is fine is roughly 0.2%%.  Thus, when repeatedly running\n"
+           "batteries that themselves contain a large number of tests, some test failures\n"
+           "are expected.  So long as the test failures are are sporadic, don't occur in the\n"
+           "same place when running multiple times with random seq and/or idx, and don't\n"
+           "have p-values improbably close to 0 (p <<< 1/overall_num_tests_run) or 1\n"
+           "(1-p <<< 1/overall_num_tests_run), it is expected and normal\n",
+           cmd );
+}
+
+int
+main( int     argc,
+      char ** argv ) {
+
+  /* Get command line arguments */
+
+  if( argc!=4 ) { usage( argv[0] ); return 1; }
+  int      bat = atoi( argv[1] );
+  uint32_t seq = (uint32_t)strtoul( argv[2], NULL, 0 );
+  uint64_t idx = (uint64_t)strtoul( argv[3], NULL, 0 );
+
+  /* Create the test generator */
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, seq, idx ) );
+
+  /* Run the requested test battery */
+
+  char name[128];
+  sprintf( name, "prng(0x%08lxU,0x%016lxUL)", (unsigned long)prng_seq( prng ), (unsigned long)prng_idx( prng ) );
+
+  unif01_Gen gen[1];
+  gen->state   = prng;
+  gen->param   = NULL;
+  gen->name    = name;
+  gen->GetU01  = test_GetU01;
+  gen->GetBits = test_GetBits;
+  gen->Write   = test_Write;
+
+  switch( bat ) {
+  case 0: bbattery_FIPS_140_2( gen );    break;
+  case 1: bbattery_pseudoDIEHARD( gen ); break;
+  case 2: bbattery_SmallCrush( gen );    break;
+  case 3: bbattery_Crush( gen );         break;
+  case 4: bbattery_BigCrush( gen );      break;
+//case 5: bbattery_Rabbit( gen );        break; /* FIXME: NB */
+//case 6: bbattery_Alphabit( gen );      break; /* FIXME: NB/R/S */
+//case 7: bbattery_BlockAlphabit( gen ); break; /* FIXME: NB/R/S */
+  default: usage( argv[0] ); return 1;
+  }
+
+  /* Destroy the test generator */
+
+  prng_delete( prng_leave( prng ) );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_rexp2.c b/program/src/oracle/util/test_rexp2.c
new file mode 100644
index 000000000..a612b2064
--- /dev/null
+++ b/program/src/oracle/util/test_rexp2.c
@@ -0,0 +1,78 @@
+#include <stdio.h>
+#include <math.h>
+#include "prng.h"
+
+#define REXP2_FXP_ORDER 7
+#include "rexp2.h"
+
+static double const ulp_thresh[8] = {
+         0.50000000000000000000, // For measuring limit
+  46209195.03792428970336914062,
+   2086872.61177277565002441406,
+     91856.93437397480010986328,
+      2999.59625971317291259766,
+        83.14213037490844726562,
+         2.73321378231048583984,
+         0.60266053676605224609
+};
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  double const one_ulp = (double)(1<<30); /* exact */
+  double const ulp     = (1./one_ulp);    /* exact */
+  double const thresh  = ulp_thresh[ REXP2_FXP_ORDER ];
+
+  double max_aerr = 0.;
+  double max_rerr = 0.;
+
+  /* Exhaustively test x in [0,2^30) */
+
+  int ctr = 0;
+  for( int i=0; i<(1<<30); i++ ) {
+    if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 100000000; }
+    ctr--;
+
+    double x = ulp*(double)i;                        /* exact */
+    double y = ulp*(double)rexp2_fxp( (uint64_t)i ); /* approx */
+  //double y = ulp*round( one_ulp*exp2( -x ) );      /* limit: bits 30.0 max_aerr 4.7e-10 max_rerr 9.3e-10 ulp 0.5 */
+    double z = exp2( -x );                           /* goal */
+    double aerr = fabs(y-z);
+    double rerr = aerr/z;
+    if( aerr*one_ulp>thresh ) {
+      printf( "FAIL (x %.20f y %.20f z %.20f aerr %.20f ulp)\n", x, y, z, aerr*one_ulp );
+      return 1;
+    }
+    max_aerr = fmax( max_aerr, aerr );
+    max_rerr = fmax( max_rerr, rerr );
+  }
+
+  /* Randomly test larger x (with good coverage of each exponent range) */
+  /* FIXME: EXPLICITLY TEST EXACT HANDLING AT POWERS OF 2 */
+
+  for( int i=0; i<10000000; i++ ) {
+    int      s  = ((int)prng_uint32( prng )) & 63;
+    uint64_t _x = prng_uint64( prng ) >> s;
+    
+    double x = ulp*(double)_x;              /* exact when _x is small enough to give a non-zero result */
+    double y = ulp*(double)rexp2_fxp( _x ); /* approx */
+    double z = exp2( -x );                  /* goal */
+    double aerr = fabs(y-z);
+    if( aerr*one_ulp>thresh ) {
+      printf( "FAIL (x %.20f y %.20f z %.20f aerr %.20f ulp)\n", x, y, z, aerr*one_ulp );
+      return 1;
+    }
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass (bits %.1f max_aerr %.1e max_rerr %.1e ulp %.1f)\n", -log2( max_rerr ), max_aerr, max_rerr, max_aerr*one_ulp );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_round.c b/program/src/oracle/util/test_round.c
new file mode 100644
index 000000000..aff69822f
--- /dev/null
+++ b/program/src/oracle/util/test_round.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  unsigned i = (unsigned)0;
+
+  int ctr = 0;
+  for( int x=-32767; x<=32767; x++ ) {
+    for( int y=-32767; y<=32767; y++ ) {
+      if( !ctr ) { printf( "Completed %u iterations\n", i ); ctr = 10000000; }
+      ctr--;
+
+      int u = (x+y)>>1;
+      int v = (x>>1)+(y>>1);
+      int w = v + (x&y&1);
+      if( u!=w ) {
+        printf( "FAIL (x %3i y %3i u %3i v %3i w %3i)\n", x, y, u, v, w );
+        return 1;
+      }
+
+      i++;
+    }
+  }
+  printf( "pass\n" );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_sar.c b/program/src/oracle/util/test_sar.c
new file mode 100644
index 000000000..f0b1ac5ef
--- /dev/null
+++ b/program/src/oracle/util/test_sar.c
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include "util.h"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  /* FIXME: EXPLICT COVERAGE OF EDGE CASES (PROBABLY STATICALLY FULLY
+     SAMPLED ALREADY THOUGH FOR 8 AND 16 BIT TYPES) */
+
+  int ctr = 0;                                                                                                                      
+  for( int i=0; i<1000000000; i++ ) {
+    if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; }
+    ctr--;
+
+    /* These tests assume the unit test platform has arithmetic right
+       shift for signed integers (and the diagnostic printfs assume that
+       long is at least a w-bit wide type) */
+
+#   define TEST_SAR(w) do {                                                                                  \
+      int##w##_t x   = (int##w##_t)prng_uint##w( prng );                                                     \
+      int        n   = (int)( prng_uint32( prng ) & (uint32_t)(w-1) );                                       \
+      int##w##_t val = sar_int##w( x, n );                                                                   \
+      int##w##_t ref = (int##w##_t)(x >> n);                                                                 \
+      if( val!=ref ) {                                                                                       \
+        printf( "FAIL (iter %i op %i x %li n %i val %li ref %li)", i, w, (long)x, n, (long)val, (long)ref ); \
+        return 1;                                                                                            \
+      }                                                                                                      \
+    } while(0)
+
+    TEST_SAR(8);
+    TEST_SAR(16);
+    TEST_SAR(32);
+    TEST_SAR(64);
+#   undef TEST_SAR
+
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass\n" );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_smag.c b/program/src/oracle/util/test_smag.c
new file mode 100644
index 000000000..c006ea9f7
--- /dev/null
+++ b/program/src/oracle/util/test_smag.c
@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include "util.h"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  int ctr = 0;                                                                                                                      
+  for( int i=0; i<100000000; i++ ) {
+    if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; }
+    ctr--;
+#   define TEST(w) do {                                                      \
+      int##w##_t x = (int##w##_t)prng_uint##w( prng );                       \
+      int s; uint##w##_t m = smag_unpack_int##w( x, &s );                    \
+      int##w##_t y = smag_pack_int##w( s, m );                               \
+      uint##w##_t max = (uint##w##_t)(1ULL<<(w-1));                          \
+      if( !(s==(x<(int##w##_t)0) && (m<max || (m==max && s==1))) ) {         \
+        printf( "FAIL (iter %i op smag_unpack_int" #w " x %lx s %i m %lx\n", \
+                i, (long unsigned)x, s, (long unsigned)m );                  \
+        return 1;                                                            \
+      }                                                                      \
+      if( x!=y ) {                                                           \
+        printf( "FAIL (iter %i op smag_pack_int" #w " s %i m %lx y %lx\n",   \
+                i, s, (long unsigned)m, (long unsigned)y );                  \
+        return 1;                                                            \
+      }                                                                      \
+      if( s!=signbit_int##w(x) ) {                                           \
+        printf( "FAIL (iter %i op signbit_int" #w " x %lx s %i\n",           \
+                i, (long unsigned)x, s );                                    \
+        return 1;                                                            \
+      }                                                                      \
+      if( m!=abs_int##w(x) ) {                                               \
+        printf( "FAIL (iter %i op abs_int" #w " x %lx m %lx\n",              \
+                i, (long unsigned)x, (long unsigned)m );                     \
+        return 1;                                                            \
+      }                                                                      \
+    } while(0)
+    TEST(8);
+    TEST(16);
+    TEST(32);
+    TEST(64);
+#   undef TEST
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass\n" );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_sqrt.c b/program/src/oracle/util/test_sqrt.c
new file mode 100644
index 000000000..a2a6111fc
--- /dev/null
+++ b/program/src/oracle/util/test_sqrt.c
@@ -0,0 +1,67 @@
+#include <stdio.h>
+#include "util.h"
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  int ctr = 0;                                                                                                                      
+  for( int i=0; i<50000000; i++ ) {
+    if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 1000000; }
+    ctr--;
+
+#   define TEST(w) do {                                                     \
+      int         n = ((int)prng_uint32( prng )) & (w-1);                   \
+      uint##w##_t x = (uint##w##_t)(prng_uint##w( prng ) >> n);             \
+      uint##w##_t y = sqrt_uint##w( x );                                    \
+      uint##w##_t r = (uint##w##_t)(x - y*y);                               \
+      if( y>=(UINT##w##_C(1)<<(w/2)) || r>(y<<1) ) {                        \
+        printf( "FAIL (iter %i op sqrt_uint" #w " x %lx y %lx r %lx)\n",    \
+                i, (long unsigned)x, (long unsigned)y, (long unsigned)r );  \
+        return 1;                                                           \
+      }                                                                     \
+      int##w##_t u = (int##w##_t)x;                                         \
+      if( u>=INT##w##_C(0) ) {                                              \
+        int##w##_t v = sqrt_int##w( u );                                    \
+        int##w##_t s = (int##w##_t)(u - v*v);                               \
+        if( !(INT##w##_C(0)<=s && s<=(v<<1)) ) {                            \
+          printf( "FAIL (iter %i op sqrt_int" #w " u %li v %li s %li)\n",   \
+                  i, (long)u, (long)v, (long)s );                           \
+          return 1;                                                         \
+        }                                                                   \
+      }                                                                     \
+      int##w##_t v = sqrt_re_int##w( u );                                   \
+      int##w##_t s = (int##w##_t)(u - v*v);                                 \
+      if( !( (u>=INT##w##_C(0) && INT##w##_C(0)<=s && s<=(v<<1)) ||         \
+             (u< INT##w##_C(0) && v==INT##w##_C(0)             ) ) ) {      \
+        printf( "FAIL (iter %i op sqrt_re_int" #w " u %li v %li s %li)\n",  \
+                i, (long)u, (long)v, (long)s );                             \
+        return 1;                                                           \
+      }                                                                     \
+      v = sqrt_abs_int##w( u );                                             \
+      s = (int##w##_t)sqrt_uint##w( abs_int##w( u ) );                      \
+      if( v!=s ) {                                                          \
+        printf( "FAIL (iter %i op sqrt_abs_int" #w " u %li v %li s %li)\n", \
+                i, (long)u, (long)v, (long)s );                             \
+        return 1;                                                           \
+      }                                                                     \
+    } while(0)
+    TEST(8);
+    TEST(16);
+    TEST(32);
+    TEST(64);
+#   undef TEST
+
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass\n" );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/test_uwide.c b/program/src/oracle/util/test_uwide.c
new file mode 100644
index 000000000..aabb1b7e5
--- /dev/null
+++ b/program/src/oracle/util/test_uwide.c
@@ -0,0 +1,211 @@
+#include <stdio.h>
+#include "util.h"
+
+/* This unit test assumes a uint128_t extensions are available on the
+   test platform */
+
+__extension__ typedef          __int128  int128_t;
+__extension__ typedef unsigned __int128 uint128_t;
+
+static inline uint64_t split_hi( uint128_t x ) { return (uint64_t)(x>>64); }
+static inline uint64_t split_lo( uint128_t x ) { return (uint64_t) x;      }
+
+static inline uint128_t join( uint64_t xh, uint64_t xl ) { return (((uint128_t)xh)<<64) | ((uint128_t)xl); }
+
+static inline uint128_t
+prng_uint128( prng_t * prng ) {
+  uint64_t h = prng_uint64( prng );
+  uint64_t l = prng_uint64( prng );
+  return join( h, l );
+}
+
+int
+main( int     argc,
+      char ** argv ) {
+  (void)argc; (void)argv;
+
+  prng_t _prng[1];
+  prng_t * prng = prng_join( prng_new( _prng, (uint32_t)0, (uint64_t)0 ) );
+
+  int ctr = 0;
+  for( int i=0; i<500000000; i++ ) {
+    if( !ctr ) { printf( "Completed %i iterations\n", i ); ctr = 10000000; }
+    ctr--;
+
+    /* Generate a random test vector */
+
+    uint32_t t = prng_uint32( prng );
+    int nx =  1+(((int)t) & 127); t >>= 7; /* Bit width of the x, in [1,128] */
+    int ny =  1+(((int)t) & 127); t >>= 7; /* Bit width of the y, in [1,128] */
+    int nc =  1+(((int)t) &  63); t >>= 6; /* Bit width of the c, in [1,64] */
+    int b0 =     ((int)t) &   1;  t >>= 1; /* Random bit, in [0,1] */
+    int b1 =     ((int)t) &   1;  t >>= 1;
+    int s  = b1+(((int)t) & 127); t >>= 8; /* Shift magnitude, in [0,128] (0 and 128 at half prob) */
+
+    uint128_t x = prng_uint128( prng ) >> (128-nx); uint64_t xh = split_hi( x ); uint64_t xl = split_lo( x );
+    uint128_t y = prng_uint128( prng ) >> (128-ny); uint64_t yh = split_hi( y ); uint64_t yl = split_lo( y );
+    uint64_t  c = prng_uint64 ( prng ) >> ( 64-nc);
+
+    /* Add two random uint128_t x and y with a random 64-bit carry in c */
+
+    do {
+      uint128_t z0 = (uint128_t)c + x; uint64_t w0 = (uint64_t)(z0<x); z0 += y; w0 += (uint64_t)(z0<y);
+      uint64_t zh,zl, w = uwide_add( &zh,&zl, xh,xl, yh,yl, c );
+      uint128_t z = join( zh,zl );
+      if( z!=z0 || w!=w0 ) {
+        printf( "FAIL (iter %i op uwide_add x %016lx %016lx y %016lx %016lx c %016lx z %016lx %016lx w %016lx z0 %016lx %016lx w0 %016lx)\n",
+                i, xh,xl, yh,yl, c, zh,zl,w, split_hi(z0),split_lo(z0),w0 );
+        return 1;
+      }
+    } while(0);
+
+    /* Increment random uint128_t x by random 64-bit c */
+
+    do {
+      uint128_t z0 = x + c;
+      uint64_t zh,zl; uwide_inc( &zh,&zl, xh,xl, c );
+      uint128_t z = join( zh,zl );
+      if( z!=z0 ) {
+        printf( "FAIL (iter %i op uwide_inc x %016lx %016lx c %016lx z %016lx %016lx z0 %016lx %016lx)\n",
+                i, xh,xl, c, zh,zl, split_hi(z0),split_lo(z0) );
+        return 1;
+      }
+    } while(0);
+
+    /* Subtract two random uint128_t x and y with a random 64-bit borrow c */
+
+    do {
+      uint64_t w0 = (uint64_t)(x<(uint128_t)c); uint128_t z0 = x - (uint128_t)c; w0 += (uint64_t)(z0<y); z0 -= y;
+      uint64_t zh,zl, w = uwide_sub( &zh,&zl, xh,xl, yh,yl, c );
+      uint128_t z = join( zh,zl );
+      if( z!=z0 || w!=w0 ) {
+        printf( "FAIL (iter %i op uwide_sub x %016lx %016lx y %016lx %016lx c %016lx z %016lx %016lx w %016lx z0 %016lx %016lx w0 %016lx)\n",
+                i, xh,xl, yh,yl, c, zh,zl,w, split_hi(z0),split_lo(z0),w0 );
+        return 1;
+      }
+    } while(0);
+
+    /* Decrement random uint128_t x by random 64-bit c */
+
+    do {
+      uint128_t z0 = x - c;
+      uint64_t zh,zl; uwide_dec( &zh,&zl, xh,xl, c );
+      uint128_t z = join( zh,zl );
+      if( z!=z0 ) {
+        printf( "FAIL (iter %i op uwide_dec x %016lx %016lx c %016lx z %016lx %016lx z0 %016lx %016lx)\n",
+                i, xh,xl, c, zh,zl, split_hi(z0),split_lo(z0) );
+        return 1;
+      }
+    } while(0);
+
+    /* Multiply two random uint128_t x and y */
+
+    do {
+      uint128_t z0 = ((uint128_t)xl)*((uint128_t)yl);
+      uint64_t zh,zl; uwide_mul( &zh,&zl, xl,yl );
+      uint128_t z = join( zh,zl );
+      if( z!=z0 ) {
+        printf( "FAIL (iter %i op uwide_mul x %016lx y %016lx z %016lx %016lx z0 %016lx %016lx)\n",
+                i, xl, yl, zh,zl, split_hi(z0),split_lo(z0) );
+        return 1;
+      }
+    } while(0);
+
+    /* Divide a random uint128_t x by a random non-zero d */
+
+    do {
+      uint64_t  d = c | (UINT64_C(1) << (nc-1)); /* d is a random nc bit denom with leading 1 set */
+      uint128_t z0 = x / (uint128_t)d;
+      uint64_t zh,zl; uwide_div( &zh,&zl, xh,xl, d );
+      uint128_t z = join( zh,zl );
+      if( z!=z0 ) {
+        printf( "FAIL (iter %i op uwide_div x %016lx %016lx d %016lx z %016lx %016lx z0 %016lx %016lx)\n",
+                i, xh,xl, d, zh,zl, split_hi(z0),split_lo(z0) );
+        return 1;
+      }
+    } while(0);
+
+    /* Divide a random uint128_t x by a random non-zero d and get the remainder */
+
+    do {
+      uint64_t  d = c | (UINT64_C(1) << (nc-1)); /* d is a random nc bit denom with leading 1 set */
+      uint128_t z0 =            x / (uint128_t)d;
+      uint64_t  w0 = (uint64_t)(x % (uint128_t)d);
+      uint64_t zh,zl; uint64_t w = uwide_divrem( &zh,&zl, xh,xl, d );
+      uint128_t z = join( zh,zl );
+      if( z!=z0 || w!=w0 ) {
+        printf( "FAIL (iter %i op uwide_divrem x %016lx %016lx d %016lx z %016lx %016lx w %016lx z0 %016lx %016lx w0 %016lx)\n",
+                i, xh,xl, d, zh,zl,w, split_hi(z0),split_lo(z0),w0 );
+        return 1;
+      }
+    } while(0);
+
+    /* Compute the log2 of a random uint128_t x (sets leading bit of x)  */
+
+    do {
+      x |= ((uint128_t)1) << (nx-1); xh = split_hi( x ); xl = split_lo( x ); /* Set the leading bit */
+      int n0 = nx-1;
+      int n  = uwide_log2( xh, xl );
+      if( n!=n0 ) {
+        printf( "FAIL (iter %i op uwide_log2 x %016lx %016lx n %i n0 %i\n", i, xh,xl, n, n0 );
+        return 1;
+      }
+    } while(0);
+
+    /* Compute the log2 of a random uint128_t x with default (clobbers leading bit of x) */
+
+    do {
+      if( nx==1 ) { /* If 1 bit wide, use a coin test to we sample the default */
+        x = (uint128_t)b0; xh = split_hi( x ); xl = split_lo( x );
+        int n0 = (x ? 0 : 1234);
+        int n  = uwide_log2_def( xh,xl, 1234 );
+        if( n!=n0 ) {
+          printf( "FAIL (iter %i op uwide_log2_def x %016lx %016lx n %i n0 %i\n", i, xh,xl, n, n0 );
+          return 1;
+        }
+      } else { /* If wider, def should not happen (assumes leading bit of x set) */
+        int n  = uwide_log2_def( xh, xl, 1234 );
+        int n0 = nx-1;
+        if( n!=n0 ) {
+          printf( "FAIL (iter %i op uwide_log2_def x %016lx %016lx n %i n0 %i\n", i, xh,xl, n, n0 );
+          return 1;
+        }
+      }
+    } while(0);
+
+    /* Left shift a random uint128_t y */
+
+    do {
+      uint128_t z0           = s==128 ? (uint128_t)0 : y<<s;
+      int       f0           = (s==0) ? 0 : (s==128) ? !!y : !!(y>>(128-s));
+      uint64_t  zh,zl; int f = uwide_sl( &zh,&zl, yh,yl, s );
+      uint128_t z            = join( zh,zl );
+      if( z!=z0 || f!=f0 ) {
+        printf( "FAIL (iter %i op uwide_sl y %016lx %016lx s %i z %016lx %016lx f %i z0 %016lx %016lx f0 %i\n",
+                i, yh,yl, s, zh,zl, f, split_hi(z0),split_lo(z0), f0 );
+        return 1;
+      }
+    } while(0);
+
+    /* Right shift a random uint128_t y */
+
+    do {
+      uint128_t z0           = s==128 ? (uint128_t)0 : y>>s;
+      int       f0           = (s==0) ? 0 : (s==128) ? !!y : !!(y<<(128-s));
+      uint64_t  zh,zl; int f = uwide_sr( &zh,&zl, yh,yl, s );
+      uint128_t z            = join( zh,zl );
+      if( z!=z0 || f!=f0 ) {
+        printf( "FAIL (iter %i op uwide_sr y %016lx %016lx s %i z %016lx %016lx f %i z0 %016lx %016lx f0 %i\n",
+                i, yh,yl, s, zh,zl, f, split_hi(z0),split_lo(z0), f0 );
+        return 1;
+      }
+    } while(0);
+  }
+
+  prng_delete( prng_leave( prng ) );
+
+  printf( "pass\n" );
+
+  return 0;
+}
+
diff --git a/program/src/oracle/util/util.h b/program/src/oracle/util/util.h
new file mode 100644
index 000000000..c6dfe1bcd
--- /dev/null
+++ b/program/src/oracle/util/util.h
@@ -0,0 +1,18 @@
+#ifndef _pyth_oracle_util_util_h_
+#define _pyth_oracle_util_util_h_
+
+#include "align.h"   /* includes stdint.h */
+//#include "sar.h"   /* includes stdint.h */
+//#include "hash.h"  /* includes stdint.h */
+//#include "log2.h"  /* includes stdint.h */
+#include "smag.h"    /* includes stdint.h */
+#include "rexp2.h"   /* includes stdint.h */
+#include "exp2m1.h"  /* includes stdint.h */
+//#include "uwide.h" /* includes log2.h */
+//#include "sqrt.h"  /* includes log2.h */
+#include "fxp.h"     /* includes uwide.h and sqrt.h */
+#include "avg.h"     /* includes sar.h */
+#include "prng.h"    /* includes stdalign.h and hash.h */
+
+#endif /* _pyth_oracle_util_util_h_ */
+
diff --git a/program/src/oracle/util/uwide.h b/program/src/oracle/util/uwide.h
new file mode 100644
index 000000000..d8ca33862
--- /dev/null
+++ b/program/src/oracle/util/uwide.h
@@ -0,0 +1,363 @@
+#ifndef _pyth_oracle_util_uwide_h_
+#define _pyth_oracle_util_uwide_h_
+
+/* Useful operations for unsigned 128-bit integer operations.  A 128-bit
+   wide number is represented as a pair of uint64_t.  In the notation
+   below <xh,xl> == xh 2^64 + xl where xh and xl are uint64_t's.  FIXME:
+   CONSIDER ADDING A STYLE BASED ON GCC'S __int128 EXTENSIONS? */
+
+#include "log2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Compute co 2^128 + <zh,zl> = <xh,xl> + <yh,yl> + ci exactly.  Returns
+   the carry out.  Note that the carry in / carry operations should be
+   compile time optimized out if ci is 0 at compile time on input and/or
+   return value is not used.  Assumes _zh and _zl are valid (e.g.
+   non-NULL and non-overlapping).  Ignoring carry in/out related
+   operations costs 4 u64 adds, 1 u64 compare and 1 u64 conditional
+   increment. */
+
+static inline uint64_t
+uwide_add( uint64_t * _zh, uint64_t * _zl,
+           uint64_t    xh, uint64_t    xl,
+           uint64_t    yh, uint64_t    yl,
+           uint64_t    ci ) {
+  uint64_t zh = xh; uint64_t zl = xl;
+  uint64_t ct;      uint64_t co;
+  zl += ci; ct  = (uint64_t)(zl<ci);
+  zl += yl; ct += (uint64_t)(zl<yl);
+  zh += ct; co  = (uint64_t)(zh<ct);
+  zh += yh; co += (uint64_t)(zh<yh);
+  *_zh = zh; *_zl = zl; return co;
+}
+
+/* Compute <zh,zl> = (<xh,xl> + y) mod 2^128 exactly (a common use of
+   the above) */
+
+static inline void
+uwide_inc( uint64_t * _zh, uint64_t * _zl,
+           uint64_t    xh, uint64_t    xl,
+           uint64_t     y ) {
+  uint64_t zl = xl + y;
+  uint64_t zh = xh + (zl<xl);
+  *_zh = zh; *_zl = zl;
+}
+
+/* Compute <zh,zl> = bo 2^128 + <xh,xl> - <yh,yl> - bi exactly.  Returns
+   the borrow out.  Note that the borrow in / borrow operations should
+   be compile time optimized out if b is 0 at compile time on input
+   and/or return value is not used.  Assumes _zh and _zl are valid (e.g.
+   non-NULL and non-overlapping).  Ignoring borrow in/out related
+   operations costs 4 u64 subs, 1 u64 compare and 1 u64 conditional
+   decrement. */
+
+static inline uint64_t
+uwide_sub( uint64_t * _zh, uint64_t * _zl,
+           uint64_t    xh, uint64_t    xl,
+           uint64_t    yh, uint64_t    yl,
+           uint64_t    bi ) {
+  uint64_t zh = xh; uint64_t zl = xl;
+  uint64_t bt;      uint64_t bo;
+  bt  = (uint64_t)(zl<bi); zl -= bi;
+  bt += (uint64_t)(zl<yl); zl -= yl;
+  bo  = (uint64_t)(zh<bt); zh -= bt;
+  bo += (uint64_t)(zh<yh); zh -= yh;
+  *_zh = zh; *_zl = zl; return bo;
+}
+
+/* Compute <zh,zl> = (<xh,xl> - y) mod 2^128 exactly (a common use of
+   the above) */
+
+static inline void
+uwide_dec( uint64_t * _zh, uint64_t * _zl,
+           uint64_t    xh, uint64_t    xl,
+           uint64_t     y ) {
+  uint64_t zh = xh - (uint64_t)(y>xl);
+  uint64_t zl = xl - y;
+  *_zh = zh; *_zl = zl;
+}
+
+/* Compute <zh,zl> = x*y exactly, will be in [0,2^128-2^65+1].  Assumes
+   _zh and _zl are valid (e.g. non-NULL and non-overlapping).  Cost is 4
+   u32*u32->u64 muls, 4 u64 adds, 2 u64 compares, 2 u64 conditional
+   increments, 8 u64 u32 word extractions. */
+
+static inline void
+uwide_mul( uint64_t * _zh, uint64_t * _zl,
+           uint64_t   x,
+           uint64_t   y ) {
+  uint64_t x1  = x>>32;  uint64_t x0  = (uint64_t)(uint32_t)x;  /* both 2^32-1 @ worst case (x==y==2^64-1) */
+  uint64_t y1  = y>>32;  uint64_t y0  = (uint64_t)(uint32_t)y;  /* both 2^32-1 @ worst case */
+
+  uint64_t w0  = x0*y0;  uint64_t w1  = x0*y1;                  /* both 2^64-2^33+1 @ worst case */
+  uint64_t w2  = x1*y0;  uint64_t w3  = x1*y1;                  /* both 2^64-2^33+1 @ worst case */
+
+  uint64_t w1h = w1>>32; uint64_t w1l = (uint64_t)(uint32_t)w1; /* w1h 2^32-2, w1l 1 @ worst case */
+  uint64_t w2h = w2>>32; uint64_t w2l = (uint64_t)(uint32_t)w2; /* w2h 2^32-2, w2l 1 @ worst case */
+
+  uint64_t zh  = w1h + w2h + w3;                                /* 2^64-3                     @ worst case */
+  uint64_t t0  = w0 + (w1l<<32); zh += (uint64_t)(t0<w0);       /* t 2^64-2^32+1, zh 2^64 - 3 @ worst case */
+  uint64_t zl  = t0 + (w2l<<32); zh += (uint64_t)(zl<t0);       /* t 1,           zh 2^64 - 2 @ worst case */
+  /* zh 2^64 + zl == 2^128-2^65+1 @ worst case */
+
+  *_zh = zh; *_zl = zl;
+}
+
+/* Returns floor( log2 <xh,xl> ) exactly.  Assumes <xh,xl> is not 0. */
+
+static inline int
+uwide_log2( uint64_t xh,
+            uint64_t xl ) {
+  int off = 0;
+  if( xh ) { off = 64; xl = xh; }
+  return off + log2_uint64( xl );
+}
+
+/* Same as the uwide_log2 but returns def is <xh,xl> is 0. */
+
+static inline int uwide_log2_def( uint64_t xh, uint64_t xl, int def ) { return (xh|xl) ? uwide_log2(xh,xl) : def; }
+
+/* Compute <zh,zl> = <xh,xl> << s.  Assumes _zh and _zl are valid (e.g.
+   non-NULL and non-overlapping) and s is non-negative.  Large values of
+   s are fine (shifts to zero).  Returns the inexact flag (will be 0 or
+   1) which indicates if any non-zero bits of <xh,xl> were lost in the
+   process.  Note that inexact handling and various cases should be
+   compile time optimized out if if s is known at compile time on input
+   and/or return value is not used.  Ignoring inexact handling and
+   assuming compile time s, for the worst case s, cost is 3 u64 shifts
+   and 1 u64 bit or.  (FIXME: CONSIDER HAVING AN INVALID FLAG FOR
+   NEGATIVE S?) */
+
+static inline int
+uwide_sl( uint64_t * _zh, uint64_t * _zl,
+          uint64_t    xh, uint64_t    xl,
+          int s ) {
+  if( s>=128 ) {                        *_zh = UINT64_C(0);     *_zl = UINT64_C(0); return !!(xh| xl    ); }
+  if( s>  64 ) { s -= 64; int t = 64-s; *_zh =  xl<<s;          *_zl = UINT64_C(0); return !!(xh|(xl>>t)); }
+  if( s== 64 ) {                        *_zh =  xl;             *_zl = UINT64_C(0); return !! xh;          }
+  if( s>   0 ) {          int t = 64-s; *_zh = (xh<<s)|(xl>>t); *_zl = xl<<s;       return !!(xh>>t);      }
+  /*  s==  0 */                         *_zh =  xh;             *_zl = xl;          return 0;
+}
+
+/* Compute <zh,zl> = <xh,xl> >> s.  Assumes _zh and _zl are valid (e.g.
+   non-NULL and non-overlapping) and s is non-negative.  Large values of
+   s are fine (shifts to zero).  Returns the inexact flag (will be 0 or
+   1) which indicates if any non-zero bits of <xh,xl> were lost in the
+   process.  Note that inexact handling and various cases should be
+   compile time optimized out if if s is known at compile time on input
+   and/or return value is not used.  Ignoring inexact handling and
+   assuming compile time s, for the worst case s, cost is 3 u64 shifts
+   and 1 u64 bit or.  (FIXME: CONSIDER HAVING AN INVALID FLAG FOR
+   NEGATIVE S AND/OR MORE DETAILED INEXACT FLAGS TO SIMPLIFY
+   IMPLEMENTING FIXED AND FLOATING POINT ROUNDING MODES?) */
+
+static inline int
+uwide_sr( uint64_t * _zh, uint64_t * _zl,
+          uint64_t    xh, uint64_t    xl,
+          int s ) {
+  if( s>=128 ) {                        *_zh = UINT64_C(0); *_zl = UINT64_C(0);     return !!( xh    |xl); }
+  if( s>  64 ) { s -= 64; int t = 64-s; *_zh = UINT64_C(0); *_zl =  xh>>s;          return !!((xh<<t)|xl); }
+  if( s== 64 ) {                        *_zh = UINT64_C(0); *_zl =  xh;             return !!         xl;  }
+  if( s>   0 ) {          int t = 64-s; *_zh = xh>>s;       *_zl = (xl>>s)|(xh<<t); return !!( xl<<t    ); }
+  /*  s==  0 */                         *_zh = xh;          *_zl =  xl;             return 0;
+}
+
+/* Compute a ~32-bit accurate approximation qa of q = floor(n 2^64 / d)
+   where d is in [2^63,2^64) cheaply.  Approximation will be at most q.
+
+   In the general case, qa is up to 65 bits wide (worst case is n=2^64-1
+   and d=2^63 such that q is 2^65-2 and qa is precise enough to need 65
+   bits too).  The implementation here assumes n is is less than d so
+   that q and qa are both known to fit within 64 bits.
+
+   Cost to setup for a given d is approximately a 1 u64 u32 extract, 1
+   u64 increment, 1 u64 neg, 1 u64/u64 div, 1 u64 add.  Cost per n/d
+   afterward is 2 u32*u32->u64 mul, 2 u64 add, 1 u64 u32 extract.
+
+   Theory: Let q d + r = n 2^64 where q = floor( n 2^64 / d ).  Note r
+   is in [0,d).  Break d into d = dh 2^32 + dl where dh and dl are
+   32-bit words:
+     q+r/d = n 2^64 / d =  n 2^64 / ( dh 2^32 + dl )
+                        =  n 2^64 / ( (dh+1) 2^32 - (2^32-dl) ) ... dh+1 can be computed without overflow, 2^32-dl is positive
+                        >  n 2^64 / ( (dh+1) 2^32 )
+                        >= n floor( 2^64/(dh+1) ) / 2^32
+   Note that floor( 2^64/(dh+1) ) is in [2^32,2^33-4].  This suggests
+   letting:
+     2^32 + m = floor( 2^64/(dh+1) )
+   where m is in [0,2^32-4] (fits in a uint32_t).  Then we have:
+     q+r/d > n (2^32 + m) / 2^32
+           = n + n m / 2^32
+   Similarly breaking n into n = nh 2^32 + nl:
+     q+r/d > n + (nh m) + (nl m/2^32)
+           >= n + nh m + floor( nl m / 2^32 )
+   We get:
+     q+r/d > n + nh*m + ((nl*m)>>32)
+   And, as q is integral, r/d is less than 1 and the right hand side is
+   integral:
+     q >= qa
+   where:
+     qa = n + nh*m + (((nl*m)>>32)
+   and:
+     m  = floor( 2^64 / (dh+1) ) - 2^32
+
+   To compute m efficiently, note:
+     m  = floor( 1 + (2^64-(dh+1))/(dh+1) ) - 2^32
+        = floor( (2^64-(dh+1))/(dh+1) ) - (2^32-1)
+   and in "C style" modulo 2^64 arithmetic, 2^64 - x = -x.  This yields:
+     m  = (-(dh+1))/(dh+1) - (2^32-1)
+*/
+
+static inline uint64_t                    /* In [0,2^32) */
+uwide_div_approx_init( uint64_t d ) {     /* d in [2^63,2^64) */
+  uint64_t m = (d>>32) + UINT64_C(1);     /* m = dh+1 and is in (2^31,2^32]  ... exact */
+  return ((-m)/m) - (uint64_t)UINT32_MAX; /* m = floor( 2^64/(dh+1) ) - 2^32 ... exact */
+}
+
+static inline uint64_t           /* In [n,2^64) and <= floor(n 2^64/d) */
+uwide_div_approx( uint64_t n,    /* In [0,d) */
+                  uint64_t m ) { /* Output of uwide_div_helper_init for the desired d */
+  uint64_t nh = n>>32;
+  uint64_t nl = (uint64_t)(uint32_t)n;
+  return n + nh*m + ((nl*m)>>32);
+}
+
+/* Compute z = floor( (xh 2^64 + xl) / y ).  Assumes _zh and _zl are
+   valid (e.g. non-NULL and non-overlapping).  Requires y to be
+   non-zero.  Returns the exception flag if y is 0 (<zh,zl>=0 in this
+   case).  This is not very cheap and cost is highly variable depending
+   on properties of both n and d.  Worst case is roughly ~3 u64/u64
+   divides, ~24 u64*u64->u64 muls plus other minor ops.
+
+   Breakdown in worst case 1 u64 log2, 2 u64/u64 div, 1 u64*u64->u64
+   mul, 1 u64 sub, 1 int sub, 1 u128 variable shift, 1 1 u64 variable
+   shift, 1 u64 div approx init, 4*(1 u64 div approx, 1 u64*u64->u128
+   mul, 1 u128 sub) plus various operations to faciliating shortcutting
+   (e.g. when xh is zero, cost is 1 u64/u64 div). */
+
+static inline int
+uwide_div( uint64_t * _zh, uint64_t * _zl,
+           uint64_t    xh, uint64_t    xl,
+           uint64_t    y ) {
+
+  /* Simple cases (y==0, x==0 and y==2^n) */
+
+  if( !y ) { *_zh = UINT64_C(0); *_zl = UINT64_C(0); return 1; }
+  if( !xh ) { *_zh = UINT64_C(0); *_zl = xl / y; return 0; }
+
+  int n = log2_uint64( y );
+  if( !(y & (y-UINT64_C(1))) ) { uwide_sr( _zh,_zl, xh,xl, n ); return 0; }
+
+  /* General case ... compute zh noting that:
+       <zh,zl> = floor( (xh 2^64 + xl) / y )
+               = floor( ((qh y + rh) 2^64 + xl) / y )
+     where xh = qh y + rh and qh is floor(xh/y), rh = xh%y and rh is in
+     [0,y).  Thus:
+       = qh 2^64 + floor( (rh 2^64 + xl) / y )
+     where the right term is less that 2^64 such that zh is qh and
+     zl = floor( (rh 2^64 + xl) / y ). */
+
+  uint64_t qh = xh / y;
+  uint64_t rh = xh - qh*y;
+
+  /* Simple zl shortcut */
+
+  if( !rh ) { *_zh = qh; *_zl = xl / y; return 0; }
+
+  /* At this point, zh is qh and zl is floor( (rh 2^64 + xl) / y ) where
+     rh is in (0,y) and y is a positive non-power of 2.
+
+     Normalize this by noting for:
+       q=n/d; r=n-q*d=n%d
+     we have:
+       -> n   = q d + r where q = floor(n/d) and r in [0,d).
+       -> n w = q d w + r w for some integer w>0
+     That is, if we scale up both n and d by w, it doesn't affect q (it
+     just scales the remainder).  We use a scale factor of 2^s on
+     <rh,xl> and y such that y will have its most significant bit set.
+     This will not cause <rh,xl> to overflow as rh is less than y at
+     this point. */
+
+  int s = 63-n;
+  uint64_t nh, nl; uwide_sl( &nh,&nl, rh,xl, s );
+  uint64_t d = y << s;
+
+  /* At this point zh = qh and zl is floor( (nh 2^64 + nl) / d ) where
+     nh is in (0,d) and d is in (2^63,2^64). */
+
+  uint64_t m  = uwide_div_approx_init( d );
+  uint64_t eh = nh; uint64_t el = nl;
+  uint64_t ql = UINT64_C(0);
+  do {
+
+    /* At this point, n-ql*d has an error of <eh,el> such that:
+         d < 2^64 <= <eh,el>
+       (i.e. eh is non-zero).  If we increment ql by the correction
+       floor(<eh,el>/d), we'd be at the solution but computing this is
+       as hard as the original problem.  We do have the ability to
+       very quickly compute a ~32-bit accurate estimate dqest of
+       floor(<eh,0>/d) such that:
+         1 <= eh <= dqest <= floor(<eh,0>/d) <= floor(<eh,el)/d).
+       so we use that as our increment.  Practically, this loop will
+       require at most ~4 iterations. */
+
+    ql += uwide_div_approx( eh, m );                 /* Guaranteed to make progress if eh > 0 */
+    uwide_mul( &eh,&el, ql, d );                     /* <eh,el> = ql*d */
+    uwide_sub( &eh,&el, nh,nl, eh,el, UINT64_C(0) ); /* <eh,el> = <nh,nl> - ql d */
+  } while( eh );
+
+  /* At this point, n - ql*d has an error less than 2^64 so we can
+     directly compute the remaining correction. */
+
+  ql += el/d;
+
+  *_zh = qh; *_zl = ql; return 0;
+}
+
+/* Same as the above but returns the value of the remainder too.
+   For non-zero y, remainder will be in [0,y).  If y==0, returns
+   <zh,zl>=0 with a remainder of UINT64_MAX (to signal error). */
+
+static inline uint64_t
+uwide_divrem( uint64_t * _zh, uint64_t * _zl,
+              uint64_t    xh, uint64_t    xl,
+              uint64_t    y ) {
+
+  if( !y ) { *_zh = UINT64_C(0); *_zl = UINT64_C(0); return UINT64_MAX; }
+  if( !xh ) { uint64_t ql = xl / y; uint64_t r = xl - ql*y; *_zh = UINT64_C(0); *_zl = ql; return r; }
+
+  int n = log2_uint64( y );
+  if( !(y & (y-UINT64_C(1))) ) { int s = 64-n; uwide_sr( _zh,_zl, xh,xl, n ); return n ? ((xl << s) >> s) : UINT64_C(0); }
+
+  uint64_t qh = xh / y;
+  uint64_t rh = xh - qh*y;
+
+  if( !rh ) { uint64_t ql = xl / y; uint64_t r = xl - ql*y; *_zh = qh; *_zl = ql; return r; }
+
+  int s = 63-n;
+  uint64_t nh, nl; uwide_sl( &nh,&nl, rh,xl, s );
+  uint64_t d = y << s;
+
+  uint64_t m  = uwide_div_approx_init( d );
+  uint64_t eh = nh; uint64_t el = nl;
+  uint64_t ql = UINT64_C(0);
+  do {
+    ql += uwide_div_approx( eh, m );
+    uwide_mul( &eh,&el, ql, d );
+    uwide_sub( &eh,&el, nh,nl, eh,el, UINT64_C(0) );
+  } while( eh );
+
+  uint64_t dq = el / d;
+  uint64_t r  = (el - dq*d) >> s;
+  ql += dq;
+
+  *_zh = qh; *_zl = ql; return r;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _pyth_oracle_util_uwide_h_ */