diff --git a/README.md b/README.md index 7b737c8..191fe73 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,7 @@ For CPU: ```bash $ conda env create -f environment-cpu.yml $ conda activate pyhpc-bench-cpu +$ sudo python -m pyperf system tune ``` GPU: diff --git a/backends.py b/backends.py index 37b4919..3e18e9a 100644 --- a/backends.py +++ b/backends.py @@ -123,6 +123,52 @@ def setup_numba(device="cpu"): yield numba +@setup_function +def setup_transonic_pythran(device="cpu"): + if device == "gpu": + raise RuntimeError("Pythran on GPU requires adding OpenMP annotations") + else: + os.environ.update( + OMP_NUM_THREADS="1", + CFLAGS="-O3 -ffast-math", + ) + + import logging + import transonic as ts + from transonic.log import logger + import pythran + + yield pythran + + # For JIT this is good to have to wait in the first run. + # FIXME: The "setup" function is called multiple times by the benchmark script run.py + logger.setLevel(logging.ERROR) # To avoid INFO logs "Wait for all extensions" multiple times. + ts.wait_for_all_extensions() + + +@setup_function +def setup_transonic_cython(device="cpu"): + if device == "gpu": + raise RuntimeError("Cython on GPU in not implemented in Transonic.") + else: + os.environ.update( + OMP_NUM_THREADS="1", + CFLAGS="-O3 -ffast-math", + ) + + import logging + import transonic as ts + from transonic.log import logger + import cython + + yield cython + + # For JIT this is good to have to wait in the first run. + # FIXME: The "setup" function is called multiple times by the benchmark script run.py + logger.setLevel(logging.ERROR) # To avoid INFO logs "Wait for all extensions" multiple times. + ts.wait_for_all_extensions() + + @setup_function def setup_cupy(device="cpu"): if device != "gpu": @@ -203,4 +249,6 @@ def setup_tensorflow(device="cpu"): "numba": setup_numba, "pytorch": setup_pytorch, "tensorflow": setup_tensorflow, + "transonic_pythran": setup_transonic_pythran, + "transonic_cython": setup_transonic_cython, } diff --git a/benchmarks/equation_of_state/__init__.py b/benchmarks/equation_of_state/__init__.py index 3c0063a..4f6230a 100644 --- a/benchmarks/equation_of_state/__init__.py +++ b/benchmarks/equation_of_state/__init__.py @@ -43,4 +43,6 @@ def get_callable(backend, size, device="cpu"): "numpy", "pytorch", "tensorflow", + "transonic_pythran", + "transonic_cython", ) diff --git a/benchmarks/equation_of_state/eos_transonic_cython.py b/benchmarks/equation_of_state/eos_transonic_cython.py new file mode 100644 index 0000000..1ac08c1 --- /dev/null +++ b/benchmarks/equation_of_state/eos_transonic_cython.py @@ -0,0 +1,270 @@ +""" +========================================================================== + in-situ density, dynamic enthalpy and derivatives + from Absolute Salinity and Conservative + Temperature, using the computationally-efficient 48-term expression for + density in terms of SA, CT and p (IOC et al., 2010). +========================================================================== +""" + +import numpy as np +from transonic import jit +from transonic.typing import Tuple + + +v01 = 9.998420897506056e2 +v02 = 2.839940833161907e0 +v03 = -3.147759265588511e-2 +v04 = 1.181805545074306e-3 +v05 = -6.698001071123802e0 +v06 = -2.986498947203215e-2 +v07 = 2.327859407479162e-4 +v08 = -3.988822378968490e-2 +v09 = 5.095422573880500e-4 +v10 = -1.426984671633621e-5 +v11 = 1.645039373682922e-7 +v12 = -2.233269627352527e-2 +v13 = -3.436090079851880e-4 +v14 = 3.726050720345733e-6 +v15 = -1.806789763745328e-4 +v16 = 6.876837219536232e-7 +v17 = -3.087032500374211e-7 +v18 = -1.988366587925593e-8 +v19 = -1.061519070296458e-11 +v20 = 1.550932729220080e-10 +v21 = 1.0e0 +v22 = 2.775927747785646e-3 +v23 = -2.349607444135925e-5 +v24 = 1.119513357486743e-6 +v25 = 6.743689325042773e-10 +v26 = -7.521448093615448e-3 +v27 = -2.764306979894411e-5 +v28 = 1.262937315098546e-7 +v29 = 9.527875081696435e-10 +v30 = -1.811147201949891e-11 +v31 = -3.303308871386421e-5 +v32 = 3.801564588876298e-7 +v33 = -7.672876869259043e-9 +v34 = -4.634182341116144e-11 +v35 = 2.681097235569143e-12 +v36 = 5.419326551148740e-6 +v37 = -2.742185394906099e-5 +v38 = -3.212746477974189e-7 +v39 = 3.191413910561627e-9 +v40 = -1.931012931541776e-12 +v41 = -1.105097577149576e-7 +v42 = 6.211426728363857e-10 +v43 = -1.119011592875110e-10 +v44 = -1.941660213148725e-11 +v45 = -1.864826425365600e-14 +v46 = 1.119522344879478e-14 +v47 = -1.200507748551599e-15 +v48 = 6.057902487546866e-17 +rho0 = 1024.0 + + +# NOTE: no type annotations are required with the `jit` decorator. +def gsw_dHdT(sa, ct, p): + """ + d/dT of dynamic enthalpy, analytical derivative + sa : Absolute Salinity [g/kg] + ct : Conservative Temperature [deg C] + p : sea pressure [dbar] + """ + t1 = v45 * ct + t2 = 0.2e1 * t1 + t3 = v46 * sa + t4 = 0.5 * v12 + t5 = v14 * ct + t7 = ct * (v13 + t5) + t8 = 0.5 * t7 + t11 = sa * (v15 + v16 * ct) + t12 = 0.5 * t11 + t13 = t4 + t8 + t12 + t15 = v19 * ct + t19 = v17 + ct * (v18 + t15) + v20 * sa + t20 = 1.0 / t19 + t24 = v47 + v48 * ct + t25 = 0.5 * v13 + t26 = 1.0 * t5 + t27 = sa * v16 + t28 = 0.5 * t27 + t29 = t25 + t26 + t28 + t33 = t24 * t13 + t34 = t19 ** 2 + t35 = 1.0 / t34 + t37 = v18 + 2.0 * t15 + t38 = t35 * t37 + t48 = ct * (v44 + t1 + t3) + t57 = v40 * ct + t59 = ct * (v39 + t57) + t64 = t13 ** 2 + t68 = t20 * t29 + t71 = t24 * t64 + t74 = v04 * ct + t76 = ct * (v03 + t74) + t79 = v07 * ct + t82 = np.sqrt(sa) + t83 = v11 * ct + t85 = ct * (v10 + t83) + t92 = ( + v01 + + ct * (v02 + t76) + + sa * (v05 + ct * (v06 + t79) + t82 * (v08 + ct * (v09 + t85))) + ) + t93 = v48 * t92 + t105 = ( + v02 + + t76 + + ct * (v03 + 2.0 * t74) + + sa * (v06 + 2.0 * t79 + t82 * (v09 + t85 + ct * (v10 + 2.0 * t83))) + ) + t106 = t24 * t105 + t107 = v44 + t2 + t3 + t110 = v43 + t48 + t117 = t24 * t92 + t120 = 4.0 * t71 * t20 - t117 - 2.0 * t110 * t13 + t123 = ( + v38 + + t59 + + ct * (v39 + 2.0 * t57) + + sa * v42 + + ( + 4.0 * v48 * t64 * t20 + + 8.0 * t33 * t68 + - 4.0 * t71 * t38 + - t93 + - t106 + - 2.0 * t107 * t13 + - 2.0 * t110 * t29 + ) + * t20 + - t120 * t35 * t37 + ) + t128 = t19 * p + t130 = p * (1.0 * v12 + 1.0 * t7 + 1.0 * t11 + t128) + t131 = 1.0 / t92 + t133 = 1.0 + t130 * t131 + t134 = np.log(t133) + t143 = v37 + ct * (v38 + t59) + sa * (v41 + v42 * ct) + t120 * t20 + t152 = t37 * p + t156 = t92 ** 2 + t165 = v25 * ct + t167 = ct * (v24 + t165) + t169 = ct * (v23 + t167) + t175 = v30 * ct + t177 = ct * (v29 + t175) + t179 = ct * (v28 + t177) + t185 = v35 * ct + t187 = ct * (v34 + t185) + t189 = ct * (v33 + t187) + t199 = t13 * t20 + t217 = 2.0 * t117 * t199 - t110 * t92 + t234 = ( + v21 + + ct * (v22 + t169) + + sa * (v26 + ct * (v27 + t179) + v36 * sa + t82 * (v31 + ct * (v32 + t189))) + + t217 * t20 + ) + t241 = t64 - t92 * t19 + t242 = np.sqrt(t241) + t243 = 1.0 / t242 + t244 = t4 + t8 + t12 - t242 + t245 = 1.0 / t244 + t247 = t4 + t8 + t12 + t242 + t128 + t248 = 1.0 / t247 + t249 = t242 * t245 * t248 + t252 = 1.0 + 2.0 * t128 * t249 + t253 = np.log(t252) + t254 = t243 * t253 + t259 = t234 * t19 - t143 * t13 + t264 = t259 * t20 + t272 = 2.0 * t13 * t29 - t105 * t19 - t92 * t37 + t282 = t128 * t242 + t283 = t244 ** 2 + t287 = t243 * t272 / 2.0 + t292 = t247 ** 2 + t305 = ( + 0.1e5 + * p + * ( + v44 + + t2 + + t3 + - 2.0 * v48 * t13 * t20 + - 2.0 * t24 * t29 * t20 + + 2.0 * t33 * t38 + + 0.5 * v48 * p + ) + * t20 + - 0.1e5 * p * (v43 + t48 - 2.0 * t33 * t20 + 0.5 * t24 * p) * t38 + + 0.5e4 * t123 * t20 * t134 + - 0.5e4 * t143 * t35 * t134 * t37 + + 0.5e4 + * t143 + * t20 + * (p * (1.0 * v13 + 2.0 * t5 + 1.0 * t27 + t152) * t131 - t130 / t156 * t105) + / t133 + + 0.5e4 + * ( + ( + v22 + + t169 + + ct * (v23 + t167 + ct * (v24 + 2.0 * t165)) + + sa + * ( + v27 + + t179 + + ct * (v28 + t177 + ct * (v29 + 2.0 * t175)) + + t82 * (v32 + t189 + ct * (v33 + t187 + ct * (v34 + 2.0 * t185))) + ) + + ( + 2.0 * t93 * t199 + + 2.0 * t106 * t199 + + 2.0 * t117 * t68 + - 2.0 * t117 * t13 * t35 * t37 + - t107 * t92 + - t110 * t105 + ) + * t20 + - t217 * t35 * t37 + ) + * t19 + + t234 * t37 + - t123 * t13 + - t143 * t29 + ) + * t20 + * t254 + - 0.5e4 * t259 * t35 * t254 * t37 + - 0.25e4 * t264 / t242 / t241 * t253 * t272 + + 0.5e4 + * t264 + * t243 + * ( + 2.0 * t152 * t249 + + t128 * t243 * t245 * t248 * t272 + - 2.0 * t282 / t283 * t248 * (t25 + t26 + t28 - t287) + - 2.0 * t282 * t245 / t292 * (t25 + t26 + t28 + t287 + t152) + ) + / t252 + ) + + return t305 + + +@jit(backend="cython", native=True) +def gsw_dHdT_vec(sa, ct, p, out): + ix: int = sa.shape[0] + iy: int = sa.shape[1] + iz: int = sa.shape[2] + for i in range(ix): + for j in range(iy): + for k in range(iz): + out[i, j, k] = gsw_dHdT(sa[i, j, k], ct[i, j, k], p[0, 0, k]) + + +def run(sa, ct, p, device="cpu"): + out = np.empty_like(sa) + gsw_dHdT_vec(sa, ct, p, out) + return out diff --git a/benchmarks/equation_of_state/eos_transonic_pythran.py b/benchmarks/equation_of_state/eos_transonic_pythran.py new file mode 100644 index 0000000..73054e1 --- /dev/null +++ b/benchmarks/equation_of_state/eos_transonic_pythran.py @@ -0,0 +1,274 @@ +""" +========================================================================== + in-situ density, dynamic enthalpy and derivatives + from Absolute Salinity and Conservative + Temperature, using the computationally-efficient 48-term expression for + density in terms of SA, CT and p (IOC et al., 2010). +========================================================================== +""" + +import numpy as np +from transonic import jit + + +v01 = 9.998420897506056e2 +v02 = 2.839940833161907e0 +v03 = -3.147759265588511e-2 +v04 = 1.181805545074306e-3 +v05 = -6.698001071123802e0 +v06 = -2.986498947203215e-2 +v07 = 2.327859407479162e-4 +v08 = -3.988822378968490e-2 +v09 = 5.095422573880500e-4 +v10 = -1.426984671633621e-5 +v11 = 1.645039373682922e-7 +v12 = -2.233269627352527e-2 +v13 = -3.436090079851880e-4 +v14 = 3.726050720345733e-6 +v15 = -1.806789763745328e-4 +v16 = 6.876837219536232e-7 +v17 = -3.087032500374211e-7 +v18 = -1.988366587925593e-8 +v19 = -1.061519070296458e-11 +v20 = 1.550932729220080e-10 +v21 = 1.0e0 +v22 = 2.775927747785646e-3 +v23 = -2.349607444135925e-5 +v24 = 1.119513357486743e-6 +v25 = 6.743689325042773e-10 +v26 = -7.521448093615448e-3 +v27 = -2.764306979894411e-5 +v28 = 1.262937315098546e-7 +v29 = 9.527875081696435e-10 +v30 = -1.811147201949891e-11 +v31 = -3.303308871386421e-5 +v32 = 3.801564588876298e-7 +v33 = -7.672876869259043e-9 +v34 = -4.634182341116144e-11 +v35 = 2.681097235569143e-12 +v36 = 5.419326551148740e-6 +v37 = -2.742185394906099e-5 +v38 = -3.212746477974189e-7 +v39 = 3.191413910561627e-9 +v40 = -1.931012931541776e-12 +v41 = -1.105097577149576e-7 +v42 = 6.211426728363857e-10 +v43 = -1.119011592875110e-10 +v44 = -1.941660213148725e-11 +v45 = -1.864826425365600e-14 +v46 = 1.119522344879478e-14 +v47 = -1.200507748551599e-15 +v48 = 6.057902487546866e-17 +rho0 = 1024.0 + + +# NOTE: no type annotations are required with the `jit` decorator. +@jit(native=True, xsimd=True) +def gsw_dHdT(sa, ct, p): + """ + d/dT of dynamic enthalpy, analytical derivative + sa : Absolute Salinity [g/kg] + ct : Conservative Temperature [deg C] + p : sea pressure [dbar] + """ + t1 = v45 * ct + t2 = 0.2e1 * t1 + t3 = v46 * sa + t4 = 0.5 * v12 + t5 = v14 * ct + t7 = ct * (v13 + t5) + t8 = 0.5 * t7 + t11 = sa * (v15 + v16 * ct) + t12 = 0.5 * t11 + t13 = t4 + t8 + t12 + t15 = v19 * ct + t19 = v17 + ct * (v18 + t15) + v20 * sa + t20 = 1.0 / t19 + t24 = v47 + v48 * ct + t25 = 0.5 * v13 + t26 = 1.0 * t5 + t27 = sa * v16 + t28 = 0.5 * t27 + t29 = t25 + t26 + t28 + t33 = t24 * t13 + t34 = t19 ** 2 + t35 = 1.0 / t34 + t37 = v18 + 2.0 * t15 + t38 = t35 * t37 + t48 = ct * (v44 + t1 + t3) + t57 = v40 * ct + t59 = ct * (v39 + t57) + t64 = t13 ** 2 + t68 = t20 * t29 + t71 = t24 * t64 + t74 = v04 * ct + t76 = ct * (v03 + t74) + t79 = v07 * ct + t82 = np.sqrt(sa) + t83 = v11 * ct + t85 = ct * (v10 + t83) + t92 = ( + v01 + + ct * (v02 + t76) + + sa * (v05 + ct * (v06 + t79) + t82 * (v08 + ct * (v09 + t85))) + ) + t93 = v48 * t92 + t105 = ( + v02 + + t76 + + ct * (v03 + 2.0 * t74) + + sa * (v06 + 2.0 * t79 + t82 * (v09 + t85 + ct * (v10 + 2.0 * t83))) + ) + t106 = t24 * t105 + t107 = v44 + t2 + t3 + t110 = v43 + t48 + t117 = t24 * t92 + t120 = 4.0 * t71 * t20 - t117 - 2.0 * t110 * t13 + t123 = ( + v38 + + t59 + + ct * (v39 + 2.0 * t57) + + sa * v42 + + ( + 4.0 * v48 * t64 * t20 + + 8.0 * t33 * t68 + - 4.0 * t71 * t38 + - t93 + - t106 + - 2.0 * t107 * t13 + - 2.0 * t110 * t29 + ) + * t20 + - t120 * t35 * t37 + ) + t128 = t19 * p + t130 = p * (1.0 * v12 + 1.0 * t7 + 1.0 * t11 + t128) + t131 = 1.0 / t92 + t133 = 1.0 + t130 * t131 + t134 = np.log(t133) + t143 = v37 + ct * (v38 + t59) + sa * (v41 + v42 * ct) + t120 * t20 + t152 = t37 * p + t156 = t92 ** 2 + t165 = v25 * ct + t167 = ct * (v24 + t165) + t169 = ct * (v23 + t167) + t175 = v30 * ct + t177 = ct * (v29 + t175) + t179 = ct * (v28 + t177) + t185 = v35 * ct + t187 = ct * (v34 + t185) + t189 = ct * (v33 + t187) + t199 = t13 * t20 + t217 = 2.0 * t117 * t199 - t110 * t92 + t234 = ( + v21 + + ct * (v22 + t169) + + sa * (v26 + ct * (v27 + t179) + v36 * sa + t82 * (v31 + ct * (v32 + t189))) + + t217 * t20 + ) + t241 = t64 - t92 * t19 + t242 = np.sqrt(t241) + t243 = 1.0 / t242 + t244 = t4 + t8 + t12 - t242 + t245 = 1.0 / t244 + t247 = t4 + t8 + t12 + t242 + t128 + t248 = 1.0 / t247 + t249 = t242 * t245 * t248 + t252 = 1.0 + 2.0 * t128 * t249 + t253 = np.log(t252) + t254 = t243 * t253 + t259 = t234 * t19 - t143 * t13 + t264 = t259 * t20 + t272 = 2.0 * t13 * t29 - t105 * t19 - t92 * t37 + t282 = t128 * t242 + t283 = t244 ** 2 + t287 = t243 * t272 / 2.0 + t292 = t247 ** 2 + t305 = ( + 0.1e5 + * p + * ( + v44 + + t2 + + t3 + - 2.0 * v48 * t13 * t20 + - 2.0 * t24 * t29 * t20 + + 2.0 * t33 * t38 + + 0.5 * v48 * p + ) + * t20 + - 0.1e5 * p * (v43 + t48 - 2.0 * t33 * t20 + 0.5 * t24 * p) * t38 + + 0.5e4 * t123 * t20 * t134 + - 0.5e4 * t143 * t35 * t134 * t37 + + 0.5e4 + * t143 + * t20 + * (p * (1.0 * v13 + 2.0 * t5 + 1.0 * t27 + t152) * t131 - t130 / t156 * t105) + / t133 + + 0.5e4 + * ( + ( + v22 + + t169 + + ct * (v23 + t167 + ct * (v24 + 2.0 * t165)) + + sa + * ( + v27 + + t179 + + ct * (v28 + t177 + ct * (v29 + 2.0 * t175)) + + t82 * (v32 + t189 + ct * (v33 + t187 + ct * (v34 + 2.0 * t185))) + ) + + ( + 2.0 * t93 * t199 + + 2.0 * t106 * t199 + + 2.0 * t117 * t68 + - 2.0 * t117 * t13 * t35 * t37 + - t107 * t92 + - t110 * t105 + ) + * t20 + - t217 * t35 * t37 + ) + * t19 + + t234 * t37 + - t123 * t13 + - t143 * t29 + ) + * t20 + * t254 + - 0.5e4 * t259 * t35 * t254 * t37 + - 0.25e4 * t264 / t242 / t241 * t253 * t272 + + 0.5e4 + * t264 + * t243 + * ( + 2.0 * t152 * t249 + + t128 * t243 * t245 * t248 * t272 + - 2.0 * t282 / t283 * t248 * (t25 + t26 + t28 - t287) + - 2.0 * t282 * t245 / t292 * (t25 + t26 + t28 + t287 + t152) + ) + / t252 + ) + + return t305 + + +def run(sa, ct, p, device="cpu"): + return gsw_dHdT(sa, ct, p) + + +# One could also use loops and in-place computation as done with Numba: + +# @jit(native=True, xsimd=True) +# def gsw_dHdT_vec(sa, ct, p, out): +# ix, iy, iz = sa.shape +# for i in range(ix): +# for j in range(iy): +# for k in range(iz): +# out[i, j, k] = gsw_dHdT(sa[i, j, k], ct[i, j, k], p[0, 0, k]) +# +# +# def run(sa, ct, p, device="cpu"): +# out = np.empty_like(sa) +# gsw_dHdT_vec(sa, ct, p, out) +# return out diff --git a/benchmarks/isoneutral_mixing/__init__.py b/benchmarks/isoneutral_mixing/__init__.py index 952563b..985424d 100644 --- a/benchmarks/isoneutral_mixing/__init__.py +++ b/benchmarks/isoneutral_mixing/__init__.py @@ -83,4 +83,6 @@ def get_callable(backend, size, device="cpu"): "numpy", "jax", "pytorch", + "transonic_pythran", + "transonic_cython", ) diff --git a/benchmarks/isoneutral_mixing/isoneutral_transonic_cython.py b/benchmarks/isoneutral_mixing/isoneutral_transonic_cython.py new file mode 100644 index 0000000..9316887 --- /dev/null +++ b/benchmarks/isoneutral_mixing/isoneutral_transonic_cython.py @@ -0,0 +1,290 @@ +import numpy as np +from transonic import jit + + +def get_drhodT(salt, temp, p): + rho0 = 1024.0 + z0 = 0.0 + theta0 = 283.0 - 273.15 + grav = 9.81 + betaT = 1.67e-4 + betaTs = 1e-5 + gammas = 1.1e-8 + + zz = -p - z0 + thetas = temp - theta0 + return -(betaTs * thetas + betaT * (1 - gammas * grav * zz * rho0)) * rho0 + + +def get_drhodS(salt, temp, p): + betaS = 0.78e-3 + rho0 = 1024.0 + return betaS * rho0 + + +def dm_taper(sx): + """ + tapering function for isopycnal slopes + """ + iso_slopec = 1e-3 + iso_dslope = 1e-3 + return 0.5 * (1.0 + np.tanh((-np.abs(sx) + iso_slopec) / iso_dslope)) + + +@jit(backend="cython", native=True) +def isoneutral_diffusion_pre( + maskT, + maskU, + maskV, + maskW, + dxt, + dxu, + dyt, + dyu, + dzt, + dzw, + cost, + cosu, + salt, + temp, + zt, + K_iso, + K_11, + K_22, + K_33, + Ai_ez, + Ai_nz, + Ai_bx, + Ai_by, +): + """ + Isopycnal diffusion for tracer + following functional formulation by Griffies et al + Code adopted from MOM2.1 + """ + nx: int = maskT.shape[0] + ny: int = maskT.shape[1] + nz: int = maskT.shape[2] + + + epsln: float = 1e-20 + K_iso_steep: float = 50.0 + tau: int = 0 + + drdT = np.empty_like(K_11) + drdS = np.empty_like(K_11) + dTdx = np.empty_like(K_11) + dSdx = np.empty_like(K_11) + dTdy = np.empty_like(K_11) + dSdy = np.empty_like(K_11) + dTdz = np.empty_like(K_11) + dSdz = np.empty_like(K_11) + + """ + drho_dt and drho_ds at centers of T cells + """ + for i in range(nx): + for j in range(ny): + for k in range(nz): + drdT[i, j, k] = maskT[i, j, k] * get_drhodT( + salt[i, j, k, tau], temp[i, j, k, tau], np.abs(zt[k]) + ) + drdS[i, j, k] = maskT[i, j, k] * get_drhodS( + salt[i, j, k, tau], temp[i, j, k, tau], np.abs(zt[k]) + ) + + """ + gradients at top face of T cells + """ + for i in range(nx): + for j in range(ny): + for k in range(nz - 1): + dTdz[i, j, k] = ( + maskW[i, j, k] + * (temp[i, j, k + 1, tau] - temp[i, j, k, tau]) + / dzw[k] + ) + dSdz[i, j, k] = ( + maskW[i, j, k] + * (salt[i, j, k + 1, tau] - salt[i, j, k, tau]) + / dzw[k] + ) + dTdz[i, j, -1] = 0.0 + dSdz[i, j, -1] = 0.0 + + """ + gradients at eastern face of T cells + """ + for i in range(nx - 1): + for j in range(ny): + for k in range(nz): + dTdx[i, j, k] = ( + maskU[i, j, k] + * (temp[i + 1, j, k, tau] - temp[i, j, k, tau]) + / (dxu[i] * cost[j]) + ) + dSdx[i, j, k] = ( + maskU[i, j, k] + * (salt[i + 1, j, k, tau] - salt[i, j, k, tau]) + / (dxu[i] * cost[j]) + ) + dTdx[-1, :, :] = 0.0 + dSdx[-1, :, :] = 0.0 + + """ + gradients at northern face of T cells + """ + for i in range(nx): + for j in range(ny - 1): + for k in range(nz): + dTdy[i, j, k] = ( + maskV[i, j, k] + * (temp[i, j + 1, k, tau] - temp[i, j, k, tau]) + / dyu[j] + ) + dSdy[i, j, k] = ( + maskV[i, j, k] + * (salt[i, j + 1, k, tau] - salt[i, j, k, tau]) + / dyu[j] + ) + dTdy[:, -1, :] = 0.0 + dSdy[:, -1, :] = 0.0 + + """ + Compute Ai_ez and K11 on center of east face of T cell. + """ + for i in range(1, nx - 2): + for j in range(2, ny - 2): + for k in range(0, nz): + if k == 0: + diffloc = 0.5 * (K_iso[i, j, k] + K_iso[i + 1, j, k]) + else: + diffloc = 0.25 * ( + K_iso[i, j, k] + + K_iso[i, j, k - 1] + + K_iso[i + 1, j, k] + + K_iso[i + 1, j, k - 1] + ) + + sumz = 0.0 + + for kr in (0, 1): + if k == 0 and kr == 0: + continue + + for ip in (0, 1): + drodxe = ( + drdT[i + ip, j, k] * dTdx[i, j, k] + + drdS[i + ip, j, k] * dSdx[i, j, k] + ) + drodze = ( + drdT[i + ip, j, k] * dTdz[i + ip, j, k + kr - 1] + + drdS[i + ip, j, k] * dSdz[i + ip, j, k + kr - 1] + ) + sxe = -drodxe / (min(0.0, drodze) - epsln) + taper = dm_taper(sxe) + sumz += ( + dzw[k + kr - 1] + * maskU[i, j, k] + * max(K_iso_steep, diffloc * taper) + ) + Ai_ez[i, j, k, ip, kr] = taper * sxe * maskU[i, j, k] + + K_11[i, j, k] = sumz / (4.0 * dzt[k]) + + """ + Compute Ai_nz and K_22 on center of north face of T cell. + """ + for i in range(2, nx - 2): + for j in range(1, ny - 2): + for k in range(0, nz): + if k == 0: + diffloc = 0.5 * (K_iso[i, j, k] + K_iso[i, j + 1, k]) + else: + diffloc = 0.25 * ( + K_iso[i, j, k] + + K_iso[i, j, k - 1] + + K_iso[i, j + 1, k] + + K_iso[i, j + 1, k - 1] + ) + + sumz = 0.0 + + for kr in (0, 1): + if k == 0 and kr == 0: + continue + + for jp in (0, 1): + drodyn = ( + drdT[i, j + jp, k] * dTdy[i, j, k] + + drdS[i, j + jp, k] * dSdy[i, j, k] + ) + drodzn = ( + drdT[i, j + jp, k] * dTdz[i, j + jp, k + kr - 1] + + drdS[i, j + jp, k] * dSdz[i, j + jp, k + kr - 1] + ) + syn = -drodyn / (min(0.0, drodzn) - epsln) + taper = dm_taper(syn) + sumz += ( + dzw[k + kr - 1] + * maskV[i, j, k] + * max(K_iso_steep, diffloc * taper) + ) + Ai_nz[i, j, k, jp, kr] = taper * syn * maskV[i, j, k] + + K_22[i, j, k] = sumz / (4.0 * dzt[k]) + + """ + compute Ai_bx, Ai_by and K33 on top face of T cell. + """ + for i in range(2, nx - 2): + for j in range(2, ny - 2): + for k in range(nz - 1): + sumx = 0.0 + sumy = 0.0 + + for kr in (0, 1): + drodzb = ( + drdT[i, j, k + kr] * dTdz[i, j, k] + + drdS[i, j, k + kr] * dSdz[i, j, k] + ) + + # eastward slopes at the top of T cells + for ip in (0, 1): + drodxb = ( + drdT[i, j, k + kr] * dTdx[i - 1 + ip, j, k + kr] + + drdS[i, j, k + kr] * dSdx[i - 1 + ip, j, k + kr] + ) + sxb = -drodxb / (min(0.0, drodzb) - epsln) + taper = dm_taper(sxb) + sumx += ( + dxu[i - 1 + ip] + * K_iso[i, j, k] + * taper + * sxb ** 2 + * maskW[i, j, k] + ) + Ai_bx[i, j, k, ip, kr] = taper * sxb * maskW[i, j, k] + + # northward slopes at the top of T cells + for jp in (0, 1): + facty = cosu[j - 1 + jp] * dyu[j - 1 + jp] + drodyb = ( + drdT[i, j, k + kr] * dTdy[i, j + jp - 1, k + kr] + + drdS[i, j, k + kr] * dSdy[i, j + jp - 1, k + kr] + ) + syb = -drodyb / (min(0.0, drodzb) - epsln) + taper = dm_taper(syb) + sumy += ( + facty * K_iso[i, j, k] * taper * syb ** 2 * maskW[i, j, k] + ) + Ai_by[i, j, k, jp, kr] = taper * syb * maskW[i, j, k] + + K_33[i, j, k] = sumx / (4 * dxt[i]) + sumy / (4 * dyt[j] * cost[j]) + + K_33[i, j, -1] = 0.0 + + +def run(*inputs, device="cpu"): + isoneutral_diffusion_pre(*inputs) + return inputs[-7:] diff --git a/benchmarks/isoneutral_mixing/isoneutral_transonic_pythran.py b/benchmarks/isoneutral_mixing/isoneutral_transonic_pythran.py new file mode 100644 index 0000000..aadc76a --- /dev/null +++ b/benchmarks/isoneutral_mixing/isoneutral_transonic_pythran.py @@ -0,0 +1,254 @@ +import numpy as np +from transonic import jit + + +def get_drhodT(salt, temp, p): + rho0 = 1024.0 + z0 = 0.0 + theta0 = 283.0 - 273.15 + grav = 9.81 + betaT = 1.67e-4 + betaTs = 1e-5 + gammas = 1.1e-8 + + zz = -p - z0 + thetas = temp - theta0 + return -(betaTs * thetas + betaT * (1 - gammas * grav * zz * rho0)) * rho0 + + +def get_drhodS(salt, temp, p): + betaS = 0.78e-3 + rho0 = 1024.0 + return betaS * rho0 * np.ones_like(temp) + + +@jit(native=True, xsimd=True) +def isoneutral_diffusion_pre( + maskT, + maskU, + maskV, + maskW, + dxt, + dxu, + dyt, + dyu, + dzt, + dzw, + cost, + cosu, + salt, + temp, + zt, + K_iso, + K_11, + K_22, + K_33, + Ai_ez, + Ai_nz, + Ai_bx, + Ai_by, +): + """ + Isopycnal diffusion for tracer + following functional formulation by Griffies et al + Code adopted from MOM2.1 + """ + epsln = 1e-20 + iso_slopec = 1e-3 + iso_dslope = 1e-3 + K_iso_steep = 50.0 + tau = 0 + + dTdx = np.zeros_like(K_11) + dSdx = np.zeros_like(K_11) + dTdy = np.zeros_like(K_11) + dSdy = np.zeros_like(K_11) + dTdz = np.zeros_like(K_11) + dSdz = np.zeros_like(K_11) + + """ + drho_dt and drho_ds at centers of T cells + """ + drdT = maskT * get_drhodT(salt[:, :, :, tau], temp[:, :, :, tau], np.abs(zt)) + drdS = maskT * get_drhodS(salt[:, :, :, tau], temp[:, :, :, tau], np.abs(zt)) + + """ + gradients at top face of T cells + """ + dTdz[:, :, :-1] = ( + maskW[:, :, :-1] + * (temp[:, :, 1:, tau] - temp[:, :, :-1, tau]) + / dzw[np.newaxis, np.newaxis, :-1] + ) + dSdz[:, :, :-1] = ( + maskW[:, :, :-1] + * (salt[:, :, 1:, tau] - salt[:, :, :-1, tau]) + / dzw[np.newaxis, np.newaxis, :-1] + ) + + """ + gradients at eastern face of T cells + """ + dTdx[:-1, :, :] = ( + maskU[:-1, :, :] + * (temp[1:, :, :, tau] - temp[:-1, :, :, tau]) + / (dxu[:-1, np.newaxis, np.newaxis] * cost[np.newaxis, :, np.newaxis]) + ) + dSdx[:-1, :, :] = ( + maskU[:-1, :, :] + * (salt[1:, :, :, tau] - salt[:-1, :, :, tau]) + / (dxu[:-1, np.newaxis, np.newaxis] * cost[np.newaxis, :, np.newaxis]) + ) + + """ + gradients at northern face of T cells + """ + dTdy[:, :-1, :] = ( + maskV[:, :-1, :] + * (temp[:, 1:, :, tau] - temp[:, :-1, :, tau]) + / dyu[np.newaxis, :-1, np.newaxis] + ) + dSdy[:, :-1, :] = ( + maskV[:, :-1, :] + * (salt[:, 1:, :, tau] - salt[:, :-1, :, tau]) + / dyu[np.newaxis, :-1, np.newaxis] + ) + + def dm_taper(sx): + """ + tapering function for isopycnal slopes + """ + return 0.5 * (1.0 + np.tanh((-np.abs(sx) + iso_slopec) / iso_dslope)) + + """ + Compute Ai_ez and K11 on center of east face of T cell. + """ + diffloc = np.zeros_like(K_11) + diffloc[1:-2, 2:-2, 1:] = 0.25 * ( + K_iso[1:-2, 2:-2, 1:] + + K_iso[1:-2, 2:-2, :-1] + + K_iso[2:-1, 2:-2, 1:] + + K_iso[2:-1, 2:-2, :-1] + ) + diffloc[1:-2, 2:-2, 0] = 0.5 * (K_iso[1:-2, 2:-2, 0] + K_iso[2:-1, 2:-2, 0]) + + sumz = np.zeros_like(K_11)[1:-2, 2:-2] + for kr in range(2): + ki = 0 if kr == 1 else 1 + for ip in range(2): + drodxe = ( + drdT[1 + ip : -2 + ip, 2:-2, ki:] * dTdx[1:-2, 2:-2, ki:] + + drdS[1 + ip : -2 + ip, 2:-2, ki:] * dSdx[1:-2, 2:-2, ki:] + ) + drodze = ( + drdT[1 + ip : -2 + ip, 2:-2, ki:] + * dTdz[1 + ip : -2 + ip, 2:-2, : -1 + kr or None] + + drdS[1 + ip : -2 + ip, 2:-2, ki:] + * dSdz[1 + ip : -2 + ip, 2:-2, : -1 + kr or None] + ) + sxe = -drodxe / (np.minimum(0.0, drodze) - epsln) + taper = dm_taper(sxe) + sumz[:, :, ki:] += ( + dzw[np.newaxis, np.newaxis, : -1 + kr or None] + * maskU[1:-2, 2:-2, ki:] + * np.maximum(K_iso_steep, diffloc[1:-2, 2:-2, ki:] * taper) + ) + Ai_ez[1:-2, 2:-2, ki:, ip, kr] = taper * sxe * maskU[1:-2, 2:-2, ki:] + K_11[1:-2, 2:-2, :] = sumz / (4.0 * dzt[np.newaxis, np.newaxis, :]) + + """ + Compute Ai_nz and K_22 on center of north face of T cell. + """ + diffloc[:, :, :] = 0 + diffloc[2:-2, 1:-2, 1:] = 0.25 * ( + K_iso[2:-2, 1:-2, 1:] + + K_iso[2:-2, 1:-2, :-1] + + K_iso[2:-2, 2:-1, 1:] + + K_iso[2:-2, 2:-1, :-1] + ) + diffloc[2:-2, 1:-2, 0] = 0.5 * (K_iso[2:-2, 1:-2, 0] + K_iso[2:-2, 2:-1, 0]) + + sumz = np.zeros_like(K_11)[2:-2, 1:-2] + for kr in range(2): + ki = 0 if kr == 1 else 1 + for jp in range(2): + drodyn = ( + drdT[2:-2, 1 + jp : -2 + jp, ki:] * dTdy[2:-2, 1:-2, ki:] + + drdS[2:-2, 1 + jp : -2 + jp, ki:] * dSdy[2:-2, 1:-2, ki:] + ) + drodzn = ( + drdT[2:-2, 1 + jp : -2 + jp, ki:] + * dTdz[2:-2, 1 + jp : -2 + jp, : -1 + kr or None] + + drdS[2:-2, 1 + jp : -2 + jp, ki:] + * dSdz[2:-2, 1 + jp : -2 + jp, : -1 + kr or None] + ) + syn = -drodyn / (np.minimum(0.0, drodzn) - epsln) + taper = dm_taper(syn) + sumz[:, :, ki:] += ( + dzw[np.newaxis, np.newaxis, : -1 + kr or None] + * maskV[2:-2, 1:-2, ki:] + * np.maximum(K_iso_steep, diffloc[2:-2, 1:-2, ki:] * taper) + ) + Ai_nz[2:-2, 1:-2, ki:, jp, kr] = taper * syn * maskV[2:-2, 1:-2, ki:] + K_22[2:-2, 1:-2, :] = sumz / (4.0 * dzt[np.newaxis, np.newaxis, :]) + + """ + compute Ai_bx, Ai_by and K33 on top face of T cell. + """ + sumx = np.zeros_like(K_11)[2:-2, 2:-2, :-1] + sumy = np.zeros_like(K_11)[2:-2, 2:-2, :-1] + + for kr in range(2): + drodzb = ( + drdT[2:-2, 2:-2, kr : -1 + kr or None] * dTdz[2:-2, 2:-2, :-1] + + drdS[2:-2, 2:-2, kr : -1 + kr or None] * dSdz[2:-2, 2:-2, :-1] + ) + + # eastward slopes at the top of T cells + for ip in range(2): + drodxb = ( + drdT[2:-2, 2:-2, kr : -1 + kr or None] + * dTdx[1 + ip : -3 + ip, 2:-2, kr : -1 + kr or None] + + drdS[2:-2, 2:-2, kr : -1 + kr or None] + * dSdx[1 + ip : -3 + ip, 2:-2, kr : -1 + kr or None] + ) + sxb = -drodxb / (np.minimum(0.0, drodzb) - epsln) + taper = dm_taper(sxb) + sumx += ( + dxu[1 + ip : -3 + ip, np.newaxis, np.newaxis] + * K_iso[2:-2, 2:-2, :-1] + * taper + * sxb ** 2 + * maskW[2:-2, 2:-2, :-1] + ) + Ai_bx[2:-2, 2:-2, :-1, ip, kr] = taper * sxb * maskW[2:-2, 2:-2, :-1] + + # northward slopes at the top of T cells + for jp in range(2): + facty = cosu[1 + jp : -3 + jp] * dyu[1 + jp : -3 + jp] + drodyb = ( + drdT[2:-2, 2:-2, kr : -1 + kr or None] + * dTdy[2:-2, 1 + jp : -3 + jp, kr : -1 + kr or None] + + drdS[2:-2, 2:-2, kr : -1 + kr or None] + * dSdy[2:-2, 1 + jp : -3 + jp, kr : -1 + kr or None] + ) + syb = -drodyb / (np.minimum(0.0, drodzb) - epsln) + taper = dm_taper(syb) + sumy += ( + facty[np.newaxis, :, np.newaxis] + * K_iso[2:-2, 2:-2, :-1] + * taper + * syb ** 2 + * maskW[2:-2, 2:-2, :-1] + ) + Ai_by[2:-2, 2:-2, :-1, jp, kr] = taper * syb * maskW[2:-2, 2:-2, :-1] + + K_33[2:-2, 2:-2, :-1] = sumx / (4 * dxt[2:-2, np.newaxis, np.newaxis]) + sumy / ( + 4 * dyt[np.newaxis, 2:-2, np.newaxis] * cost[np.newaxis, 2:-2, np.newaxis] + ) + K_33[2:-2, 2:-2, -1] = 0.0 + + +def run(*inputs, device="cpu"): + isoneutral_diffusion_pre(*inputs) + return inputs[-7:] diff --git a/benchmarks/turbulent_kinetic_energy/__init__.py b/benchmarks/turbulent_kinetic_energy/__init__.py index 9e0baf7..514d03a 100644 --- a/benchmarks/turbulent_kinetic_energy/__init__.py +++ b/benchmarks/turbulent_kinetic_energy/__init__.py @@ -80,4 +80,6 @@ def get_callable(backend, size, device="cpu"): "numba", "numpy", "pytorch", + "transonic_pythran", + "transonic_cython", ) diff --git a/benchmarks/turbulent_kinetic_energy/tke_transonic_cython.py b/benchmarks/turbulent_kinetic_energy/tke_transonic_cython.py new file mode 100644 index 0000000..2aac95e --- /dev/null +++ b/benchmarks/turbulent_kinetic_energy/tke_transonic_cython.py @@ -0,0 +1,329 @@ +import numpy as np +from transonic.typing import Tuple +from transonic import jit + + +def solve_tridiag(a, b, c, d): + """ + Solves a tridiagonal matrix system with diagonals a, b, c and RHS vector d. + """ + assert a.shape == b.shape and a.shape == c.shape and a.shape == d.shape + + n = a.shape[0] + + for i in range(1, n): + w = a[i] / b[i - 1] + b[i] += -w * c[i - 1] + d[i] += -w * d[i - 1] + + out = np.empty_like(a) + out[-1] = d[-1] / b[-1] + + for i in range(n - 2, -1, -1): + out[i] = (d[i] - c[i] * out[i + 1]) / b[i] + + return out + + +def _calc_cr(r_jp, r_j, r_jm, vel): + """ + Calculates cr value used in superbee advection scheme + """ + eps = 1e-20 # prevent division by 0 + if abs(r_j) < eps: + fac = eps + else: + fac = r_j + + if vel > 0: + return r_jm / fac + else: + return r_jp / fac + + +def limiter(cr): + return max(0.0, max(min(1.0, 2.0 * cr), min(2.0, cr))) + + +def adv_flux_superbee_wgrid( + adv_fe, + adv_fn, + adv_ft, + var, + u_wgrid, + v_wgrid, + w_wgrid, + maskW, + dxt, + dyt, + dzw, + cost, + cosu, + dt_tracer, +): + """ + Calculates advection of a tracer defined on Wgrid + """ + nx, ny, nz = var.shape + + maskUtr = np.zeros_like(maskW) + maskUtr[:-1, :, :] = maskW[1:, :, :] * maskW[:-1, :, :] + + adv_fe[...] = 0.0 + for i in range(1, nx - 2): + for j in range(2, ny - 2): + for k in range(nz): + vel = u_wgrid[i, j, k] + u_cfl = abs(vel * dt_tracer / (cost[j] * dxt[i])) + r_jp = (var[i + 2, j, k] - var[i + 1, j, k]) * maskUtr[i + 1, j, k] + r_j = (var[i + 1, j, k] - var[i, j, k]) * maskUtr[i, j, k] + r_jm = (var[i, j, k] - var[i - 1, j, k]) * maskUtr[i - 1, j, k] + cr = limiter(_calc_cr(r_jp, r_j, r_jm, vel)) + adv_fe[i, j, k] = ( + vel * (var[i + 1, j, k] + var[i, j, k]) * 0.5 + - abs(vel) * ((1.0 - cr) + u_cfl * cr) * r_j * 0.5 + ) + + maskVtr = np.zeros_like(maskW) + maskVtr[:, :-1, :] = maskW[:, 1:, :] * maskW[:, :-1, :] + + adv_fn[...] = 0.0 + for i in range(2, nx - 2): + for j in range(1, ny - 2): + for k in range(nz): + vel = cosu[j] * v_wgrid[i, j, k] + u_cfl = abs(vel * dt_tracer / (cost[j] * dyt[j])) + r_jp = (var[i, j + 2, k] - var[i, j + 1, k]) * maskVtr[i, j + 1, k] + r_j = (var[i, j + 1, k] - var[i, j, k]) * maskVtr[i, j, k] + r_jm = (var[i, j, k] - var[i, j - 1, k]) * maskVtr[i, j - 1, k] + cr = limiter(_calc_cr(r_jp, r_j, r_jm, v_wgrid[i, j, k])) + adv_fn[i, j, k] = ( + vel * (var[i, j + 1, k] + var[i, j, k]) * 0.5 + - abs(vel) * ((1.0 - cr) + u_cfl * cr) * r_j * 0.5 + ) + + maskWtr = np.zeros_like(maskW) + maskWtr[:, :, :-1] = maskW[:, :, 1:] * maskW[:, :, :-1] + + adv_ft[...] = 0.0 + for i in range(2, nx - 2): + for j in range(2, ny - 2): + for k in range(nz - 1): + kp1 = min(nz - 2, k + 1) + kp2 = min(nz - 1, k + 2) + km1 = max(0, k - 1) + + vel = w_wgrid[i, j, k] + u_cfl = abs(vel * dt_tracer / dzw[k]) + r_jp = (var[i, j, kp2] - var[i, j, k + 1]) * maskWtr[i, j, kp1] + r_j = (var[i, j, k + 1] - var[i, j, k]) * maskWtr[i, j, k] + r_jm = (var[i, j, k] - var[i, j, km1]) * maskWtr[i, j, km1] + cr = limiter(_calc_cr(r_jp, r_j, r_jm, vel)) + adv_ft[i, j, k] = ( + vel * (var[i, j, k + 1] + var[i, j, k]) * 0.5 + - abs(vel) * ((1.0 - cr) + u_cfl * cr) * r_j * 0.5 + ) + + +@jit(backend="cython", native=True) +def integrate_tke( + u, + v, + w, + maskU, + maskV, + maskW, + dxt, + dxu, + dyt, + dyu, + dzt, + dzw, + cost, + cosu, + kbot, + kappaM, + mxl, + forc, + forc_tke_surface, + tke, + dtke, +): + i: int + j: int + k: int + nx: int = maskU.shape[0] + ny: int = maskU.shape[1] + nz: int = maskU.shape[2] + + tau: int = 0 + taup1: int = 1 + taum1: int = 2 + + dt_tracer: float = 1 + dt_mom: float = 1 + AB_eps: float = 0.1 + alpha_tke: float = 1.0 + c_eps: float = 0.7 + K_h_tke: float = 2000.0 + + flux_east = np.zeros_like(maskU) + flux_north = np.zeros_like(maskU) + flux_top = np.zeros_like(maskU) + + sqrttke = np.sqrt(np.maximum(0.0, tke[:, :, :, tau])) + + """ + integrate Tke equation on W grid with surface flux boundary condition + """ + dt_tke = dt_mom # use momentum time step to prevent spurious oscillations + + """ + vertical mixing and dissipation of TKE + """ + a_tri = np.empty(nz) + b_tri = np.empty(nz) + c_tri = np.empty(nz) + d_tri = np.empty(nz) + delta = np.empty(nz) + + ke = nz - 1 + for i in range(2, nx - 2): + for j in range(2, ny - 2): + ks = kbot[i, j] - 1 + if ks < 0: + continue + + for k in range(ks, ke): + delta[k] = ( + dt_tke + / dzt[k + 1] + * alpha_tke + * 0.5 + * (kappaM[i, j, k] + kappaM[i, j, k + 1]) + ) + delta[ke] = 0.0 + + for k in range(ks + 1, ke): + a_tri[k] = -delta[k - 1] / dzw[k] + a_tri[ks] = 0.0 + a_tri[ke] = -delta[ke - 1] / (0.5 * dzw[ke]) + + for k in range(ks + 1, ke): + b_tri[k] = ( + 1 + + delta[k] / dzw[k] + + delta[k - 1] / dzw[k] + + dt_tke * c_eps * sqrttke[i, j, k] / mxl[i, j, k] + ) + b_tri[ke] = ( + 1 + + delta[ke - 1] / (0.5 * dzw[ke]) + + dt_tke * c_eps * sqrttke[i, j, ke] / mxl[i, j, ke] + ) + b_tri[ks] = ( + 1 + + delta[ks] / dzw[ks] + + dt_tke * c_eps * sqrttke[i, j, ks] / mxl[i, j, ks] + ) + + for k in range(ks, ke): + c_tri[k] = -delta[k] / dzw[k] + c_tri[ke] = 0.0 + + d_tri[ks:] = tke[i, j, ks:, tau] + dt_tke * forc[i, j, ks:] + d_tri[ke] += dt_tke * forc_tke_surface[i, j] / (0.5 * dzw[ke]) + + tke[i, j, ks:, taup1] = solve_tridiag( + a_tri[ks:], b_tri[ks:], c_tri[ks:], d_tri[ks:] + ) + + """ + Add TKE if surface density flux drains TKE in uppermost box + """ + tke_surf_corr = np.zeros((nx, ny)) + for i in range(2, nx - 2): + for j in range(2, ny - 2): + if tke[i, j, -1, taup1] >= 0.0: + continue + tke_surf_corr[i, j] = -tke[i, j, -1, taup1] * (0.5 * dzw[-1]) / dt_tke + tke[i, j, -1, taup1] = 0.0 + + """ + add tendency due to lateral diffusion + """ + for i in range(nx - 1): + for j in range(ny): + flux_east[i, j, :] = ( + K_h_tke + * (tke[i + 1, j, :, tau] - tke[i, j, :, tau]) + / (cost[j] * dxu[i]) + * maskU[i, j, :] + ) + flux_east[-1, :, :] = 0.0 + + for j in range(ny - 1): + flux_north[:, j, :] = ( + K_h_tke + * (tke[:, j + 1, :, tau] - tke[:, j, :, tau]) + / dyu[j] + * maskV[:, j, :] + * cosu[j] + ) + flux_north[:, -1, :] = 0.0 + + for i in range(2, nx - 2): + for j in range(2, ny - 2): + tke[i, j, :, taup1] += ( + dt_tke + * maskW[i, j, :] + * ( + (flux_east[i, j, :] - flux_east[i - 1, j, :]) / (cost[j] * dxt[i]) + + (flux_north[i, j, :] - flux_north[i, j - 1, :]) + / (cost[j] * dyt[j]) + ) + ) + + """ + add tendency due to advection + """ + adv_flux_superbee_wgrid( + flux_east, + flux_north, + flux_top, + tke[:, :, :, tau], + u[..., tau], + v[..., tau], + w[..., tau], + maskW, + dxt, + dyt, + dzw, + cost, + cosu, + dt_tracer, + ) + + for i in range(2, nx - 2): + for j in range(2, ny - 2): + dtke[i, j, :, tau] = maskW[i, j, :] * ( + -(flux_east[i, j, :] - flux_east[i - 1, j, :]) / (cost[j] * dxt[i]) + - (flux_north[i, j, :] - flux_north[i, j - 1, :]) / (cost[j] * dyt[j]) + ) + dtke[:, :, 0, tau] += -flux_top[:, :, 0] / dzw[0] + dtke[:, :, 1:-1, tau] += -(flux_top[:, :, 1:-1] - flux_top[:, :, :-2]) / dzw[1:-1] + dtke[:, :, -1, tau] += -(flux_top[:, :, -1] - flux_top[:, :, -2]) / (0.5 * dzw[-1]) + + """ + Adam Bashforth time stepping + """ + tke[:, :, :, taup1] += dt_tracer * ( + (1.5 + AB_eps) * dtke[:, :, :, tau] - (0.5 + AB_eps) * dtke[:, :, :, taum1] + ) + + return tke, dtke, tke_surf_corr + + +def run(*inputs, device="cpu"): + outputs = integrate_tke(*inputs) + return outputs diff --git a/benchmarks/turbulent_kinetic_energy/tke_transonic_pythran.py b/benchmarks/turbulent_kinetic_energy/tke_transonic_pythran.py new file mode 100644 index 0000000..3923033 --- /dev/null +++ b/benchmarks/turbulent_kinetic_energy/tke_transonic_pythran.py @@ -0,0 +1,324 @@ +import numpy as np +# LAPACK is not supported by Pythran, so we borrow solve_tridiag from the Numba +# version (https://github.com/serge-sans-paille/pythran/issues/1654) +# from scipy.linalg import lapack +from transonic import jit + + +where = np.where + + +def solve_implicit(ks, a, b, c, d, b_edge=None, d_edge=None): + land_mask = (ks >= 0)[:, :, np.newaxis] + nz = a.shape[2] + z_index = np.arange(nz).reshape(1, 1, nz) + edge_mask = land_mask & ( + z_index == ks[:, :, np.newaxis] + ) + water_mask = land_mask & ( + z_index >= ks[:, :, np.newaxis] + ) + + a_tri = water_mask * a * np.logical_not(edge_mask) + b_tri = where(water_mask, b, 1.0) + if b_edge is not None: + b_tri = where(edge_mask, b_edge, b_tri) + c_tri = water_mask * c + d_tri = water_mask * d + if d_edge is not None: + d_tri = where(edge_mask, d_edge, d_tri) + + return solve_tridiag(a_tri, b_tri, c_tri, d_tri), water_mask + + +def solve_tridiag(a, b, c, d): + """ + Solves a tridiagonal matrix system with diagonals a, b, c and RHS vector d. + """ + assert a.shape == b.shape and a.shape == c.shape and a.shape == d.shape + + n = a.shape[0] + + for i in range(1, n): + w = a[i] / b[i - 1] + b[i] += -w * c[i - 1] + d[i] += -w * d[i - 1] + + out = np.empty_like(a) + out[-1] = d[-1] / b[-1] + + for i in range(n - 2, -1, -1): + out[i] = (d[i] - c[i] * out[i + 1]) / b[i] + + return out + + +def _calc_cr(rjp, rj, rjm, vel): + """ + Calculates cr value used in superbee advection scheme + """ + eps = 1e-20 # prevent division by 0 + return where(vel > 0.0, rjm, rjp) / where(np.abs(rj) < eps, eps, rj) + + +def pad_z_edges(arr): + shape = arr.shape + out = np.zeros((shape[0], shape[1], shape[2] + 2), arr.dtype) + out[:, :, 1:-1] = arr + return out + + +def limiter(cr): + return np.maximum(0.0, np.maximum(np.minimum(1.0, 2 * cr), np.minimum(2.0, cr))) + + +def _adv_superbee(vel, var, mask, dx, axis, cost, cosu, dt_tracer): + velfac = 1 + if axis == 0: + sm1, s, sp1, sp2 = ( + (slice(1 + n, -2 + n or None), slice(2, -2), slice(None)) + for n in range(-1, 3) + ) + dx = cost[np.newaxis, 2:-2, np.newaxis] * dx[1:-2, np.newaxis, np.newaxis] + elif axis == 1: + sm1, s, sp1, sp2 = ( + (slice(2, -2), slice(1 + n, -2 + n or None), slice(None)) + for n in range(-1, 3) + ) + dx = (cost * dx)[np.newaxis, 1:-2, np.newaxis] + velfac = cosu[np.newaxis, 1:-2, np.newaxis] + elif axis == 2: + vel, var, mask = (pad_z_edges(a) for a in (vel, var, mask)) + sm1, s, sp1, sp2 = ( + (slice(2, -2), slice(2, -2), slice(1 + n, -2 + n or None)) + for n in range(-1, 3) + ) + dx = dx[np.newaxis, np.newaxis, :-1] + else: + raise ValueError("axis must be 0, 1, or 2") + uCFL = np.abs(velfac * vel[s] * dt_tracer / dx) + rjp = (var[sp2] - var[sp1]) * mask[sp1] + rj = (var[sp1] - var[s]) * mask[s] + rjm = (var[s] - var[sm1]) * mask[sm1] + cr = limiter(_calc_cr(rjp, rj, rjm, vel[s])) + return ( + velfac * vel[s] * (var[sp1] + var[s]) * 0.5 + - np.abs(velfac * vel[s]) * ((1.0 - cr) + uCFL * cr) * rj * 0.5 + ) + + +def adv_flux_superbee_wgrid( + adv_fe, + adv_fn, + adv_ft, + var, + u_wgrid, + v_wgrid, + w_wgrid, + maskW, + dxt, + dyt, + dzw, + cost, + cosu, + dt_tracer, +): + """ + Calculates advection of a tracer defined on Wgrid + """ + maskUtr = np.zeros_like(maskW) + maskUtr[:-1, :, :] = maskW[1:, :, :] * maskW[:-1, :, :] + adv_fe[:, :, :] = 0.0 + adv_fe[1:-2, 2:-2, :] = _adv_superbee( + u_wgrid, var, maskUtr, dxt, 0, cost, cosu, dt_tracer + ) + + maskVtr = np.zeros_like(maskW) + maskVtr[:, :-1, :] = maskW[:, 1:, :] * maskW[:, :-1, :] + adv_fn[:, :, :] = 0.0 + adv_fn[2:-2, 1:-2, :] = _adv_superbee( + v_wgrid, var, maskVtr, dyt, 1, cost, cosu, dt_tracer + ) + + maskWtr = np.zeros_like(maskW) + maskWtr[:, :, :-1] = maskW[:, :, 1:] * maskW[:, :, :-1] + adv_ft[:, :, :] = 0.0 + adv_ft[2:-2, 2:-2, :-1] = _adv_superbee( + w_wgrid, var, maskWtr, dzw, 2, cost, cosu, dt_tracer + ) + + +@jit(native=True, xsimd=True) +def integrate_tke( + u, + v, + w, + maskU, + maskV, + maskW, + dxt, + dxu, + dyt, + dyu, + dzt, + dzw, + cost, + cosu, + kbot, + kappaM, + mxl, + forc, + forc_tke_surface, + tke, + dtke, +): + tau = 0 + taup1 = 1 + taum1 = 2 + + dt_tracer = 1 + dt_mom = 1 + AB_eps = 0.1 + alpha_tke = 1.0 + c_eps = 0.7 + K_h_tke = 2000.0 + + flux_east = np.zeros_like(maskU) + flux_north = np.zeros_like(maskU) + flux_top = np.zeros_like(maskU) + + sqrttke = np.sqrt(np.maximum(0.0, tke[:, :, :, tau])) + + """ + integrate Tke equation on W grid with surface flux boundary condition + """ + dt_tke = dt_mom # use momentum time step to prevent spurious oscillations + + """ + vertical mixing and dissipation of TKE + """ + ks = kbot[2:-2, 2:-2] - 1 + + a_tri = np.zeros_like(maskU[2:-2, 2:-2]) + b_tri = np.zeros_like(maskU[2:-2, 2:-2]) + c_tri = np.zeros_like(maskU[2:-2, 2:-2]) + d_tri = np.zeros_like(maskU[2:-2, 2:-2]) + delta = np.zeros_like(maskU[2:-2, 2:-2]) + + delta[:, :, :-1] = ( + dt_tke + / dzt[np.newaxis, np.newaxis, 1:] + * alpha_tke + * 0.5 + * (kappaM[2:-2, 2:-2, :-1] + kappaM[2:-2, 2:-2, 1:]) + ) + + a_tri[:, :, 1:-1] = -delta[:, :, :-2] / dzw[np.newaxis, np.newaxis, 1:-1] + a_tri[:, :, -1] = -delta[:, :, -2] / (0.5 * dzw[-1]) + + b_tri[:, :, 1:-1] = ( + 1 + + (delta[:, :, 1:-1] + delta[:, :, :-2]) / dzw[np.newaxis, np.newaxis, 1:-1] + + dt_tke * c_eps * sqrttke[2:-2, 2:-2, 1:-1] / mxl[2:-2, 2:-2, 1:-1] + ) + b_tri[:, :, -1] = ( + 1 + + delta[:, :, -2] / (0.5 * dzw[-1]) + + dt_tke * c_eps / mxl[2:-2, 2:-2, -1] * sqrttke[2:-2, 2:-2, -1] + ) + b_tri_edge = ( + 1 + + delta / dzw[np.newaxis, np.newaxis, :] + + dt_tke * c_eps / mxl[2:-2, 2:-2, :] * sqrttke[2:-2, 2:-2, :] + ) + + c_tri[:, :, :-1] = -delta[:, :, :-1] / dzw[np.newaxis, np.newaxis, :-1] + + d_tri[:, :, :] = tke[2:-2, 2:-2, :, tau] + dt_tke * forc[2:-2, 2:-2, :] + d_tri[:, :, -1] += dt_tke * forc_tke_surface[2:-2, 2:-2] / (0.5 * dzw[-1]) + + sol, water_mask = solve_implicit(ks, a_tri, b_tri, c_tri, d_tri, b_edge=b_tri_edge) + tke[2:-2, 2:-2, :, taup1] = where(water_mask, sol, tke[2:-2, 2:-2, :, taup1]) + + """ + Add TKE if surface density flux drains TKE in uppermost box + """ + tke_surf_corr = np.zeros(maskU.shape[:2]) + mask = tke[2:-2, 2:-2, -1, taup1] < 0.0 + tke_surf_corr[2:-2, 2:-2] = where( + mask, -tke[2:-2, 2:-2, -1, taup1] * 0.5 * dzw[-1] / dt_tke, 0.0 + ) + tke[2:-2, 2:-2, -1, taup1] = np.maximum(0.0, tke[2:-2, 2:-2, -1, taup1]) + + """ + add tendency due to lateral diffusion + """ + flux_east[:-1, :, :] = ( + K_h_tke + * (tke[1:, :, :, tau] - tke[:-1, :, :, tau]) + / (cost[np.newaxis, :, np.newaxis] * dxu[:-1, np.newaxis, np.newaxis]) + * maskU[:-1, :, :] + ) + flux_east[-1, :, :] = 0.0 + flux_north[:, :-1, :] = ( + K_h_tke + * (tke[:, 1:, :, tau] - tke[:, :-1, :, tau]) + / dyu[np.newaxis, :-1, np.newaxis] + * maskV[:, :-1, :] + * cosu[np.newaxis, :-1, np.newaxis] + ) + flux_north[:, -1, :] = 0.0 + tke[2:-2, 2:-2, :, taup1] += ( + dt_tke + * maskW[2:-2, 2:-2, :] + * ( + (flux_east[2:-2, 2:-2, :] - flux_east[1:-3, 2:-2, :]) + / (cost[np.newaxis, 2:-2, np.newaxis] * dxt[2:-2, np.newaxis, np.newaxis]) + + (flux_north[2:-2, 2:-2, :] - flux_north[2:-2, 1:-3, :]) + / (cost[np.newaxis, 2:-2, np.newaxis] * dyt[np.newaxis, 2:-2, np.newaxis]) + ) + ) + + """ + add tendency due to advection + """ + adv_flux_superbee_wgrid( + flux_east, + flux_north, + flux_top, + tke[:, :, :, tau], + u[:, :, :, tau], + v[:, :, :, tau], + w[:, :, :, tau], + maskW, + dxt, + dyt, + dzw, + cost, + cosu, + dt_tracer, + ) + + dtke[2:-2, 2:-2, :, tau] = maskW[2:-2, 2:-2, :] * ( + -(flux_east[2:-2, 2:-2, :] - flux_east[1:-3, 2:-2, :]) + / (cost[np.newaxis, 2:-2, np.newaxis] * dxt[2:-2, np.newaxis, np.newaxis]) + - (flux_north[2:-2, 2:-2, :] - flux_north[2:-2, 1:-3, :]) + / (cost[np.newaxis, 2:-2, np.newaxis] * dyt[np.newaxis, 2:-2, np.newaxis]) + ) + dtke[:, :, 0, tau] += -flux_top[:, :, 0] / dzw[0] + dtke[:, :, 1:-1, tau] += -(flux_top[:, :, 1:-1] - flux_top[:, :, :-2]) / dzw[1:-1] + dtke[:, :, -1, tau] += -(flux_top[:, :, -1] - flux_top[:, :, -2]) / (0.5 * dzw[-1]) + + """ + Adam Bashforth time stepping + """ + tke[:, :, :, taup1] += dt_tracer * ( + (1.5 + AB_eps) * dtke[:, :, :, tau] - (0.5 + AB_eps) * dtke[:, :, :, taum1] + ) + + return tke, dtke, tke_surf_corr + + +def run(*inputs, device="cpu"): + outputs = integrate_tke(*inputs) + return outputs diff --git a/environment-cpu.yml b/environment-cpu.yml index 50522e7..2f3878f 100644 --- a/environment-cpu.yml +++ b/environment-cpu.yml @@ -13,3 +13,7 @@ dependencies: - tensorflow>=2.0 - -f https://storage.googleapis.com/jax-releases/jax_releases.html - jax[cpu] + - transonic + - cython + - pythran + - pyperf diff --git a/utilities.py b/utilities.py index 7ebc64c..39dde11 100644 --- a/utilities.py +++ b/utilities.py @@ -94,7 +94,7 @@ def format_output(stats, benchmark_title, device="cpu"): header = stats.dtype.names col_widths = collections.defaultdict(lambda: 8) - col_widths.update(size=12, backend=10) + col_widths.update(size=12, backend=20) def format_col(col_name, value, is_time=False): col_width = col_widths[col_name]