From 297afe208a3cf8643b98f922ab2c96f859340533 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Mon, 7 Mar 2022 09:53:33 +0000
Subject: [PATCH 1/4] [SYCL][libclc][CUDA] Add native math extension

This patch extends the native math definitions in
order to include builtins out of the current SYCL
specification.

In particular, this patch adds a tanh builtin for
floats/halfs and a exp2 builtin for halfs which
are mapped to instructions introduced for sm_75
and above.
---
 clang/include/clang/Basic/BuiltinsNVPTX.def   |  8 ++
 libclc/generic/include/clcmacro.h             | 44 +++++----
 libclc/generic/include/spirv/spirv_builtins.h | 43 +++++++++
 libclc/generic/libspirv/float16.cl            | 60 ++++++++++++
 libclc/ptx-nvidiacl/libspirv/SOURCES          |  1 +
 .../ptx-nvidiacl/libspirv/math/native_exp2.cl | 37 ++++++++
 .../ptx-nvidiacl/libspirv/math/native_tanh.cl | 41 ++++++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td        | 11 +++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      | 11 +++
 .../sycl_ext_oneapi_native_math.asciidoc      | 93 +++++++++++++++++++
 sycl/include/CL/__spirv/spirv_ops.hpp         | 38 ++++++++
 sycl/include/CL/sycl/feature_test.hpp.in      |  1 +
 .../sycl/ext/oneapi/experimental/builtins.hpp | 45 +++++++++
 13 files changed, 416 insertions(+), 17 deletions(-)
 create mode 100644 libclc/ptx-nvidiacl/libspirv/math/native_tanh.cl
 create mode 100644 sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
index 449e4d1256944..cabfb4aa2090b 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.def
@@ -205,6 +205,8 @@ BUILTIN(__nvvm_saturate_d, "dd", "")
 BUILTIN(__nvvm_ex2_approx_ftz_f, "ff", "")
 BUILTIN(__nvvm_ex2_approx_f, "ff", "")
 BUILTIN(__nvvm_ex2_approx_d, "dd", "")
+TARGET_BUILTIN(__nvvm_ex2_approx_f16, "hh", "", AND(SM_75, PTX70))
+TARGET_BUILTIN(__nvvm_ex2_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70))
 
 BUILTIN(__nvvm_lg2_approx_ftz_f, "ff", "")
 BUILTIN(__nvvm_lg2_approx_f, "ff", "")
@@ -218,6 +220,12 @@ BUILTIN(__nvvm_sin_approx_f, "ff", "")
 BUILTIN(__nvvm_cos_approx_ftz_f, "ff", "")
 BUILTIN(__nvvm_cos_approx_f, "ff", "")
 
+// Tanh
+
+TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_75,PTX70))
+TARGET_BUILTIN(__nvvm_tanh_approx_f16, "hh", "", AND(SM_75, PTX70))
+TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70))
+
 // Fma
 
 BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "")
diff --git a/libclc/generic/include/clcmacro.h b/libclc/generic/include/clcmacro.h
index d4167a8e4529e..addb461aa047d 100644
--- a/libclc/generic/include/clcmacro.h
+++ b/libclc/generic/include/clcmacro.h
@@ -9,11 +9,7 @@
 #ifndef __CLC_MACRO_H
 #define __CLC_MACRO_H
 
-#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE)          \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) {                              \
-    return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y));                        \
-  }                                                                            \
-                                                                               \
+#define _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE)    \
   DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) {                              \
     return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z));         \
   }                                                                            \
@@ -30,12 +26,14 @@
     return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi));                     \
   }
 
-#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,         \
-                              ARG2_TYPE)                                       \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) {              \
-    return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y));              \
+#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE)          \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) {                              \
+    return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y));                        \
   }                                                                            \
-                                                                               \
+  _CLC_UNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE)
+
+#define _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,   \
+                                    ARG2_TYPE)                                 \
   DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) {              \
     return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y),               \
                          FUNCTION(x.z, y.z));                                  \
@@ -53,6 +51,14 @@
     return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi));         \
   }
 
+#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,         \
+                              ARG2_TYPE)                                       \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) {              \
+    return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y));              \
+  }                                                                            \
+  _CLC_BINARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,         \
+                              ARG2_TYPE)
+
 #define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,          \
                              ARG2_TYPE)                                        \
   DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) {                 \
@@ -76,13 +82,8 @@
     return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi));               \
   }
 
-#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
-                               ARG2_TYPE, ARG3_TYPE)                           \
-  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y,                \
-                                ARG3_TYPE##2 z) {                              \
-    return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y));    \
-  }                                                                            \
-                                                                               \
+#define _CLC_TERNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,  \
+                                     ARG2_TYPE, ARG3_TYPE)                     \
   DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y,                \
                                 ARG3_TYPE##3 z) {                              \
     return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y),     \
@@ -107,6 +108,15 @@
                           FUNCTION(x.hi, y.hi, z.hi));                         \
   }
 
+#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
+                               ARG2_TYPE, ARG3_TYPE)                           \
+  DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y,                \
+                                ARG3_TYPE##2 z) {                              \
+    return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y));    \
+  }                                                                            \
+  _CLC_TERNARY_VECTORIZE_HAVE2(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
+                               ARG2_TYPE, ARG3_TYPE)
+
 #define _CLC_V_S_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE,        \
                                ARG2_TYPE, ARG3_TYPE)                           \
   DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE y, ARG3_TYPE##2 z) {    \
diff --git a/libclc/generic/include/spirv/spirv_builtins.h b/libclc/generic/include/spirv/spirv_builtins.h
index adeb3a63460d6..ca15fdf3c6547 100644
--- a/libclc/generic/include/spirv/spirv_builtins.h
+++ b/libclc/generic/include/spirv/spirv_builtins.h
@@ -15776,6 +15776,21 @@ _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp32_t
 _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp32_t
     __spirv_ocl_native_exp2(__clc_vec16_fp32_t);
 
+#ifdef cl_khr_fp16
+_CLC_OVERLOAD
+_CLC_DECL _CLC_CONSTFN __clc_fp16_t __clc_native_exp2(__clc_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp16_t
+    __clc_native_exp2(__clc_vec2_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp16_t
+    __clc_native_exp2(__clc_vec3_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp16_t
+    __clc_native_exp2(__clc_vec4_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp16_t
+    __clc_native_exp2(__clc_vec8_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
+    __clc_native_exp2(__clc_vec16_fp16_t);
+#endif
+
 _CLC_OVERLOAD
 _CLC_DECL _CLC_CONSTFN __clc_fp32_t __spirv_ocl_native_log(__clc_fp32_t);
 _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t
@@ -19077,6 +19092,34 @@ _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
     __spirv_ocl_tanh(__clc_vec16_fp16_t);
 #endif
 
+_CLC_OVERLOAD
+_CLC_DECL _CLC_CONSTFN __clc_fp32_t __clc_native_tanh(__clc_fp32_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t
+    __clc_native_tanh(__clc_vec2_fp32_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp32_t
+    __clc_native_tanh(__clc_vec3_fp32_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp32_t
+    __clc_native_tanh(__clc_vec4_fp32_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp32_t
+    __clc_native_tanh(__clc_vec8_fp32_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp32_t
+    __clc_native_tanh(__clc_vec16_fp32_t);
+
+#ifdef cl_khr_fp16
+_CLC_OVERLOAD
+_CLC_DECL _CLC_CONSTFN __clc_fp16_t __clc_native_tanh(__clc_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp16_t
+    __clc_native_tanh(__clc_vec2_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp16_t
+    __clc_native_tanh(__clc_vec3_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp16_t
+    __clc_native_tanh(__clc_vec4_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp16_t
+    __clc_native_tanh(__clc_vec8_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
+    __clc_native_tanh(__clc_vec16_fp16_t);
+#endif
+
 _CLC_OVERLOAD
 _CLC_DECL _CLC_CONSTFN __clc_fp32_t __spirv_ocl_tanpi(__clc_fp32_t);
 _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t
diff --git a/libclc/generic/libspirv/float16.cl b/libclc/generic/libspirv/float16.cl
index b2cd14e8c63f4..28f5b65ac80e2 100644
--- a/libclc/generic/libspirv/float16.cl
+++ b/libclc/generic/libspirv/float16.cl
@@ -4344,6 +4344,36 @@ __spirv_ocl_exp2(__clc_vec16_float16_t args_0) {
   return __spirv_ocl_exp2(as_half16(args_0));
 }
 
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t
+__clc_native_exp2(__clc_float16_t args_0) {
+  return __clc_native_exp2(as_half(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec2_fp16_t
+__clc_native_exp2(__clc_vec2_float16_t args_0) {
+  return __clc_native_exp2(as_half2(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec3_fp16_t
+__clc_native_exp2(__clc_vec3_float16_t args_0) {
+  return __clc_native_exp2(as_half3(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec4_fp16_t
+__clc_native_exp2(__clc_vec4_float16_t args_0) {
+  return __clc_native_exp2(as_half4(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec8_fp16_t
+__clc_native_exp2(__clc_vec8_float16_t args_0) {
+  return __clc_native_exp2(as_half8(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec16_fp16_t
+__clc_native_exp2(__clc_vec16_float16_t args_0) {
+  return __clc_native_exp2(as_half16(args_0));
+}
+
 _CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t
 __spirv_ocl_expm1(__clc_float16_t args_0) {
   return __spirv_ocl_expm1(as_half(args_0));
@@ -6613,6 +6643,36 @@ __spirv_ocl_tanh(__clc_vec16_float16_t args_0) {
   return __spirv_ocl_tanh(as_half16(args_0));
 }
 
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t
+__clc_native_tanh(__clc_float16_t args_0) {
+  return __clc_native_tanh(as_half(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec2_fp16_t
+__clc_native_tanh(__clc_vec2_float16_t args_0) {
+  return __clc_native_tanh(as_half2(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec3_fp16_t
+__clc_native_tanh(__clc_vec3_float16_t args_0) {
+  return __clc_native_tanh(as_half3(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec4_fp16_t
+__clc_native_tanh(__clc_vec4_float16_t args_0) {
+  return __clc_native_tanh(as_half4(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec8_fp16_t
+__clc_native_tanh(__clc_vec8_float16_t args_0) {
+  return __clc_native_tanh(as_half8(args_0));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec16_fp16_t
+__clc_native_tanh(__clc_vec16_float16_t args_0) {
+  return __clc_native_tanh(as_half16(args_0));
+}
+
 _CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t
 __spirv_ocl_tanpi(__clc_float16_t args_0) {
   return __spirv_ocl_tanpi(as_half(args_0));
diff --git a/libclc/ptx-nvidiacl/libspirv/SOURCES b/libclc/ptx-nvidiacl/libspirv/SOURCES
index 5ec1dea1afc30..9f105b1556ed9 100644
--- a/libclc/ptx-nvidiacl/libspirv/SOURCES
+++ b/libclc/ptx-nvidiacl/libspirv/SOURCES
@@ -53,6 +53,7 @@ math/native_rsqrt.cl
 math/native_sin.cl
 math/native_sqrt.cl
 math/native_tan.cl
+math/native_tanh.cl
 math/nextafter.cl
 math/pow.cl
 math/remainder.cl
diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl b/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl
index 93c12c0aeb067..bcd32a1718102 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl
@@ -14,4 +14,41 @@
 #define __CLC_FUNCTION __spirv_ocl_native_exp2
 #define __CLC_BUILTIN __nv_exp2
 #define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
+
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+int __clc_nvvm_reflect_arch();
+
+_CLC_DEF _CLC_OVERLOAD half __clc_native_exp2(half x) {
+  if (__clc_nvvm_reflect_arch() >= 750) {
+    return __nvvm_ex2_approx_f16(x);
+  } else {
+    float upcast = x;
+    return __spirv_ocl_native_exp2(upcast);
+  }
+}
+
+_CLC_DEF _CLC_OVERLOAD half2 __clc_native_exp2(half2 x) {
+  if (__clc_nvvm_reflect_arch() >= 750) {
+    return __nvvm_ex2_approx_f16x2(x);
+  } else {
+    float upcast0 = x[0];
+    float upcast1 = x[1];
+    half2 res;
+    res.s0 = __spirv_ocl_native_exp2(upcast0);
+    res.s1 = __spirv_ocl_native_exp2(upcast1);
+    return res;
+  }
+}
+
+_CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half,
+                           __clc_native_exp2, half)
+
+#endif // cl_khr_fp16
+
+// Undef halfs before uncluding unary builtins, as they are handled above.
+#ifdef cl_khr_fp16
+#undef cl_khr_fp16
+#endif // cl_khr_fp16
 #include <math/unary_builtin.inc>
diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_tanh.cl b/libclc/ptx-nvidiacl/libspirv/math/native_tanh.cl
new file mode 100644
index 0000000000000..3216059c5c7ce
--- /dev/null
+++ b/libclc/ptx-nvidiacl/libspirv/math/native_tanh.cl
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <spirv/spirv.h>
+
+#include "../../include/libdevice.h"
+#include <clcmacro.h>
+
+extern int __clc_nvvm_reflect_arch();
+
+#define __USE_TANH_APPROX (__clc_nvvm_reflect_arch() >= 750)
+
+_CLC_DEF _CLC_OVERLOAD float __clc_native_tanh(float x) {
+  return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f(x) : __nv_tanhf(x);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_native_tanh, float)
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEF _CLC_OVERLOAD half __clc_native_tanh(half x) {
+  return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16(x) : __nv_tanhf(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD half2 __clc_native_tanh(half2 x) {
+  return (__USE_TANH_APPROX) ? __nvvm_tanh_approx_f16x2(x)
+                             : (half2)(__nv_tanhf(x.x), __nv_tanhf(x.y));
+}
+
+_CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __clc_native_tanh, half)
+
+#endif
+
+#undef __USE_TANH_APPROX
+
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index f026033782184..3321534a3218c 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -854,6 +854,17 @@ let TargetPrefix = "nvvm" in {
   def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">,
       DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
 
+//
+// Tanh
+//
+
+   def int_nvvm_tanh_approx_f : GCCBuiltin<"__nvvm_tanh_approx_f">,
+        DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+   def int_nvvm_tanh_approx_f16 : GCCBuiltin<"__nvvm_tanh_approx_f16">,
+       DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty], [IntrNoMem]>;
+   def int_nvvm_tanh_approx_f16x2 : GCCBuiltin<"__nvvm_tanh_approx_f16x2">,
+       DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty], [IntrNoMem]>;
+
 //
 // Fma
 //
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index c980f6ed4bdc2..377dce99578cc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -933,6 +933,17 @@ def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
 def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
   Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;
 
+//
+// Tanh
+//
+
+ def INT_NVVM_TANH_APPROX_F : F_MATH_1<"tanh.approx.f32 \t$dst, $src0;",
+    Float32Regs, Float32Regs, int_nvvm_tanh_approx_f>;
+ def INT_NVVM_TANH_APPROX_F16 : F_MATH_1<"tanh.approx.f16 \t$dst, $src0;",
+   Float16Regs, Float16Regs, int_nvvm_tanh_approx_f16>;
+ def INT_NVVM_TANH_APPROX_F16X2 : F_MATH_1<"tanh.approx.f16x2 \t$dst, $src0;",
+   Float16x2Regs, Float16x2Regs, int_nvvm_tanh_approx_f16x2>;
+
 //
 // Fma
 //
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc
new file mode 100644
index 0000000000000..adc5ab2d2e87e
--- /dev/null
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc
@@ -0,0 +1,93 @@
+# Native math functions extension for DPC++: SYCL_ONEAPI_NATIVE_MATH
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+:dpcpp: pass:[DPC++]
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+Copyright (c) 2021-2021 Intel Corporation.  All rights reserved.
+
+IMPORTANT: This specification is a draft.
+
+NOTE: The APIs described in this specification are experimental. Future versions of this extension may change these APIs in ways that are incompatible with the versions described here.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
+trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
+used by permission by Khronos.
+
+This extension is written against the SYCL 2020 revision 4 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+## Contributors
+
+* Jack Kirk
+* Paolo Gorlani
+
+## Introduction
+
+This document proposes extending the `sycl::native::exp2` function (that supports `genfloatf` types) to support `genfloath` types in the `sycl::ext::oneapi::experimental` namespace. It also proposes introducing a new native `tanh` function supporting `genfloath` and `genfloatf` types in the `sycl::ext::oneapi::experimental` namespace.
+
+NOTE: This document does not propose `genfloatd` support for either `sycl::ext::oneapi::experimental::exp2` or `sycl::ext::oneapi::experimental::tanh`. 
+
+## Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros". Therefore, an implementation
+supporting this extension must predefine the macro
+SYCL_EXT_ONEAPI_NATIVE_MATH to one of the values defined in the table
+below. Applications can test for the existence of this macro to determine if the
+implementation supports this feature, or applications can test the macro’s value
+to determine which of the extension’s APIs the implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value |Description
+|1     |Initial extension implementation.
+|===
+
+## Motivation
+
+The CUDA backend has introduced fast math "approx" versions of the exp2 and tanh functions for `half` and `float` types. It is considered likely that other backends will eventually introduce similar functionality, motivated particularly by deep learning use cases of these functions. We propose that the appropriate place to call such functionality in SYCL applications would be from the newly proposed native functions.
+
+## New function declarations
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+
+// Available only when "T" is one of the genfloath types.
+template <typename T>
+T exp2(T x);
+
+// Available only when "T" is one of the genfloatf or genfloath types.
+template <typename T>
+T tanh(T x);
+
+} // namespace sycl::ext::oneapi::experimental
+```
+
+## Issues for future discussion
+
+
+## Revision History
+
+[frame="none",options="header"]
+|======================
+|Rev |Date       |Author        |Changes
+|1   |2022-02-23 |Jack Kirk and ... |Initial working draft.
+|======================
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 4878cc4dd5db8..e7b660f9d29e7 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -755,6 +755,44 @@ __spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, ...);
 extern SYCL_EXTERNAL int __spirv_ocl_printf(const char *Format, ...);
 #endif
 
+// Native builtin extension
+
+extern SYCL_EXTERNAL float __clc_native_tanh(float);
+extern SYCL_EXTERNAL __ocl_vec_t<float, 2>
+    __clc_native_tanh(__ocl_vec_t<float, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<float, 3>
+    __clc_native_tanh(__ocl_vec_t<float, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<float, 4>
+    __clc_native_tanh(__ocl_vec_t<float, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<float, 8>
+    __clc_native_tanh(__ocl_vec_t<float, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<float, 16>
+    __clc_native_tanh(__ocl_vec_t<float, 16>);
+
+extern SYCL_EXTERNAL _Float16 __clc_native_tanh(_Float16);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 2>
+    __clc_native_tanh(__ocl_vec_t<_Float16, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 3>
+    __clc_native_tanh(__ocl_vec_t<_Float16, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 4>
+    __clc_native_tanh(__ocl_vec_t<_Float16, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 8>
+    __clc_native_tanh(__ocl_vec_t<_Float16, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 16>
+    __clc_native_tanh(__ocl_vec_t<_Float16, 16>);
+
+extern SYCL_EXTERNAL _Float16 __clc_native_exp2(_Float16);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 2>
+    __clc_native_exp2(__ocl_vec_t<_Float16, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 3>
+    __clc_native_exp2(__ocl_vec_t<_Float16, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 4>
+    __clc_native_exp2(__ocl_vec_t<_Float16, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 8>
+    __clc_native_exp2(__ocl_vec_t<_Float16, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 16>
+    __clc_native_exp2(__ocl_vec_t<_Float16, 16>);
+
 #else // if !__SYCL_DEVICE_ONLY__
 
 template <typename dataT>
diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in
index a7f0ca071ee46..75986f5b3f9ff 100644
--- a/sycl/include/CL/sycl/feature_test.hpp.in
+++ b/sycl/include/CL/sycl/feature_test.hpp.in
@@ -54,6 +54,7 @@ namespace sycl {
 #define SYCL_EXT_ONEAPI_SRGB 1
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
 #define SYCL_EXT_ONEAPI_PROPERTIES 1
+#define SYCL_EXT_ONEAPI_NATIVE_MATH 1
 #define SYCL_EXT_INTEL_BF16_CONVERSION 1
 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1
 #ifdef __clang__
diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
index e32e1c70a5a97..c8fa033d8c79e 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -8,8 +8,17 @@
 
 #pragma once
 
+#include <CL/sycl/builtins.hpp>
+#include <CL/sycl/detail/builtins.hpp>
+#include <CL/sycl/detail/generic_type_lists.hpp>
+#include <CL/sycl/detail/generic_type_traits.hpp>
+#include <CL/sycl/detail/type_traits.hpp>
+
 #include <CL/__spirv/spirv_ops.hpp>
 
+// TODO Decide whether to mark functions with this attribute.
+#define __NOEXC /*noexcept*/
+
 #ifdef __SYCL_DEVICE_ONLY__
 #define __SYCL_CONSTANT_AS __attribute__((opencl_constant))
 #else
@@ -72,6 +81,42 @@ int printf(const FormatT *__format, Args... args) {
 #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__SPIR__)
 }
 
+namespace native {
+
+// genfloatfh tanh (genfloatfh x)
+template <typename T>
+inline __SYCL_ALWAYS_INLINE
+    sycl::detail::enable_if_t<sycl::detail::is_genfloatf<T>::value ||
+                                  sycl::detail::is_genfloath<T>::value,
+                              T>
+    tanh(T x) __NOEXC {
+#if defined(__NVPTX__)
+  using _ocl_T = cl::sycl::detail::ConvertToOpenCLType_t<T>;
+  _ocl_T arg1 = cl::sycl::detail::convertDataToType<T, _ocl_T>(x);
+  return cl::sycl::detail::convertDataToType<_ocl_T, T>(
+      __clc_native_tanh(arg1));
+#else
+  return __sycl_std::__invoke_tanh<T>(x);
+#endif
+}
+
+// genfloath exp2 (genfloath x)
+template <typename T>
+inline __SYCL_ALWAYS_INLINE
+    sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value, T>
+    exp2(T x) __NOEXC {
+#if defined(__NVPTX__)
+  using _ocl_T = cl::sycl::detail::ConvertToOpenCLType_t<T>;
+  _ocl_T arg1 = cl::sycl::detail::convertDataToType<T, _ocl_T>(x);
+  return cl::sycl::detail::convertDataToType<_ocl_T, T>(
+      __clc_native_exp2(arg1));
+#else
+  return __sycl_std::__invoke_exp2<T>(x);
+#endif
+}
+
+} // namespace native
+
 } // namespace experimental
 } // namespace oneapi
 } // namespace ext

From 3e907ca59ce42bad61a66268eeb8ab4822632dee Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Mon, 7 Mar 2022 14:43:15 +0000
Subject: [PATCH 2/4] update extension template addressing reviewer comments

---
 .../sycl_ext_oneapi_native_math.asciidoc      | 117 ++++++++++--------
 1 file changed, 68 insertions(+), 49 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc
index adc5ab2d2e87e..239c111516e25 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc
@@ -1,7 +1,7 @@
-# Native math functions extension for DPC++: SYCL_ONEAPI_NATIVE_MATH
+= sycl_ext_oneapi_native_math
+
 :source-highlighter: coderay
 :coderay-linenums-mode: table
-:dpcpp: pass:[DPC++]
 
 // This section needs to be after the document title.
 :doctype: book
@@ -9,8 +9,7 @@
 :toc: left
 :encoding: utf-8
 :lang: en
-
-:blank: pass:[ +]
+:dpcpp: pass:[DPC++]
 
 // Set the default source code type in this document to C++,
 // for syntax highlighting purposes.  This is needed because
@@ -20,74 +19,94 @@
 
 == Notice
 
-Copyright (c) 2021-2021 Intel Corporation.  All rights reserved.
+[%hardbreaks]
+Copyright (C) 2022-2022 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
 
-IMPORTANT: This specification is a draft.
+To report problems with this extension, please open a new issue at:
 
-NOTE: The APIs described in this specification are experimental. Future versions of this extension may change these APIs in ways that are incompatible with the versions described here.
+https://github.com/intel/llvm/issues
 
-NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
-trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
-used by permission by Khronos.
+
+== Dependencies
 
 This extension is written against the SYCL 2020 revision 4 specification.  All
 references below to the "core SYCL specification" or to section numbers in the
 SYCL specification refer to that revision.
 
-## Contributors
+== Status
 
-* Jack Kirk
-* Paolo Gorlani
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback.  Interfaces defined in this
+specification are implemented in {dpcpp}, but they are not finalized and may
+change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in this
+specification.*
 
-## Introduction
+== Overview
 
-This document proposes extending the `sycl::native::exp2` function (that supports `genfloatf` types) to support `genfloath` types in the `sycl::ext::oneapi::experimental` namespace. It also proposes introducing a new native `tanh` function supporting `genfloath` and `genfloatf` types in the `sycl::ext::oneapi::experimental` namespace.
+The CUDA backend has introduced fast math "approx" versions of the `exp2` and
+`tanh` functions for `half` and `float` types. It is considered likely that
+other backends will eventually introduce similar functionality, motivated
+particularly by deep learning use cases of these functions. We propose that the
+appropriate place to call such functionality in SYCL applications would be from
+the newly proposed native functions.
 
-NOTE: This document does not propose `genfloatd` support for either `sycl::ext::oneapi::experimental::exp2` or `sycl::ext::oneapi::experimental::tanh`. 
+== Specification
 
-## Feature test macro
+=== Feature test macro
 
 This extension provides a feature-test macro as described in the core SYCL
-specification section 6.3.3 "Feature test macros". Therefore, an implementation
-supporting this extension must predefine the macro
-SYCL_EXT_ONEAPI_NATIVE_MATH to one of the values defined in the table
-below. Applications can test for the existence of this macro to determine if the
-implementation supports this feature, or applications can test the macro’s value
-to determine which of the extension’s APIs the implementation supports.
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_NATIVE_MATH` to one of the values defined in the table
+below.  Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
 
 [%header,cols="1,5"]
 |===
-|Value |Description
-|1     |Initial extension implementation.
-|===
-
-## Motivation
+|Value
+|Description
 
-The CUDA backend has introduced fast math "approx" versions of the exp2 and tanh functions for `half` and `float` types. It is considered likely that other backends will eventually introduce similar functionality, motivated particularly by deep learning use cases of these functions. We propose that the appropriate place to call such functionality in SYCL applications would be from the newly proposed native functions.
-
-## New function declarations
-
-```c++
-namespace sycl::ext::oneapi::experimental {
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
 
-// Available only when "T" is one of the genfloath types.
-template <typename T>
-T exp2(T x);
+=== New member function declarations
 
-// Available only when "T" is one of the genfloatf or genfloath types.
-template <typename T>
-T tanh(T x);
+This extension allows the `sycl::native::exp2` function (that originally
+supports `genfloatf` types) to support `genfloath` types in the
+`sycl::ext::oneapi::experimental::native` namespace. It also introduces a new
+native `tanh` function supporting `genfloath` and `genfloatf` types in the
+`sycl::ext::oneapi::experimental::native` namespace.
 
-} // namespace sycl::ext::oneapi::experimental
-```
+NOTE: This document does not propose `genfloatd` support for either
+`sycl::ext::oneapi::experimental::native::exp2` or
+`sycl::ext::oneapi::experimental::native::tanh`. 
 
-## Issues for future discussion
 
+> This extension adds the following new native builtin functions to SYCL:
+>
+> ```
+> namespace sycl::ext::oneapi::experimental::native {
+> 
+> // Available only when "T" is one of the genfloath types.
+> template <typename T>
+> T exp2(T x);
+> 
+> // Available only when "T" is one of the genfloatf or genfloath types.
+> template <typename T>
+> T tanh(T x);
+> 
+> } // namespace sycl::ext::oneapi::experimental::native
+> ```
 
-## Revision History
 
-[frame="none",options="header"]
-|======================
-|Rev |Date       |Author        |Changes
-|1   |2022-02-23 |Jack Kirk and ... |Initial working draft.
-|======================

From 6daff55636473ee14f06999a09ba7c0f2a2f0392 Mon Sep 17 00:00:00 2001
From: pgorlani <92453485+pgorlani@users.noreply.github.com>
Date: Mon, 7 Mar 2022 15:19:42 +0000
Subject: [PATCH 3/4] Change subsection name

---
 .../experimental/sycl_ext_oneapi_native_math.asciidoc           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc
index 239c111516e25..9b8daaf0ab6d6 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_native_math.asciidoc
@@ -80,7 +80,7 @@ supports.
  feature-test macro always has this value.
 |===
 
-=== New member function declarations
+=== New functions
 
 This extension allows the `sycl::native::exp2` function (that originally
 supports `genfloatf` types) to support `genfloath` types in the

From 07aa4e52aa087cb27d6aeba4a4dd789100a51b94 Mon Sep 17 00:00:00 2001
From: pgorlani <paolo.gorlani@codeplay.com>
Date: Wed, 9 Mar 2022 03:32:22 -0800
Subject: [PATCH 4/4] Remove return after else

---
 .../ptx-nvidiacl/libspirv/math/native_exp2.cl | 29 +++++++------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl b/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl
index bcd32a1718102..8c06a7ed9ea9c 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/native_exp2.cl
@@ -19,31 +19,24 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 int __clc_nvvm_reflect_arch();
+#define __USE_HALF_EXP2_APPROX (__clc_nvvm_reflect_arch() >= 750)
 
 _CLC_DEF _CLC_OVERLOAD half __clc_native_exp2(half x) {
-  if (__clc_nvvm_reflect_arch() >= 750) {
-    return __nvvm_ex2_approx_f16(x);
-  } else {
-    float upcast = x;
-    return __spirv_ocl_native_exp2(upcast);
-  }
+  return (__USE_HALF_EXP2_APPROX) ? __nvvm_ex2_approx_f16(x)
+                                  : __spirv_ocl_native_exp2((float)x);
 }
 
 _CLC_DEF _CLC_OVERLOAD half2 __clc_native_exp2(half2 x) {
-  if (__clc_nvvm_reflect_arch() >= 750) {
-    return __nvvm_ex2_approx_f16x2(x);
-  } else {
-    float upcast0 = x[0];
-    float upcast1 = x[1];
-    half2 res;
-    res.s0 = __spirv_ocl_native_exp2(upcast0);
-    res.s1 = __spirv_ocl_native_exp2(upcast1);
-    return res;
-  }
+  return (__USE_HALF_EXP2_APPROX)
+             ? __nvvm_ex2_approx_f16x2(x)
+             : (half2)(__spirv_ocl_native_exp2((float)x.x),
+                       __spirv_ocl_native_exp2((float)x.y));
 }
 
-_CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half,
-                           __clc_native_exp2, half)
+_CLC_UNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __clc_native_exp2,
+                           half)
+
+#undef __USE_HALF_EXP2_APPROX
 
 #endif // cl_khr_fp16