Array4: __cuda_array_interface__ v2

ax3l · ax3l · commit f3ff7888d8bd · 2022-10-05T13:52:47.000-07:00
Start implementing the `__cuda_array_interface__` for zero-copy
data exchange on Nvidia CUDA GPUs.
diff --git a/src/Base/Array4.cpp b/src/Base/Array4.cpp
@@ -19,6 +19,61 @@ namespace py = pybind11;
 using namespace amrex;
 
 
+namespace
+{
+    /** CPU: __array_interface__ v3
+     *
+     * https://numpy.org/doc/stable/reference/arrays.interface.html
+     */
+    template<typename T>
+    py::dict
+    array_interface(Array4<T> const & a4)
+    {
+        auto d = py::dict();
+        auto const len = length(a4);
+        // F->C index conversion here
+        // p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride];
+        // Buffer dimensions: zero-size shall not skip dimension
+        auto shape = py::make_tuple(
+                a4.ncomp,
+                len.z <= 0 ? 1 : len.z,
+                len.y <= 0 ? 1 : len.y,
+                len.x <= 0 ? 1 : len.x  // fastest varying index
+        );
+        // buffer protocol strides are in bytes, AMReX strides are elements
+        auto const strides = py::make_tuple(
+                sizeof(T) * a4.nstride,
+                sizeof(T) * a4.kstride,
+                sizeof(T) * a4.jstride,
+                sizeof(T)  // fastest varying index
+        );
+        bool const read_only = false;
+        d["data"] = py::make_tuple(std::intptr_t(a4.dataPtr()), read_only);
+        // note: if we want to keep the same global indexing with non-zero
+        //       box small_end as in AMReX, then we can explore playing with
+        //       this offset as well
+        //d["offset"] = 0;         // default
+        //d["mask"] = py::none();  // default
+
+        d["shape"] = shape;
+        // we could also set this after checking the strides are C-style contiguous:
+        //if (is_contiguous<T>(shape, strides))
+        //    d["strides"] = py::none();  // C-style contiguous
+        //else
+        d["strides"] = strides;
+
+        // type description
+        // for more complicated types, e.g., tuples/structs
+        //d["descr"] = ...;
+        // we currently only need this
+        d["typestr"] = py::format_descriptor<T>::format();
+
+        d["version"] = 3;
+        return d;
+    }
+}
+
+
 template< typename T >
 void make_Array4(py::module &m, std::string typestr)
 {
@@ -85,56 +140,44 @@ void make_Array4(py::module &m, std::string typestr)
             return a4;
         }))
 
+
+        // CPU: __array_interface__ v3
+        // https://numpy.org/doc/stable/reference/arrays.interface.html
         .def_property_readonly("__array_interface__", [](Array4<T> const & a4) {
-            auto d = py::dict();
-            auto const len = length(a4);
-            // F->C index conversion here
-            // p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride];
-            // Buffer dimensions: zero-size shall not skip dimension
-            auto shape = py::make_tuple(
-                    a4.ncomp,
-                    len.z <= 0 ? 1 : len.z,
-                    len.y <= 0 ? 1 : len.y,
-                    len.x <= 0 ? 1 : len.x  // fastest varying index
-            );
-            // buffer protocol strides are in bytes, AMReX strides are elements
-            auto const strides = py::make_tuple(
-                    sizeof(T) * a4.nstride,
-                    sizeof(T) * a4.kstride,
-                    sizeof(T) * a4.jstride,
-                    sizeof(T)  // fastest varying index
-            );
-            bool const read_only = false;
-            d["data"] = py::make_tuple(std::intptr_t(a4.dataPtr()), read_only);
-            // note: if we want to keep the same global indexing with non-zero
-            //       box small_end as in AMReX, then we can explore playing with
-            //       this offset as well
-            //d["offset"] = 0;         // default
-            //d["mask"] = py::none();  // default
-
-            d["shape"] = shape;
-            // we could also set this after checking the strides are C-style contiguous:
-            //if (is_contiguous<T>(shape, strides))
-            //    d["strides"] = py::none();  // C-style contiguous
-            //else
-            d["strides"] = strides;
-
-            d["typestr"] = py::format_descriptor<T>::format();
-            d["version"] = 3;
-            return d;
+            return array_interface(a4);
         })
 
+        // CPU: __array_function__ interface (TODO)
+        //
+        // NEP 18 — A dispatch mechanism for NumPy's high level array functions.
+        //   https://numpy.org/neps/nep-0018-array-function-protocol.html
+        // This enables code using NumPy to be directly operated on Array4 arrays.
+        // __array_function__ feature requires NumPy 1.16 or later.
+
 
-        // TODO: __cuda_array_interface__
+        // Nvidia GPUs: __cuda_array_interface__ v2
         // https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html
+        .def_property_readonly("__cuda_array_interface__", [](Array4<T> const & a4) {
+            auto d = array_interface(a4);
+
+            // data:
+            // Because the user of the interface may or may not be in the same context, the most common case is to use cuPointerGetAttribute with CU_POINTER_ATTRIBUTE_DEVICE_POINTER in the CUDA driver API (or the equivalent CUDA Runtime API) to retrieve a device pointer that is usable in the currently active context.
+            // TODO For zero-size arrays, use 0 here.
+
+            // ... TODO: wasn't there some stream or device info?
+
+            d["version"] = 2;
+            return d;
+        })
 
 
-        // TODO: __dlpack__
+        // TODO: __dlpack__ __dlpack_device__
         // DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
         // https://dmlc.github.io/dlpack/latest/
         // https://data-apis.org/array-api/latest/design_topics/data_interchange.html
         // https://github.com/data-apis/consortium-feedback/issues/1
         // https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
+        // https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
 
 
         .def("contains", &Array4<T>::contains)
diff --git a/tests/test_array4.py b/tests/test_array4.py
@@ -69,36 +69,83 @@ def test_array4():
     x[1, 1, 1] = 44
     assert v_carr2np[0, 1, 1, 1] == 44
 
-    # from cupy
-
-    # to numpy
-
-    # to cupy
-
-    return
-
-    # Check indexing
-    assert obj[0] == 1
-    assert obj[1] == 2
-    assert obj[2] == 3
-    assert obj[-1] == 3
-    assert obj[-2] == 2
-    assert obj[-3] == 1
-    with pytest.raises(IndexError):
-        obj[-4]
-    with pytest.raises(IndexError):
-        obj[3]
-
-    # Check assignment
-    obj[0] = 2
-    obj[1] = 3
-    obj[2] = 4
-    assert obj[0] == 2
-    assert obj[1] == 3
-    assert obj[2] == 4
-
-
-# def test_iv_conversions():
-#    obj = amrex.IntVect.max_vector().numpy()
-#    assert(isinstance(obj, np.ndarray))
-#    assert(obj.dtype == np.int32)
+
+@pytest.mark.skipif(
+    amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
+)
+def test_array4_numba():
+    # https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
+    import numba
+
+    # numba -> AMReX Array4
+    x = np.ones(
+        (
+            2,
+            3,
+            4,
+        )
+    )  # type: numpy.ndarray
+
+    # host-to-device copy
+    x_numba = numba.cuda.to_device(
+        x
+    )  # type: numba.cuda.cudadrv.devicearray.DeviceNDArray
+    # x_cupy = cupy.asarray(x_numba)      # type: cupy.ndarray
+    x_arr = amrex.Array4_double(x_numba)  # type: amrex.Array4_double
+
+    assert (
+        x_arr.__cuda_array_interface__["data"][0]
+        == x_numba.__cuda_array_interface__["data"][0]
+    )
+
+    # AMReX -> numba
+    # arr_numba = cuda.as_cuda_array(arr4)
+    # ... or as MultiFab test
+    # TODO
+
+
+@pytest.mark.skipif(
+    amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
+)
+def test_array4_cupy():
+    # https://docs.cupy.dev/en/stable/user_guide/interoperability.html
+    import cupy as cp
+
+    # cupy -> AMReX Array4
+    x = np.ones(
+        (
+            2,
+            3,
+            4,
+        )
+    )  # TODO: merge into next line and create on device?
+    x_cupy = cp.asarray(x)  # type: cupy.ndarray
+    print(f"x_cupy={x_cupy}")
+    print(x_cupy.__cuda_array_interface__)
+
+    # cupy -> AMReX array4
+    x_arr = amrex.Array4_double(x_cupy)  # type: amrex.Array4_double
+    print(f"x_arr={x_arr}")
+    print(x_arr.__cuda_array_interface__)
+
+    assert (
+        x_arr.__cuda_array_interface__["data"][0]
+        == x_cupy.__cuda_array_interface__["data"][0]
+    )
+
+    # AMReX -> cupy
+    # arr_numba = cuda.as_cuda_array(arr4)
+    # ... or as MultiFab test
+    # TODO
+
+
+@pytest.mark.skipif(
+    amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
+)
+def test_array4_pytorch():
+    # https://docs.cupy.dev/en/stable/user_guide/interoperability.html#pytorch
+    # arr_torch = torch.as_tensor(arr, device='cuda')
+    # assert(arr_torch.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
+    # TODO
+
+    pass
diff --git a/tests/test_multifab.py b/tests/test_multifab.py
@@ -152,3 +152,56 @@ def test_mfab_mfiter(mfab, nghost):
         cnt += 1
 
     assert iter(mfab).length == cnt
+
+
+@pytest.mark.skipif(
+    amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
+)
+def test_mfab_ops_cuda_numba():
+    # https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
+    import numba
+
+    # AMReX -> numba
+    # arr_numba = cuda.as_cuda_array(arr4)
+    # TODO
+
+
+@pytest.mark.skipif(
+    amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
+)
+def test_mfab_ops_cuda_cupy():
+    # https://docs.cupy.dev/en/stable/user_guide/interoperability.html
+    import cupy as cp
+
+    # AMReX -> cupy
+    # arr_numba = cuda.as_cuda_array(arr4)
+    # TODO
+
+
+@pytest.mark.skipif(
+    amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
+)
+def test_mfab_ops_cuda_pytorch():
+    # https://docs.cupy.dev/en/stable/user_guide/interoperability.html#pytorch
+    import torch
+
+    # AMReX -> pytorch
+    # arr_torch = torch.as_tensor(arr, device='cuda')
+    # assert(arr_torch.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
+    # TODO
+
+
+@pytest.mark.skipif(
+    amrex.Config.gpu_backend != "CUDA", reason="Requires AMReX_GPU_BACKEND=CUDA"
+)
+def test_mfab_ops_cuda_cuml():
+    # https://github.com/rapidsai/cuml
+    # https://github.com/rapidsai/cudf
+    #   maybe better for particles as a dataframe test
+    import cudf
+    import cuml
+
+    # AMReX -> RAPIDSAI cuML
+    # arr_cuml = ...
+    # assert(arr_cuml.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
+    # TODO