Skip to content

Commit d2937d0

Browse files
committed
Array4: __cuda_array_interface__ v2
Start implementing the `__cuda_array_interface__` for zero-copy data exchange on Nvidia CUDA GPUs.
1 parent 88743cf commit d2937d0

File tree

3 files changed

+188
-70
lines changed

3 files changed

+188
-70
lines changed

src/Base/Array4.cpp

Lines changed: 81 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,61 @@ namespace py = pybind11;
1919
using namespace amrex;
2020

2121

22+
namespace
23+
{
24+
/** CPU: __array_interface__ v3
25+
*
26+
* https://numpy.org/doc/stable/reference/arrays.interface.html
27+
*/
28+
template<typename T>
29+
py::dict
30+
array_interface(Array4<T> const & a4)
31+
{
32+
auto d = py::dict();
33+
auto const len = length(a4);
34+
// F->C index conversion here
35+
// p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride];
36+
// Buffer dimensions: zero-size shall not skip dimension
37+
auto shape = py::make_tuple(
38+
a4.ncomp,
39+
len.z <= 0 ? 1 : len.z,
40+
len.y <= 0 ? 1 : len.y,
41+
len.x <= 0 ? 1 : len.x // fastest varying index
42+
);
43+
// buffer protocol strides are in bytes, AMReX strides are elements
44+
auto const strides = py::make_tuple(
45+
sizeof(T) * a4.nstride,
46+
sizeof(T) * a4.kstride,
47+
sizeof(T) * a4.jstride,
48+
sizeof(T) // fastest varying index
49+
);
50+
bool const read_only = false;
51+
d["data"] = py::make_tuple(std::intptr_t(a4.dataPtr()), read_only);
52+
// note: if we want to keep the same global indexing with non-zero
53+
// box small_end as in AMReX, then we can explore playing with
54+
// this offset as well
55+
//d["offset"] = 0; // default
56+
//d["mask"] = py::none(); // default
57+
58+
d["shape"] = shape;
59+
// we could also set this after checking the strides are C-style contiguous:
60+
//if (is_contiguous<T>(shape, strides))
61+
// d["strides"] = py::none(); // C-style contiguous
62+
//else
63+
d["strides"] = strides;
64+
65+
// type description
66+
// for more complicated types, e.g., tuples/structs
67+
//d["descr"] = ...;
68+
// we currently only need this
69+
d["typestr"] = py::format_descriptor<T>::format();
70+
71+
d["version"] = 3;
72+
return d;
73+
}
74+
}
75+
76+
2277
template< typename T >
2378
void make_Array4(py::module &m, std::string typestr)
2479
{
@@ -85,56 +140,44 @@ void make_Array4(py::module &m, std::string typestr)
85140
return a4;
86141
}))
87142

143+
144+
// CPU: __array_interface__ v3
145+
// https://numpy.org/doc/stable/reference/arrays.interface.html
88146
.def_property_readonly("__array_interface__", [](Array4<T> const & a4) {
89-
auto d = py::dict();
90-
auto const len = length(a4);
91-
// F->C index conversion here
92-
// p[(i-begin.x)+(j-begin.y)*jstride+(k-begin.z)*kstride+n*nstride];
93-
// Buffer dimensions: zero-size shall not skip dimension
94-
auto shape = py::make_tuple(
95-
a4.ncomp,
96-
len.z <= 0 ? 1 : len.z,
97-
len.y <= 0 ? 1 : len.y,
98-
len.x <= 0 ? 1 : len.x // fastest varying index
99-
);
100-
// buffer protocol strides are in bytes, AMReX strides are elements
101-
auto const strides = py::make_tuple(
102-
sizeof(T) * a4.nstride,
103-
sizeof(T) * a4.kstride,
104-
sizeof(T) * a4.jstride,
105-
sizeof(T) // fastest varying index
106-
);
107-
bool const read_only = false;
108-
d["data"] = py::make_tuple(std::intptr_t(a4.dataPtr()), read_only);
109-
// note: if we want to keep the same global indexing with non-zero
110-
// box small_end as in AMReX, then we can explore playing with
111-
// this offset as well
112-
//d["offset"] = 0; // default
113-
//d["mask"] = py::none(); // default
114-
115-
d["shape"] = shape;
116-
// we could also set this after checking the strides are C-style contiguous:
117-
//if (is_contiguous<T>(shape, strides))
118-
// d["strides"] = py::none(); // C-style contiguous
119-
//else
120-
d["strides"] = strides;
121-
122-
d["typestr"] = py::format_descriptor<T>::format();
123-
d["version"] = 3;
124-
return d;
147+
return array_interface(a4);
125148
})
126149

150+
// CPU: __array_function__ interface (TODO)
151+
//
152+
// NEP 18 — A dispatch mechanism for NumPy's high level array functions.
153+
// https://numpy.org/neps/nep-0018-array-function-protocol.html
154+
// This enables code using NumPy to be directly operated on Array4 arrays.
155+
// __array_function__ feature requires NumPy 1.16 or later.
156+
127157

128-
// TODO: __cuda_array_interface__
158+
// Nvidia GPUs: __cuda_array_interface__ v2
129159
// https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html
160+
.def_property_readonly("__cuda_array_interface__", [](Array4<T> const & a4) {
161+
auto d = array_interface(a4);
162+
163+
// data:
164+
// Because the user of the interface may or may not be in the same context, the most common case is to use cuPointerGetAttribute with CU_POINTER_ATTRIBUTE_DEVICE_POINTER in the CUDA driver API (or the equivalent CUDA Runtime API) to retrieve a device pointer that is usable in the currently active context.
165+
// TODO For zero-size arrays, use 0 here.
166+
167+
// ... TODO: wasn't there some stream or device info?
168+
169+
d["version"] = 2;
170+
return d;
171+
})
130172

131173

132-
// TODO: __dlpack__
174+
// TODO: __dlpack__ __dlpack_device__
133175
// DLPack protocol (CPU, NVIDIA GPU, AMD GPU, Intel GPU, etc.)
134176
// https://dmlc.github.io/dlpack/latest/
135177
// https://data-apis.org/array-api/latest/design_topics/data_interchange.html
136178
// https://github.com/data-apis/consortium-feedback/issues/1
137179
// https://github.com/dmlc/dlpack/blob/master/include/dlpack/dlpack.h
180+
// https://docs.cupy.dev/en/stable/user_guide/interoperability.html#dlpack-data-exchange-protocol
138181

139182

140183
.def("contains", &Array4<T>::contains)

tests/test_array4.py

Lines changed: 59 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import numpy as np
55
import amrex
66

7+
78
def test_array4_empty():
89
empty = amrex.Array4_double()
910

@@ -17,6 +18,7 @@ def test_array4_empty():
1718
assert(emptyc.size == 0)
1819
assert(emptyc.nComp == 0)
1920

21+
2022
def test_array4():
2123
# from numpy (also a non-owning view)
2224
x = np.ones((2, 3, 4,))
@@ -60,35 +62,60 @@ def test_array4():
6062
x[1, 1, 1] = 44
6163
assert(v_carr2np[0, 1, 1, 1] == 44)
6264

63-
# from cupy
64-
65-
# to numpy
66-
67-
# to cupy
68-
69-
return
70-
71-
# Check indexing
72-
assert(obj[0] == 1)
73-
assert(obj[1] == 2)
74-
assert(obj[2] == 3)
75-
assert(obj[-1] == 3)
76-
assert(obj[-2] == 2)
77-
assert(obj[-3] == 1)
78-
with pytest.raises(IndexError):
79-
obj[-4]
80-
with pytest.raises(IndexError):
81-
obj[3]
82-
83-
# Check assignment
84-
obj[0] = 2
85-
obj[1] = 3
86-
obj[2] = 4
87-
assert(obj[0] == 2)
88-
assert(obj[1] == 3)
89-
assert(obj[2] == 4)
90-
91-
#def test_iv_conversions():
92-
# obj = amrex.IntVect.max_vector().numpy()
93-
# assert(isinstance(obj, np.ndarray))
94-
# assert(obj.dtype == np.int32)
65+
66+
@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
67+
reason="Requires AMReX_GPU_BACKEND=CUDA")
68+
def test_array4_numba():
69+
# https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
70+
import numba
71+
72+
# numba -> AMReX Array4
73+
x = np.ones((2, 3, 4,)) # type: numpy.ndarray
74+
75+
# host-to-device copy
76+
x_numba = numba.cuda.to_device(x) # type: numba.cuda.cudadrv.devicearray.DeviceNDArray
77+
#x_cupy = cupy.asarray(x_numba) # type: cupy.ndarray
78+
x_arr = amrex.Array4_double(x_numba) # type: amrex.Array4_double
79+
80+
assert(x_arr.__cuda_array_interface__['data'][0] == x_numba.__cuda_array_interface__['data'][0])
81+
82+
# AMReX -> numba
83+
#arr_numba = cuda.as_cuda_array(arr4)
84+
# ... or as MultiFab test
85+
# TODO
86+
87+
88+
@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
89+
reason="Requires AMReX_GPU_BACKEND=CUDA")
90+
def test_array4_cupy():
91+
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html
92+
import cupy as cp
93+
94+
# cupy -> AMReX Array4
95+
x = np.ones((2, 3, 4,)) # TODO: merge into next line and create on device?
96+
x_cupy = cp.asarray(x) # type: cupy.ndarray
97+
print(f"x_cupy={x_cupy}")
98+
print(x_cupy.__cuda_array_interface__)
99+
100+
# cupy -> AMReX array4
101+
x_arr = amrex.Array4_double(x_cupy) # type: amrex.Array4_double
102+
print(f"x_arr={x_arr}")
103+
print(x_arr.__cuda_array_interface__)
104+
105+
assert(x_arr.__cuda_array_interface__['data'][0] == x_cupy.__cuda_array_interface__['data'][0])
106+
107+
# AMReX -> cupy
108+
#arr_numba = cuda.as_cuda_array(arr4)
109+
# ... or as MultiFab test
110+
# TODO
111+
112+
113+
@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
114+
reason="Requires AMReX_GPU_BACKEND=CUDA")
115+
def test_array4_pytorch():
116+
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html#pytorch
117+
#arr_torch = torch.as_tensor(arr, device='cuda')
118+
#assert(arr_torch.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
119+
# TODO
120+
121+
pass

tests/test_multifab.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,51 @@ def test_mfab_ops(boxarr, distmap, nghost):
142142
np.testing.assert_allclose(dst.min(0), 150.0)
143143
np.testing.assert_allclose(dst.max(0), 150.0)
144144

145+
146+
@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
147+
reason="Requires AMReX_GPU_BACKEND=CUDA")
148+
def test_mfab_ops_cuda_numba():
149+
# https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html
150+
import numba
151+
152+
# AMReX -> numba
153+
#arr_numba = cuda.as_cuda_array(arr4)
154+
# TODO
155+
156+
157+
@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
158+
reason="Requires AMReX_GPU_BACKEND=CUDA")
159+
def test_mfab_ops_cuda_cupy():
160+
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html
161+
import cupy as cp
162+
163+
# AMReX -> cupy
164+
#arr_numba = cuda.as_cuda_array(arr4)
165+
# TODO
166+
167+
168+
@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
169+
reason="Requires AMReX_GPU_BACKEND=CUDA")
170+
def test_mfab_ops_cuda_pytorch():
171+
# https://docs.cupy.dev/en/stable/user_guide/interoperability.html#pytorch
172+
import torch
173+
174+
# AMReX -> pytorch
175+
#arr_torch = torch.as_tensor(arr, device='cuda')
176+
#assert(arr_torch.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
177+
# TODO
178+
179+
180+
@pytest.mark.skipif(amrex.Config.gpu_backend != "CUDA",
181+
reason="Requires AMReX_GPU_BACKEND=CUDA")
182+
def test_mfab_ops_cuda_cuml():
183+
# https://github.com/rapidsai/cuml
184+
# https://github.com/rapidsai/cudf
185+
# maybe better for particles as a dataframe test
186+
import cudf
187+
import cuml
188+
189+
# AMReX -> RAPIDSAI cuML
190+
#arr_cuml = ...
191+
#assert(arr_cuml.__cuda_array_interface__['data'][0] == arr.__cuda_array_interface__['data'][0])
192+
# TODO

0 commit comments

Comments
 (0)