-
Notifications
You must be signed in to change notification settings - Fork 33
Closed
Description
Let's say we want to mutliply two matrices (of dimensions |a| = X*Y and |b| = Y*Z, i.e. a x b = c, |c| = X*Z) using this SYCL code:
#include <iostream>
#include <vector>
#include <string>
#include <CL/sycl.hpp>
int matmul(std::vector<int> &a,
std::vector<int> &b,
size_t row_a, size_t col_a,
size_t row_b, size_t col_b) {
size_t X = row_a, Y = col_a, Z = col_b;
std::vector<int> c(X * Z, 0);
sycl::queue queue;
std::cout << "Selected device: "
<< queue.get_device().get_info<sycl::info::device::name>()
<< "\n";
sycl::buffer Ab{a};
sycl::buffer Bb{b};
sycl::buffer Cb{c};
queue.submit([&](sycl::handler& h) {
sycl::accessor Ad{Ab,h,sycl::read_only};
sycl::accessor Bd{Bb,h,sycl::read_only};
sycl::accessor Cd{Cb,h,sycl::read_write};
sycl::range global{X, Z};
sycl::stream out(1024, 256, h);
h.parallel_for(sycl::range{global}, [=](sycl::id<2> index) {
auto i = index[0];
auto j = index[1];
out << "i = " << i << ", j = " << j << sycl::endl;
for (auto k = 0; k < Y; ++k) {
Cd[i*Z+j] += Ad[i*Y+k] * Bd[k*Z+j];
}
});
});
sycl::host_accessor Ch{Cb, sycl::read_only};
for(auto i = 0 ; i < X ; i++) {
for(auto j = 0 ; j < Z ; j++)
std::cout << Ch[i*Z+j] << " ";
std::cout << std::endl;
}
return 0;
}
int main () {
std::vector<int> a(20,1);
std::vector<int> b(24,1);
matmul(a, b, 5, 4, 4, 6);
return 0;
}What we are essentially doing is parallalizing this loop:
for (auto i = 0 ; i < X ; i++)
for(auto j = 0; j < Z; j++)
for(auto k = 0 ; k < Y; k++)
c[i,j] += a[i,k] * b[k,j]The parallel_for gets the two outer-most loop indices through get_global_id() in the kernel. Now if we print the indices, we get an expected iteration:
i = 0, j = 0
i = 0, j = 1
i = 0, j = 2
i = 0, j = 3
i = 0, j = 4
i = 0, j = 5
i = 1, j = 0
i = 1, j = 1
i = 1, j = 2
i = 1, j = 3
i = 1, j = 4
i = 1, j = 5
i = 2, j = 0
i = 2, j = 1
i = 2, j = 2
i = 2, j = 3
i = 2, j = 4
i = 2, j = 5
i = 3, j = 0
i = 3, j = 1
i = 3, j = 2
i = 3, j = 3
i = 3, j = 4
i = 3, j = 5
i = 4, j = 0
i = 4, j = 1
i = 4, j = 2
i = 4, j = 3
i = 4, j = 4
i = 4, j = 5Now, if we look at the similar matrix multiplication code in numba_dpex:
import dpctl
import dpctl.tensor as dpt
import numba_dpex as dpex
# Array dimensions
X = 5
Y = 4
Z = 6
global_size = X, Z
@dpex.kernel
def matmul(a, b, c):
i = dpex.get_global_id(0)
j = dpex.get_global_id(1)
print("i =", i, "j =", j)
for k in range(Y):
c[i, j] += a[i, k] * b[k, j]
def driver(a, b, c):
# Invoke the kernel
matmul[global_size, dpex.DEFAULT_LOCAL_SIZE](a,b,c)
if __name__ == "__main__":
device = dpctl.select_default_device()
a = dpt.ones(X * Y, dtype=dpt.int32, device=device)
a = dpt.reshape(a, (X, Y))
b = dpt.ones(Y * Z, dtype=dpt.int32, device=device)
b = dpt.reshape(b, (Y, Z))
c = dpt.zeros(X * Z, dtype=dpt.int32, device=device)
c = dpt.reshape(c, (X, Z))
print("Using device ...")
device.print_device_info()
driver(a, b, c)The iteration order is flipped:
i = 0 j = 0
i = 1 j = 0
i = 2 j = 0
i = 3 j = 0
i = 4 j = 0
i = 0 j = 1
i = 1 j = 1
i = 2 j = 1
i = 3 j = 1
i = 4 j = 1
i = 0 j = 2
i = 1 j = 2
i = 2 j = 2
i = 3 j = 2
i = 4 j = 2
i = 0 j = 3
i = 1 j = 3
i = 2 j = 3
i = 3 j = 3
i = 4 j = 3
i = 0 j = 4
i = 1 j = 4
i = 2 j = 4
i = 3 j = 4
i = 4 j = 4
i = 0 j = 5
i = 1 j = 5
i = 2 j = 5
i = 3 j = 5
i = 4 j = 5Why is that? Given that all the parameter settings in the python code is identical to the c++ code.
fcharras
Metadata
Metadata
Assignees
Labels
No labels