Skip to content

get_global_id() indexing order doesn't follow that of SYCL indexing  #889

@chudur-budur

Description

@chudur-budur

Let's say we want to mutliply two matrices (of dimensions |a| = X*Y and |b| = Y*Z, i.e. a x b = c, |c| = X*Z) using this SYCL code:

#include <iostream>
#include <vector>
#include <string>
#include <CL/sycl.hpp>

int matmul(std::vector<int> &a,
            std::vector<int> &b,
            size_t row_a, size_t col_a,
            size_t row_b, size_t col_b) {

    size_t X = row_a, Y = col_a, Z = col_b;

    std::vector<int> c(X * Z, 0);

    sycl::queue queue;
    std::cout << "Selected device: "
            << queue.get_device().get_info<sycl::info::device::name>()
            << "\n";

    sycl::buffer Ab{a};
    sycl::buffer Bb{b};
    sycl::buffer Cb{c};

    queue.submit([&](sycl::handler& h) {

        sycl::accessor Ad{Ab,h,sycl::read_only};
        sycl::accessor Bd{Bb,h,sycl::read_only};
        sycl::accessor Cd{Cb,h,sycl::read_write};

        sycl::range global{X, Z};

        sycl::stream out(1024, 256, h);
        h.parallel_for(sycl::range{global}, [=](sycl::id<2> index) {
            auto i = index[0];
            auto j = index[1];
            out << "i = " << i << ", j = " << j << sycl::endl;
            for (auto k = 0; k < Y; ++k) {
              Cd[i*Z+j] += Ad[i*Y+k] * Bd[k*Z+j];
            }
        });
    });

    sycl::host_accessor Ch{Cb, sycl::read_only};
    for(auto i = 0 ; i < X ; i++) {
        for(auto j = 0 ; j < Z ; j++)
            std::cout << Ch[i*Z+j] << " ";
        std::cout << std::endl;
    }
    return 0;
}

int main () {
    std::vector<int> a(20,1);
    std::vector<int> b(24,1);

    matmul(a, b, 5, 4, 4, 6);
    return 0;
}

What we are essentially doing is parallalizing this loop:

for (auto i = 0 ; i < X ; i++)
    for(auto j = 0; j < Z; j++) 
        for(auto k = 0 ; k < Y; k++)
            c[i,j] += a[i,k] * b[k,j]

The parallel_for gets the two outer-most loop indices through get_global_id() in the kernel. Now if we print the indices, we get an expected iteration:

i = 0, j = 0
i = 0, j = 1
i = 0, j = 2
i = 0, j = 3
i = 0, j = 4
i = 0, j = 5
i = 1, j = 0
i = 1, j = 1
i = 1, j = 2
i = 1, j = 3
i = 1, j = 4
i = 1, j = 5
i = 2, j = 0
i = 2, j = 1
i = 2, j = 2
i = 2, j = 3
i = 2, j = 4
i = 2, j = 5
i = 3, j = 0
i = 3, j = 1
i = 3, j = 2
i = 3, j = 3
i = 3, j = 4
i = 3, j = 5
i = 4, j = 0
i = 4, j = 1
i = 4, j = 2
i = 4, j = 3
i = 4, j = 4
i = 4, j = 5

Now, if we look at the similar matrix multiplication code in numba_dpex:

import dpctl
import dpctl.tensor as dpt
import numba_dpex as dpex

# Array dimensions
X = 5
Y = 4
Z = 6
global_size = X, Z

@dpex.kernel
def matmul(a, b, c):
    i = dpex.get_global_id(0)
    j = dpex.get_global_id(1)
    print("i =", i, "j =", j)
    for k in range(Y):
        c[i, j] += a[i, k] * b[k, j]


def driver(a, b, c):
    # Invoke the kernel
    matmul[global_size, dpex.DEFAULT_LOCAL_SIZE](a,b,c)


if __name__ == "__main__":
    device = dpctl.select_default_device()
    a = dpt.ones(X * Y, dtype=dpt.int32, device=device)
    a = dpt.reshape(a, (X, Y))
    
    b = dpt.ones(Y * Z, dtype=dpt.int32, device=device)
    b = dpt.reshape(b, (Y, Z))
    
    c = dpt.zeros(X * Z, dtype=dpt.int32, device=device)
    c = dpt.reshape(c, (X, Z))

    print("Using device ...")
    device.print_device_info()

    driver(a, b, c)

The iteration order is flipped:

i = 0 j = 0
i = 1 j = 0
i = 2 j = 0
i = 3 j = 0
i = 4 j = 0
i = 0 j = 1
i = 1 j = 1
i = 2 j = 1
i = 3 j = 1
i = 4 j = 1
i = 0 j = 2
i = 1 j = 2
i = 2 j = 2
i = 3 j = 2
i = 4 j = 2
i = 0 j = 3
i = 1 j = 3
i = 2 j = 3
i = 3 j = 3
i = 4 j = 3
i = 0 j = 4
i = 1 j = 4
i = 2 j = 4
i = 3 j = 4
i = 4 j = 4
i = 0 j = 5
i = 1 j = 5
i = 2 j = 5
i = 3 j = 5
i = 4 j = 5

Why is that? Given that all the parameter settings in the python code is identical to the c++ code.

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions