Skip to content

Commit ae5348a

Browse files
[openmp][amdgpu] Make plugin robust to presence of explicit implicit arguments
OpenMP (compiler) does not currently request any implicit kernel arguments. OpenMP (runtime) allocates and initialises a reasonable guess at the implicit kernel arguments anyway. This change makes the plugin check the number of explicit arguments, instead of all arguments, and puts the pointer to hostcall buffer in both the current location and at the offset expected when implicit arguments are added to the metadata by D113538. This is intended to keep things running while fixing the oversight in the compiler (in D113538). Once that patch lands, and a following one marks openmp kernels that use printf such that the backend emits an args element with the right type (instead of hidden_node), the over-allocation can be removed and the hardcoded 8*e+3 offset replaced with one read from the .offset of the corresponding metadata element. Reviewed By: estewart08 Differential Revision: https://reviews.llvm.org/D114274
1 parent 7aafe46 commit ae5348a

File tree

3 files changed

+25
-7
lines changed

3 files changed

+25
-7
lines changed

openmp/libomptarget/plugins/amdgpu/impl/internal.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ typedef struct atl_kernel_info_s {
5454
uint32_t sgpr_spill_count;
5555
uint32_t vgpr_spill_count;
5656
uint32_t kernel_segment_size;
57-
uint32_t num_args;
57+
uint32_t explicit_argument_count;
58+
uint32_t implicit_argument_count;
5859
} atl_kernel_info_t;
5960

6061
typedef struct atl_symbol_info_s {

openmp/libomptarget/plugins/amdgpu/impl/system.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ static hsa_status_t get_code_object_custom_metadata(
381381
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
382382
}
383383

384-
atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0};
384+
atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
385385

386386
uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count;
387387
msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count);
@@ -446,8 +446,6 @@ static hsa_status_t get_code_object_custom_metadata(
446446
return HSA_STATUS_ERROR_INVALID_CODE_OBJECT;
447447
}
448448

449-
info.num_args = argsSize;
450-
451449
for (size_t i = 0; i < argsSize; ++i) {
452450
KernelArgMD lcArg;
453451

@@ -476,8 +474,10 @@ static hsa_status_t get_code_object_custom_metadata(
476474
// check if the arg is a hidden/implicit arg
477475
// this logic assumes that all hidden args are 8-byte aligned
478476
if (!isImplicit(lcArg.valueKind_)) {
477+
info.explicit_argument_count++;
479478
kernel_explicit_args_size += lcArg.size_;
480479
} else {
480+
info.implicit_argument_count++;
481481
hasHiddenArgs = true;
482482
}
483483
kernel_explicit_args_size += padding;

openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2071,7 +2071,7 @@ int32_t __tgt_rtl_run_target_team_region_locked(
20712071
const uint32_t sgpr_spill_count = KernelInfoEntry.sgpr_spill_count;
20722072
const uint32_t vgpr_spill_count = KernelInfoEntry.vgpr_spill_count;
20732073

2074-
assert(arg_num == (int)KernelInfoEntry.num_args);
2074+
assert(arg_num == (int)KernelInfoEntry.explicit_argument_count);
20752075

20762076
/*
20772077
* Set limit based on ThreadsPerGroup and GroupsPerDevice
@@ -2173,14 +2173,31 @@ int32_t __tgt_rtl_run_target_team_region_locked(
21732173
// under a multiple reader lock, not a writer lock.
21742174
static pthread_mutex_t hostcall_init_lock = PTHREAD_MUTEX_INITIALIZER;
21752175
pthread_mutex_lock(&hostcall_init_lock);
2176-
impl_args->hostcall_ptr = hostrpc_assign_buffer(
2176+
unsigned long buffer = hostrpc_assign_buffer(
21772177
DeviceInfo.HSAAgents[device_id], queue, device_id);
21782178
pthread_mutex_unlock(&hostcall_init_lock);
2179-
if (!impl_args->hostcall_ptr) {
2179+
if (!buffer) {
21802180
DP("hostrpc_assign_buffer failed, gpu would dereference null and "
21812181
"error\n");
21822182
return OFFLOAD_FAIL;
21832183
}
2184+
2185+
if (KernelInfoEntry.implicit_argument_count >= 4) {
2186+
// Initialise pointer for implicit_argument_count != 0 ABI
2187+
// Guess that the right implicit argument is at offset 24 after
2188+
// the explicit arguments. In the future, should be able to read
2189+
// the offset from msgpack. Clang is not annotating it at present.
2190+
uint64_t Offset =
2191+
sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
2192+
if ((Offset + 8) > (ArgPool->kernarg_segment_size)) {
2193+
DP("Bad offset of hostcall, exceeds kernarg segment size\n");
2194+
} else {
2195+
memcpy(static_cast<char *>(kernarg) + Offset, &buffer, 8);
2196+
}
2197+
}
2198+
2199+
// initialise pointer for implicit_argument_count == 0 ABI
2200+
impl_args->hostcall_ptr = buffer;
21842201
}
21852202

21862203
packet->kernarg_address = kernarg;

0 commit comments

Comments
 (0)