Skip to content

Commit 0d27d54

Browse files
committed
add timing utils for amdgpu
1 parent 1abe22c commit 0d27d54

File tree

3 files changed

+81
-1
lines changed

3 files changed

+81
-1
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
add_header_library(
2+
amdgpu_timing
3+
HDRS
4+
timing.h
5+
DEPENDS
6+
libc.src.__support.common
7+
)
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
//===------------- AMDGPU implementation of timing utils --------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
10+
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
11+
12+
#include "src/__support/GPU/utils.h"
13+
#include "src/__support/common.h"
14+
#include "src/__support/macros/attributes.h"
15+
#include "src/__support/macros/config.h"
16+
17+
#include <stdint.h>
18+
19+
namespace LIBC_NAMESPACE {
20+
21+
// Returns the overhead associated with calling the profiling region. This
22+
// allows us to substract the constant-time overhead from the latency to
23+
// obtain a true result. This can vary with system load.
24+
[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
25+
gpu::memory_fence();
26+
uint64_t start = gpu::processor_clock();
27+
uint32_t result = 0.0;
28+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
29+
asm("" ::"s"(start));
30+
uint64_t stop = gpu::processor_clock();
31+
return stop - start;
32+
}
33+
34+
// Profile a simple function and obtain its latency in clock cycles on the
35+
// system. This function cannot be inlined or else it will disturb the very
36+
// delicate balance of hard-coded dependencies.
37+
template <typename F, typename T>
38+
[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) {
39+
// We need to store the input somewhere to guarantee that the compiler will
40+
// not constant propagate it and remove the profiling region.
41+
volatile uint32_t storage = t;
42+
float arg = storage;
43+
asm("" ::"s"(arg));
44+
45+
// The AMDGPU architecture needs to wait on pending results.
46+
gpu::memory_fence();
47+
// Get the current timestamp from the clock.
48+
uint64_t start = gpu::processor_clock();
49+
50+
// This forces the compiler to load the input argument and run the clock cycle
51+
// counter before the profiling region.
52+
asm("" ::"s"(arg), "s"(start));
53+
54+
// Run the function under test and return its value.
55+
auto result = f(arg);
56+
57+
// This inline assembly performs a no-op which forces the result to both be
58+
// used and prevents us from exiting this region before it's complete.
59+
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result) :);
60+
61+
// Obtain the current timestamp after running the calculation and force
62+
// ordering.
63+
uint64_t stop = gpu::processor_clock();
64+
asm("" ::"s"(stop));
65+
gpu::memory_fence();
66+
67+
// Return the time elapsed.
68+
return stop - start;
69+
}
70+
71+
} // namespace LIBC_NAMESPACE
72+
73+
#endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU

libc/benchmarks/gpu/timing/timing.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
#include "src/__support/macros/properties/architectures.h"
1313

1414
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
15-
#error "amdgpu not yet supported"
15+
#include "amdgpu/timing.h"
1616
#elif defined(LIBC_TARGET_ARCH_IS_NVPTX)
1717
#include "nvptx/timing.h"
1818
#else

0 commit comments

Comments
 (0)