Skip to content

Commit d2c22aa

Browse files
committed
[OpenMP] Fix the OpenMPOpt pass incorrectly optimizing if definition was missing
Summary: This code is intended to block transformations if the call isn't present, however the way it's coded it silently lets it pass if the definition doesn't exist at all. This previously was always valid since we included the runtime as one giant blob so everything was always there, but now that we want to move towards separate ones, it's not quite correct.
1 parent 7811c20 commit d2c22aa

File tree

11 files changed

+93
-479
lines changed

11 files changed

+93
-479
lines changed

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,7 @@ PassBuilder::buildInlinerPipeline(OptimizationLevel Level,
969969
// Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
970970
// there are no OpenMP runtime calls present in the module.
971971
if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3)
972-
MainCGPipeline.addPass(OpenMPOptCGSCCPass());
972+
MainCGPipeline.addPass(OpenMPOptCGSCCPass(Phase));
973973

974974
invokeCGSCCOptimizerLateEPCallbacks(MainCGPipeline, Level);
975975

@@ -1137,7 +1137,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
11371137

11381138
// Try to perform OpenMP specific optimizations on the module. This is a
11391139
// (quick!) no-op if there are no OpenMP runtime calls present in the module.
1140-
MPM.addPass(OpenMPOptPass());
1140+
MPM.addPass(OpenMPOptPass(Phase));
11411141

11421142
if (AttributorRun & AttributorRunOption::MODULE)
11431143
MPM.addPass(AttributorPass());

llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,7 @@ struct OMPInformationCache : public InformationCache {
569569
for (RuntimeFunction Fn : Fns) {
570570
RuntimeFunctionInfo &RFI = RFIs[Fn];
571571

572-
if (RFI.Declaration && RFI.Declaration->isDeclaration())
572+
if (!RFI.Declaration || RFI.Declaration->isDeclaration())
573573
return false;
574574
}
575575
return true;
@@ -5792,6 +5792,7 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
57925792
CallGraphUpdater CGUpdater;
57935793

57945794
bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
5795+
LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink ||
57955796
LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
57965797
OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ nullptr, PostLink);
57975798

@@ -5871,6 +5872,7 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
58715872
CGUpdater.initialize(CG, C, AM, UR);
58725873

58735874
bool PostLink = LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
5875+
LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink ||
58745876
LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink;
58755877
SetVector<Function *> Functions(SCC.begin(), SCC.end());
58765878
OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,

llvm/test/Transforms/OpenMP/spmdization.ll

Lines changed: 45 additions & 432 deletions
Large diffs are not rendered by default.

offload/DeviceRTL/include/DeviceTypes.h

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,15 @@
1212
#ifndef OMPTARGET_TYPES_H
1313
#define OMPTARGET_TYPES_H
1414

15+
#include <gpuintrin.h>
1516
#include <stddef.h>
1617
#include <stdint.h>
1718

19+
template <typename T> using Private = __gpu_private T;
20+
template <typename T> using Constant = __gpu_constant T;
21+
template <typename T> using Local = __gpu_local T;
22+
template <typename T> using Global = __gpu_local T;
23+
1824
enum omp_proc_bind_t {
1925
omp_proc_bind_false = 0,
2026
omp_proc_bind_true = 1,
@@ -155,19 +161,6 @@ typedef enum omp_allocator_handle_t {
155161
#define __PRAGMA(STR) _Pragma(#STR)
156162
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
157163

158-
#define SHARED(NAME) \
159-
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
160-
161-
// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
162-
// now that's not the case.
163-
#define THREAD_LOCAL(NAME) \
164-
[[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
165-
166-
// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
167-
// does?
168-
#define CONSTANT(NAME) \
169-
[[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
170-
171164
///}
172165

173166
#endif

offload/DeviceRTL/include/State.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ struct TeamStateTy {
8686
ParallelRegionFnTy ParallelRegionFnVar;
8787
};
8888

89-
extern TeamStateTy [[clang::address_space(3)]] TeamState;
89+
extern Local<TeamStateTy> TeamState;
9090

9191
struct ThreadStateTy {
9292

@@ -112,7 +112,7 @@ struct ThreadStateTy {
112112
}
113113
};
114114

115-
extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
115+
extern Local<ThreadStateTy **> ThreadStates;
116116

117117
/// Initialize the state machinery. Must be called by all threads.
118118
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,

offload/DeviceRTL/src/Configuration.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ using namespace ompx;
2828
// This variable should be visible to the plugin so we override the default
2929
// hidden visibility.
3030
[[gnu::used, gnu::retain, gnu::weak,
31-
gnu::visibility("protected")]] DeviceEnvironmentTy
32-
CONSTANT(__omp_rtl_device_environment);
31+
gnu::visibility(
32+
"protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment;
3333

3434
uint32_t config::getAssumeTeamsOversubscription() {
3535
return __omp_rtl_assume_teams_oversubscription;

offload/DeviceRTL/src/Mapping.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ uint32_t mapping::getNumberOfProcessorElements() {
308308

309309
// TODO: This is a workaround for initialization coming from kernels outside of
310310
// the TU. We will need to solve this more correctly in the future.
311-
[[gnu::weak]] int SHARED(IsSPMDMode);
311+
[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode;
312312

313313
void mapping::init(bool IsSPMD) {
314314
if (mapping::isInitialThreadInLevel0(IsSPMD))

offload/DeviceRTL/src/Reduction.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -71,16 +71,16 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
7171
if (NumThreads == 1)
7272
return 1;
7373

74-
//
75-
// This reduce function handles reduction within a team. It handles
76-
// parallel regions in both L1 and L2 parallelism levels. It also
77-
// supports Generic, SPMD, and NoOMP modes.
78-
//
79-
// 1. Reduce within a warp.
80-
// 2. Warp master copies value to warp 0 via shared memory.
81-
// 3. Warp 0 reduces to a single value.
82-
// 4. The reduced value is available in the thread that returns 1.
83-
//
74+
//
75+
// This reduce function handles reduction within a team. It handles
76+
// parallel regions in both L1 and L2 parallelism levels. It also
77+
// supports Generic, SPMD, and NoOMP modes.
78+
//
79+
// 1. Reduce within a warp.
80+
// 2. Warp master copies value to warp 0 via shared memory.
81+
// 3. Warp 0 reduces to a single value.
82+
// 4. The reduced value is available in the thread that returns 1.
83+
//
8484

8585
#if __has_builtin(__nvvm_reflect)
8686
if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
@@ -196,8 +196,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
196196
uint32_t NumThreads = omp_get_num_threads();
197197
uint32_t TeamId = omp_get_team_num();
198198
uint32_t NumTeams = omp_get_num_teams();
199-
static unsigned SHARED(Bound);
200-
static unsigned SHARED(ChunkTeamCount);
199+
[[clang::loader_uninitialized]] static Local<unsigned> Bound;
200+
[[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
201201

202202
// Block progress for teams greater than the current upper
203203
// limit. We always only allow a number of teams less or equal

offload/DeviceRTL/src/State.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,17 @@ using namespace ompx;
2828
///{
2929

3030
/// External symbol to access dynamic shared memory.
31-
[[gnu::aligned(allocator::ALIGNMENT)]] extern unsigned char
32-
[[clang::address_space(3)]] DynamicSharedBuffer[];
31+
[[gnu::aligned(
32+
allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[];
3333

3434
/// The kernel environment passed to the init method by the compiler.
35-
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
35+
[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *>
36+
KernelEnvironmentPtr;
3637

3738
/// The kernel launch environment passed as argument to the kernel by the
3839
/// runtime.
39-
static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
40+
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
41+
KernelLaunchEnvironmentPtr;
4042

4143
///}
4244

@@ -108,7 +110,8 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
108110
"Shared scratchpad of this size not supported yet.");
109111

110112
/// The allocation of a single shared memory scratchpad.
111-
static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
113+
[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy>
114+
SharedMemorySmartStack;
112115

113116
void SharedMemorySmartStackTy::init(bool IsSPMD) {
114117
Usage[mapping::getThreadIdInBlock()] = 0;
@@ -220,8 +223,10 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
220223
ASSERT(HasThreadState == Other.HasThreadState, nullptr);
221224
}
222225

223-
state::TeamStateTy SHARED(ompx::state::TeamState);
224-
state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
226+
[[clang::loader_uninitialized]] Local<state::TeamStateTy>
227+
ompx::state::TeamState;
228+
[[clang::loader_uninitialized]] Local<state::ThreadStateTy **>
229+
ompx::state::ThreadStates;
225230

226231
namespace {
227232

@@ -449,10 +454,10 @@ void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
449454
/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
450455
constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
451456

452-
[[clang::loader_uninitialized]] static void *[[clang::address_space(
453-
3)]] SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
454-
[[clang::loader_uninitialized]] static void **[[clang::address_space(
455-
3)]] SharedMemVariableSharingSpacePtr;
457+
[[clang::loader_uninitialized]] static Local<void *>
458+
SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
459+
[[clang::loader_uninitialized]] static Local<void **>
460+
SharedMemVariableSharingSpacePtr;
456461

457462
void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
458463
if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {

offload/DeviceRTL/src/Synchronization.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
6969
}
7070
}
7171

72-
uint32_t SHARED(namedBarrierTracker);
72+
[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
7373

7474
void namedBarrierInit() {
7575
// Don't have global ctors, and shared memory is not zero init

0 commit comments

Comments
 (0)