From a05af192052de8503fb4945bfb853b3f2c14e4c9 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Mon, 7 Apr 2025 13:58:25 -0500 Subject: [PATCH 01/29] Codegen for Reduction over private variables with reduction clause --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 147 +++++++++++ clang/lib/CodeGen/CGOpenMPRuntime.h | 14 ++ clang/lib/CodeGen/CGStmtOpenMP.cpp | 12 +- .../OpenMP/for_private_reduction_codegen.cpp | 236 ++++++++++++++++++ 4 files changed, 406 insertions(+), 3 deletions(-) create mode 100644 clang/test/OpenMP/for_private_reduction_codegen.cpp diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 06a652c146fb9..3424227e5da79 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4899,6 +4899,150 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, } } +void CGOpenMPRuntime::emitPrivateReduction( + CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, + ArrayRef LHSExprs, ArrayRef RHSExprs, + ArrayRef ReductionOps) { + + if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty()) + return; + + if (LHSExprs.size() != Privates.size() || + LHSExprs.size() != ReductionOps.size()) + return; + + QualType PrivateType = Privates[0]->getType(); + llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType); + + BinaryOperatorKind MainBO = BO_Comma; + if (const auto *BinOp = dyn_cast(ReductionOps[0])) { + if (const auto *RHSExpr = BinOp->getRHS()) { + if (const auto *BORHS = + dyn_cast(RHSExpr->IgnoreParenImpCasts())) { + MainBO = BORHS->getOpcode(); + } + } + } + + llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType); + const Expr *Private = Privates[0]; + + if (const auto *DRE = dyn_cast(Private)) { + if (const auto *VD = dyn_cast(DRE->getDecl())) { + if (const Expr *Init = VD->getInit()) { + if (Init->isConstantInitializer(CGF.getContext(), false)) { + Expr::EvalResult Result; + if (Init->EvaluateAsRValue(Result, CGF.getContext())) { + APValue &InitValue = Result.Val; + if (InitValue.isInt()) { + InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt()); + } + } + } + } + } + } + + // Create an internal shared variable + std::string SharedName = getName({"internal_private_var"}); + llvm::GlobalVariable *SharedVar = new llvm::GlobalVariable( + CGM.getModule(), LLVMType, false, llvm::GlobalValue::CommonLinkage, + InitVal, ".omp.reduction." + SharedName, nullptr, + llvm::GlobalVariable::NotThreadLocal); + + SharedVar->setAlignment( + llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8)); + + Address SharedResult(SharedVar, SharedVar->getValueType(), + CGF.getContext().getTypeAlignInChars(PrivateType)); + + llvm::Value *ThreadId = getThreadID(CGF, Loc); + llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE); + llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId}; + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + llvm::BasicBlock *InitBB = CGF.createBasicBlock("init"); + llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end"); + + llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ( + ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0)); + CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB); + + CGF.EmitBlock(InitBB); + CGF.Builder.CreateStore(InitVal, SharedResult); + CGF.Builder.CreateBr(InitEndBB); + + CGF.EmitBlock(InitEndBB); + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + for (unsigned I = 0; I < ReductionOps.size(); ++I) { + if (I >= LHSExprs.size()) { + break; + } + + const auto *BinOp = dyn_cast(ReductionOps[I]); + if (!BinOp || BinOp->getOpcode() != BO_Assign) + continue; + + const Expr *RHSExpr = BinOp->getRHS(); + if (!RHSExpr) + continue; + + BinaryOperatorKind BO = BO_Comma; + if (const auto *BORHS = + dyn_cast(RHSExpr->IgnoreParenImpCasts())) { + BO = BORHS->getOpcode(); + } + + LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); + LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); + RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc); + auto &&UpdateOp = [&CGF, PrivateRV, BinOp, BO](RValue OldVal) { + if (BO == BO_Mul) { + llvm::Value *OldScalar = OldVal.getScalarVal(); + llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); + llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar); + return RValue::get(Result); + } else { + OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(), + BinOp->getLHS()->getType(), + ExprValueKind::VK_PRValue); + CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal); + return CGF.EmitAnyExpr(BinOp->getRHS()); + } + }; + + (void)CGF.EmitOMPAtomicSimpleUpdateExpr( + SharedLV, PrivateRV, BO, true, + llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); + } + + // Final barrier + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + // Broadcast final result + llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult); + + // Update private variables with final result + for (unsigned I = 0; I < Privates.size(); ++I) { + LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); + CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress()); + } + + // Final synchronization + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); +} + void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, ArrayRef LHSExprs, @@ -5201,6 +5345,9 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); + if (Options.IsPrivateVarReduction) { + emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps); + } } /// Generates unique name for artificial threadprivate variables. diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 4321712e1521d..50ba28b565b6d 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -1201,8 +1201,22 @@ class CGOpenMPRuntime { struct ReductionOptionsTy { bool WithNowait; bool SimpleReduction; + bool IsPrivateVarReduction; OpenMPDirectiveKind ReductionKind; }; + + /// Emits code for private variable reduction + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc, + ArrayRef Privates, + ArrayRef LHSExprs, + ArrayRef RHSExprs, + ArrayRef ReductionOps); + /// Emit a code for reduction clause. Next code should be emitted for /// reduction: /// \code diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index e4d1db264aac9..720a88e075ddd 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1470,6 +1470,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( llvm::SmallVector LHSExprs; llvm::SmallVector RHSExprs; llvm::SmallVector ReductionOps; + llvm::SmallVector IsPrivate; bool HasAtLeastOneReduction = false; bool IsReductionWithTaskMod = false; for (const auto *C : D.getClausesOfKind()) { @@ -1480,6 +1481,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( Privates.append(C->privates().begin(), C->privates().end()); LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end()); RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end()); + IsPrivate.append(C->private_var_reduction_flags().begin(), + C->private_var_reduction_flags().end()); ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end()); IsReductionWithTaskMod = IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task; @@ -1499,9 +1502,11 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( bool SimpleReduction = ReductionKind == OMPD_simd; // Emit nowait reduction if nowait clause is present or directive is a // parallel directive (it always has implicit barrier). + bool IsPrivateVarReduction = + llvm::any_of(IsPrivate, [](bool IsPriv) { return IsPriv; }); CGM.getOpenMPRuntime().emitReduction( *this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps, - {WithNowait, SimpleReduction, ReductionKind}); + {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind}); } } @@ -3943,7 +3948,8 @@ static void emitScanBasedDirective( PrivScope.Privatize(); CGF.CGM.getOpenMPRuntime().emitReduction( CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, - {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown}); + {/*WithNowait=*/true, /*SimpleReduction=*/true, + /*IsPrivateVarReduction */ false, OMPD_unknown}); } llvm::Value *NextIVal = CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1)); @@ -5747,7 +5753,7 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) { } CGM.getOpenMPRuntime().emitReduction( *this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, - {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd}); + {/*WithNowait=*/true, /*SimpleReduction=*/true, false, OMPD_simd}); for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) { const Expr *PrivateExpr = Privates[I]; LValue DestLVal; diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp new file mode 100644 index 0000000000000..be50991ca193e --- /dev/null +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -0,0 +1,236 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --prefix-filecheck-ir-name _ --version 5 +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics +#define N 10 +void do_red(int n, int *v, int &sum_v) + { + sum_v = 0; + #pragma omp for reduction(original(private),+: sum_v) + for (int i = 0; i < n; i++) + { + sum_v += v[i]; + } + } + int main(void) + { + int v[N]; + for (int i = 0; i < N; i++) + v[i] = i; + #pragma omp parallel num_threads(4) + { + int s_v; + do_red(N, v, s_v); + } + return 0; + } +//. +// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" +// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8 +// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @.omp.reduction..internal_private_var = common global i32 0, align 4 +// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +//. +// CHECK-LABEL: define dso_local void @_Z6do_rediPiRi( +// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]] +// CHECK: [[OMP_PRECOND_THEN]]: +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-NEXT: store i32 0, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[_TMP5]], align 8 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK-NEXT: br i1 [[CMP7]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CHECK: [[COND_TRUE]]: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: br label %[[COND_END:.*]] +// CHECK: [[COND_FALSE]]: +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label %[[COND_END]] +// CHECK: [[COND_END]]: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE]] ], [ [[TMP11]], %[[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CHECK: [[OMP_INNER_FOR_COND]]: +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK-NEXT: br i1 [[CMP8]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CHECK: [[OMP_INNER_FOR_BODY]]: +// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I6]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I6]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]] +// CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4 +// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// CHECK: [[OMP_BODY_CONTINUE]]: +// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CHECK: [[OMP_INNER_FOR_INC]]: +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CHECK: [[OMP_INNER_FOR_END]]: +// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CHECK: [[OMP_LOOP_EXIT]]: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ +// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] +// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] +// CHECK-NEXT: ] +// CHECK: [[_OMP_REDUCTION_CASE1:.*:]] +// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_CASE2:.*:]] +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4 +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]] +// CHECK: [[INIT]]: +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: br label %[[INIT_END]] +// CHECK: [[INIT_END]]: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var, i32 [[TMP29]] seq_cst, align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: br label %[[OMP_PRECOND_END]] +// CHECK: [[OMP_PRECOND_END]]: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func( +// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define dso_local noundef i32 @main( +// CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: br label %[[FOR_COND:.*]] +// CHECK: [[FOR_COND]]: +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +// CHECK: [[FOR_BODY]]: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label %[[FOR_INC:.*]] +// CHECK: [[FOR_INC]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK: [[FOR_END]]: +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @main.omp_outlined, ptr [[V]]) +// CHECK-NEXT: ret i32 0 +// +// +// CHECK-LABEL: define internal void @main.omp_outlined( +// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[S_V:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0 +// CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]]) +// CHECK-NEXT: ret void From 4e6eea6ff066320a584d606ab258c375b1d887be Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Tue, 8 Apr 2025 11:26:53 -0500 Subject: [PATCH 02/29] review comment changes incorporated --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 3424227e5da79..13b070f898a1c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4981,7 +4981,8 @@ void CGOpenMPRuntime::emitPrivateReduction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); - for (unsigned I = 0; I < ReductionOps.size(); ++I) { + for (unsigned I : + llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) { if (I >= LHSExprs.size()) { break; } @@ -5003,7 +5004,7 @@ void CGOpenMPRuntime::emitPrivateReduction( LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc); - auto &&UpdateOp = [&CGF, PrivateRV, BinOp, BO](RValue OldVal) { + auto UpdateOp = [&](RValue OldVal) { if (BO == BO_Mul) { llvm::Value *OldScalar = OldVal.getScalarVal(); llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); @@ -5032,7 +5033,7 @@ void CGOpenMPRuntime::emitPrivateReduction( llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult); // Update private variables with final result - for (unsigned I = 0; I < Privates.size(); ++I) { + for (unsigned I : llvm::seq(Privates.size())) { LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress()); } @@ -5345,9 +5346,8 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); - if (Options.IsPrivateVarReduction) { + if (Options.IsPrivateVarReduction) emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps); - } } /// Generates unique name for artificial threadprivate variables. From 18e1708275ffbeec5a68b264f8c584b3c9a72704 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Wed, 9 Apr 2025 11:28:59 -0500 Subject: [PATCH 03/29] review comment , removing redundant code --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 13b070f898a1c..93d7280408002 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4983,9 +4983,6 @@ void CGOpenMPRuntime::emitPrivateReduction( for (unsigned I : llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) { - if (I >= LHSExprs.size()) { - break; - } const auto *BinOp = dyn_cast(ReductionOps[I]); if (!BinOp || BinOp->getOpcode() != BO_Assign) From 59ab4be637f31f04e684fe759d287841c9a11746 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Thu, 10 Apr 2025 11:44:44 -0500 Subject: [PATCH 04/29] fix for user-defined reduction op --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 93d7280408002..3fd0a0489f38e 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4993,9 +4993,11 @@ void CGOpenMPRuntime::emitPrivateReduction( continue; BinaryOperatorKind BO = BO_Comma; - if (const auto *BORHS = - dyn_cast(RHSExpr->IgnoreParenImpCasts())) { + const Expr *StripRHS = RHSExpr->IgnoreParenImpCasts(); + if (const auto *BORHS = dyn_cast(StripRHS)) { BO = BORHS->getOpcode(); + } else if (const auto *OpCall = dyn_cast(StripRHS)) { + BO = BinaryOperator::getOverloadedOpcode(OpCall->getOperator()); } LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); From e45c30a3e6489d2c40f4267c4444d557683be6a7 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Thu, 1 May 2025 06:10:20 -0500 Subject: [PATCH 05/29] Handle user-defined reduction and updated lit test --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 134 ++++--- .../OpenMP/for_private_reduction_codegen.cpp | 344 ++++++++++++++++-- 2 files changed, 393 insertions(+), 85 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 3fd0a0489f38e..bea9f6af080dd 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4914,30 +4914,22 @@ void CGOpenMPRuntime::emitPrivateReduction( QualType PrivateType = Privates[0]->getType(); llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType); - BinaryOperatorKind MainBO = BO_Comma; - if (const auto *BinOp = dyn_cast(ReductionOps[0])) { - if (const auto *RHSExpr = BinOp->getRHS()) { - if (const auto *BORHS = - dyn_cast(RHSExpr->IgnoreParenImpCasts())) { - MainBO = BORHS->getOpcode(); - } - } - } - llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType); - const Expr *Private = Privates[0]; - - if (const auto *DRE = dyn_cast(Private)) { + const Expr *InitExpr = nullptr; + if (const auto *DRE = dyn_cast(Privates[0])) { if (const auto *VD = dyn_cast(DRE->getDecl())) { - if (const Expr *Init = VD->getInit()) { - if (Init->isConstantInitializer(CGF.getContext(), false)) { + InitExpr = VD->getInit(); + if (InitExpr && !PrivateType->isAggregateType()) { + if (InitExpr->isConstantInitializer(CGF.getContext(), false)) { Expr::EvalResult Result; - if (Init->EvaluateAsRValue(Result, CGF.getContext())) { + if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) { APValue &InitValue = Result.Val; if (InitValue.isInt()) { InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt()); } } + } else { + InitVal = llvm::Constant::getNullValue(LLVMType); } } } @@ -4972,7 +4964,25 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB); CGF.EmitBlock(InitBB); - CGF.Builder.CreateStore(InitVal, SharedResult); + if (InitExpr) { + RValue RV = CGF.EmitAnyExpr(InitExpr); + if (RV.isAggregate()) { + CGF.Builder.CreateMemCpy(SharedResult, RV.getAggregateAddress(), + llvm::ConstantInt::get(CGF.IntPtrTy, 4), + /*IsVolatile=*/false); + } else { + CGF.Builder.CreateStore(RV.getScalarVal(), SharedResult); + } + } else { + if (PrivateType->isAggregateType()) { + CGF.Builder.CreateMemSet(SharedResult, + llvm::ConstantInt::get(CGM.Int8Ty, 0), + llvm::ConstantInt::get(CGF.IntPtrTy, 4), + /*IsVolatile=*/false); + } else { + CGF.Builder.CreateStore(InitVal, SharedResult); + } + } CGF.Builder.CreateBr(InitEndBB); CGF.EmitBlock(InitEndBB); @@ -4983,46 +4993,75 @@ void CGOpenMPRuntime::emitPrivateReduction( for (unsigned I : llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) { + const Expr *ReductionClauseExpr = ReductionOps[I]->IgnoreParenCasts(); + if (const auto *Cleanup = dyn_cast(ReductionClauseExpr)) + ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); + const Expr *AssignRHS = nullptr; + const Expr *AssignLHS = nullptr; + + if (const auto *BinOp = dyn_cast(ReductionClauseExpr)) { + if (BinOp->getOpcode() == BO_Assign) { + AssignLHS = BinOp->getLHS(); + AssignRHS = BinOp->getRHS(); + } + } else if (const auto *OpCall = + dyn_cast(ReductionClauseExpr)) { + if (OpCall->getOperator() == OO_Equal) { + AssignLHS = OpCall->getArg(0); + AssignRHS = OpCall->getArg(1); + } + } - const auto *BinOp = dyn_cast(ReductionOps[I]); - if (!BinOp || BinOp->getOpcode() != BO_Assign) - continue; - - const Expr *RHSExpr = BinOp->getRHS(); - if (!RHSExpr) + if (!AssignRHS || !AssignLHS) { continue; + } - BinaryOperatorKind BO = BO_Comma; - const Expr *StripRHS = RHSExpr->IgnoreParenImpCasts(); - if (const auto *BORHS = dyn_cast(StripRHS)) { - BO = BORHS->getOpcode(); - } else if (const auto *OpCall = dyn_cast(StripRHS)) { - BO = BinaryOperator::getOverloadedOpcode(OpCall->getOperator()); + const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts(); + if (const auto *MTE = + dyn_cast(ReductionCombinerExpr)) { + ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts(); } + BinaryOperatorKind BO = BO_Assign; LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc); - auto UpdateOp = [&](RValue OldVal) { - if (BO == BO_Mul) { - llvm::Value *OldScalar = OldVal.getScalarVal(); - llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); - llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar); - return RValue::get(Result); - } else { - OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(), - BinOp->getLHS()->getType(), - ExprValueKind::VK_PRValue); - CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal); - return CGF.EmitAnyExpr(BinOp->getRHS()); - } - }; + if (const auto *BinOp = dyn_cast(ReductionCombinerExpr)) { + BO = BinOp->getOpcode(); + auto UpdateOp = [&](RValue OldVal) { + if (BO == BO_Mul) { + llvm::Value *OldScalar = OldVal.getScalarVal(); + llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); + llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar); + return RValue::get(Result); + } else { + OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(), + BinOp->getLHS()->getType(), + ExprValueKind::VK_PRValue); + CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal); + return CGF.EmitAnyExpr(BinOp->getRHS()); + } + }; - (void)CGF.EmitOMPAtomicSimpleUpdateExpr( - SharedLV, PrivateRV, BO, true, - llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); + (void)CGF.EmitOMPAtomicSimpleUpdateExpr( + SharedLV, PrivateRV, BO, true, + llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); + } else if (const auto *OpCall = dyn_cast(ReductionClauseExpr)) { + auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + CharUnits Alignment = CGF.getContext().getTypeAlignInChars(PrivateType); + Address TempResult = + CGF.CreateMemTemp(PrivateType, "reduction.temp.result"); + ReturnValueSlot RVS(TempResult, /*IsVolatile=*/false); + RValue ResultRV = CGF.EmitCallExpr(OpCall, RVS, nullptr); + CGF.Builder.CreateMemCpy(SharedResult, ResultRV.getAggregateAddress(), + llvm::ConstantInt::get(CGF.IntPtrTy, 4), + Alignment.getQuantity()); + }; + std::string CriticalName = getName({"reduction_critical"}); + emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); + } } - // Final barrier CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_barrier), @@ -5042,7 +5081,6 @@ void CGOpenMPRuntime::emitPrivateReduction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); } - void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, ArrayRef LHSExprs, diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index be50991ca193e..dcacc4140bbdb 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -2,38 +2,307 @@ // RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s // expected-no-diagnostics #define N 10 +class Sum { + int val; +public: + Sum(int v = 0) : val(v) {} + Sum operator+(const Sum& rhs) const { + return Sum(val + rhs.val); + } +}; + +void func_red(){ + Sum result(0); + Sum array[N]; + + for(int i = 0; i < 10; i++) { + array[i] = Sum(i); + } + + #pragma omp parallel private(result) num_threads(4) + { + #pragma omp for reduction(+:result) + for(int i = 0; i < 10; i++) { + result = result + array[i]; + } + } +} + void do_red(int n, int *v, int &sum_v) { - sum_v = 0; - #pragma omp for reduction(original(private),+: sum_v) - for (int i = 0; i < n; i++) - { - sum_v += v[i]; - } + sum_v = 0; + #pragma omp for reduction(original(private),+: sum_v) + for (int i = 0; i < n; i++) + { + sum_v += v[i]; + } } int main(void) { - int v[N]; - for (int i = 0; i < N; i++) - v[i] = i; - #pragma omp parallel num_threads(4) - { - int s_v; - do_red(N, v, s_v); - } - return 0; + int v[N]; + for (int i = 0; i < N; i++) + v[i] = i; + #pragma omp parallel num_threads(4) + { + int s_v; + do_red(N, v, s_v); + } + return 0; } //. // CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" // CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8 -// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @.omp.reduction..internal_private_var = common global i32 0, align 4 +// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8 +// CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4 +// CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 +// CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4 //. +// CHECK-LABEL: define dso_local void @_Z8func_redv( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*]]: +// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0) +// CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10 +// CHECK-NEXT: br label %[[ARRAYCTOR_LOOP:.*]] +// CHECK: [[ARRAYCTOR_LOOP]]: +// CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], %[[ENTRY]] ], [ [[ARRAYCTOR_NEXT:%.*]], %[[ARRAYCTOR_LOOP]] ] +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0) +// CHECK-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1 +// CHECK-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]] +// CHECK-NEXT: br i1 [[ARRAYCTOR_DONE]], label %[[ARRAYCTOR_CONT:.*]], label %[[ARRAYCTOR_LOOP]] +// CHECK: [[ARRAYCTOR_CONT]]: +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: br label %[[FOR_COND:.*]] +// CHECK: [[FOR_COND]]: +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] +// CHECK: [[FOR_BODY]]: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: br label %[[FOR_INC:.*]] +// CHECK: [[FOR_INC]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK: [[FOR_END]]: +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define linkonce_odr void @_ZN3SumC1Ei( +// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4 +// CHECK-NEXT: call void @_ZN3SumC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined( +// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[ARRAY_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[REF_TMP4:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[REF_TMP7:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[AGG_TEMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[REDUCTION_TEMP_RESULT:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[REF_TMP10:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAY_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0) +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], i32 noundef 0) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CHECK: [[COND_TRUE]]: +// CHECK-NEXT: br label %[[COND_END:.*]] +// CHECK: [[COND_FALSE]]: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label %[[COND_END]] +// CHECK: [[COND_END]]: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CHECK: [[OMP_INNER_FOR_COND]]: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]] +// CHECK-NEXT: br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CHECK: [[OMP_INNER_FOR_BODY]]: +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// CHECK: [[OMP_BODY_CONTINUE]]: +// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CHECK: [[OMP_INNER_FOR_INC]]: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CHECK: [[OMP_INNER_FOR_END]]: +// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CHECK: [[OMP_LOOP_EXIT]]: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK-NEXT: store ptr [[RESULT1]], ptr [[TMP11]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP12]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ +// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] +// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] +// CHECK-NEXT: ] +// CHECK: [[_OMP_REDUCTION_CASE1:.*:]] +// CHECK-NEXT: [[CALL5:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]]) +// CHECK-NEXT: [[COERCE_DIVE6:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP4]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL5]], ptr [[COERCE_DIVE6]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP4]], i64 4, i1 false) +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_CASE2:.*:]] +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var) +// CHECK-NEXT: [[CALL8:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]]) +// CHECK-NEXT: [[COERCE_DIVE9:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP7]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL8]], ptr [[COERCE_DIVE9]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP7]], i64 4, i1 false) +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var) +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]] +// CHECK: [[INIT]]: +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[AGG_TEMP]], i32 noundef 0) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 4 [[AGG_TEMP]], i64 4, i1 false) +// CHECK-NEXT: br label %[[INIT_END]] +// CHECK: [[INIT_END]]: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr [[RESULT]], align 4 +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[CALL11:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]]) +// CHECK-NEXT: [[COERCE_DIVE12:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP10]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL11]], ptr [[COERCE_DIVE12]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP10]], i64 4, i1 false) +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 16 [[RESULT]], i64 4, i1 true) +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP15:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: store [[CLASS_SUM]] [[TMP15]], ptr [[RESULT]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define linkonce_odr i32 @_ZNK3SumplERKS_( +// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[TMP1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]] +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RETVAL]], i32 noundef [[ADD]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: ret i32 [[TMP3]] +// +// +// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined.omp.reduction.reduction_func( +// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP7]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP5]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define linkonce_odr void @_ZN3SumC2Ei( +// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[VAL]], align 4 +// CHECK-NEXT: ret void +// +// // CHECK-LABEL: define dso_local void @_Z6do_rediPiRi( -// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 @@ -52,7 +321,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) // CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 // CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 // CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8 @@ -130,7 +399,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 // CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8 -// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ // CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] // CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] @@ -140,28 +409,28 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4 // CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] // CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4 -// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_CASE2:.*:]] // CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4 // CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4 -// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0 // CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var.1, align 4 // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4 -// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var, i32 [[TMP29]] seq_cst, align 4 -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var.1, i32 [[TMP29]] seq_cst, align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4 // CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4 -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: br label %[[OMP_PRECOND_END]] // CHECK: [[OMP_PRECOND_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) @@ -169,7 +438,7 @@ void do_red(int n, int *v, int &sum_v) // // // CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func( -// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -189,12 +458,12 @@ void do_red(int n, int *v, int &sum_v) // // // CHECK-LABEL: define dso_local noundef i32 @main( -// CHECK-SAME: ) #[[ATTR4:[0-9]+]] { +// CHECK-SAME: ) #[[ATTR6:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 // CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) // CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK-NEXT: store i32 0, ptr [[I]], align 4 // CHECK-NEXT: br label %[[FOR_COND:.*]] @@ -213,15 +482,15 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 -// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] // CHECK: [[FOR_END]]: -// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 4) -// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @main.omp_outlined, ptr [[V]]) +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]]) // CHECK-NEXT: ret i32 0 // // // CHECK-LABEL: define internal void @main.omp_outlined( -// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR2]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 @@ -234,3 +503,4 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0 // CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]]) // CHECK-NEXT: ret void + From 980bc06bd8bb1a733f3df4a4b33629105058fd59 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Thu, 1 May 2025 12:54:30 -0500 Subject: [PATCH 06/29] conditional checks --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index bea9f6af080dd..3627ba30cf8d7 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -5054,9 +5054,15 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.CreateMemTemp(PrivateType, "reduction.temp.result"); ReturnValueSlot RVS(TempResult, /*IsVolatile=*/false); RValue ResultRV = CGF.EmitCallExpr(OpCall, RVS, nullptr); - CGF.Builder.CreateMemCpy(SharedResult, ResultRV.getAggregateAddress(), - llvm::ConstantInt::get(CGF.IntPtrTy, 4), - Alignment.getQuantity()); + if (ResultRV.isAggregate()) { + CGF.Builder.CreateMemCpy(SharedResult, ResultRV.getAggregateAddress(), + llvm::ConstantInt::get(CGF.IntPtrTy, 4), + Alignment.getQuantity()); + } else { + CGF.Builder.CreateStore(ResultRV.getScalarVal(), + SharedLV.getAddress(), + /*IsVolatile=*/false); + } }; std::string CriticalName = getName({"reduction_critical"}); emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); From a103dfa2cb7f9176fea411e15801e6b45d4271ac Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Thu, 1 May 2025 13:01:00 -0500 Subject: [PATCH 07/29] lit update --- clang/test/OpenMP/for_private_reduction_codegen.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index dcacc4140bbdb..4995ac42a2242 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -49,6 +49,7 @@ void do_red(int n, int *v, int &sum_v) } return 0; } + //. // CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" // CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 @@ -235,7 +236,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[COERCE_DIVE12:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP10]], i32 0, i32 0 // CHECK-NEXT: store i32 [[CALL11]], ptr [[COERCE_DIVE12]], align 4 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP10]], i64 4, i1 false) -// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 16 [[RESULT]], i64 4, i1 true) +// CHECK-NEXT: store ptr [[RESULT]], ptr @.omp.reduction..internal_private_var, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP15:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4 From 526314c4210b7c09ccba882850e209ed756d32b5 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Mon, 5 May 2025 10:48:25 -0500 Subject: [PATCH 08/29] Support for UDR for private variables --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 221 ++++++++++-------- .../OpenMP/for_private_reduction_codegen.cpp | 101 +++++--- 2 files changed, 190 insertions(+), 132 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 3627ba30cf8d7..71742d24a33ba 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4903,7 +4903,6 @@ void CGOpenMPRuntime::emitPrivateReduction( CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps) { - if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty()) return; @@ -4914,13 +4913,16 @@ void CGOpenMPRuntime::emitPrivateReduction( QualType PrivateType = Privates[0]->getType(); llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType); - llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType); - const Expr *InitExpr = nullptr; - if (const auto *DRE = dyn_cast(Privates[0])) { - if (const auto *VD = dyn_cast(DRE->getDecl())) { - InitExpr = VD->getInit(); - if (InitExpr && !PrivateType->isAggregateType()) { - if (InitExpr->isConstantInitializer(CGF.getContext(), false)) { + llvm::Constant *InitVal = nullptr; + const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps[0]); + + if (!UDR) { + InitVal = llvm::Constant::getNullValue(LLVMType); + if (const auto *DRE = dyn_cast(Privates[0])) { + if (const auto *VD = dyn_cast(DRE->getDecl())) { + const Expr *InitExpr = VD->getInit(); + if (InitExpr && !PrivateType->isAggregateType() && + !PrivateType->isAnyComplexType()) { Expr::EvalResult Result; if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) { APValue &InitValue = Result.Val; @@ -4928,11 +4930,11 @@ void CGOpenMPRuntime::emitPrivateReduction( InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt()); } } - } else { - InitVal = llvm::Constant::getNullValue(LLVMType); } } } + } else { + InitVal = llvm::Constant::getNullValue(LLVMType); } // Create an internal shared variable @@ -4964,108 +4966,121 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB); CGF.EmitBlock(InitBB); - if (InitExpr) { - RValue RV = CGF.EmitAnyExpr(InitExpr); - if (RV.isAggregate()) { - CGF.Builder.CreateMemCpy(SharedResult, RV.getAggregateAddress(), - llvm::ConstantInt::get(CGF.IntPtrTy, 4), - /*IsVolatile=*/false); - } else { - CGF.Builder.CreateStore(RV.getScalarVal(), SharedResult); - } + if (UDR) { + Address OrigAddr = Address::invalid(); + emitInitWithReductionInitializer(CGF, UDR, ReductionOps[0], SharedResult, + OrigAddr, PrivateType); + } else { - if (PrivateType->isAggregateType()) { - CGF.Builder.CreateMemSet(SharedResult, - llvm::ConstantInt::get(CGM.Int8Ty, 0), - llvm::ConstantInt::get(CGF.IntPtrTy, 4), - /*IsVolatile=*/false); + if (const auto *DRE = dyn_cast(Privates[0])) { + if (const auto *VD = dyn_cast(DRE->getDecl())) { + const Expr *InitExpr = VD->getInit(); + if (InitExpr && (PrivateType->isAggregateType() || + PrivateType->isAnyComplexType())) { + CGF.EmitAnyExprToMem(InitExpr, SharedResult, + PrivateType.getQualifiers(), + /*IsInitializer=*/true); + } else if (!InitVal->isNullValue()) { + CGF.EmitStoreOfScalar(InitVal, + CGF.MakeAddrLValue(SharedResult, PrivateType)); + } else { + CGF.EmitNullInitialization(SharedResult, PrivateType); + } + } else { + CGF.EmitNullInitialization(SharedResult, PrivateType); + } } else { - CGF.Builder.CreateStore(InitVal, SharedResult); + CGF.EmitNullInitialization(SharedResult, PrivateType); } } - CGF.Builder.CreateBr(InitEndBB); + CGF.Builder.CreateBr(InitEndBB); CGF.EmitBlock(InitEndBB); - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); - for (unsigned I : llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) { - const Expr *ReductionClauseExpr = ReductionOps[I]->IgnoreParenCasts(); - if (const auto *Cleanup = dyn_cast(ReductionClauseExpr)) - ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); - const Expr *AssignRHS = nullptr; - const Expr *AssignLHS = nullptr; - - if (const auto *BinOp = dyn_cast(ReductionClauseExpr)) { - if (BinOp->getOpcode() == BO_Assign) { - AssignLHS = BinOp->getLHS(); - AssignRHS = BinOp->getRHS(); - } - } else if (const auto *OpCall = - dyn_cast(ReductionClauseExpr)) { - if (OpCall->getOperator() == OO_Equal) { - AssignLHS = OpCall->getArg(0); - AssignRHS = OpCall->getArg(1); - } - } - - if (!AssignRHS || !AssignLHS) { - continue; - } - const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts(); - if (const auto *MTE = - dyn_cast(ReductionCombinerExpr)) { - ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts(); - } - - BinaryOperatorKind BO = BO_Assign; + const Expr *ReductionOp = ReductionOps[I]; LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); - RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc); - if (const auto *BinOp = dyn_cast(ReductionCombinerExpr)) { - BO = BinOp->getOpcode(); - auto UpdateOp = [&](RValue OldVal) { - if (BO == BO_Mul) { - llvm::Value *OldScalar = OldVal.getScalarVal(); - llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); - llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar); - return RValue::get(Result); - } else { - OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(), - BinOp->getLHS()->getType(), - ExprValueKind::VK_PRValue); - CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal); - return CGF.EmitAnyExpr(BinOp->getRHS()); - } - }; - - (void)CGF.EmitOMPAtomicSimpleUpdateExpr( - SharedLV, PrivateRV, BO, true, - llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); - } else if (const auto *OpCall = dyn_cast(ReductionClauseExpr)) { + // If UDR + const OMPDeclareReductionDecl *CurrentUDR = + getReductionInit(ReductionOps[I]); + if (CurrentUDR) { auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { Action.Enter(CGF); - CharUnits Alignment = CGF.getContext().getTypeAlignInChars(PrivateType); - Address TempResult = - CGF.CreateMemTemp(PrivateType, "reduction.temp.result"); - ReturnValueSlot RVS(TempResult, /*IsVolatile=*/false); - RValue ResultRV = CGF.EmitCallExpr(OpCall, RVS, nullptr); - if (ResultRV.isAggregate()) { - CGF.Builder.CreateMemCpy(SharedResult, ResultRV.getAggregateAddress(), - llvm::ConstantInt::get(CGF.IntPtrTy, 4), - Alignment.getQuantity()); - } else { - CGF.Builder.CreateStore(ResultRV.getScalarVal(), - SharedLV.getAddress(), - /*IsVolatile=*/false); + std::pair ReductionFnPair = + getUserDefinedReduction(CurrentUDR); + llvm::Function *CombinerFn = ReductionFnPair.first; + if (const auto *CE = dyn_cast(ReductionOp)) { + if (CE && CombinerFn) { + const auto *CE = cast(ReductionOps[I]); + const auto *OutDRE = cast( + cast(CE->getArg(0)->IgnoreParenImpCasts()) + ->getSubExpr()); + const auto *InDRE = cast( + cast(CE->getArg(1)->IgnoreParenImpCasts()) + ->getSubExpr()); + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(cast(OutDRE->getDecl()), + SharedLV.getAddress()); + LocalScope.addPrivate(cast(InDRE->getDecl()), + LHSLV.getAddress()); + (void)LocalScope.Privatize(); + emitReductionCombiner(CGF, ReductionOp); + } } }; std::string CriticalName = getName({"reduction_critical"}); emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); + } else { + // Built-in Operator Combination + const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); + if (const auto *Cleanup = dyn_cast(ReductionClauseExpr)) + ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); + const Expr *AssignRHS = nullptr; + if (const auto *BinOp = dyn_cast(ReductionClauseExpr)) { + if (BinOp->getOpcode() == BO_Assign) + AssignRHS = BinOp->getRHS(); + } else if (const auto *OpCall = + dyn_cast(ReductionClauseExpr)) { + if (OpCall->getOperator() == OO_Equal) + AssignRHS = OpCall->getArg(1); + } + if (!AssignRHS) + continue; + const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts(); + if (const auto *MTE = + dyn_cast(ReductionCombinerExpr)) + ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts(); + + BinaryOperatorKind BO = BO_Assign; + RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc); + if (const auto *BinOp = dyn_cast(ReductionCombinerExpr)) { + BO = BinOp->getOpcode(); + auto UpdateOp = [&](RValue OldVal) { + if (BO == BO_Mul) { + llvm::Value *OldScalar = OldVal.getScalarVal(); + llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); + llvm::Value *Result = + CGF.Builder.CreateMul(OldScalar, PrivateScalar); + return RValue::get(Result); + } else { + OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(), + BinOp->getLHS()->getType(), + ExprValueKind::VK_PRValue); + CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, + OldVal); + return CGF.EmitAnyExpr(BinOp->getRHS()); + } + }; + + (void)CGF.EmitOMPAtomicSimpleUpdateExpr( + SharedLV, PrivateRV, BO, true, + llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); + } } } // Final barrier @@ -5074,12 +5089,25 @@ void CGOpenMPRuntime::emitPrivateReduction( BarrierArgs); // Broadcast final result - llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult); + bool IsAggregate = PrivateType->isAggregateType(); + llvm::Value *FinalResultVal = nullptr; + LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); + Address FinalResultAddr = Address::invalid(); + if (IsAggregate) { + FinalResultAddr = SharedResult; + } else { + FinalResultVal = CGF.EmitLoadOfScalar(SharedLV, Loc); + } - // Update private variables with final result for (unsigned I : llvm::seq(Privates.size())) { - LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); - CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress()); + LValue TargetLHSLV = CGF.EmitLValue(LHSExprs[I]); + if (IsAggregate) { + CGF.EmitAggregateCopy(TargetLHSLV, + CGF.MakeAddrLValue(FinalResultAddr, PrivateType), + PrivateType, AggValueSlot::DoesNotOverlap, false); + } else { + CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV); + } } // Final synchronization @@ -5087,6 +5115,7 @@ void CGOpenMPRuntime::emitPrivateReduction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); } + void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, ArrayRef LHSExprs, diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index 4995ac42a2242..eeba6c29afb54 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -9,20 +9,25 @@ class Sum { Sum operator+(const Sum& rhs) const { return Sum(val + rhs.val); } + Sum& operator+=(const Sum& rhs) { + val += rhs.val; + return *this; + } }; +#pragma omp declare reduction(sum_reduction : Sum : omp_out += omp_in) initializer(omp_priv = Sum(0)) void func_red(){ Sum result(0); Sum array[N]; - for(int i = 0; i < 10; i++) { + for (int i = 0; i < N; i++) { array[i] = Sum(i); } #pragma omp parallel private(result) num_threads(4) { - #pragma omp for reduction(+:result) - for(int i = 0; i < 10; i++) { + #pragma omp for reduction(sum_reduction:result) + for (int i = 0; i < N; i++) { result = result + array[i]; } } @@ -58,6 +63,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4 +// CHECK: @_ZZ8func_redvE6result = internal global %class.Sum zeroinitializer, align 4 // CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4 @@ -136,11 +142,6 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 // CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8 -// CHECK-NEXT: [[REF_TMP4:%.*]] = alloca [[CLASS_SUM]], align 4 -// CHECK-NEXT: [[REF_TMP7:%.*]] = alloca [[CLASS_SUM]], align 4 -// CHECK-NEXT: [[AGG_TEMP:%.*]] = alloca [[CLASS_SUM]], align 4 -// CHECK-NEXT: [[REDUCTION_TEMP_RESULT:%.*]] = alloca [[CLASS_SUM]], align 4 -// CHECK-NEXT: [[REF_TMP10:%.*]] = alloca [[CLASS_SUM]], align 4 // CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8 @@ -150,7 +151,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 // CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 -// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], i32 noundef 0) +// CHECK-NEXT: call void @.omp_initializer.(ptr noundef [[RESULT1]], ptr noundef [[RESULT]]) // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) @@ -205,18 +206,12 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] // CHECK-NEXT: ] // CHECK: [[_OMP_REDUCTION_CASE1:.*:]] -// CHECK-NEXT: [[CALL5:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]]) -// CHECK-NEXT: [[COERCE_DIVE6:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP4]], i32 0, i32 0 -// CHECK-NEXT: store i32 [[CALL5]], ptr [[COERCE_DIVE6]], align 4 -// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP4]], i64 4, i1 false) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]]) // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_CASE2:.*:]] // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var) -// CHECK-NEXT: [[CALL8:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]]) -// CHECK-NEXT: [[COERCE_DIVE9:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP7]], i32 0, i32 0 -// CHECK-NEXT: store i32 [[CALL8]], ptr [[COERCE_DIVE9]], align 4 -// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP7]], i64 4, i1 false) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]]) // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var) // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] @@ -225,27 +220,65 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0 // CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[AGG_TEMP]], i32 noundef 0) -// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 4 [[AGG_TEMP]], i64 4, i1 false) +// CHECK-NEXT: call void @.omp_initializer.(ptr noundef @.omp.reduction..internal_private_var, ptr noundef @_ZZ8func_redvE6result) // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) -// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr [[RESULT]], align 4 // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[CALL11:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]]) -// CHECK-NEXT: [[COERCE_DIVE12:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP10]], i32 0, i32 0 -// CHECK-NEXT: store i32 [[CALL11]], ptr [[COERCE_DIVE12]], align 4 -// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP10]], i64 4, i1 false) -// CHECK-NEXT: store ptr [[RESULT]], ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_private_var, ptr noundef [[RESULT]]) // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) -// CHECK-NEXT: [[TMP15:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4 -// CHECK-NEXT: store [[CLASS_SUM]] [[TMP15]], ptr [[RESULT]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: store [[CLASS_SUM]] [[TMP14]], ptr [[RESULT]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]]) // CHECK-NEXT: ret void // // +// CHECK-LABEL: define internal void @.omp_combiner.( +// CHECK-SAME: ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define linkonce_odr noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_( +// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VAL]], align 4 +// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[VAL2]], align 4 +// CHECK-NEXT: ret ptr [[THIS1]] +// +// +// CHECK-LABEL: define internal void @.omp_initializer.( +// CHECK-SAME: ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], i32 noundef 0) +// CHECK-NEXT: ret void +// +// // CHECK-LABEL: define linkonce_odr i32 @_ZNK3SumplERKS_( // CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { // CHECK-NEXT: [[ENTRY:.*:]] @@ -268,11 +301,10 @@ void do_red(int n, int *v, int &sum_v) // // // CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined.omp.reduction.reduction_func( -// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 // CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 @@ -281,10 +313,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 // CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0 // CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 -// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP7]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP5]]) -// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0 -// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4 -// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[TMP7]], ptr noundef [[TMP5]]) // CHECK-NEXT: ret void // // @@ -422,7 +451,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0 // CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var.1, align 4 +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var.1, i8 0, i64 4, i1 false) // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) @@ -439,7 +468,7 @@ void do_red(int n, int *v, int &sum_v) // // // CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func( -// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] { +// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 @@ -459,7 +488,7 @@ void do_red(int n, int *v, int &sum_v) // // // CHECK-LABEL: define dso_local noundef i32 @main( -// CHECK-SAME: ) #[[ATTR6:[0-9]+]] { +// CHECK-SAME: ) #[[ATTR8:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 From c77fb0e1911428ee9441f2f3c7a0bf02157a49e4 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Mon, 5 May 2025 12:44:45 -0500 Subject: [PATCH 09/29] Implicit reduction identifier fix --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 53 ++++++++++++------- .../OpenMP/for_private_reduction_codegen.cpp | 25 +++++++-- 2 files changed, 56 insertions(+), 22 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 71742d24a33ba..fb865ac9f453c 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4966,32 +4966,25 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB); CGF.EmitBlock(InitBB); - if (UDR) { - Address OrigAddr = Address::invalid(); - emitInitWithReductionInitializer(CGF, UDR, ReductionOps[0], SharedResult, - OrigAddr, PrivateType); - } else { - if (const auto *DRE = dyn_cast(Privates[0])) { - if (const auto *VD = dyn_cast(DRE->getDecl())) { - const Expr *InitExpr = VD->getInit(); - if (InitExpr && (PrivateType->isAggregateType() || - PrivateType->isAnyComplexType())) { - CGF.EmitAnyExprToMem(InitExpr, SharedResult, - PrivateType.getQualifiers(), - /*IsInitializer=*/true); - } else if (!InitVal->isNullValue()) { - CGF.EmitStoreOfScalar(InitVal, - CGF.MakeAddrLValue(SharedResult, PrivateType)); - } else { - CGF.EmitNullInitialization(SharedResult, PrivateType); - } + if (const auto *DRE = dyn_cast(Privates[0])) { + if (const auto *VD = dyn_cast(DRE->getDecl())) { + const Expr *InitExpr = VD->getInit(); + if (InitExpr && + (PrivateType->isAggregateType() || PrivateType->isAnyComplexType())) { + CGF.EmitAnyExprToMem(InitExpr, SharedResult, + PrivateType.getQualifiers(), true); + } else if (!InitVal->isNullValue()) { + CGF.EmitStoreOfScalar(InitVal, + CGF.MakeAddrLValue(SharedResult, PrivateType)); } else { CGF.EmitNullInitialization(SharedResult, PrivateType); } } else { CGF.EmitNullInitialization(SharedResult, PrivateType); } + } else { + CGF.EmitNullInitialization(SharedResult, PrivateType); } CGF.Builder.CreateBr(InitEndBB); @@ -5080,6 +5073,28 @@ void CGOpenMPRuntime::emitPrivateReduction( (void)CGF.EmitOMPAtomicSimpleUpdateExpr( SharedLV, PrivateRV, BO, true, llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); + } else if (dyn_cast(ReductionClauseExpr)) { + // Implicit Reduction Identifiers ( openmp 6.0 sec 7.6.5 ) + auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + const auto *OmpOutDRE = + dyn_cast(LHSExprs[I]->IgnoreParenImpCasts()); + const auto *OmpInDRE = + dyn_cast(RHSExprs[I]->IgnoreParenImpCasts()); + + if (!OmpOutDRE || !OmpInDRE) { + return; + } + const VarDecl *OmpOutVD = cast(OmpOutDRE->getDecl()); + const VarDecl *OmpInVD = cast(OmpInDRE->getDecl()); + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress()); + LocalScope.addPrivate(OmpInVD, LHSLV.getAddress()); + (void)LocalScope.Privatize(); + CGF.EmitIgnoredExpr(ReductionOp); + }; + std::string CriticalName = getName({"reduction_critical"}); + emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); } } } diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index eeba6c29afb54..313d0997f18e3 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -63,7 +63,6 @@ void do_red(int n, int *v, int &sum_v) // CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4 -// CHECK: @_ZZ8func_redvE6result = internal global %class.Sum zeroinitializer, align 4 // CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4 @@ -220,7 +219,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0 // CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: call void @.omp_initializer.(ptr noundef @.omp.reduction..internal_private_var, ptr noundef @_ZZ8func_redvE6result) +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, i8 0, i64 4, i1 false) // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) @@ -533,4 +532,24 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0 // CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]]) // CHECK-NEXT: ret void - +// +//. +// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +// CHECK: attributes #[[ATTR2]] = { noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR3]] = { noinline nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind } +// CHECK: attributes #[[ATTR5]] = { noinline norecurse nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +// CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind } +// CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } +// CHECK: attributes #[[ATTR8]] = { mustprogress noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } +//. +// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} +// CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 60} +// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} +// CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} +// CHECK: [[META4]] = !{!"llvm.loop.mustprogress"} +// CHECK: [[META5:![0-9]+]] = !{[[META6:![0-9]+]]} +// CHECK: [[META6]] = !{i64 2, i64 -1, i64 -1, i1 true} +// CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]} +//. From f202eaaefe3d4c905a946b1ead636defe6e0d7cf Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Wed, 7 May 2025 07:03:01 -0500 Subject: [PATCH 10/29] updated with comments, unified logic and docs --- clang/docs/OpenMPSupport.rst | 4 +- clang/docs/ReleaseNotes.rst | 1 + clang/lib/CodeGen/CGOpenMPRuntime.cpp | 206 +++++++++--------- .../OpenMP/for_private_reduction_codegen.cpp | 31 +-- 4 files changed, 112 insertions(+), 130 deletions(-) diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 83d90ffef6bc7..3c5011a8809d5 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -406,7 +406,9 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Extensions to atomic construct | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Private reductions | :part:`partial` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 | +| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 +| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709 +| +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Self maps | :part:`partial` | :none:`unclaimed` | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 159991e8db981..67d56b9cb5ec9 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -530,6 +530,7 @@ OpenMP Support - Added support 'no_openmp_constructs' assumption clause. - Added support for 'self_maps' in map and requirement clause. - Added support for 'omp stripe' directive. +- Added support for reduction over private variable with 'reduction' clause. Improvements ^^^^^^^^^^^^ diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index fb865ac9f453c..982501f7214a9 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4910,12 +4910,33 @@ void CGOpenMPRuntime::emitPrivateReduction( LHSExprs.size() != ReductionOps.size()) return; + // Create a shared global variable (__shared_reduction_var) to accumulate the + // final result. + // + // Call __kmpc_barrier to synchronize threads before initialization. + // + // The master thread (thread_id == 0) initializes __shared_reduction_var + // with the identity value or initializer. + // + // Call __kmpc_barrier to synchronize before combining. + // For each i: + // - Thread enters critical section. + // - Reads its private value from LHSExprs[i]. + // - Updates __shared_reduction_var[i] = RedOp_i(__shared_reduction_var[i], + // LHSExprs[i]). + // - Exits critical section. + // + // Call __kmpc_barrier after combining. + // + // Each thread copies __shared_reduction_var[i] back to LHSExprs[i]. + // + // Final __kmpc_barrier to synchronize after broadcasting QualType PrivateType = Privates[0]->getType(); llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType); llvm::Constant *InitVal = nullptr; const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps[0]); - + // Determine the initial value for the shared reduction variable if (!UDR) { InitVal = llvm::Constant::getNullValue(LLVMType); if (const auto *DRE = dyn_cast(Privates[0])) { @@ -4926,9 +4947,8 @@ void CGOpenMPRuntime::emitPrivateReduction( Expr::EvalResult Result; if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) { APValue &InitValue = Result.Val; - if (InitValue.isInt()) { + if (InitValue.isInt()) InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt()); - } } } } @@ -4954,10 +4974,11 @@ void CGOpenMPRuntime::emitPrivateReduction( llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE); llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId}; + // First barrier to ensure all threads are ready. CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); - + // Initialize the shared variable by the master thread. llvm::BasicBlock *InitBB = CGF.createBasicBlock("init"); llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end"); @@ -4967,28 +4988,29 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.EmitBlock(InitBB); - if (const auto *DRE = dyn_cast(Privates[0])) { - if (const auto *VD = dyn_cast(DRE->getDecl())) { - const Expr *InitExpr = VD->getInit(); - if (InitExpr && - (PrivateType->isAggregateType() || PrivateType->isAnyComplexType())) { - CGF.EmitAnyExprToMem(InitExpr, SharedResult, - PrivateType.getQualifiers(), true); - } else if (!InitVal->isNullValue()) { - CGF.EmitStoreOfScalar(InitVal, - CGF.MakeAddrLValue(SharedResult, PrivateType)); - } else { - CGF.EmitNullInitialization(SharedResult, PrivateType); + auto EmitSharedInit = [&]() { + if (const auto *DRE = dyn_cast(Privates[0])) { + if (const auto *VD = dyn_cast(DRE->getDecl())) { + const Expr *InitExpr = VD->getInit(); + if (InitExpr && (PrivateType->isAggregateType() || + PrivateType->isAnyComplexType())) { + CGF.EmitAnyExprToMem(InitExpr, SharedResult, + PrivateType.getQualifiers(), true); + return; + } + if (!InitVal->isNullValue()) { + CGF.EmitStoreOfScalar(InitVal, + CGF.MakeAddrLValue(SharedResult, PrivateType)); + return; + } } - } else { - CGF.EmitNullInitialization(SharedResult, PrivateType); } - } else { CGF.EmitNullInitialization(SharedResult, PrivateType); - } - + }; + EmitSharedInit(); CGF.Builder.CreateBr(InitEndBB); CGF.EmitBlock(InitEndBB); + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); @@ -4996,20 +5018,23 @@ void CGOpenMPRuntime::emitPrivateReduction( llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) { const Expr *ReductionOp = ReductionOps[I]; + const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp); LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); - // If UDR - const OMPDeclareReductionDecl *CurrentUDR = - getReductionInit(ReductionOps[I]); + + auto EmitCriticalReduction = [&](auto ReductionGen) { + std::string CriticalName = getName({"reduction_critical"}); + emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); + }; + if (CurrentUDR) { + // Handle user-defined reduction. auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { Action.Enter(CGF); - std::pair ReductionFnPair = + std::pair FnPair = getUserDefinedReduction(CurrentUDR); - llvm::Function *CombinerFn = ReductionFnPair.first; - if (const auto *CE = dyn_cast(ReductionOp)) { - if (CE && CombinerFn) { - const auto *CE = cast(ReductionOps[I]); + if (FnPair.first) { + if (const auto *CE = dyn_cast(ReductionOp)) { const auto *OutDRE = cast( cast(CE->getArg(0)->IgnoreParenImpCasts()) ->getSubExpr()); @@ -5026,93 +5051,65 @@ void CGOpenMPRuntime::emitPrivateReduction( } } }; - std::string CriticalName = getName({"reduction_critical"}); - emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); - } else { - // Built-in Operator Combination - const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); - if (const auto *Cleanup = dyn_cast(ReductionClauseExpr)) - ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); - const Expr *AssignRHS = nullptr; - if (const auto *BinOp = dyn_cast(ReductionClauseExpr)) { - if (BinOp->getOpcode() == BO_Assign) - AssignRHS = BinOp->getRHS(); - } else if (const auto *OpCall = - dyn_cast(ReductionClauseExpr)) { - if (OpCall->getOperator() == OO_Equal) - AssignRHS = OpCall->getArg(1); - } - if (!AssignRHS) - continue; - const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts(); - if (const auto *MTE = - dyn_cast(ReductionCombinerExpr)) - ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts(); - - BinaryOperatorKind BO = BO_Assign; - RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc); - if (const auto *BinOp = dyn_cast(ReductionCombinerExpr)) { - BO = BinOp->getOpcode(); - auto UpdateOp = [&](RValue OldVal) { - if (BO == BO_Mul) { - llvm::Value *OldScalar = OldVal.getScalarVal(); - llvm::Value *PrivateScalar = PrivateRV.getScalarVal(); - llvm::Value *Result = - CGF.Builder.CreateMul(OldScalar, PrivateScalar); - return RValue::get(Result); - } else { - OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(), - BinOp->getLHS()->getType(), - ExprValueKind::VK_PRValue); - CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, - OldVal); - return CGF.EmitAnyExpr(BinOp->getRHS()); - } - }; + EmitCriticalReduction(ReductionGen); + continue; + } + // Handle built-in reduction operations. + const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); + if (const auto *Cleanup = dyn_cast(ReductionClauseExpr)) + ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); - (void)CGF.EmitOMPAtomicSimpleUpdateExpr( - SharedLV, PrivateRV, BO, true, - llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp); - } else if (dyn_cast(ReductionClauseExpr)) { - // Implicit Reduction Identifiers ( openmp 6.0 sec 7.6.5 ) - auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { - Action.Enter(CGF); - const auto *OmpOutDRE = - dyn_cast(LHSExprs[I]->IgnoreParenImpCasts()); - const auto *OmpInDRE = - dyn_cast(RHSExprs[I]->IgnoreParenImpCasts()); - - if (!OmpOutDRE || !OmpInDRE) { - return; - } - const VarDecl *OmpOutVD = cast(OmpOutDRE->getDecl()); - const VarDecl *OmpInVD = cast(OmpInDRE->getDecl()); - CodeGenFunction::OMPPrivateScope LocalScope(CGF); - LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress()); - LocalScope.addPrivate(OmpInVD, LHSLV.getAddress()); - (void)LocalScope.Privatize(); - CGF.EmitIgnoredExpr(ReductionOp); - }; - std::string CriticalName = getName({"reduction_critical"}); - emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); - } + const Expr *AssignRHS = nullptr; + if (const auto *BinOp = dyn_cast(ReductionClauseExpr)) { + if (BinOp->getOpcode() == BO_Assign) + AssignRHS = BinOp->getRHS(); + } else if (const auto *OpCall = + dyn_cast(ReductionClauseExpr)) { + if (OpCall->getOperator() == OO_Equal) + AssignRHS = OpCall->getArg(1); } + + if (!AssignRHS) + continue; + + const Expr *CombinerExpr = AssignRHS->IgnoreParenImpCasts(); + if (const auto *MTE = dyn_cast(CombinerExpr)) + CombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts(); + + auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + const auto *OmpOutDRE = + dyn_cast(LHSExprs[I]->IgnoreParenImpCasts()); + const auto *OmpInDRE = + dyn_cast(RHSExprs[I]->IgnoreParenImpCasts()); + if (!OmpOutDRE || !OmpInDRE) + return; + const VarDecl *OmpOutVD = cast(OmpOutDRE->getDecl()); + const VarDecl *OmpInVD = cast(OmpInDRE->getDecl()); + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress()); + LocalScope.addPrivate(OmpInVD, LHSLV.getAddress()); + (void)LocalScope.Privatize(); + // Emit the actual reduction operation + CGF.EmitIgnoredExpr(ReductionOp); + }; + EmitCriticalReduction(ReductionGen); } - // Final barrier + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); // Broadcast final result bool IsAggregate = PrivateType->isAggregateType(); - llvm::Value *FinalResultVal = nullptr; LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); + llvm::Value *FinalResultVal = nullptr; Address FinalResultAddr = Address::invalid(); - if (IsAggregate) { + + if (IsAggregate) FinalResultAddr = SharedResult; - } else { + else FinalResultVal = CGF.EmitLoadOfScalar(SharedLV, Loc); - } for (unsigned I : llvm::seq(Privates.size())) { LValue TargetLHSLV = CGF.EmitLValue(LHSExprs[I]); @@ -5124,8 +5121,7 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV); } } - - // Final synchronization + // Final synchronization barrier CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index 313d0997f18e3..817aff37d6c0d 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -54,7 +54,6 @@ void do_red(int n, int *v, int &sum_v) } return 0; } - //. // CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" // CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 @@ -454,8 +453,12 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4 -// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var.1, i32 [[TMP29]] seq_cst, align 4 +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4 +// CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] +// CHECK-NEXT: store i32 [[ADD12]], ptr @.omp.reduction..internal_private_var.1, align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4 // CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4 @@ -532,24 +535,4 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0 // CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]]) // CHECK-NEXT: ret void -// -//. -// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } -// CHECK: attributes #[[ATTR2]] = { noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR3]] = { noinline nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind } -// CHECK: attributes #[[ATTR5]] = { noinline norecurse nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -// CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind } -// CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) } -// CHECK: attributes #[[ATTR8]] = { mustprogress noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" } -//. -// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} -// CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 60} -// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"} -// CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} -// CHECK: [[META4]] = !{!"llvm.loop.mustprogress"} -// CHECK: [[META5:![0-9]+]] = !{[[META6:![0-9]+]]} -// CHECK: [[META6]] = !{i64 2, i64 -1, i64 -1, i1 true} -// CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]} -//. + From 9d2370ba2958d204a7de9cef499a2e567764f685 Mon Sep 17 00:00:00 2001 From: CHANDRA GHALE Date: Wed, 7 May 2025 17:44:09 +0530 Subject: [PATCH 11/29] Update OpenMPSupport.rst --- clang/docs/OpenMPSupport.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 3c5011a8809d5..904b26cdb0b21 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -406,9 +406,8 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Extensions to atomic construct | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 -| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709 -| +| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 | +| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Self maps | :part:`partial` | :none:`unclaimed` | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ From 0ca2f86613bd16fbd08b88bc1da5a5104b48e6d7 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Wed, 7 May 2025 11:46:16 -0500 Subject: [PATCH 12/29] Handle UDR init and updated lit --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 17 +- .../OpenMP/for_private_reduction_codegen.cpp | 306 ++++++++++++++++-- 2 files changed, 284 insertions(+), 39 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 982501f7214a9..c7a010b642c41 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4974,11 +4974,6 @@ void CGOpenMPRuntime::emitPrivateReduction( llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE); llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId}; - // First barrier to ensure all threads are ready. - CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( - CGM.getModule(), OMPRTL___kmpc_barrier), - BarrierArgs); - // Initialize the shared variable by the master thread. llvm::BasicBlock *InitBB = CGF.createBasicBlock("init"); llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end"); @@ -4989,6 +4984,18 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.EmitBlock(InitBB); auto EmitSharedInit = [&]() { + if (UDR) { // Check if it's a User-Defined Reduction + if (const Expr *UDRInitExpr = UDR->getInitializer()) { + // Use the initializer from the OMPDeclareReductionDecl + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), true); + } else { + // EmitNullInitialization handles default construction for C++ classes + // and zeroing for scalars, which is a reasonable default. + CGF.EmitNullInitialization(SharedResult, PrivateType); + } + return; // UDR initialization handled + } if (const auto *DRE = dyn_cast(Privates[0])) { if (const auto *VD = dyn_cast(DRE->getDecl())) { const Expr *InitExpr = VD->getInit(); diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index 817aff37d6c0d..31a52b611068c 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -4,19 +4,19 @@ #define N 10 class Sum { int val; + public: Sum(int v = 0) : val(v) {} - Sum operator+(const Sum& rhs) const { - return Sum(val + rhs.val); - } - Sum& operator+=(const Sum& rhs) { + Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); } + Sum &operator+=(const Sum &rhs) { val += rhs.val; return *this; } }; -#pragma omp declare reduction(sum_reduction : Sum : omp_out += omp_in) initializer(omp_priv = Sum(0)) +#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \ + initializer(omp_priv = Sum(0)) -void func_red(){ +void func_red() { Sum result(0); Sum array[N]; @@ -24,36 +24,50 @@ void func_red(){ array[i] = Sum(i); } - #pragma omp parallel private(result) num_threads(4) +#pragma omp parallel private(result) num_threads(4) { - #pragma omp for reduction(sum_reduction:result) - for (int i = 0; i < N; i++) { - result = result + array[i]; +#pragma omp for reduction(sum_reduction : result) + for (int i = 0; i < N; i++) { + result = result + array[i]; + } } +} + +void do_red(int n, int *v, int &sum_v) { + sum_v = 0; +#pragma omp for reduction(original(private), + : sum_v) + for (int i = 0; i < n; i++) { + sum_v += v[i]; } } +void do_red_extended(int n, int *v, int &sum_v, int &prod_v) { + sum_v = 0; + prod_v = 1; -void do_red(int n, int *v, int &sum_v) - { - sum_v = 0; - #pragma omp for reduction(original(private),+: sum_v) - for (int i = 0; i < n; i++) - { - sum_v += v[i]; - } - } - int main(void) - { - int v[N]; - for (int i = 0; i < N; i++) - v[i] = i; - #pragma omp parallel num_threads(4) - { - int s_v; - do_red(N, v, s_v); - } - return 0; - } +#pragma omp for reduction(original(private), + : sum_v) \ + reduction(original(private), * : prod_v) + for (int i = 0; i < n; i++) { + sum_v += v[i]; + prod_v *= v[i]; + } +} +int main(void) { + int v[N]; + for (int i = 0; i < N; i++) + v[i] = i; +#pragma omp parallel num_threads(4) + { + int s_v; + do_red(N, v, s_v); + } + + int sum_v_ext = 0, prod_v_ext = 1; +#pragma omp parallel num_threads(4) + { + do_red_extended(N, v, sum_v_ext, prod_v_ext); + } + return 0; +} //. // CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" // CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 @@ -65,6 +79,7 @@ void do_red(int n, int *v, int &sum_v) // CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4 +// CHECK: @.omp.reduction..internal_private_var.2 = common global i32 0, align 4 //. // CHECK-LABEL: define dso_local void @_Z8func_redv( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { @@ -214,11 +229,10 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) // CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0 // CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, i8 0, i64 4, i1 false) +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_private_var, i32 noundef 0) // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) @@ -445,7 +459,6 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0 // CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: @@ -489,12 +502,233 @@ void do_red(int n, int *v, int &sum_v) // CHECK-NEXT: ret void // // +// CHECK-LABEL: define dso_local void @_Z15do_red_extendediPiRiS0_( +// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[PROD_V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[PROD_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V5:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP6:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[PROD_V7:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP8:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8 +// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP19:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[PROD_V]], ptr [[PROD_V_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8 +// CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]] +// CHECK: [[OMP_PRECOND_THEN]]: +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-NEXT: store i32 0, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: store ptr [[SUM_V5]], ptr [[_TMP6]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[_TMP1]], align 8 +// CHECK-NEXT: store i32 1, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: store ptr [[PROD_V7]], ptr [[_TMP8]], align 8 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +// CHECK-NEXT: br i1 [[CMP10]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] +// CHECK: [[COND_TRUE]]: +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: br label %[[COND_END:.*]] +// CHECK: [[COND_FALSE]]: +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label %[[COND_END]] +// CHECK: [[COND_END]]: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], %[[COND_TRUE]] ], [ [[TMP14]], %[[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] +// CHECK: [[OMP_INNER_FOR_COND]]: +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK-NEXT: br i1 [[CMP11]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] +// CHECK: [[OMP_INNER_FOR_BODY]]: +// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I9]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[I9]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = load ptr, ptr [[_TMP6]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 +// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP23]], [[TMP21]] +// CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP22]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[I9]], align 4 +// CHECK-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP25]] to i64 +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[IDXPROM13]] +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[_TMP8]], align 8 +// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4 +// CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[TMP28]], [[TMP26]] +// CHECK-NEXT: store i32 [[MUL15]], ptr [[TMP27]], align 4 +// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] +// CHECK: [[OMP_BODY_CONTINUE]]: +// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] +// CHECK: [[OMP_INNER_FOR_INC]]: +// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP29]], 1 +// CHECK-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]] +// CHECK: [[OMP_INNER_FOR_END]]: +// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] +// CHECK: [[OMP_LOOP_EXIT]]: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK-NEXT: store ptr [[SUM_V5]], ptr [[TMP30]], align 8 +// CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK-NEXT: store ptr [[PROD_V7]], ptr [[TMP31]], align 8 +// CHECK-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 2, i64 16, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP32]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ +// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] +// CHECK-NEXT: i32 2, label %[[DOTOMP_REDUCTION_CASE2:.*]] +// CHECK-NEXT: ] +// CHECK: [[_OMP_REDUCTION_CASE1:.*:]] +// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] +// CHECK-NEXT: store i32 [[ADD17]], ptr [[TMP9]], align 4 +// CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP10]], align 4 +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: [[MUL18:%.*]] = mul nsw i32 [[TMP35]], [[TMP36]] +// CHECK-NEXT: store i32 [[MUL18]], ptr [[TMP10]], align 4 +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_CASE2:.*:]] +// CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: [[TMP38:%.*]] = atomicrmw add ptr [[TMP9]], i32 [[TMP37]] monotonic, align 4 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i32, ptr [[TMP10]] monotonic, align 4 +// CHECK-NEXT: br label %[[ATOMIC_CONT:.*]] +// CHECK: [[ATOMIC_CONT]]: +// CHECK-NEXT: [[TMP40:%.*]] = phi i32 [ [[ATOMIC_LOAD]], %[[DOTOMP_REDUCTION_CASE2]] ], [ [[TMP45:%.*]], %[[ATOMIC_CONT]] ] +// CHECK-NEXT: store i32 [[TMP40]], ptr [[_TMP19]], align 4 +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[_TMP19]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: [[MUL20:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]] +// CHECK-NEXT: store i32 [[MUL20]], ptr [[ATOMIC_TEMP]], align 4 +// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// CHECK-NEXT: [[TMP44:%.*]] = cmpxchg ptr [[TMP10]], i32 [[TMP40]], i32 [[TMP43]] monotonic monotonic, align 4 +// CHECK-NEXT: [[TMP45]] = extractvalue { i32, i1 } [[TMP44]], 0 +// CHECK-NEXT: [[TMP46:%.*]] = extractvalue { i32, i1 } [[TMP44]], 1 +// CHECK-NEXT: br i1 [[TMP46]], label %[[ATOMIC_EXIT:.*]], label %[[ATOMIC_CONT]] +// CHECK: [[ATOMIC_EXIT]]: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] +// CHECK-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP47]], label %[[INIT:.*]], label %[[INIT_END:.*]] +// CHECK: [[INIT]]: +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var.2, i8 0, i64 4, i1 false) +// CHECK-NEXT: br label %[[INIT_END]] +// CHECK: [[INIT_END]]: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4 +// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP48]], [[TMP49]] +// CHECK-NEXT: store i32 [[ADD21]], ptr @.omp.reduction..internal_private_var.2, align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4 +// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP10]], align 4 +// CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 [[TMP50]], [[TMP51]] +// CHECK-NEXT: store i32 [[MUL22]], ptr @.omp.reduction..internal_private_var.2, align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4 +// CHECK-NEXT: store i32 [[TMP52]], ptr [[TMP9]], align 4 +// CHECK-NEXT: store i32 [[TMP52]], ptr [[TMP10]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: br label %[[OMP_PRECOND_END]] +// CHECK: [[OMP_PRECOND_END]]: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define internal void @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func( +// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP2]], i64 0, i64 0 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP2]], i64 0, i64 1 +// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP11]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]] +// CHECK-NEXT: store i32 [[MUL]], ptr [[TMP11]], align 4 +// CHECK-NEXT: ret void +// +// // CHECK-LABEL: define dso_local noundef i32 @main( // CHECK-SAME: ) #[[ATTR8:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 // CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V_EXT:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[PROD_V_EXT:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) // CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK-NEXT: store i32 0, ptr [[I]], align 4 @@ -518,6 +752,10 @@ void do_red(int n, int *v, int &sum_v) // CHECK: [[FOR_END]]: // CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) // CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]]) +// CHECK-NEXT: store i32 0, ptr [[SUM_V_EXT]], align 4 +// CHECK-NEXT: store i32 1, ptr [[PROD_V_EXT]], align 4 +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.3, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]]) // CHECK-NEXT: ret i32 0 // // From 9335af1858c8befc1796253b1ed05aacf62109a9 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Thu, 8 May 2025 08:07:20 -0500 Subject: [PATCH 13/29] multiple reduced value change --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 144 +++++++++--------- clang/lib/CodeGen/CGOpenMPRuntime.h | 6 +- .../OpenMP/for_private_reduction_codegen.cpp | 54 ++++--- 3 files changed, 110 insertions(+), 94 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index c7a010b642c41..9d13397ee9fb5 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4900,15 +4900,8 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, } void CGOpenMPRuntime::emitPrivateReduction( - CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, - ArrayRef LHSExprs, ArrayRef RHSExprs, - ArrayRef ReductionOps) { - if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty()) - return; - - if (LHSExprs.size() != Privates.size() || - LHSExprs.size() != ReductionOps.size()) - return; + CodeGenFunction &CGF, SourceLocation Loc, const Expr *Privates, + const Expr *LHSExprs, const Expr *RHSExprs, const Expr *ReductionOps) { // Create a shared global variable (__shared_reduction_var) to accumulate the // final result. @@ -4931,15 +4924,15 @@ void CGOpenMPRuntime::emitPrivateReduction( // Each thread copies __shared_reduction_var[i] back to LHSExprs[i]. // // Final __kmpc_barrier to synchronize after broadcasting - QualType PrivateType = Privates[0]->getType(); + QualType PrivateType = Privates->getType(); llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType); llvm::Constant *InitVal = nullptr; - const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps[0]); + const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps); // Determine the initial value for the shared reduction variable if (!UDR) { InitVal = llvm::Constant::getNullValue(LLVMType); - if (const auto *DRE = dyn_cast(Privates[0])) { + if (const auto *DRE = dyn_cast(Privates)) { if (const auto *VD = dyn_cast(DRE->getDecl())) { const Expr *InitExpr = VD->getInit(); if (InitExpr && !PrivateType->isAggregateType() && @@ -4956,11 +4949,18 @@ void CGOpenMPRuntime::emitPrivateReduction( } else { InitVal = llvm::Constant::getNullValue(LLVMType); } + std::string ReductionVarNameStr; + if (const auto *DRE = dyn_cast(Privates->IgnoreParenCasts())) { + ReductionVarNameStr = DRE->getDecl()->getNameAsString(); + } else { + ReductionVarNameStr = "unnamed_priv_var"; + } // Create an internal shared variable - std::string SharedName = getName({"internal_private_var"}); + std::string SharedName = + CGM.getOpenMPRuntime().getName({"internal_pivate_", ReductionVarNameStr}); llvm::GlobalVariable *SharedVar = new llvm::GlobalVariable( - CGM.getModule(), LLVMType, false, llvm::GlobalValue::CommonLinkage, + CGM.getModule(), LLVMType, false, llvm::GlobalValue::InternalLinkage, InitVal, ".omp.reduction." + SharedName, nullptr, llvm::GlobalVariable::NotThreadLocal); @@ -4996,7 +4996,7 @@ void CGOpenMPRuntime::emitPrivateReduction( } return; // UDR initialization handled } - if (const auto *DRE = dyn_cast(Privates[0])) { + if (const auto *DRE = dyn_cast(Privates)) { if (const auto *VD = dyn_cast(DRE->getDecl())) { const Expr *InitExpr = VD->getInit(); if (InitExpr && (PrivateType->isAggregateType() || @@ -5021,47 +5021,45 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); - for (unsigned I : - llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) { - const Expr *ReductionOp = ReductionOps[I]; - const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp); - LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); - LValue LHSLV = CGF.EmitLValue(LHSExprs[I]); + const Expr *ReductionOp = ReductionOps; + const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp); + LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); + LValue LHSLV = CGF.EmitLValue(LHSExprs); - auto EmitCriticalReduction = [&](auto ReductionGen) { - std::string CriticalName = getName({"reduction_critical"}); - emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); - }; + auto EmitCriticalReduction = [&](auto ReductionGen) { + std::string CriticalName = getName({"reduction_critical"}); + emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); + }; - if (CurrentUDR) { - // Handle user-defined reduction. - auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { - Action.Enter(CGF); - std::pair FnPair = - getUserDefinedReduction(CurrentUDR); - if (FnPair.first) { - if (const auto *CE = dyn_cast(ReductionOp)) { - const auto *OutDRE = cast( - cast(CE->getArg(0)->IgnoreParenImpCasts()) - ->getSubExpr()); - const auto *InDRE = cast( - cast(CE->getArg(1)->IgnoreParenImpCasts()) - ->getSubExpr()); - CodeGenFunction::OMPPrivateScope LocalScope(CGF); - LocalScope.addPrivate(cast(OutDRE->getDecl()), - SharedLV.getAddress()); - LocalScope.addPrivate(cast(InDRE->getDecl()), - LHSLV.getAddress()); - (void)LocalScope.Privatize(); - emitReductionCombiner(CGF, ReductionOp); - } + if (CurrentUDR) { + // Handle user-defined reduction. + auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + std::pair FnPair = + getUserDefinedReduction(CurrentUDR); + if (FnPair.first) { + if (const auto *CE = dyn_cast(ReductionOp)) { + const auto *OutDRE = cast( + cast(CE->getArg(0)->IgnoreParenImpCasts()) + ->getSubExpr()); + const auto *InDRE = cast( + cast(CE->getArg(1)->IgnoreParenImpCasts()) + ->getSubExpr()); + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(cast(OutDRE->getDecl()), + SharedLV.getAddress()); + LocalScope.addPrivate(cast(InDRE->getDecl()), + LHSLV.getAddress()); + (void)LocalScope.Privatize(); + emitReductionCombiner(CGF, ReductionOp); } - }; - EmitCriticalReduction(ReductionGen); - continue; - } - // Handle built-in reduction operations. + } + }; + EmitCriticalReduction(ReductionGen); + } + // Handle built-in reduction operations. + else { const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); if (const auto *Cleanup = dyn_cast(ReductionClauseExpr)) ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); @@ -5077,7 +5075,7 @@ void CGOpenMPRuntime::emitPrivateReduction( } if (!AssignRHS) - continue; + return; const Expr *CombinerExpr = AssignRHS->IgnoreParenImpCasts(); if (const auto *MTE = dyn_cast(CombinerExpr)) @@ -5086,9 +5084,9 @@ void CGOpenMPRuntime::emitPrivateReduction( auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { Action.Enter(CGF); const auto *OmpOutDRE = - dyn_cast(LHSExprs[I]->IgnoreParenImpCasts()); + dyn_cast(LHSExprs->IgnoreParenImpCasts()); const auto *OmpInDRE = - dyn_cast(RHSExprs[I]->IgnoreParenImpCasts()); + dyn_cast(RHSExprs->IgnoreParenImpCasts()); if (!OmpOutDRE || !OmpInDRE) return; const VarDecl *OmpOutVD = cast(OmpOutDRE->getDecl()); @@ -5109,24 +5107,22 @@ void CGOpenMPRuntime::emitPrivateReduction( // Broadcast final result bool IsAggregate = PrivateType->isAggregateType(); - LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); + LValue SharedLV1 = CGF.MakeAddrLValue(SharedResult, PrivateType); llvm::Value *FinalResultVal = nullptr; Address FinalResultAddr = Address::invalid(); if (IsAggregate) FinalResultAddr = SharedResult; else - FinalResultVal = CGF.EmitLoadOfScalar(SharedLV, Loc); - - for (unsigned I : llvm::seq(Privates.size())) { - LValue TargetLHSLV = CGF.EmitLValue(LHSExprs[I]); - if (IsAggregate) { - CGF.EmitAggregateCopy(TargetLHSLV, - CGF.MakeAddrLValue(FinalResultAddr, PrivateType), - PrivateType, AggValueSlot::DoesNotOverlap, false); - } else { - CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV); - } + FinalResultVal = CGF.EmitLoadOfScalar(SharedLV1, Loc); + + LValue TargetLHSLV = CGF.EmitLValue(LHSExprs); + if (IsAggregate) { + CGF.EmitAggregateCopy(TargetLHSLV, + CGF.MakeAddrLValue(FinalResultAddr, PrivateType), + PrivateType, AggValueSlot::DoesNotOverlap, false); + } else { + CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV); } // Final synchronization barrier CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( @@ -5436,8 +5432,18 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); - if (Options.IsPrivateVarReduction) - emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps); + if (Options.IsPrivateVarReduction) { + if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty()) + return; + if (LHSExprs.size() != Privates.size() || + LHSExprs.size() != ReductionOps.size()) + return; + for (unsigned I : + llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) { + emitPrivateReduction(CGF, Loc, Privates[I], LHSExprs[I], RHSExprs[I], + ReductionOps[I]); + } + } } /// Generates unique name for artificial threadprivate variables. diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 50ba28b565b6d..51595cdb31fa8 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -1212,10 +1212,8 @@ class CGOpenMPRuntime { /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' /// or 'operator binop(LHS, RHS)'. void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc, - ArrayRef Privates, - ArrayRef LHSExprs, - ArrayRef RHSExprs, - ArrayRef ReductionOps); + const Expr *Privates, const Expr *LHSExprs, + const Expr *RHSExprs, const Expr *ReductionOps); /// Emit a code for reduction clause. Next code should be emitted for /// reduction: diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index 31a52b611068c..8532bf4125787 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -68,6 +68,7 @@ int main(void) { } return 0; } + //. // CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" // CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 @@ -75,11 +76,12 @@ int main(void) { // CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8 -// CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4 +// CHECK: @.omp.reduction..internal_pivate_.result = internal global %class.Sum zeroinitializer, align 4 // CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4 -// CHECK: @.omp.reduction..internal_private_var.2 = common global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.sum_v = internal global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.sum_v.1 = internal global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.prod_v = internal global i32 1, align 4 //. // CHECK-LABEL: define dso_local void @_Z8func_redv( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { @@ -232,15 +234,15 @@ int main(void) { // CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0 // CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_private_var, i32 noundef 0) +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result, i32 noundef 0) // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_private_var, ptr noundef [[RESULT]]) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result, ptr noundef [[RESULT]]) // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) -// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result, align 4 // CHECK-NEXT: store [[CLASS_SUM]] [[TMP14]], ptr [[RESULT]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]]) @@ -462,18 +464,18 @@ int main(void) { // CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0 // CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var.1, i8 0, i64 4, i1 false) +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_pivate_.sum_v, i8 0, i64 4, i1 false) // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4 +// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 // CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK-NEXT: store i32 [[ADD12]], ptr @.omp.reduction..internal_private_var.1, align 4 +// CHECK-NEXT: store i32 [[ADD12]], ptr @.omp.reduction..internal_pivate_.sum_v, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4 +// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 // CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: br label %[[OMP_PRECOND_END]] @@ -666,26 +668,36 @@ int main(void) { // CHECK-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP0]], 0 // CHECK-NEXT: br i1 [[TMP47]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var.2, i8 0, i64 4, i1 false) +// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_pivate_.sum_v.1, i8 0, i64 4, i1 false) // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4 +// CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.1, align 4 // CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP9]], align 4 // CHECK-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP48]], [[TMP49]] -// CHECK-NEXT: store i32 [[ADD21]], ptr @.omp.reduction..internal_private_var.2, align 4 +// CHECK-NEXT: store i32 [[ADD21]], ptr @.omp.reduction..internal_pivate_.sum_v.1, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.1, align 4 +// CHECK-NEXT: store i32 [[TMP50]], ptr [[TMP9]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP51:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP51]], label %[[INIT22:.*]], label %[[INIT_END23:.*]] +// CHECK: [[INIT22]]: +// CHECK-NEXT: store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 +// CHECK-NEXT: br label %[[INIT_END23]] +// CHECK: [[INIT_END23]]: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4 -// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP10]], align 4 -// CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 [[TMP50]], [[TMP51]] -// CHECK-NEXT: store i32 [[MUL22]], ptr @.omp.reduction..internal_private_var.2, align 4 +// CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 +// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[TMP10]], align 4 +// CHECK-NEXT: [[MUL24:%.*]] = mul nsw i32 [[TMP52]], [[TMP53]] +// CHECK-NEXT: store i32 [[MUL24]], ptr @.omp.reduction..internal_pivate_.prod_v, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4 -// CHECK-NEXT: store i32 [[TMP52]], ptr [[TMP9]], align 4 -// CHECK-NEXT: store i32 [[TMP52]], ptr [[TMP10]], align 4 +// CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 +// CHECK-NEXT: store i32 [[TMP54]], ptr [[TMP10]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: br label %[[OMP_PRECOND_END]] // CHECK: [[OMP_PRECOND_END]]: @@ -755,7 +767,7 @@ int main(void) { // CHECK-NEXT: store i32 0, ptr [[SUM_V_EXT]], align 4 // CHECK-NEXT: store i32 1, ptr [[PROD_V_EXT]], align 4 // CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) -// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.3, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]]) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.2, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]]) // CHECK-NEXT: ret i32 0 // // From e1a19983c9beeb11b045794025bf5e36761184dc Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Thu, 8 May 2025 13:39:36 -0500 Subject: [PATCH 14/29] UDR init logic leveraged from emitInitWithReductionInitializer fn --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 35 ++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 9d13397ee9fb5..b87583049b253 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4986,9 +4986,38 @@ void CGOpenMPRuntime::emitPrivateReduction( auto EmitSharedInit = [&]() { if (UDR) { // Check if it's a User-Defined Reduction if (const Expr *UDRInitExpr = UDR->getInitializer()) { - // Use the initializer from the OMPDeclareReductionDecl - CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, - PrivateType.getQualifiers(), true); + std::pair FnPair = + getUserDefinedReduction(UDR); + llvm::Function *InitializerFn = FnPair.second; + if (InitializerFn) { + if (const auto *CE = + dyn_cast(UDRInitExpr->IgnoreParenImpCasts())) { + const auto *OutDRE = cast( + cast(CE->getArg(0)->IgnoreParenImpCasts()) + ->getSubExpr()); + const VarDecl *OutVD = cast(OutDRE->getDecl()); + + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(OutVD, SharedResult); + + (void)LocalScope.Privatize(); + if (const auto *OVE = dyn_cast( + CE->getCallee()->IgnoreParenImpCasts())) { + CodeGenFunction::OpaqueValueMapping OpaqueMap( + CGF, OVE, RValue::get(InitializerFn)); + CGF.EmitIgnoredExpr(CE); + } else { + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), true); + } + } else { + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), true); + } + } else { + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), true); + } } else { // EmitNullInitialization handles default construction for C++ classes // and zeroing for scalars, which is a reasonable default. From efd69bb41b827690f46485d24194e5b1b6f98bb9 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Fri, 9 May 2025 05:20:58 -0500 Subject: [PATCH 15/29] runtime tests --- .../for/omp_for_private_reduction.cpp | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp new file mode 100644 index 0000000000000..0a3bbafd9331f --- /dev/null +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -0,0 +1,93 @@ +//RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run +#include +#include +#include "omp_testsuite.h" + +#define N 10 +class Sum { + int val; + +public: + Sum(int v = 0) : val(v) {} + Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); } + Sum &operator+=(const Sum &rhs) { + val += rhs.val; + return *this; + } + int getValue() const { return val; } +}; + +// Declare OpenMP reduction +#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \ + initializer(omp_priv = Sum(0)) + +int checkUserDefinedReduction() { + Sum final_result_udr(0); + Sum array_sum[N]; + int error_flag = 0; + int expected_value = 0; + for (int i = 0; i < N; ++i) { + array_sum[i] = Sum(i); + expected_value += i; // Calculate expected sum: 0 + 1 + ... + (N-1) + } +#pragma omp parallel num_threads(4) + { +#pragma omp for reduction(sum_reduction : final_result_udr) + for (int i = 0; i < N; ++i) { + final_result_udr += array_sum[i]; + } + + if (final_result_udr.getValue() != expected_value) + error_flag += 1; + } + return error_flag; +} + +void performReductions(int n_elements, const int *input_values, + int &sum_val_out, int &prod_val_out, + float &float_sum_val_out) { + // private variables for this thread's reduction. + sum_val_out = 0; + prod_val_out = 1; + float_sum_val_out = 0.0f; + + const float kPiValue = 3.14f; +#pragma omp for reduction(original(private), + : sum_val_out) \ + reduction(original(private), * : prod_val_out) \ + reduction(original(private), + : float_sum_val_out) + for (int i = 0; i < n_elements; ++i) { + sum_val_out += input_values[i]; + prod_val_out *= (i + 1); + float_sum_val_out += kPiValue; + } +} +int main(void) { + int input_array[N]; + int total_errors = 0; + const float kPiVal = 3.14f; + const int kExpectedSum = 45; // Sum of 0..9 + const int kExpectedProd = 3628800; // 10! + const float kExpectedFsum = kPiVal * N; // 3.14f * 10 + + for (int i = 0; i < N; i++) + input_array[i] = i; +#pragma omp parallel num_threads(4) + { + + int t_sum_v; + int t_prod_v; + float t_fsum_v; + performReductions(N, input_array, t_sum_v, t_prod_v, t_fsum_v); + if (t_sum_v != kExpectedSum) + total_errors++; + if (t_prod_v != kExpectedProd) + total_errors++; + if (t_fsum_v != kExpectedFsum) + total_errors++; + } + total_errors += checkUserDefinedReduction(); + if (total_errors != 0) + fprintf(stderr, "ERROR: reduction on private variable %d\n", total_errors); + + return total_errors; +} From c01671e8f164c59d1ff625f7610e8bcc0b341519 Mon Sep 17 00:00:00 2001 From: CHANDRA GHALE Date: Fri, 9 May 2025 15:56:23 +0530 Subject: [PATCH 16/29] Update omp_for_private_reduction.cpp --- .../runtime/test/worksharing/for/omp_for_private_reduction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp index 0a3bbafd9331f..799955266cd67 100644 --- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -30,7 +30,7 @@ int checkUserDefinedReduction() { array_sum[i] = Sum(i); expected_value += i; // Calculate expected sum: 0 + 1 + ... + (N-1) } -#pragma omp parallel num_threads(4) +#pragma omp parallel num_threads(4) private(final_result_udr) { #pragma omp for reduction(sum_reduction : final_result_udr) for (int i = 0; i < N; ++i) { From ad0d2f0d43a0783491d804b14f2d9af69dae1493 Mon Sep 17 00:00:00 2001 From: CHANDRA GHALE Date: Fri, 9 May 2025 16:02:19 +0530 Subject: [PATCH 17/29] Update omp_for_private_reduction.cpp Formating --- .../test/worksharing/for/omp_for_private_reduction.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp index 799955266cd67..faf85a82b6232 100644 --- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -1,4 +1,4 @@ -//RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run +// RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run #include #include #include "omp_testsuite.h" @@ -65,8 +65,8 @@ int main(void) { int input_array[N]; int total_errors = 0; const float kPiVal = 3.14f; - const int kExpectedSum = 45; // Sum of 0..9 - const int kExpectedProd = 3628800; // 10! + const int kExpectedSum = 45; // Sum of 0..9 + const int kExpectedProd = 3628800; // 10! const float kExpectedFsum = kPiVal * N; // 3.14f * 10 for (int i = 0; i < N; i++) From 4df291081a6d0ffe1611f8717299d983838d5768 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Fri, 9 May 2025 06:35:57 -0500 Subject: [PATCH 18/29] update test --- .../for/omp_for_private_reduction.cpp | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp index faf85a82b6232..f907719cf5f34 100644 --- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -1,6 +1,7 @@ // RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run #include #include +#include #include "omp_testsuite.h" #define N 10 @@ -42,7 +43,20 @@ int checkUserDefinedReduction() { } return error_flag; } - +void performMinMaxRed(int &min_val, int &max_val) { + int input_data[] = {7, 3, 12, 5, 8}; + int n_size = sizeof(input_data) / sizeof(input_data[0]); + min_val = INT_MAX; + max_val = INT_MIN; +#pragma omp for reduction(original(private), min : min_val) \ + reduction(original(private), max : max_val) + for (int i = 0; i < n_size; ++i) { + if (input_data[i] < min_val) + min_val = input_data[i]; + if (input_data[i] > max_val) + max_val = input_data[i]; + } +} void performReductions(int n_elements, const int *input_values, int &sum_val_out, int &prod_val_out, float &float_sum_val_out) { @@ -68,6 +82,8 @@ int main(void) { const int kExpectedSum = 45; // Sum of 0..9 const int kExpectedProd = 3628800; // 10! const float kExpectedFsum = kPiVal * N; // 3.14f * 10 + const int kExpectedMin = 3; + const int kExpectedMax = 12; for (int i = 0; i < N; i++) input_array[i] = i; @@ -85,6 +101,16 @@ int main(void) { if (t_fsum_v != kExpectedFsum) total_errors++; } +#pragma omp parallel num_threads(4) + { + int t_min_v; + int t_max_v; + performMinMaxRed(t_min_v, t_max_v); + if (t_min_v != kExpectedMin) + total_errors++; + if (t_max_v != kExpectedMax) + total_errors++; + } total_errors += checkUserDefinedReduction(); if (total_errors != 0) fprintf(stderr, "ERROR: reduction on private variable %d\n", total_errors); From 2468be32feb8ffa0c3602ebeab1508527fa6d365 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Fri, 9 May 2025 10:07:21 -0500 Subject: [PATCH 19/29] test update --- .../for/omp_for_private_reduction.cpp | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp index f907719cf5f34..8fffee71dc879 100644 --- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -22,23 +22,37 @@ class Sum { #pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \ initializer(omp_priv = Sum(0)) +#pragma omp declare reduction(sum_pctor_reduction:Sum : omp_out += omp_in) \ + initializer(omp_priv = Sum(1)) // non-default ctor + int checkUserDefinedReduction() { Sum final_result_udr(0); + Sum final_result_udr_pctor(1); Sum array_sum[N]; int error_flag = 0; int expected_value = 0; + int expected_value_pctor = 0; for (int i = 0; i < N; ++i) { array_sum[i] = Sum(i); expected_value += i; // Calculate expected sum: 0 + 1 + ... + (N-1) + expected_value_pctor += i; } -#pragma omp parallel num_threads(4) private(final_result_udr) + int num_threads_for_pctor_calc = 4; // num_threads(4) + int priv_initializer_val_pctor = 1; // initializer(omp_priv = Sum(1)) + expected_value_pctor += + num_threads_for_pctor_calc + priv_initializer_val_pctor; +#pragma omp parallel num_threads(4) private(final_result_udr) private( \ + final_result_udr_pctor) { -#pragma omp for reduction(sum_reduction : final_result_udr) +#pragma omp for reduction(sum_reduction : final_result_udr) \ + reduction(sum_pctor_reduction : final_result_udr_pctor) for (int i = 0; i < N; ++i) { final_result_udr += array_sum[i]; + final_result_udr_pctor += array_sum[i]; } - if (final_result_udr.getValue() != expected_value) + if (final_result_udr.getValue() != expected_value || + final_result_udr_pctor.getValue() != expected_value_pctor) error_flag += 1; } return error_flag; From 9576c87a0a05caddaf27029a0f0667e20ab920d9 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Fri, 9 May 2025 12:04:17 -0500 Subject: [PATCH 20/29] Resolve mergeconflict rel notes --- clang/docs/ReleaseNotes.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 67d56b9cb5ec9..0fec16f649f65 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -530,6 +530,12 @@ OpenMP Support - Added support 'no_openmp_constructs' assumption clause. - Added support for 'self_maps' in map and requirement clause. - Added support for 'omp stripe' directive. +- Fixed a crashing bug with ``omp unroll partial`` if the argument to + ``partial`` was an invalid expression. (#GH139267) +- Fixed a crashing bug with ``omp tile sizes`` if the argument to ``sizes`` was + an invalid expression. (#GH139073) +- Fixed a crashing bug with ``omp distribute dist_schedule`` if the argument to + ``dist_schedule`` was not strictly positive. (#GH139266) - Added support for reduction over private variable with 'reduction' clause. Improvements From 7e324bda008cb6f43ac83c840d5216ace19b145d Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Fri, 9 May 2025 12:10:37 -0500 Subject: [PATCH 21/29] Resolve mergeconflict rel notes --- clang/docs/ReleaseNotes.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 0fec16f649f65..b07e83377bd29 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -536,7 +536,6 @@ OpenMP Support an invalid expression. (#GH139073) - Fixed a crashing bug with ``omp distribute dist_schedule`` if the argument to ``dist_schedule`` was not strictly positive. (#GH139266) -- Added support for reduction over private variable with 'reduction' clause. Improvements ^^^^^^^^^^^^ From 262a861ce0d0cdc4b68dc1cb2afcd09b6026663e Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Fri, 9 May 2025 13:23:18 -0500 Subject: [PATCH 22/29] Release notes update --- clang/docs/ReleaseNotes.rst | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index b07e83377bd29..62e41a9f54f83 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -530,12 +530,7 @@ OpenMP Support - Added support 'no_openmp_constructs' assumption clause. - Added support for 'self_maps' in map and requirement clause. - Added support for 'omp stripe' directive. -- Fixed a crashing bug with ``omp unroll partial`` if the argument to - ``partial`` was an invalid expression. (#GH139267) -- Fixed a crashing bug with ``omp tile sizes`` if the argument to ``sizes`` was - an invalid expression. (#GH139073) -- Fixed a crashing bug with ``omp distribute dist_schedule`` if the argument to - ``dist_schedule`` was not strictly positive. (#GH139266) +- Added support for private variable reduction. Improvements ^^^^^^^^^^^^ From a0d29ab2fee9ffeeac18975e8a263e4dc29b2d6a Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Tue, 13 May 2025 09:40:57 -0500 Subject: [PATCH 23/29] address comments,support all types --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 58 ++++++++++++------- .../OpenMP/for_private_reduction_codegen.cpp | 7 +-- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index b87583049b253..8be8c6603b0aa 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4935,13 +4935,34 @@ void CGOpenMPRuntime::emitPrivateReduction( if (const auto *DRE = dyn_cast(Privates)) { if (const auto *VD = dyn_cast(DRE->getDecl())) { const Expr *InitExpr = VD->getInit(); - if (InitExpr && !PrivateType->isAggregateType() && - !PrivateType->isAnyComplexType()) { + if (InitExpr) { Expr::EvalResult Result; if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) { APValue &InitValue = Result.Val; if (InitValue.isInt()) InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt()); + else if (InitValue.isFloat()) + InitVal = llvm::ConstantFP::get(LLVMType, InitValue.getFloat()); + else if (InitValue.isComplexInt()) { + // For complex int: create struct { real, imag } + llvm::Constant *Real = llvm::ConstantInt::get( + cast(LLVMType)->getElementType(0), + InitValue.getComplexIntReal()); + llvm::Constant *Imag = llvm::ConstantInt::get( + cast(LLVMType)->getElementType(1), + InitValue.getComplexIntImag()); + InitVal = llvm::ConstantStruct::get( + cast(LLVMType), {Real, Imag}); + } else if (InitValue.isComplexFloat()) { + llvm::Constant *Real = llvm::ConstantFP::get( + cast(LLVMType)->getElementType(0), + InitValue.getComplexFloatReal()); + llvm::Constant *Imag = llvm::ConstantFP::get( + cast(LLVMType)->getElementType(1), + InitValue.getComplexFloatImag()); + InitVal = llvm::ConstantStruct::get( + cast(LLVMType), {Real, Imag}); + } } } } @@ -4950,11 +4971,10 @@ void CGOpenMPRuntime::emitPrivateReduction( InitVal = llvm::Constant::getNullValue(LLVMType); } std::string ReductionVarNameStr; - if (const auto *DRE = dyn_cast(Privates->IgnoreParenCasts())) { + if (const auto *DRE = dyn_cast(Privates->IgnoreParenCasts())) ReductionVarNameStr = DRE->getDecl()->getNameAsString(); - } else { + else ReductionVarNameStr = "unnamed_priv_var"; - } // Create an internal shared variable std::string SharedName = @@ -5027,18 +5047,11 @@ void CGOpenMPRuntime::emitPrivateReduction( } if (const auto *DRE = dyn_cast(Privates)) { if (const auto *VD = dyn_cast(DRE->getDecl())) { - const Expr *InitExpr = VD->getInit(); - if (InitExpr && (PrivateType->isAggregateType() || - PrivateType->isAnyComplexType())) { + if (const Expr *InitExpr = VD->getInit()) { CGF.EmitAnyExprToMem(InitExpr, SharedResult, PrivateType.getQualifiers(), true); return; } - if (!InitVal->isNullValue()) { - CGF.EmitStoreOfScalar(InitVal, - CGF.MakeAddrLValue(SharedResult, PrivateType)); - return; - } } } CGF.EmitNullInitialization(SharedResult, PrivateType); @@ -5086,9 +5099,8 @@ void CGOpenMPRuntime::emitPrivateReduction( } }; EmitCriticalReduction(ReductionGen); - } - // Handle built-in reduction operations. - else { + } else { + // Handle built-in reduction operations. const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); if (const auto *Cleanup = dyn_cast(ReductionClauseExpr)) ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); @@ -5106,10 +5118,6 @@ void CGOpenMPRuntime::emitPrivateReduction( if (!AssignRHS) return; - const Expr *CombinerExpr = AssignRHS->IgnoreParenImpCasts(); - if (const auto *MTE = dyn_cast(CombinerExpr)) - CombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts(); - auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { Action.Enter(CGF); const auto *OmpOutDRE = @@ -5414,7 +5422,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, } else { // Emit as a critical region. auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *, - const Expr *, const Expr *) { + const Expr *, const Expr *) { CGOpenMPRuntime &RT = CGF.CGM.getOpenMPRuntime(); std::string Name = RT.getName({"atomic_reduction"}); RT.emitCriticalRegion( @@ -5467,6 +5475,14 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, if (LHSExprs.size() != Privates.size() || LHSExprs.size() != ReductionOps.size()) return; + assert(!LHSExprs.empty() && "PrivateVarReduction: LHSExprs is empty"); + assert(!Privates.empty() && "PrivateVarReduction: Privates is empty"); + assert(!ReductionOps.empty() && + "PrivateVarReduction: ReductionOps is empty"); + assert(LHSExprs.size() == Privates.size() && + "PrivateVarReduction: Privates size mismatch"); + assert(LHSExprs.size() == ReductionOps.size() && + "PrivateVarReduction: ReductionOps size mismatch"); for (unsigned I : llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) { emitPrivateReduction(CGF, Loc, Privates[I], LHSExprs[I], RHSExprs[I], diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index 8532bf4125787..e3327d513e1af 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -464,7 +464,7 @@ int main(void) { // CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0 // CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_pivate_.sum_v, i8 0, i64 4, i1 false) +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) @@ -668,7 +668,7 @@ int main(void) { // CHECK-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP0]], 0 // CHECK-NEXT: br i1 [[TMP47]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: -// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_pivate_.sum_v.1, i8 0, i64 4, i1 false) +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.1, align 4 // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) @@ -734,7 +734,7 @@ int main(void) { // // // CHECK-LABEL: define dso_local noundef i32 @main( -// CHECK-SAME: ) #[[ATTR8:[0-9]+]] { +// CHECK-SAME: ) #[[ATTR7:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 @@ -785,4 +785,3 @@ int main(void) { // CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0 // CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]]) // CHECK-NEXT: ret void - From 0c2978c21d758bc9c3af9832f42d07265cbc870d Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Tue, 13 May 2025 11:37:11 -0500 Subject: [PATCH 24/29] complex type test for priv redn --- .../for/omp_for_private_reduction.cpp | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp index 8fffee71dc879..824a8f6e7ec9b 100644 --- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -2,6 +2,8 @@ #include #include #include +#include +#include #include "omp_testsuite.h" #define N 10 @@ -71,6 +73,33 @@ void performMinMaxRed(int &min_val, int &max_val) { max_val = input_data[i]; } } +int performComplexReduction() { + double _Complex arr[N]; + double _Complex expected = 0.0 + 0.0 * I; + double _Complex result = 0.0 + 0.0 * I; + int error = 0; + + // Initialize the array and compute serial sum + for (int i = 0; i < N; ++i) { + arr[i] = i - i * I; + expected += arr[i]; + } + double real_sum = 0.0, imag_sum = 0.0; +#pragma omp parallel private(real_sum) private(imag_sum) + { +#pragma omp for reduction(+ : real_sum, imag_sum) + for (int i = 0; i < N; ++i) { + real_sum += creal(arr[i]); + imag_sum += cimag(arr[i]); + } + + result = real_sum + imag_sum * I; + if (cabs(result - expected) > 1e-6) { + error++; + } + } + return error; +} void performReductions(int n_elements, const int *input_values, int &sum_val_out, int &prod_val_out, float &float_sum_val_out) { @@ -126,6 +155,7 @@ int main(void) { total_errors++; } total_errors += checkUserDefinedReduction(); + total_errors += performComplexReduction(); if (total_errors != 0) fprintf(stderr, "ERROR: reduction on private variable %d\n", total_errors); From 384cd4af85bc48eb7df493b212643536586acbcc Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Wed, 14 May 2025 06:27:01 -0500 Subject: [PATCH 25/29] add addtional complex test --- .../for/omp_for_private_reduction.cpp | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp index 824a8f6e7ec9b..9727bbeda7885 100644 --- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -100,6 +100,20 @@ int performComplexReduction() { } return error; } + +std::complex doComplexReduction(std::complex *arr) { + std::complex result(1, 0); + +#pragma omp declare reduction(* : std::complex : omp_out *= omp_in) \ + initializer(omp_priv = std::complex(1, 0)) + +#pragma omp for reduction(original(private), * : result) + for (int i = 0; i < N; ++i) + result *= arr[i]; + + return result; +} + void performReductions(int n_elements, const int *input_values, int &sum_val_out, int &prod_val_out, float &float_sum_val_out) { @@ -127,6 +141,14 @@ int main(void) { const float kExpectedFsum = kPiVal * N; // 3.14f * 10 const int kExpectedMin = 3; const int kExpectedMax = 12; + std::complex arr[N]; + std::complex kExpectedComplex(1, 0); + // Initialize the array + for (int i = 1; i <= N; ++i) { + arr[i - 1] = std::complex( + 1.0 + 0.1 * i, 0.5 * i); // Avoid zero to prevent multiplication by zero + kExpectedComplex *= arr[i - 1]; + } for (int i = 0; i < N; i++) input_array[i] = i; @@ -156,6 +178,15 @@ int main(void) { } total_errors += checkUserDefinedReduction(); total_errors += performComplexReduction(); + #pragma omp parallel num_threads(4) + { + std::complex result(1, 0); + result = doComplexReduction(arr); + if (std::abs(result.real() - kExpectedComplex.real()) > 1e-6 || + std::abs(result.imag() - kExpectedComplex.imag()) > 1e-6) { + total_errors++; + } + } if (total_errors != 0) fprintf(stderr, "ERROR: reduction on private variable %d\n", total_errors); From 0b59740467798131042a37ba4ed5fd2bcc3137f8 Mon Sep 17 00:00:00 2001 From: CHANDRA GHALE Date: Wed, 14 May 2025 17:09:52 +0530 Subject: [PATCH 26/29] format error fix --- .../runtime/test/worksharing/for/omp_for_private_reduction.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp index 9727bbeda7885..9bf3be1e9e45d 100644 --- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -178,7 +178,7 @@ int main(void) { } total_errors += checkUserDefinedReduction(); total_errors += performComplexReduction(); - #pragma omp parallel num_threads(4) +#pragma omp parallel num_threads(4) { std::complex result(1, 0); result = doComplexReduction(arr); From 4c36ba7540c451f1bdaa7320446ef64f1c2ddd7b Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Fri, 23 May 2025 11:14:33 -0500 Subject: [PATCH 27/29] Format fix --- clang/lib/CodeGen/CGStmtOpenMP.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 5f415074b41e7..ff09ec8ee494c 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -3947,7 +3947,7 @@ static void emitScanBasedDirective( CGF.CGM.getOpenMPRuntime().emitReduction( CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, {/*WithNowait=*/true, /*SimpleReduction=*/true, - /*IsPrivateVarReduction*/{}, OMPD_unknown}); + /*IsPrivateVarReduction*/ {}, OMPD_unknown}); } llvm::Value *NextIVal = CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1)); @@ -5753,7 +5753,7 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) { CGM.getOpenMPRuntime().emitReduction( *this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, {/*WithNowait=*/true, /*SimpleReduction=*/true, - /*IsPrivateVarReduction*/{}, OMPD_simd}); + /*IsPrivateVarReduction*/ {}, OMPD_simd}); for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) { const Expr *PrivateExpr = Privates[I]; LValue DestLVal; From b146a1a27b2891df6fd193ffbc9f25e6f254ae2d Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Fri, 30 May 2025 07:54:25 -0500 Subject: [PATCH 28/29] Few more fixes with ref from spec --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 63 ++++-- clang/lib/Sema/SemaOpenMP.cpp | 41 ++-- .../OpenMP/distribute_simd_misc_messages.c | 3 +- .../OpenMP/for_private_reduction_codegen.cpp | 183 ++++++------------ clang/test/OpenMP/for_reduction_messages.cpp | 2 + .../OpenMP/for_simd_reduction_messages.cpp | 2 +- .../OpenMP/sections_reduction_messages.cpp | 2 +- 7 files changed, 136 insertions(+), 160 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index ce2f45617f87c..354c46a064b25 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4915,12 +4915,12 @@ void CGOpenMPRuntime::emitPrivateReduction( // - Thread enters critical section. // - Reads its private value from LHSExprs[i]. // - Updates __shared_reduction_var[i] = RedOp_i(__shared_reduction_var[i], - // LHSExprs[i]). + // Privates[i]). // - Exits critical section. // // Call __kmpc_barrier after combining. // - // Each thread copies __shared_reduction_var[i] back to LHSExprs[i]. + // Each thread copies __shared_reduction_var[i] back to RHSExprs[i]. // // Final __kmpc_barrier to synchronize after broadcasting QualType PrivateType = Privates->getType(); @@ -5025,7 +5025,7 @@ void CGOpenMPRuntime::emitPrivateReduction( const Expr *ReductionOp = ReductionOps; const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp); LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); - LValue LHSLV = CGF.EmitLValue(LHSExprs); + LValue LHSLV = CGF.EmitLValue(Privates); auto EmitCriticalReduction = [&](auto ReductionGen) { std::string CriticalName = getName({"reduction_critical"}); @@ -5114,7 +5114,7 @@ void CGOpenMPRuntime::emitPrivateReduction( else FinalResultVal = CGF.EmitLoadOfScalar(SharedLV1, Loc); - LValue TargetLHSLV = CGF.EmitLValue(LHSExprs); + LValue TargetLHSLV = CGF.EmitLValue(RHSExprs); if (IsAggregate) { CGF.EmitAggregateCopy(TargetLHSLV, CGF.MakeAddrLValue(FinalResultAddr, PrivateType), @@ -5126,13 +5126,23 @@ void CGOpenMPRuntime::emitPrivateReduction( CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( CGM.getModule(), OMPRTL___kmpc_barrier), BarrierArgs); + + // Combiner with original list item + auto OriginalListCombiner = [&](CodeGenFunction &CGF, + PrePostActionTy &Action) { + Action.Enter(CGF); + emitSingleReductionCombiner(CGF, ReductionOps, Privates, + cast(LHSExprs), + cast(RHSExprs)); + }; + EmitCriticalReduction(OriginalListCombiner); } void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, - ArrayRef Privates, - ArrayRef LHSExprs, - ArrayRef RHSExprs, - ArrayRef ReductionOps, + ArrayRef OrgPrivates, + ArrayRef OrgLHSExprs, + ArrayRef OrgRHSExprs, + ArrayRef OrgReductionOps, ReductionOptionsTy Options) { if (!CGF.HaveInsertPoint()) return; @@ -5179,10 +5189,10 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, if (SimpleReduction) { CodeGenFunction::RunCleanupsScope Scope(CGF); - const auto *IPriv = Privates.begin(); - const auto *ILHS = LHSExprs.begin(); - const auto *IRHS = RHSExprs.begin(); - for (const Expr *E : ReductionOps) { + const auto *IPriv = OrgPrivates.begin(); + const auto *ILHS = OrgLHSExprs.begin(); + const auto *IRHS = OrgRHSExprs.begin(); + for (const Expr *E : OrgReductionOps) { emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), cast(*IRHS)); ++IPriv; @@ -5192,6 +5202,26 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, return; } + // Filter out shared reduction variables based on IsPrivateVarReduction flag. + // Only keep entries where the corresponding variable is not private. + SmallVector FilteredPrivates, FilteredLHSExprs, + FilteredRHSExprs, FilteredReductionOps; + for (unsigned I : llvm::seq( + std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) { + if (!Options.IsPrivateVarReduction[I]) { + FilteredPrivates.emplace_back(OrgPrivates[I]); + FilteredLHSExprs.emplace_back(OrgLHSExprs[I]); + FilteredRHSExprs.emplace_back(OrgRHSExprs[I]); + FilteredReductionOps.emplace_back(OrgReductionOps[I]); + } + } + // Wrap filtered vectors in ArrayRef for downstream shared reduction + // processing. + ArrayRef Privates = FilteredPrivates; + ArrayRef LHSExprs = FilteredLHSExprs; + ArrayRef RHSExprs = FilteredRHSExprs; + ArrayRef ReductionOps = FilteredReductionOps; + // 1. Build a list of reduction variables. // void *RedList[] = {[0], ..., [-1]}; auto Size = RHSExprs.size(); @@ -5439,11 +5469,12 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, "PrivateVarReduction: ReductionOps size mismatch"); assert(LHSExprs.size() == Options.IsPrivateVarReduction.size() && "PrivateVarReduction: IsPrivateVarReduction size mismatch"); - for (unsigned I : - llvm::seq(std::min(ReductionOps.size(), LHSExprs.size()))) { + + for (unsigned I : llvm::seq( + std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) { if (Options.IsPrivateVarReduction[I]) - emitPrivateReduction(CGF, Loc, Privates[I], LHSExprs[I], RHSExprs[I], - ReductionOps[I]); + emitPrivateReduction(CGF, Loc, OrgPrivates[I], OrgLHSExprs[I], + OrgRHSExprs[I], OrgReductionOps[I]); } } diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index f16f841d62edd..dbc46c898a0a8 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -19015,34 +19015,14 @@ static bool actOnOMPReductionKindClause( reportOriginalDsa(S, Stack, D, DVar); continue; } - // OpenMP 6.0 [ 7.6.10 ] - // Support Reduction over private variables with reduction clause. - // A list item in a reduction clause can now be private in the enclosing - // context. For orphaned constructs it is assumed to be shared unless the - // original(private) modifier appears in the clause. - DVar = Stack->getImplicitDSA(D, true); - bool IsOrphaned = false; - OpenMPDirectiveKind CurrDir = Stack->getCurrentDirective(); - OpenMPDirectiveKind ParentDir = Stack->getParentDirective(); - // Check if the construct is orphaned (has no enclosing OpenMP context) - IsOrphaned = ParentDir == OMPD_unknown; - // OpenMP 6.0: Private DSA check - IsPrivate = - (S.getLangOpts().OpenMP > 52) && - ((isOpenMPPrivate(DVar.CKind) && DVar.CKind != OMPC_reduction && - isOpenMPWorksharingDirective(CurrDir) && - !isOpenMPParallelDirective(CurrDir) && - !isOpenMPTeamsDirective(CurrDir) && - !isOpenMPSimdDirective(ParentDir)) || - (IsOrphaned && DVar.CKind == OMPC_unknown) || - RD.OrigSharingModifier != OMPC_ORIGINAL_SHARING_shared); // OpenMP [2.14.3.6, Restrictions, p.1] // A list item that appears in a reduction clause of a worksharing // construct must be shared in the parallel regions to which any of the // worksharing regions arising from the worksharing construct bind. - if (!IsPrivate && isOpenMPWorksharingDirective(CurrDir) && + if (S.getLangOpts().OpenMP <= 52 && + isOpenMPWorksharingDirective(CurrDir) && !isOpenMPParallelDirective(CurrDir) && !isOpenMPTeamsDirective(CurrDir)) { DVar = Stack->getImplicitDSA(D, true); @@ -19053,6 +19033,23 @@ static bool actOnOMPReductionKindClause( reportOriginalDsa(S, Stack, D, DVar); continue; } + } else if (isOpenMPWorksharingDirective(CurrDir) && + !isOpenMPParallelDirective(CurrDir) && + !isOpenMPTeamsDirective(CurrDir)) { + // OpenMP 6.0 [ 7.6.10 ] + // Support Reduction over private variables with reduction clause. + // A list item in a reduction clause can now be private in the enclosing + // context. For orphaned constructs it is assumed to be shared unless + // the original(private) modifier appears in the clause. + DVar = Stack->getImplicitDSA(D, true); + // Determine if the variable should be considered private + IsPrivate = DVar.CKind != OMPC_shared; + bool IsOrphaned = false; + OpenMPDirectiveKind ParentDir = Stack->getParentDirective(); + IsOrphaned = ParentDir == OMPD_unknown; + if ((IsOrphaned && + RD.OrigSharingModifier == OMPC_ORIGINAL_SHARING_private)) + IsPrivate = true; } } else { // Threadprivates cannot be shared between threads, so dignose if the base diff --git a/clang/test/OpenMP/distribute_simd_misc_messages.c b/clang/test/OpenMP/distribute_simd_misc_messages.c index 8cbf96cd7a014..270e17dcb89bb 100644 --- a/clang/test/OpenMP/distribute_simd_misc_messages.c +++ b/clang/test/OpenMP/distribute_simd_misc_messages.c @@ -508,6 +508,7 @@ void test_collapse(void) { #pragma omp distribute simd collapse(5 - 5) for (i = 0; i < 16; ++i) ; +#if defined(_OPENMP) && (_OPENMP <= 202111) // expected-note@+3 2 {{defined as reduction}} #pragma omp target #pragma omp teams @@ -520,7 +521,7 @@ void test_collapse(void) { #pragma omp for reduction(+ : i, j) for (int k = 0; k < 16; ++k) i += j; - +#endif #pragma omp target #pragma omp teams for (i = 0; i < 16; ++i) diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index 57f1eeeaa6b10..318bd1f577bbf 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -74,9 +74,8 @@ int main(void) { // CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @.omp.reduction..internal_pivate_.result = common global %class.Sum zeroinitializer, align 4 +// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8 // CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 // CHECK: @.omp.reduction..internal_pivate_.sum_v = common global i32 0, align 4 @@ -155,7 +154,7 @@ int main(void) { // CHECK-NEXT: [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4 // CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 -// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8 // CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8 @@ -212,38 +211,35 @@ int main(void) { // CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] // CHECK: [[OMP_LOOP_EXIT]]: // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) -// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK-NEXT: store ptr [[RESULT1]], ptr [[TMP11]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: switch i32 [[TMP12]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ +// CHECK-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP11]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ // CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] // CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] // CHECK-NEXT: ] // CHECK: [[_OMP_REDUCTION_CASE1:.*:]] -// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]]) // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_CASE2:.*:]] -// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var) -// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]]) -// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var) // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] -// CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0 -// CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]] +// CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[TMP12]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: // CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result, i32 noundef 0) // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result, ptr noundef [[RESULT]]) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result, ptr noundef [[RESULT1]]) // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) -// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result, align 4 -// CHECK-NEXT: store [[CLASS_SUM]] [[TMP14]], ptr [[RESULT]], align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result, align 4 +// CHECK-NEXT: store [[CLASS_SUM]] [[TMP13]], ptr [[RESULT1]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]]) +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]]) // CHECK-NEXT: ret void // @@ -322,11 +318,6 @@ int main(void) { // CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0 -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 -// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[TMP7]], ptr noundef [[TMP5]]) // CHECK-NEXT: ret void // // @@ -363,7 +354,7 @@ int main(void) { // CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) // CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 // CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 @@ -440,43 +431,41 @@ int main(void) { // CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] // CHECK: [[OMP_LOOP_EXIT]]: // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8 -// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ +// CHECK-NEXT: [[TMP22:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP22]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ // CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] // CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] // CHECK-NEXT: ] // CHECK: [[_OMP_REDUCTION_CASE1:.*:]] -// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP7]], align 4 -// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4 -// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4 // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_CASE2:.*:]] -// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4 -// CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4 // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] -// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0 -// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]] +// CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP23]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: // CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 -// CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP7]], align 4 -// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP29]], [[TMP30]] -// CHECK-NEXT: store i32 [[ADD12]], ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK-NEXT: store i32 [[ADD11]], ptr @.omp.reduction..internal_pivate_.sum_v, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 -// CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: store i32 [[TMP26]], ptr [[SUM_V4]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP27]], [[TMP28]] +// CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: br label %[[OMP_PRECOND_END]] // CHECK: [[OMP_PRECOND_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) @@ -492,14 +481,6 @@ int main(void) { // CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0 -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 -// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4 -// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]] -// CHECK-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 // CHECK-NEXT: ret void // // @@ -526,9 +507,7 @@ int main(void) { // CHECK-NEXT: [[PROD_V7:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[_TMP8:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[I9:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8 -// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 -// CHECK-NEXT: [[_TMP19:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8 // CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) // CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 // CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 @@ -622,82 +601,64 @@ int main(void) { // CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] // CHECK: [[OMP_LOOP_EXIT]]: // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 -// CHECK-NEXT: store ptr [[SUM_V5]], ptr [[TMP30]], align 8 -// CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 -// CHECK-NEXT: store ptr [[PROD_V7]], ptr [[TMP31]], align 8 -// CHECK-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 2, i64 16, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: switch i32 [[TMP32]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ +// CHECK-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP30]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ // CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] -// CHECK-NEXT: i32 2, label %[[DOTOMP_REDUCTION_CASE2:.*]] +// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] // CHECK-NEXT: ] // CHECK: [[_OMP_REDUCTION_CASE1:.*:]] -// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP9]], align 4 -// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[SUM_V5]], align 4 -// CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP33]], [[TMP34]] -// CHECK-NEXT: store i32 [[ADD17]], ptr [[TMP9]], align 4 -// CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP10]], align 4 -// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[PROD_V7]], align 4 -// CHECK-NEXT: [[MUL18:%.*]] = mul nsw i32 [[TMP35]], [[TMP36]] -// CHECK-NEXT: store i32 [[MUL18]], ptr [[TMP10]], align 4 // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_CASE2:.*:]] -// CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[SUM_V5]], align 4 -// CHECK-NEXT: [[TMP38:%.*]] = atomicrmw add ptr [[TMP9]], i32 [[TMP37]] monotonic, align 4 -// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4 -// CHECK-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i32, ptr [[TMP10]] monotonic, align 4 -// CHECK-NEXT: br label %[[ATOMIC_CONT:.*]] -// CHECK: [[ATOMIC_CONT]]: -// CHECK-NEXT: [[TMP40:%.*]] = phi i32 [ [[ATOMIC_LOAD]], %[[DOTOMP_REDUCTION_CASE2]] ], [ [[TMP45:%.*]], %[[ATOMIC_CONT]] ] -// CHECK-NEXT: store i32 [[TMP40]], ptr [[_TMP19]], align 4 -// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[_TMP19]], align 4 -// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[PROD_V7]], align 4 -// CHECK-NEXT: [[MUL20:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]] -// CHECK-NEXT: store i32 [[MUL20]], ptr [[ATOMIC_TEMP]], align 4 -// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 -// CHECK-NEXT: [[TMP44:%.*]] = cmpxchg ptr [[TMP10]], i32 [[TMP40]], i32 [[TMP43]] monotonic monotonic, align 4 -// CHECK-NEXT: [[TMP45]] = extractvalue { i32, i1 } [[TMP44]], 0 -// CHECK-NEXT: [[TMP46:%.*]] = extractvalue { i32, i1 } [[TMP44]], 1 -// CHECK-NEXT: br i1 [[TMP46]], label %[[ATOMIC_EXIT:.*]], label %[[ATOMIC_CONT]] -// CHECK: [[ATOMIC_EXIT]]: // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) // CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] // CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] -// CHECK-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP0]], 0 -// CHECK-NEXT: br i1 [[TMP47]], label %[[INIT:.*]], label %[[INIT_END:.*]] +// CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP31]], label %[[INIT:.*]], label %[[INIT_END:.*]] // CHECK: [[INIT]]: // CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 // CHECK-NEXT: br label %[[INIT_END]] // CHECK: [[INIT_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 -// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP9]], align 4 -// CHECK-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP48]], [[TMP49]] -// CHECK-NEXT: store i32 [[ADD21]], ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK-NEXT: store i32 [[ADD17]], ptr @.omp.reduction..internal_pivate_.sum_v, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 -// CHECK-NEXT: store i32 [[TMP50]], ptr [[TMP9]], align 4 +// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: store i32 [[TMP34]], ptr [[SUM_V5]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP51:%.*]] = icmp eq i32 [[TMP0]], 0 -// CHECK-NEXT: br i1 [[TMP51]], label %[[INIT22:.*]], label %[[INIT_END23:.*]] -// CHECK: [[INIT22]]: +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK-NEXT: store i32 [[ADD18]], ptr [[TMP9]], align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP37:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP37]], label %[[INIT19:.*]], label %[[INIT_END20:.*]] +// CHECK: [[INIT19]]: // CHECK-NEXT: store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 -// CHECK-NEXT: br label %[[INIT_END23]] -// CHECK: [[INIT_END23]]: +// CHECK-NEXT: br label %[[INIT_END20]] +// CHECK: [[INIT_END20]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 -// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[TMP10]], align 4 -// CHECK-NEXT: [[MUL24:%.*]] = mul nsw i32 [[TMP52]], [[TMP53]] -// CHECK-NEXT: store i32 [[MUL24]], ptr @.omp.reduction..internal_pivate_.prod_v, align 4 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[TMP38]], [[TMP39]] +// CHECK-NEXT: store i32 [[MUL21]], ptr @.omp.reduction..internal_pivate_.prod_v, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 -// CHECK-NEXT: store i32 [[TMP54]], ptr [[TMP10]], align 4 +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 +// CHECK-NEXT: store i32 [[TMP40]], ptr [[PROD_V7]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP10]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]] +// CHECK-NEXT: store i32 [[MUL22]], ptr [[TMP10]], align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: br label %[[OMP_PRECOND_END]] // CHECK: [[OMP_PRECOND_END]]: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) @@ -713,22 +674,6 @@ int main(void) { // CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 // CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0 -// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8 -// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP2]], i64 0, i64 0 -// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 -// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1 -// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8 -// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP2]], i64 0, i64 1 -// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8 -// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4 -// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4 -// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]] -// CHECK-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4 -// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP11]], align 4 -// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4 -// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]] -// CHECK-NEXT: store i32 [[MUL]], ptr [[TMP11]], align 4 // CHECK-NEXT: ret void // // diff --git a/clang/test/OpenMP/for_reduction_messages.cpp b/clang/test/OpenMP/for_reduction_messages.cpp index de28ba2c3be02..2fdac3048c9cd 100644 --- a/clang/test/OpenMP/for_reduction_messages.cpp +++ b/clang/test/OpenMP/for_reduction_messages.cpp @@ -417,10 +417,12 @@ int main(int argc, char **argv) { #pragma omp for reduction(+ : qa[1], qa[0]) for (int i = 0; i < 10; ++i) foo(); +#if defined(_OPENMP) && (_OPENMP <= 202111) #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}} #pragma omp for reduction(+ : fl) // expected-error {{reduction variable must be shared}} for (int i = 0; i < 10; ++i) foo(); +#endif static int m=0; #pragma omp for reduction(+:m) for (int i = 0; i < 10; ++i) diff --git a/clang/test/OpenMP/for_simd_reduction_messages.cpp b/clang/test/OpenMP/for_simd_reduction_messages.cpp index 96b3805b10a86..a9ef6c39cb5d2 100644 --- a/clang/test/OpenMP/for_simd_reduction_messages.cpp +++ b/clang/test/OpenMP/for_simd_reduction_messages.cpp @@ -396,11 +396,11 @@ int main(int argc, char **argv) { #pragma omp for simd reduction(+ : fl) // expected-error {{reduction variable must be shared}} for (int i = 0; i < 10; ++i) foo(); -#endif #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}} #pragma omp for simd reduction(+ : fl) // expected-error {{reduction variable must be shared}} for (int i = 0; i < 10; ++i) foo(); +#endif static int m; #pragma omp for simd reduction(+ : m) for (int i = 0; i < 10; ++i) diff --git a/clang/test/OpenMP/sections_reduction_messages.cpp b/clang/test/OpenMP/sections_reduction_messages.cpp index 42ec3ed6d58e8..8cde6489f325f 100644 --- a/clang/test/OpenMP/sections_reduction_messages.cpp +++ b/clang/test/OpenMP/sections_reduction_messages.cpp @@ -461,12 +461,12 @@ int main(int argc, char **argv) { { foo(); } -#endif #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}} #pragma omp sections reduction(+ : fl) // expected-error {{reduction variable must be shared}} { foo(); } +#endif static int m; #pragma omp sections reduction(+ : m) // OK { From e1ca648e6f4a89e2a2cab624c4434d069c93c120 Mon Sep 17 00:00:00 2001 From: Chandra Ghale Date: Fri, 30 May 2025 08:31:08 -0500 Subject: [PATCH 29/29] removing wrong asserts --- clang/lib/CodeGen/CGOpenMPRuntime.cpp | 16 +- .../OpenMP/for_private_reduction_codegen.cpp | 407 +++++++++--------- 2 files changed, 200 insertions(+), 223 deletions(-) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 354c46a064b25..8f7e3d8b39beb 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4898,6 +4898,9 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, } } +static std::string generateUniqueName(CodeGenModule &CGM, + llvm::StringRef Prefix, const Expr *Ref); + void CGOpenMPRuntime::emitPrivateReduction( CodeGenFunction &CGF, SourceLocation Loc, const Expr *Privates, const Expr *LHSExprs, const Expr *RHSExprs, const Expr *ReductionOps) { @@ -4929,7 +4932,8 @@ void CGOpenMPRuntime::emitPrivateReduction( const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps); std::string ReductionVarNameStr; if (const auto *DRE = dyn_cast(Privates->IgnoreParenCasts())) - ReductionVarNameStr = DRE->getDecl()->getNameAsString(); + ReductionVarNameStr = + generateUniqueName(CGM, DRE->getDecl()->getNameAsString(), Privates); else ReductionVarNameStr = "unnamed_priv_var"; @@ -5460,16 +5464,10 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); - assert(!LHSExprs.empty() && "PrivateVarReduction: LHSExprs is empty"); - assert(!Privates.empty() && "PrivateVarReduction: Privates is empty"); - assert(!ReductionOps.empty() && "PrivateVarReduction: ReductionOps is empty"); - assert(LHSExprs.size() == Privates.size() && + assert(OrgLHSExprs.size() == OrgPrivates.size() && "PrivateVarReduction: Privates size mismatch"); - assert(LHSExprs.size() == ReductionOps.size() && + assert(OrgLHSExprs.size() == OrgReductionOps.size() && "PrivateVarReduction: ReductionOps size mismatch"); - assert(LHSExprs.size() == Options.IsPrivateVarReduction.size() && - "PrivateVarReduction: IsPrivateVarReduction size mismatch"); - for (unsigned I : llvm::seq( std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) { if (Options.IsPrivateVarReduction[I]) diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp index 318bd1f577bbf..c8a6863299fb3 100644 --- a/clang/test/OpenMP/for_private_reduction_codegen.cpp +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -1,4 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --prefix-filecheck-ir-name _ --version 5 +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex ".omp.reduction..internal[a-zA-Z_0-9.]+" // RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s // expected-no-diagnostics #define N 10 @@ -70,64 +70,58 @@ int main(void) { } //. -// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" -// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8 -// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @.omp.reduction..internal_pivate_.result = common global %class.Sum zeroinitializer, align 4 -// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8 -// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8 -// CHECK: @.omp.reduction..internal_pivate_.sum_v = common global i32 0, align 4 -// CHECK: @.omp.reduction..internal_pivate_.prod_v = common global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.result.result_996 = common global %class.Sum zeroinitializer, align 4 +// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1188 = common global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1392 = common global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.prod_v.prod_v_1461 = common global i32 0, align 4 //. -// CHECK-LABEL: define dso_local void @_Z8func_redv( -// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*]]: +// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 // CHECK-NEXT: [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16 // CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]) // CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0) // CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0 // CHECK-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10 -// CHECK-NEXT: br label %[[ARRAYCTOR_LOOP:.*]] -// CHECK: [[ARRAYCTOR_LOOP]]: -// CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], %[[ENTRY]] ], [ [[ARRAYCTOR_NEXT:%.*]], %[[ARRAYCTOR_LOOP]] ] +// CHECK-NEXT: br label [[ARRAYCTOR_LOOP:%.*]] +// CHECK: arrayctor.loop: +// CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ] // CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0) // CHECK-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1 // CHECK-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]] -// CHECK-NEXT: br i1 [[ARRAYCTOR_DONE]], label %[[ARRAYCTOR_CONT:.*]], label %[[ARRAYCTOR_LOOP]] -// CHECK: [[ARRAYCTOR_CONT]]: +// CHECK-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]] +// CHECK: arrayctor.cont: // CHECK-NEXT: store i32 0, ptr [[I]], align 4 -// CHECK-NEXT: br label %[[FOR_COND:.*]] -// CHECK: [[FOR_COND]]: +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 -// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] -// CHECK: [[FOR_BODY]]: +// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]]) // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]] // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false) -// CHECK-NEXT: br label %[[FOR_INC:.*]] -// CHECK: [[FOR_INC]]: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 -// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] -// CHECK: [[FOR_END]]: +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK: for.end: // CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) // CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]]) // CHECK-NEXT: ret void // // -// CHECK-LABEL: define linkonce_odr void @_ZN3SumC1Ei( -// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC1Ei +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -138,9 +132,9 @@ int main(void) { // CHECK-NEXT: ret void // // -// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined( -// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined +// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[ARRAY_ADDR:%.*]] = alloca ptr, align 8 @@ -167,27 +161,27 @@ int main(void) { // CHECK-NEXT: call void @.omp_initializer.(ptr noundef [[RESULT1]], ptr noundef [[RESULT]]) // CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 -// CHECK-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] -// CHECK: [[COND_TRUE]]: -// CHECK-NEXT: br label %[[COND_END:.*]] -// CHECK: [[COND_FALSE]]: +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 -// CHECK-NEXT: br label %[[COND_END]] -// CHECK: [[COND_END]]: -// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ] +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] // CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4 -// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] -// CHECK: [[OMP_INNER_FOR_COND]]: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]] -// CHECK-NEXT: br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] -// CHECK: [[OMP_INNER_FOR_BODY]]: +// CHECK-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] @@ -199,54 +193,54 @@ int main(void) { // CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0 // CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false) -// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] -// CHECK: [[OMP_BODY_CONTINUE]]: -// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] -// CHECK: [[OMP_INNER_FOR_INC]]: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 // CHECK-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4 -// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]] -// CHECK: [[OMP_INNER_FOR_END]]: -// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] -// CHECK: [[OMP_LOOP_EXIT]]: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) -// CHECK-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: switch i32 [[TMP11]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ -// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] -// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] +// CHECK-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP11]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK-NEXT: ] -// CHECK: [[_OMP_REDUCTION_CASE1:.*:]] +// CHECK: .omp.reduction.case1: // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] -// CHECK: [[_OMP_REDUCTION_CASE2:.*:]] +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.case2: // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] -// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.default: // CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP2]], 0 -// CHECK-NEXT: br i1 [[TMP12]], label %[[INIT:.*]], label %[[INIT_END:.*]] -// CHECK: [[INIT]]: -// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result, i32 noundef 0) -// CHECK-NEXT: br label %[[INIT_END]] -// CHECK: [[INIT_END]]: +// CHECK-NEXT: br i1 [[TMP12]], label [[INIT:%.*]], label [[INIT_END:%.*]] +// CHECK: init: +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result.result_996, i32 noundef 0) +// CHECK-NEXT: br label [[INIT_END]] +// CHECK: init.end: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result, ptr noundef [[RESULT1]]) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result.result_996, ptr noundef [[RESULT1]]) // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) -// CHECK-NEXT: [[TMP13:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result, align 4 +// CHECK-NEXT: [[TMP13:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result.result_996, align 4 // CHECK-NEXT: store [[CLASS_SUM]] [[TMP13]], ptr [[RESULT1]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]]) // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]]) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[TMP2]]) // CHECK-NEXT: ret void // // -// CHECK-LABEL: define internal void @.omp_combiner.( -// CHECK-SAME: ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@.omp_combiner. +// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 @@ -257,9 +251,9 @@ int main(void) { // CHECK-NEXT: ret void // // -// CHECK-LABEL: define linkonce_odr noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_( -// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_ZN3SumpLERKS_ +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -275,9 +269,9 @@ int main(void) { // CHECK-NEXT: ret ptr [[THIS1]] // // -// CHECK-LABEL: define internal void @.omp_initializer.( -// CHECK-SAME: ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@.omp_initializer. +// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 @@ -288,9 +282,9 @@ int main(void) { // CHECK-NEXT: ret void // // -// CHECK-LABEL: define linkonce_odr i32 @_ZNK3SumplERKS_( -// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_ZNK3SumplERKS_ +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8 @@ -309,9 +303,9 @@ int main(void) { // CHECK-NEXT: ret i32 [[TMP3]] // // -// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined.omp.reduction.reduction_func( -// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined.omp.reduction.reduction_func +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 @@ -321,9 +315,9 @@ int main(void) { // CHECK-NEXT: ret void // // -// CHECK-LABEL: define linkonce_odr void @_ZN3SumC2Ei( -// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC2Ei +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: // CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 @@ -335,9 +329,9 @@ int main(void) { // CHECK-NEXT: ret void // // -// CHECK-LABEL: define dso_local void @_Z6do_rediPiRi( -// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi +// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8 @@ -373,8 +367,8 @@ int main(void) { // CHECK-NEXT: store i32 0, ptr [[I]], align 4 // CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] -// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]] -// CHECK: [[OMP_PRECOND_THEN]]: +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: // CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4 @@ -387,25 +381,25 @@ int main(void) { // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 // CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] -// CHECK-NEXT: br i1 [[CMP7]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] -// CHECK: [[COND_TRUE]]: +// CHECK-NEXT: br i1 [[CMP7]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: // CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 -// CHECK-NEXT: br label %[[COND_END:.*]] -// CHECK: [[COND_FALSE]]: +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 -// CHECK-NEXT: br label %[[COND_END]] -// CHECK: [[COND_END]]: -// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE]] ], [ [[TMP11]], %[[COND_FALSE]] ] +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] // CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 -// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] -// CHECK: [[OMP_INNER_FOR_COND]]: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] -// CHECK-NEXT: br i1 [[CMP8]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] -// CHECK: [[OMP_INNER_FOR_BODY]]: +// CHECK-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] @@ -419,45 +413,45 @@ int main(void) { // CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 // CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]] // CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4 -// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] -// CHECK: [[OMP_BODY_CONTINUE]]: -// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] -// CHECK: [[OMP_INNER_FOR_INC]]: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: // CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1 // CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4 -// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]] -// CHECK: [[OMP_INNER_FOR_END]]: -// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] -// CHECK: [[OMP_LOOP_EXIT]]: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP22:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: switch i32 [[TMP22]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ -// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] -// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] +// CHECK-NEXT: switch i32 [[TMP22]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK-NEXT: ] -// CHECK: [[_OMP_REDUCTION_CASE1:.*:]] +// CHECK: .omp.reduction.case1: // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] -// CHECK: [[_OMP_REDUCTION_CASE2:.*:]] +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.case2: // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] -// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.default: // CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP0]], 0 -// CHECK-NEXT: br i1 [[TMP23]], label %[[INIT:.*]], label %[[INIT_END:.*]] -// CHECK: [[INIT]]: -// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 -// CHECK-NEXT: br label %[[INIT_END]] -// CHECK: [[INIT_END]]: +// CHECK-NEXT: br i1 [[TMP23]], label [[INIT:%.*]], label [[INIT_END:%.*]] +// CHECK: init: +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 +// CHECK-NEXT: br label [[INIT_END]] +// CHECK: init.end: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 // CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4 // CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] -// CHECK-NEXT: store i32 [[ADD11]], ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: store i32 [[ADD11]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 // CHECK-NEXT: store i32 [[TMP26]], ptr [[SUM_V4]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) @@ -466,15 +460,15 @@ int main(void) { // CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP27]], [[TMP28]] // CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP7]], align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: br label %[[OMP_PRECOND_END]] -// CHECK: [[OMP_PRECOND_END]]: +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) // CHECK-NEXT: ret void // // -// CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func( -// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi.omp.reduction.reduction_func +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 @@ -484,9 +478,9 @@ int main(void) { // CHECK-NEXT: ret void // // -// CHECK-LABEL: define dso_local void @_Z15do_red_extendediPiRiS0_( -// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[PROD_V:%.*]]) #[[ATTR0]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_ +// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[PROD_V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8 @@ -531,8 +525,8 @@ int main(void) { // CHECK-NEXT: store i32 0, ptr [[I]], align 4 // CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 // CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] -// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]] -// CHECK: [[OMP_PRECOND_THEN]]: +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: // CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 // CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4 @@ -548,25 +542,25 @@ int main(void) { // CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 // CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] -// CHECK-NEXT: br i1 [[CMP10]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]] -// CHECK: [[COND_TRUE]]: +// CHECK-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: // CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 -// CHECK-NEXT: br label %[[COND_END:.*]] -// CHECK: [[COND_FALSE]]: +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: // CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 -// CHECK-NEXT: br label %[[COND_END]] -// CHECK: [[COND_END]]: -// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], %[[COND_TRUE]] ], [ [[TMP14]], %[[COND_FALSE]] ] +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] // CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 // CHECK-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4 -// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]] -// CHECK: [[OMP_INNER_FOR_COND]]: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: // CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 // CHECK-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] -// CHECK-NEXT: br i1 [[CMP11]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]] -// CHECK: [[OMP_INNER_FOR_BODY]]: +// CHECK-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: // CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] @@ -589,45 +583,45 @@ int main(void) { // CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4 // CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[TMP28]], [[TMP26]] // CHECK-NEXT: store i32 [[MUL15]], ptr [[TMP27]], align 4 -// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]] -// CHECK: [[OMP_BODY_CONTINUE]]: -// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]] -// CHECK: [[OMP_INNER_FOR_INC]]: +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: // CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 // CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP29]], 1 // CHECK-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4 -// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]] -// CHECK: [[OMP_INNER_FOR_END]]: -// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]] -// CHECK: [[OMP_LOOP_EXIT]]: +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: // CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) // CHECK-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: switch i32 [[TMP30]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [ -// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]] -// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]] +// CHECK-NEXT: switch i32 [[TMP30]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] // CHECK-NEXT: ] -// CHECK: [[_OMP_REDUCTION_CASE1:.*:]] +// CHECK: .omp.reduction.case1: // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] -// CHECK: [[_OMP_REDUCTION_CASE2:.*:]] +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.case2: // CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) -// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]] -// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]] +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.default: // CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP0]], 0 -// CHECK-NEXT: br i1 [[TMP31]], label %[[INIT:.*]], label %[[INIT_END:.*]] -// CHECK: [[INIT]]: -// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 -// CHECK-NEXT: br label %[[INIT_END]] -// CHECK: [[INIT_END]]: +// CHECK-NEXT: br i1 [[TMP31]], label [[INIT:%.*]], label [[INIT_END:%.*]] +// CHECK: init: +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 +// CHECK-NEXT: br label [[INIT_END]] +// CHECK: init.end: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 // CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[SUM_V5]], align 4 // CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] -// CHECK-NEXT: store i32 [[ADD17]], ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: store i32 [[ADD17]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4 +// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 // CHECK-NEXT: store i32 [[TMP34]], ptr [[SUM_V5]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) @@ -637,20 +631,20 @@ int main(void) { // CHECK-NEXT: store i32 [[ADD18]], ptr [[TMP9]], align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: [[TMP37:%.*]] = icmp eq i32 [[TMP0]], 0 -// CHECK-NEXT: br i1 [[TMP37]], label %[[INIT19:.*]], label %[[INIT_END20:.*]] -// CHECK: [[INIT19]]: -// CHECK-NEXT: store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 -// CHECK-NEXT: br label %[[INIT_END20]] -// CHECK: [[INIT_END20]]: +// CHECK-NEXT: br i1 [[TMP37]], label [[INIT19:%.*]], label [[INIT_END20:%.*]] +// CHECK: init19: +// CHECK-NEXT: store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 +// CHECK-NEXT: br label [[INIT_END20]] +// CHECK: init.end20: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 // CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4 // CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[TMP38]], [[TMP39]] -// CHECK-NEXT: store i32 [[MUL21]], ptr @.omp.reduction..internal_pivate_.prod_v, align 4 +// CHECK-NEXT: store i32 [[MUL21]], ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) -// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4 +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 // CHECK-NEXT: store i32 [[TMP40]], ptr [[PROD_V7]], align 4 // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) // CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) @@ -659,15 +653,15 @@ int main(void) { // CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]] // CHECK-NEXT: store i32 [[MUL22]], ptr [[TMP10]], align 4 // CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) -// CHECK-NEXT: br label %[[OMP_PRECOND_END]] -// CHECK: [[OMP_PRECOND_END]]: +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: // CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) // CHECK-NEXT: ret void // // -// CHECK-LABEL: define internal void @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func( -// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 // CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 @@ -677,9 +671,9 @@ int main(void) { // CHECK-NEXT: ret void // // -// CHECK-LABEL: define dso_local noundef i32 @main( -// CHECK-SAME: ) #[[ATTR7:[0-9]+]] { -// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-LABEL: define {{[^@]+}}@main +// CHECK-SAME: () #[[ATTR7:[0-9]+]] { +// CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 // CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 @@ -688,24 +682,24 @@ int main(void) { // CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) // CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK-NEXT: store i32 0, ptr [[I]], align 4 -// CHECK-NEXT: br label %[[FOR_COND:.*]] -// CHECK: [[FOR_COND]]: +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 -// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]] -// CHECK: [[FOR_BODY]]: +// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: // CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 // CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]] // CHECK-NEXT: store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4 -// CHECK-NEXT: br label %[[FOR_INC:.*]] -// CHECK: [[FOR_INC]]: +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: // CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 // CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 -// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] -// CHECK: [[FOR_END]]: +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] +// CHECK: for.end: // CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) // CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]]) // CHECK-NEXT: store i32 0, ptr [[SUM_V_EXT]], align 4 @@ -713,19 +707,4 @@ int main(void) { // CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) // CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.1, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]]) // CHECK-NEXT: ret i32 0 -// -// -// CHECK-LABEL: define internal void @main.omp_outlined( -// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR2]] { -// CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 -// CHECK-NEXT: [[S_V:%.*]] = alloca i32, align 4 -// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 -// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 -// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[V_ADDR]], align 8 -// CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0 -// CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]]) -// CHECK-NEXT: ret void +