diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index d6507071d4693..986aaabe1eed4 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -406,7 +406,8 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Extensions to atomic construct | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Private reductions | :part:`partial` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 | +| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 | +| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Self maps | :part:`partial` | :none:`unclaimed` | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 31c517338c21f..a9bf96d8f709c 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -950,6 +950,7 @@ OpenMP Support open parenthesis. (#GH139665) - An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have an argument larger than what can fit within a 64-bit integer. +- Added support for private variable reduction. Improvements ^^^^^^^^^^^^ diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 918b064c3cfd5..8f7e3d8b39beb 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4898,11 +4898,255 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, } } +static std::string generateUniqueName(CodeGenModule &CGM, + llvm::StringRef Prefix, const Expr *Ref); + +void CGOpenMPRuntime::emitPrivateReduction( + CodeGenFunction &CGF, SourceLocation Loc, const Expr *Privates, + const Expr *LHSExprs, const Expr *RHSExprs, const Expr *ReductionOps) { + + // Create a shared global variable (__shared_reduction_var) to accumulate the + // final result. + // + // Call __kmpc_barrier to synchronize threads before initialization. + // + // The master thread (thread_id == 0) initializes __shared_reduction_var + // with the identity value or initializer. + // + // Call __kmpc_barrier to synchronize before combining. + // For each i: + // - Thread enters critical section. + // - Reads its private value from LHSExprs[i]. + // - Updates __shared_reduction_var[i] = RedOp_i(__shared_reduction_var[i], + // Privates[i]). + // - Exits critical section. + // + // Call __kmpc_barrier after combining. + // + // Each thread copies __shared_reduction_var[i] back to RHSExprs[i]. + // + // Final __kmpc_barrier to synchronize after broadcasting + QualType PrivateType = Privates->getType(); + llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType); + + const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps); + std::string ReductionVarNameStr; + if (const auto *DRE = dyn_cast(Privates->IgnoreParenCasts())) + ReductionVarNameStr = + generateUniqueName(CGM, DRE->getDecl()->getNameAsString(), Privates); + else + ReductionVarNameStr = "unnamed_priv_var"; + + // Create an internal shared variable + std::string SharedName = + CGM.getOpenMPRuntime().getName({"internal_pivate_", ReductionVarNameStr}); + llvm::GlobalVariable *SharedVar = OMPBuilder.getOrCreateInternalVariable( + LLVMType, ".omp.reduction." + SharedName); + + SharedVar->setAlignment( + llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8)); + + Address SharedResult = + CGF.MakeNaturalAlignRawAddrLValue(SharedVar, PrivateType).getAddress(); + + llvm::Value *ThreadId = getThreadID(CGF, Loc); + llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE); + llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId}; + + llvm::BasicBlock *InitBB = CGF.createBasicBlock("init"); + llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end"); + + llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ( + ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0)); + CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB); + + CGF.EmitBlock(InitBB); + + auto EmitSharedInit = [&]() { + if (UDR) { // Check if it's a User-Defined Reduction + if (const Expr *UDRInitExpr = UDR->getInitializer()) { + std::pair FnPair = + getUserDefinedReduction(UDR); + llvm::Function *InitializerFn = FnPair.second; + if (InitializerFn) { + if (const auto *CE = + dyn_cast(UDRInitExpr->IgnoreParenImpCasts())) { + const auto *OutDRE = cast( + cast(CE->getArg(0)->IgnoreParenImpCasts()) + ->getSubExpr()); + const VarDecl *OutVD = cast(OutDRE->getDecl()); + + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(OutVD, SharedResult); + + (void)LocalScope.Privatize(); + if (const auto *OVE = dyn_cast( + CE->getCallee()->IgnoreParenImpCasts())) { + CodeGenFunction::OpaqueValueMapping OpaqueMap( + CGF, OVE, RValue::get(InitializerFn)); + CGF.EmitIgnoredExpr(CE); + } else { + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), + /*IsInitializer=*/true); + } + } else { + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), + /*IsInitializer=*/true); + } + } else { + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), + /*IsInitializer=*/true); + } + } else { + // EmitNullInitialization handles default construction for C++ classes + // and zeroing for scalars, which is a reasonable default. + CGF.EmitNullInitialization(SharedResult, PrivateType); + } + return; // UDR initialization handled + } + if (const auto *DRE = dyn_cast(Privates)) { + if (const auto *VD = dyn_cast(DRE->getDecl())) { + if (const Expr *InitExpr = VD->getInit()) { + CGF.EmitAnyExprToMem(InitExpr, SharedResult, + PrivateType.getQualifiers(), true); + return; + } + } + } + CGF.EmitNullInitialization(SharedResult, PrivateType); + }; + EmitSharedInit(); + CGF.Builder.CreateBr(InitEndBB); + CGF.EmitBlock(InitEndBB); + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + const Expr *ReductionOp = ReductionOps; + const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp); + LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); + LValue LHSLV = CGF.EmitLValue(Privates); + + auto EmitCriticalReduction = [&](auto ReductionGen) { + std::string CriticalName = getName({"reduction_critical"}); + emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); + }; + + if (CurrentUDR) { + // Handle user-defined reduction. + auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + std::pair FnPair = + getUserDefinedReduction(CurrentUDR); + if (FnPair.first) { + if (const auto *CE = dyn_cast(ReductionOp)) { + const auto *OutDRE = cast( + cast(CE->getArg(0)->IgnoreParenImpCasts()) + ->getSubExpr()); + const auto *InDRE = cast( + cast(CE->getArg(1)->IgnoreParenImpCasts()) + ->getSubExpr()); + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(cast(OutDRE->getDecl()), + SharedLV.getAddress()); + LocalScope.addPrivate(cast(InDRE->getDecl()), + LHSLV.getAddress()); + (void)LocalScope.Privatize(); + emitReductionCombiner(CGF, ReductionOp); + } + } + }; + EmitCriticalReduction(ReductionGen); + } else { + // Handle built-in reduction operations. +#ifndef NDEBUG + const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); + if (const auto *Cleanup = dyn_cast(ReductionClauseExpr)) + ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); + + const Expr *AssignRHS = nullptr; + if (const auto *BinOp = dyn_cast(ReductionClauseExpr)) { + if (BinOp->getOpcode() == BO_Assign) + AssignRHS = BinOp->getRHS(); + } else if (const auto *OpCall = + dyn_cast(ReductionClauseExpr)) { + if (OpCall->getOperator() == OO_Equal) + AssignRHS = OpCall->getArg(1); + } + + assert(AssignRHS && + "Private Variable Reduction : Invalid ReductionOp expression"); +#endif + + auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + const auto *OmpOutDRE = + dyn_cast(LHSExprs->IgnoreParenImpCasts()); + const auto *OmpInDRE = + dyn_cast(RHSExprs->IgnoreParenImpCasts()); + assert( + OmpOutDRE && OmpInDRE && + "Private Variable Reduction : LHSExpr/RHSExpr must be DeclRefExprs"); + const VarDecl *OmpOutVD = cast(OmpOutDRE->getDecl()); + const VarDecl *OmpInVD = cast(OmpInDRE->getDecl()); + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress()); + LocalScope.addPrivate(OmpInVD, LHSLV.getAddress()); + (void)LocalScope.Privatize(); + // Emit the actual reduction operation + CGF.EmitIgnoredExpr(ReductionOp); + }; + EmitCriticalReduction(ReductionGen); + } + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + // Broadcast final result + bool IsAggregate = PrivateType->isAggregateType(); + LValue SharedLV1 = CGF.MakeAddrLValue(SharedResult, PrivateType); + llvm::Value *FinalResultVal = nullptr; + Address FinalResultAddr = Address::invalid(); + + if (IsAggregate) + FinalResultAddr = SharedResult; + else + FinalResultVal = CGF.EmitLoadOfScalar(SharedLV1, Loc); + + LValue TargetLHSLV = CGF.EmitLValue(RHSExprs); + if (IsAggregate) { + CGF.EmitAggregateCopy(TargetLHSLV, + CGF.MakeAddrLValue(FinalResultAddr, PrivateType), + PrivateType, AggValueSlot::DoesNotOverlap, false); + } else { + CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV); + } + // Final synchronization barrier + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + // Combiner with original list item + auto OriginalListCombiner = [&](CodeGenFunction &CGF, + PrePostActionTy &Action) { + Action.Enter(CGF); + emitSingleReductionCombiner(CGF, ReductionOps, Privates, + cast(LHSExprs), + cast(RHSExprs)); + }; + EmitCriticalReduction(OriginalListCombiner); +} + void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, - ArrayRef Privates, - ArrayRef LHSExprs, - ArrayRef RHSExprs, - ArrayRef ReductionOps, + ArrayRef OrgPrivates, + ArrayRef OrgLHSExprs, + ArrayRef OrgRHSExprs, + ArrayRef OrgReductionOps, ReductionOptionsTy Options) { if (!CGF.HaveInsertPoint()) return; @@ -4949,10 +5193,10 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, if (SimpleReduction) { CodeGenFunction::RunCleanupsScope Scope(CGF); - const auto *IPriv = Privates.begin(); - const auto *ILHS = LHSExprs.begin(); - const auto *IRHS = RHSExprs.begin(); - for (const Expr *E : ReductionOps) { + const auto *IPriv = OrgPrivates.begin(); + const auto *ILHS = OrgLHSExprs.begin(); + const auto *IRHS = OrgRHSExprs.begin(); + for (const Expr *E : OrgReductionOps) { emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), cast(*IRHS)); ++IPriv; @@ -4962,6 +5206,26 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, return; } + // Filter out shared reduction variables based on IsPrivateVarReduction flag. + // Only keep entries where the corresponding variable is not private. + SmallVector FilteredPrivates, FilteredLHSExprs, + FilteredRHSExprs, FilteredReductionOps; + for (unsigned I : llvm::seq( + std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) { + if (!Options.IsPrivateVarReduction[I]) { + FilteredPrivates.emplace_back(OrgPrivates[I]); + FilteredLHSExprs.emplace_back(OrgLHSExprs[I]); + FilteredRHSExprs.emplace_back(OrgRHSExprs[I]); + FilteredReductionOps.emplace_back(OrgReductionOps[I]); + } + } + // Wrap filtered vectors in ArrayRef for downstream shared reduction + // processing. + ArrayRef Privates = FilteredPrivates; + ArrayRef LHSExprs = FilteredLHSExprs; + ArrayRef RHSExprs = FilteredRHSExprs; + ArrayRef ReductionOps = FilteredReductionOps; + // 1. Build a list of reduction variables. // void *RedList[] = {[0], ..., [-1]}; auto Size = RHSExprs.size(); @@ -5153,7 +5417,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, } else { // Emit as a critical region. auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *, - const Expr *, const Expr *) { + const Expr *, const Expr *) { CGOpenMPRuntime &RT = CGF.CGM.getOpenMPRuntime(); std::string Name = RT.getName({"atomic_reduction"}); RT.emitCriticalRegion( @@ -5200,6 +5464,16 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); + assert(OrgLHSExprs.size() == OrgPrivates.size() && + "PrivateVarReduction: Privates size mismatch"); + assert(OrgLHSExprs.size() == OrgReductionOps.size() && + "PrivateVarReduction: ReductionOps size mismatch"); + for (unsigned I : llvm::seq( + std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) { + if (Options.IsPrivateVarReduction[I]) + emitPrivateReduction(CGF, Loc, OrgPrivates[I], OrgLHSExprs[I], + OrgRHSExprs[I], OrgReductionOps[I]); + } } /// Generates unique name for artificial threadprivate variables. diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 4321712e1521d..5be48b439f4fd 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -1201,8 +1201,20 @@ class CGOpenMPRuntime { struct ReductionOptionsTy { bool WithNowait; bool SimpleReduction; + llvm::SmallVector IsPrivateVarReduction; OpenMPDirectiveKind ReductionKind; }; + + /// Emits code for private variable reduction + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc, + const Expr *Privates, const Expr *LHSExprs, + const Expr *RHSExprs, const Expr *ReductionOps); + /// Emit a code for reduction clause. Next code should be emitted for /// reduction: /// \code diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 803c7ed37635e..ff09ec8ee494c 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1471,6 +1471,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( llvm::SmallVector LHSExprs; llvm::SmallVector RHSExprs; llvm::SmallVector ReductionOps; + llvm::SmallVector IsPrivateVarReduction; bool HasAtLeastOneReduction = false; bool IsReductionWithTaskMod = false; for (const auto *C : D.getClausesOfKind()) { @@ -1481,6 +1482,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( Privates.append(C->privates().begin(), C->privates().end()); LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end()); RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end()); + IsPrivateVarReduction.append(C->private_var_reduction_flags().begin(), + C->private_var_reduction_flags().end()); ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end()); IsReductionWithTaskMod = IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task; @@ -1502,7 +1505,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( // parallel directive (it always has implicit barrier). CGM.getOpenMPRuntime().emitReduction( *this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps, - {WithNowait, SimpleReduction, ReductionKind}); + {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind}); } } @@ -3943,7 +3946,8 @@ static void emitScanBasedDirective( PrivScope.Privatize(); CGF.CGM.getOpenMPRuntime().emitReduction( CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, - {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown}); + {/*WithNowait=*/true, /*SimpleReduction=*/true, + /*IsPrivateVarReduction*/ {}, OMPD_unknown}); } llvm::Value *NextIVal = CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1)); @@ -5748,7 +5752,8 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) { } CGM.getOpenMPRuntime().emitReduction( *this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, - {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd}); + {/*WithNowait=*/true, /*SimpleReduction=*/true, + /*IsPrivateVarReduction*/ {}, OMPD_simd}); for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) { const Expr *PrivateExpr = Privates[I]; LValue DestLVal; diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index f16f841d62edd..dbc46c898a0a8 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -19015,34 +19015,14 @@ static bool actOnOMPReductionKindClause( reportOriginalDsa(S, Stack, D, DVar); continue; } - // OpenMP 6.0 [ 7.6.10 ] - // Support Reduction over private variables with reduction clause. - // A list item in a reduction clause can now be private in the enclosing - // context. For orphaned constructs it is assumed to be shared unless the - // original(private) modifier appears in the clause. - DVar = Stack->getImplicitDSA(D, true); - bool IsOrphaned = false; - OpenMPDirectiveKind CurrDir = Stack->getCurrentDirective(); - OpenMPDirectiveKind ParentDir = Stack->getParentDirective(); - // Check if the construct is orphaned (has no enclosing OpenMP context) - IsOrphaned = ParentDir == OMPD_unknown; - // OpenMP 6.0: Private DSA check - IsPrivate = - (S.getLangOpts().OpenMP > 52) && - ((isOpenMPPrivate(DVar.CKind) && DVar.CKind != OMPC_reduction && - isOpenMPWorksharingDirective(CurrDir) && - !isOpenMPParallelDirective(CurrDir) && - !isOpenMPTeamsDirective(CurrDir) && - !isOpenMPSimdDirective(ParentDir)) || - (IsOrphaned && DVar.CKind == OMPC_unknown) || - RD.OrigSharingModifier != OMPC_ORIGINAL_SHARING_shared); // OpenMP [2.14.3.6, Restrictions, p.1] // A list item that appears in a reduction clause of a worksharing // construct must be shared in the parallel regions to which any of the // worksharing regions arising from the worksharing construct bind. - if (!IsPrivate && isOpenMPWorksharingDirective(CurrDir) && + if (S.getLangOpts().OpenMP <= 52 && + isOpenMPWorksharingDirective(CurrDir) && !isOpenMPParallelDirective(CurrDir) && !isOpenMPTeamsDirective(CurrDir)) { DVar = Stack->getImplicitDSA(D, true); @@ -19053,6 +19033,23 @@ static bool actOnOMPReductionKindClause( reportOriginalDsa(S, Stack, D, DVar); continue; } + } else if (isOpenMPWorksharingDirective(CurrDir) && + !isOpenMPParallelDirective(CurrDir) && + !isOpenMPTeamsDirective(CurrDir)) { + // OpenMP 6.0 [ 7.6.10 ] + // Support Reduction over private variables with reduction clause. + // A list item in a reduction clause can now be private in the enclosing + // context. For orphaned constructs it is assumed to be shared unless + // the original(private) modifier appears in the clause. + DVar = Stack->getImplicitDSA(D, true); + // Determine if the variable should be considered private + IsPrivate = DVar.CKind != OMPC_shared; + bool IsOrphaned = false; + OpenMPDirectiveKind ParentDir = Stack->getParentDirective(); + IsOrphaned = ParentDir == OMPD_unknown; + if ((IsOrphaned && + RD.OrigSharingModifier == OMPC_ORIGINAL_SHARING_private)) + IsPrivate = true; } } else { // Threadprivates cannot be shared between threads, so dignose if the base diff --git a/clang/test/OpenMP/distribute_simd_misc_messages.c b/clang/test/OpenMP/distribute_simd_misc_messages.c index 8cbf96cd7a014..270e17dcb89bb 100644 --- a/clang/test/OpenMP/distribute_simd_misc_messages.c +++ b/clang/test/OpenMP/distribute_simd_misc_messages.c @@ -508,6 +508,7 @@ void test_collapse(void) { #pragma omp distribute simd collapse(5 - 5) for (i = 0; i < 16; ++i) ; +#if defined(_OPENMP) && (_OPENMP <= 202111) // expected-note@+3 2 {{defined as reduction}} #pragma omp target #pragma omp teams @@ -520,7 +521,7 @@ void test_collapse(void) { #pragma omp for reduction(+ : i, j) for (int k = 0; k < 16; ++k) i += j; - +#endif #pragma omp target #pragma omp teams for (i = 0; i < 16; ++i) diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp new file mode 100644 index 0000000000000..c8a6863299fb3 --- /dev/null +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -0,0 +1,710 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex ".omp.reduction..internal[a-zA-Z_0-9.]+" +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics +#define N 10 +class Sum { + int val; + +public: + Sum(int v = 0) : val(v) {} + Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); } + Sum &operator+=(const Sum &rhs) { + val += rhs.val; + return *this; + } +}; +#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \ + initializer(omp_priv = Sum(0)) + +void func_red() { + Sum result(0); + Sum array[N]; + + for (int i = 0; i < N; i++) { + array[i] = Sum(i); + } + +#pragma omp parallel private(result) num_threads(4) + { +#pragma omp for reduction(sum_reduction : result) + for (int i = 0; i < N; i++) { + result = result + array[i]; + } + } +} + +void do_red(int n, int *v, int &sum_v) { + sum_v = 0; +#pragma omp for reduction(original(private), + : sum_v) + for (int i = 0; i < n; i++) { + sum_v += v[i]; + } +} +void do_red_extended(int n, int *v, int &sum_v, int &prod_v) { + sum_v = 0; + prod_v = 1; + +#pragma omp for reduction(original(private), + : sum_v) \ + reduction(original(private), * : prod_v) + for (int i = 0; i < n; i++) { + sum_v += v[i]; + prod_v *= v[i]; + } +} +int main(void) { + int v[N]; + for (int i = 0; i < N; i++) + v[i] = i; +#pragma omp parallel num_threads(4) + { + int s_v; + do_red(N, v, s_v); + } + + int sum_v_ext = 0, prod_v_ext = 1; +#pragma omp parallel num_threads(4) + { + do_red_extended(N, v, sum_v_ext, prod_v_ext); + } + return 0; +} + +//. +// CHECK: @.omp.reduction..internal_pivate_.result.result_996 = common global %class.Sum zeroinitializer, align 4 +// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1188 = common global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1392 = common global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.prod_v.prod_v_1461 = common global i32 0, align 4 +//. +// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]) +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0) +// CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10 +// CHECK-NEXT: br label [[ARRAYCTOR_LOOP:%.*]] +// CHECK: arrayctor.loop: +// CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ] +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0) +// CHECK-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1 +// CHECK-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]] +// CHECK-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]] +// CHECK: arrayctor.cont: +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC1Ei +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4 +// CHECK-NEXT: call void @_ZN3SumC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined +// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[ARRAY_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8 +// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAY_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0) +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: call void @.omp_initializer.(ptr noundef [[RESULT1]], ptr noundef [[RESULT]]) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]] +// CHECK-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP11]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK-NEXT: ] +// CHECK: .omp.reduction.case1: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.case2: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.default: +// CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[TMP12]], label [[INIT:%.*]], label [[INIT_END:%.*]] +// CHECK: init: +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result.result_996, i32 noundef 0) +// CHECK-NEXT: br label [[INIT_END]] +// CHECK: init.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result.result_996, ptr noundef [[RESULT1]]) +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP13:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result.result_996, align 4 +// CHECK-NEXT: store [[CLASS_SUM]] [[TMP13]], ptr [[RESULT1]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]]) +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@.omp_combiner. +// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_ZN3SumpLERKS_ +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VAL]], align 4 +// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[VAL2]], align 4 +// CHECK-NEXT: ret ptr [[THIS1]] +// +// +// CHECK-LABEL: define {{[^@]+}}@.omp_initializer. +// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], i32 noundef 0) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_ZNK3SumplERKS_ +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[TMP1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]] +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RETVAL]], i32 noundef [[ADD]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: ret i32 [[TMP3]] +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined.omp.reduction.reduction_func +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC2Ei +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[VAL]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi +// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-NEXT: store i32 0, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[_TMP5]], align 8 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK-NEXT: br i1 [[CMP7]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I6]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I6]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]] +// CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP22:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP22]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK-NEXT: ] +// CHECK: .omp.reduction.case1: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.case2: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.default: +// CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP23]], label [[INIT:%.*]], label [[INIT_END:%.*]] +// CHECK: init: +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 +// CHECK-NEXT: br label [[INIT_END]] +// CHECK: init.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK-NEXT: store i32 [[ADD11]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 +// CHECK-NEXT: store i32 [[TMP26]], ptr [[SUM_V4]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP27]], [[TMP28]] +// CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi.omp.reduction.reduction_func +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_ +// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[PROD_V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[PROD_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V5:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP6:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[PROD_V7:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP8:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[PROD_V]], ptr [[PROD_V_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8 +// CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-NEXT: store i32 0, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: store ptr [[SUM_V5]], ptr [[_TMP6]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[_TMP1]], align 8 +// CHECK-NEXT: store i32 1, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: store ptr [[PROD_V7]], ptr [[_TMP8]], align 8 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +// CHECK-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I9]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[I9]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = load ptr, ptr [[_TMP6]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 +// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP23]], [[TMP21]] +// CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP22]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[I9]], align 4 +// CHECK-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP25]] to i64 +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[IDXPROM13]] +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[_TMP8]], align 8 +// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4 +// CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[TMP28]], [[TMP26]] +// CHECK-NEXT: store i32 [[MUL15]], ptr [[TMP27]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP29]], 1 +// CHECK-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP30]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK-NEXT: ] +// CHECK: .omp.reduction.case1: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.case2: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.default: +// CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP31]], label [[INIT:%.*]], label [[INIT_END:%.*]] +// CHECK: init: +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 +// CHECK-NEXT: br label [[INIT_END]] +// CHECK: init.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 +// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK-NEXT: store i32 [[ADD17]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 +// CHECK-NEXT: store i32 [[TMP34]], ptr [[SUM_V5]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK-NEXT: store i32 [[ADD18]], ptr [[TMP9]], align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP37:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP37]], label [[INIT19:%.*]], label [[INIT_END20:%.*]] +// CHECK: init19: +// CHECK-NEXT: store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 +// CHECK-NEXT: br label [[INIT_END20]] +// CHECK: init.end20: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[TMP38]], [[TMP39]] +// CHECK-NEXT: store i32 [[MUL21]], ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 +// CHECK-NEXT: store i32 [[TMP40]], ptr [[PROD_V7]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP10]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]] +// CHECK-NEXT: store i32 [[MUL22]], ptr [[TMP10]], align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@main +// CHECK-SAME: () #[[ATTR7:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V_EXT:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[PROD_V_EXT:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]]) +// CHECK-NEXT: store i32 0, ptr [[SUM_V_EXT]], align 4 +// CHECK-NEXT: store i32 1, ptr [[PROD_V_EXT]], align 4 +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.1, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]]) +// CHECK-NEXT: ret i32 0 + diff --git a/clang/test/OpenMP/for_reduction_messages.cpp b/clang/test/OpenMP/for_reduction_messages.cpp index de28ba2c3be02..2fdac3048c9cd 100644 --- a/clang/test/OpenMP/for_reduction_messages.cpp +++ b/clang/test/OpenMP/for_reduction_messages.cpp @@ -417,10 +417,12 @@ int main(int argc, char **argv) { #pragma omp for reduction(+ : qa[1], qa[0]) for (int i = 0; i < 10; ++i) foo(); +#if defined(_OPENMP) && (_OPENMP <= 202111) #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}} #pragma omp for reduction(+ : fl) // expected-error {{reduction variable must be shared}} for (int i = 0; i < 10; ++i) foo(); +#endif static int m=0; #pragma omp for reduction(+:m) for (int i = 0; i < 10; ++i) diff --git a/clang/test/OpenMP/for_simd_reduction_messages.cpp b/clang/test/OpenMP/for_simd_reduction_messages.cpp index 96b3805b10a86..a9ef6c39cb5d2 100644 --- a/clang/test/OpenMP/for_simd_reduction_messages.cpp +++ b/clang/test/OpenMP/for_simd_reduction_messages.cpp @@ -396,11 +396,11 @@ int main(int argc, char **argv) { #pragma omp for simd reduction(+ : fl) // expected-error {{reduction variable must be shared}} for (int i = 0; i < 10; ++i) foo(); -#endif #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}} #pragma omp for simd reduction(+ : fl) // expected-error {{reduction variable must be shared}} for (int i = 0; i < 10; ++i) foo(); +#endif static int m; #pragma omp for simd reduction(+ : m) for (int i = 0; i < 10; ++i) diff --git a/clang/test/OpenMP/sections_reduction_messages.cpp b/clang/test/OpenMP/sections_reduction_messages.cpp index 42ec3ed6d58e8..8cde6489f325f 100644 --- a/clang/test/OpenMP/sections_reduction_messages.cpp +++ b/clang/test/OpenMP/sections_reduction_messages.cpp @@ -461,12 +461,12 @@ int main(int argc, char **argv) { { foo(); } -#endif #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}} #pragma omp sections reduction(+ : fl) // expected-error {{reduction variable must be shared}} { foo(); } +#endif static int m; #pragma omp sections reduction(+ : m) // OK { diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp new file mode 100644 index 0000000000000..9bf3be1e9e45d --- /dev/null +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -0,0 +1,194 @@ +// RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run +#include +#include +#include +#include +#include +#include "omp_testsuite.h" + +#define N 10 +class Sum { + int val; + +public: + Sum(int v = 0) : val(v) {} + Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); } + Sum &operator+=(const Sum &rhs) { + val += rhs.val; + return *this; + } + int getValue() const { return val; } +}; + +// Declare OpenMP reduction +#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \ + initializer(omp_priv = Sum(0)) + +#pragma omp declare reduction(sum_pctor_reduction:Sum : omp_out += omp_in) \ + initializer(omp_priv = Sum(1)) // non-default ctor + +int checkUserDefinedReduction() { + Sum final_result_udr(0); + Sum final_result_udr_pctor(1); + Sum array_sum[N]; + int error_flag = 0; + int expected_value = 0; + int expected_value_pctor = 0; + for (int i = 0; i < N; ++i) { + array_sum[i] = Sum(i); + expected_value += i; // Calculate expected sum: 0 + 1 + ... + (N-1) + expected_value_pctor += i; + } + int num_threads_for_pctor_calc = 4; // num_threads(4) + int priv_initializer_val_pctor = 1; // initializer(omp_priv = Sum(1)) + expected_value_pctor += + num_threads_for_pctor_calc + priv_initializer_val_pctor; +#pragma omp parallel num_threads(4) private(final_result_udr) private( \ + final_result_udr_pctor) + { +#pragma omp for reduction(sum_reduction : final_result_udr) \ + reduction(sum_pctor_reduction : final_result_udr_pctor) + for (int i = 0; i < N; ++i) { + final_result_udr += array_sum[i]; + final_result_udr_pctor += array_sum[i]; + } + + if (final_result_udr.getValue() != expected_value || + final_result_udr_pctor.getValue() != expected_value_pctor) + error_flag += 1; + } + return error_flag; +} +void performMinMaxRed(int &min_val, int &max_val) { + int input_data[] = {7, 3, 12, 5, 8}; + int n_size = sizeof(input_data) / sizeof(input_data[0]); + min_val = INT_MAX; + max_val = INT_MIN; +#pragma omp for reduction(original(private), min : min_val) \ + reduction(original(private), max : max_val) + for (int i = 0; i < n_size; ++i) { + if (input_data[i] < min_val) + min_val = input_data[i]; + if (input_data[i] > max_val) + max_val = input_data[i]; + } +} +int performComplexReduction() { + double _Complex arr[N]; + double _Complex expected = 0.0 + 0.0 * I; + double _Complex result = 0.0 + 0.0 * I; + int error = 0; + + // Initialize the array and compute serial sum + for (int i = 0; i < N; ++i) { + arr[i] = i - i * I; + expected += arr[i]; + } + double real_sum = 0.0, imag_sum = 0.0; +#pragma omp parallel private(real_sum) private(imag_sum) + { +#pragma omp for reduction(+ : real_sum, imag_sum) + for (int i = 0; i < N; ++i) { + real_sum += creal(arr[i]); + imag_sum += cimag(arr[i]); + } + + result = real_sum + imag_sum * I; + if (cabs(result - expected) > 1e-6) { + error++; + } + } + return error; +} + +std::complex doComplexReduction(std::complex *arr) { + std::complex result(1, 0); + +#pragma omp declare reduction(* : std::complex : omp_out *= omp_in) \ + initializer(omp_priv = std::complex(1, 0)) + +#pragma omp for reduction(original(private), * : result) + for (int i = 0; i < N; ++i) + result *= arr[i]; + + return result; +} + +void performReductions(int n_elements, const int *input_values, + int &sum_val_out, int &prod_val_out, + float &float_sum_val_out) { + // private variables for this thread's reduction. + sum_val_out = 0; + prod_val_out = 1; + float_sum_val_out = 0.0f; + + const float kPiValue = 3.14f; +#pragma omp for reduction(original(private), + : sum_val_out) \ + reduction(original(private), * : prod_val_out) \ + reduction(original(private), + : float_sum_val_out) + for (int i = 0; i < n_elements; ++i) { + sum_val_out += input_values[i]; + prod_val_out *= (i + 1); + float_sum_val_out += kPiValue; + } +} +int main(void) { + int input_array[N]; + int total_errors = 0; + const float kPiVal = 3.14f; + const int kExpectedSum = 45; // Sum of 0..9 + const int kExpectedProd = 3628800; // 10! + const float kExpectedFsum = kPiVal * N; // 3.14f * 10 + const int kExpectedMin = 3; + const int kExpectedMax = 12; + std::complex arr[N]; + std::complex kExpectedComplex(1, 0); + // Initialize the array + for (int i = 1; i <= N; ++i) { + arr[i - 1] = std::complex( + 1.0 + 0.1 * i, 0.5 * i); // Avoid zero to prevent multiplication by zero + kExpectedComplex *= arr[i - 1]; + } + + for (int i = 0; i < N; i++) + input_array[i] = i; +#pragma omp parallel num_threads(4) + { + + int t_sum_v; + int t_prod_v; + float t_fsum_v; + performReductions(N, input_array, t_sum_v, t_prod_v, t_fsum_v); + if (t_sum_v != kExpectedSum) + total_errors++; + if (t_prod_v != kExpectedProd) + total_errors++; + if (t_fsum_v != kExpectedFsum) + total_errors++; + } +#pragma omp parallel num_threads(4) + { + int t_min_v; + int t_max_v; + performMinMaxRed(t_min_v, t_max_v); + if (t_min_v != kExpectedMin) + total_errors++; + if (t_max_v != kExpectedMax) + total_errors++; + } + total_errors += checkUserDefinedReduction(); + total_errors += performComplexReduction(); +#pragma omp parallel num_threads(4) + { + std::complex result(1, 0); + result = doComplexReduction(arr); + if (std::abs(result.real() - kExpectedComplex.real()) > 1e-6 || + std::abs(result.imag() - kExpectedComplex.imag()) > 1e-6) { + total_errors++; + } + } + if (total_errors != 0) + fprintf(stderr, "ERROR: reduction on private variable %d\n", total_errors); + + return total_errors; +}