diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index 8b6656834cc06..64f371e9f8db8 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -188,9 +188,6 @@ jobs: with: ref: ${{ needs.prepare.outputs.ref }} - - name: Install Ninja - uses: llvm/actions/install-ninja@5dd955034a6742a2e21d82bf165fcb1050ae7b49 # main - - name: Set Build Prefix id: setup-stage shell: bash diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h index 7afe97aac57e8..bf87654979cc9 100644 --- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h +++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h @@ -16,7 +16,9 @@ #include "mlir/Dialect/OpenACC/OpenACC.h" namespace fir { +class AddrOfOp; class DeclareOp; +class GlobalOp; } // namespace fir namespace hlfir { @@ -53,6 +55,18 @@ struct PartialEntityAccessModel bool isCompleteView(mlir::Operation *op) const; }; +struct AddressOfGlobalModel + : public mlir::acc::AddressOfGlobalOpInterface::ExternalModel< + AddressOfGlobalModel, fir::AddrOfOp> { + mlir::SymbolRefAttr getSymbol(mlir::Operation *op) const; +}; + +struct GlobalVariableModel + : public mlir::acc::GlobalVariableOpInterface::ExternalModel< + GlobalVariableModel, fir::GlobalOp> { + bool isConstant(mlir::Operation *op) const; +}; + } // namespace fir::acc #endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_ diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp index c1734be5185f4..11fbaf2dc2bb8 100644 --- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp @@ -59,4 +59,13 @@ bool PartialEntityAccessModel::isCompleteView( return !getBaseEntity(op); } +mlir::SymbolRefAttr AddressOfGlobalModel::getSymbol(mlir::Operation *op) const { + return mlir::cast(op).getSymbolAttr(); +} + +bool GlobalVariableModel::isConstant(mlir::Operation *op) const { + auto globalOp = mlir::cast(op); + return globalOp.getConstant().has_value(); +} + } // namespace fir::acc diff --git a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp index d71c40dfac03c..5c7f9985d41ca 100644 --- a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp +++ b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp @@ -49,6 +49,9 @@ void registerOpenACCExtensions(mlir::DialectRegistry ®istry) { PartialEntityAccessModel>(*ctx); fir::DeclareOp::attachInterface>( *ctx); + + fir::AddrOfOp::attachInterface(*ctx); + fir::GlobalOp::attachInterface(*ctx); }); // Register HLFIR operation interfaces diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index f2b168f6cb0e3..5f7fb00889655 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -796,6 +796,112 @@ every time. For more information, refer PTX ISA Membar/Fences ------------- +'``llvm.nvvm.fence.acquire/release.sync_restrict.*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster() + declare void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster() + +Overview: +""""""""" + +The `nvvm.fence.{semantics}.sync_restrict.*` restrict the class of memory +operations for which the fence instruction provides the memory ordering guarantees. +When `.sync_restrict` is restricted to `shared_cta`, then memory semantics must +be `release` and the effect of the fence operation only applies to operations +performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is +restricted to `shared_cluster`, then memory semantics must be `acquire` and the +effect of the fence operation only applies to operations performed on objects in +`shared_cluster` memory space. The scope for both operations is `cluster`. For more details, +please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.mbarrier_init.release.cluster``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.mbarrier_init.release.cluster() + +Overview: +""""""""" + +`nvvm.fence.mbarrier_init.release.cluster` intrinsic restrict the class of +memory operations for which the fence instruction provides the memory ordering +guarantees. The `mbarrier_init` modifiers restricts the synchronizing effect to +the prior `mbarrier_init` operation executed by the same thread on mbarrier objects +in `shared_cta` memory space. For more details, please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.proxy.async_generic.acquire/release.sync_restrict``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.proxy.async.generic.acquire.sync_restrict.space.cluster.scope.cluster() + declare void @llvm.nvvm.fence.proxy.async.generic.release.sync_restrict.space.cta.scope.cluster() + +Overview: +""""""""" + +`nvvm.fence.proxy.async_generic.{semantics}.sync_restrict` are used to establish +ordering between a prior memory access performed via the `async proxy__` +and a subsequent memory access performed via the generic proxy. +``nvvm.fence.proxy.async_generic.release.sync_restrict`` can form a release +sequence that synchronizes with an acquire sequence that contains the +``nvvm.fence.proxy.async_generic.acquire.sync_restrict`` proxy fence. When +`.sync_restrict` is restricted to `shared_cta`, then memory semantics must +be `release` and the effect of the fence operation only applies to operations +performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is +restricted to `shared_cluster`, then memory semantics must be `acquire` and the +effect of the fence operation only applies to operations performed on objects in +`shared_cluster` memory space. The scope for both operations is `cluster`. +For more details, please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.proxy.``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.proxy.alias() + declare void @llvm.nvvm.fence.proxy.async() + declare void @llvm.nvvm.fence.proxy.async.global() + declare void @llvm.nvvm.fence.proxy.async.shared_cluster() + declare void @llvm.nvvm.fence.proxy.async.shared_cta() + +Overview: +""""""""" + +`nvvm.fence.proxy.{proxykind}` intrinsics represent a fence with bi-directional +proxy ordering that is established between the memory accesses done between the +`generic proxy__` +and the proxy specified by `proxykind`. A `bi-directional proxy` ordering between +two proxykinds establishes two `uni-directional` proxy orderings: one from the +first proxykind to the second proxykind and the other from the second proxykind +to the first proxykind. + +`alias` proxykind refers to memory accesses performed using virtually aliased +addresses to the same memory location + +`async` proxykind specifies that the memory ordering is established between the +`async proxy` and the `generic proxy`. The memory ordering is limited only to +operations performed on objects in the state space specified (`generic`, `global`, +`shared_cluster`, `shared_cta`). If no state space is specified, then the memory +ordering applies on all state spaces. For more details, please refer the +`PTX ISA `__ + '``llvm.nvvm.fence.proxy.tensormap_generic.*``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0f17312b03827..a65e4667ab76c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -123,6 +123,32 @@ struct HardwareLoopInfo { LLVM_ABI bool canAnalyze(LoopInfo &LI); }; +/// Information for memory intrinsic cost model. +class MemIntrinsicCostAttributes { + /// Vector type of the data to be loaded or stored. + Type *DataTy = nullptr; + + /// ID of the memory intrinsic. + Intrinsic::ID IID; + + /// Address space of the pointer. + unsigned AddressSpace = 0; + + /// Alignment of single element. + Align Alignment; + +public: + LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy, + Align Alignment, unsigned AddressSpace) + : DataTy(DataTy), IID(Id), AddressSpace(AddressSpace), + Alignment(Alignment) {} + + Intrinsic::ID getID() const { return IID; } + Type *getDataType() const { return DataTy; } + unsigned getAddressSpace() const { return AddressSpace; } + Align getAlignment() const { return Alignment; } +}; + class IntrinsicCostAttributes { const IntrinsicInst *II = nullptr; Type *RetTy = nullptr; @@ -1556,7 +1582,7 @@ class TargetTransformInfo { /// \return The cost of masked Load and Store instructions. LLVM_ABI InstructionCost getMaskedMemoryOpCost( - unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, + const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; /// \return The cost of Gather or Scatter operation diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index aacb88d2f9684..d8e35748f53e5 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -842,8 +842,7 @@ class TargetTransformInfoImplBase { } virtual InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { return 1; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 944e1714e8f98..cb389ae74ef46 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1558,9 +1558,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { } InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override { + Type *DataTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned Opcode = MICA.getID() == Intrinsic::masked_load + ? Instruction::Load + : Instruction::Store; // TODO: Pass on AddressSpace when we have test coverage. return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, CostKind); @@ -1617,10 +1621,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { // Firstly, the cost of load/store operation. InstructionCost Cost; - if (UseMaskForCond || UseMaskForGaps) - Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, - AddressSpace, CostKind); - else + if (UseMaskForCond || UseMaskForGaps) { + unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load + : Intrinsic::masked_store; + Cost = thisT()->getMaskedMemoryOpCost( + {IID, VecTy, Alignment, AddressSpace}, CostKind); + } else Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); @@ -2403,14 +2409,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { case Intrinsic::masked_store: { Type *Ty = Tys[0]; Align TyAlign = thisT()->DL.getABITypeAlign(Ty); - return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, - CostKind); + return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind); } case Intrinsic::masked_load: { Type *Ty = RetTy; Align TyAlign = thisT()->DL.getABITypeAlign(Ty); - return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, - CostKind); + return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind); } case Intrinsic::experimental_vp_strided_store: { auto *Ty = cast(ICA.getArgTypes()[0]); diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 1b485dc8ccd1e..c8e776e6499df 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1746,33 +1746,65 @@ let TargetPrefix = "nvvm" in { def int_nvvm_barrier_cluster_wait_aligned : Intrinsic<[]>; } - // - // Membar - // - let IntrProperties = [IntrNoCallback] in { +// +// Membar / Fence +// +let IntrProperties = [IntrNoCallback] in { def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_fence_sc_cluster : Intrinsic<[]>; - } - // - // Proxy fence (uni-directional) - // + // Operation fence + def int_nvvm_fence_mbarrier_init_release_cluster: Intrinsic<[], [], [], + "llvm.nvvm.fence.mbarrier_init.release.cluster">; + + // Thread fence + def int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster">; + + def int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster">; + +// +// Proxy fence (uni-directional) +// + + def int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster">; + + def int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster">; + foreach scope = ["cta", "cluster", "gpu", "sys"] in { def int_nvvm_fence_proxy_tensormap_generic_release_ # scope : - Intrinsic<[], [], [IntrNoCallback], + Intrinsic<[], [], [], "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>; // The imm-arg 'size' can only be 128. def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoCallback, IntrArgMemOnly, ImmArg>, - Range, 128, 129>], - "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>; + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [], + "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope> { + let IntrProperties = [IntrNoCallback, IntrArgMemOnly, + ImmArg>, Range, 128, 129>]; + } } +// +// Proxy fence (bi-directional) +// + foreach proxykind = ["alias", "async", "async.global", "async.shared_cta", + "async.shared_cluster"] in { + defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>; + def Intr.record_name: Intrinsic<[], [], [], Intr.intr_name>; + } +} + // // Async Copy // diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index db99885121ec1..6abde996e6dc8 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -1579,6 +1579,99 @@ def CONVERGENCECTRL_GLUE : StandardPseudoInstruction { } } +/// Allow a target to replace the instruction definition of a +/// StandardPseudoInstruction. A target should only define one +/// instance of this per instruction. +/// +/// This is intended to allow targets to specify the register class +/// used for pointers. It should not be used to change the fundamental +/// operand structure (e.g., this should not add or remove operands, +/// or change the operand types). +class TargetSpecializedStandardPseudoInstruction< + StandardPseudoInstruction base_inst> : Instruction { + + StandardPseudoInstruction Instruction = base_inst; + let OutOperandList = base_inst.OutOperandList; + let InOperandList = base_inst.InOperandList; + + // TODO: Copy everything + let usesCustomInserter = base_inst.usesCustomInserter; + let hasSideEffects = base_inst.hasSideEffects; + let mayLoad = base_inst.mayLoad; + let mayStore = base_inst.mayStore; + let isTerminator = base_inst.isTerminator; + let isBranch = base_inst.isBranch; + let isIndirectBranch = base_inst.isIndirectBranch; + let isEHScopeReturn = base_inst.isEHScopeReturn; + let isReturn = base_inst.isReturn; + let isCall = base_inst.isCall; + let hasCtrlDep = base_inst.hasCtrlDep; + let isReMaterializable = base_inst.isReMaterializable; + let isMeta = base_inst.isMeta; + let Size = base_inst.Size; + let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; + let isPseudo = true; + let hasNoSchedulingInfo = true; + let isNotDuplicable = base_inst.isNotDuplicable; + let isConvergent = base_inst.isConvergent; + let hasExtraSrcRegAllocReq = base_inst.hasExtraSrcRegAllocReq; + let hasExtraDefRegAllocReq = base_inst.hasExtraDefRegAllocReq; +} + +// All pseudo instructions which need a pointer register class, which +// should be specialized by a target. +defvar PseudosWithPtrOps = [ + LOAD_STACK_GUARD, + PREALLOCATED_ARG, + PATCHABLE_EVENT_CALL, + PATCHABLE_TYPED_EVENT_CALL +]; + + +/// Replace PointerLikeRegClass operands in OperandList with new_rc. +class RemapPointerOperandList { + // Collect the set of names so we can query and rewrite them. + list op_names = !foreach(i, !range(!size(OperandList)), + !getdagname(OperandList, i)); + + // Beautiful language. This would be a lot easier if !getdagarg + // didn't require a specific type. We can't just collect a list of + // the operand values and reconstruct the dag, since there isn't a + // common base class for all the field kinds used in + // pseudoinstruction definitions; therefore everything must be + // maintained as a dag, so use a foldl. Additionally, ? doesn't + // evaluate as false so we get even more noise. + dag ret = + !foldl(OperandList, op_names, acc, name, + !cond( + !initialized(!getdagarg(OperandList, name)) + : !setdagarg(acc, name, new_rc), + !initialized(!getdagarg(OperandList, name)) : acc, + !initialized(!getdagarg(OperandList, name)) : acc + ) + ); +} + +/// Define an override for a pseudoinstruction which uses a pointer +/// register class, specialized to the target's pointer type. +class RemapPointerOperands : + TargetSpecializedStandardPseudoInstruction { + let OutOperandList = + RemapPointerOperandList.ret; + let InOperandList = + RemapPointerOperandList.ret; +} + +/// Helper to replace all pseudoinstructions using pointers to a +/// target register class. Most targets should use this. +multiclass RemapAllTargetPseudoPointerOperands< + RegisterClassLike default_ptr_rc> { + foreach inst = PseudosWithPtrOps in { + def : RemapPointerOperands; + } +} + // Generic opcodes used in GlobalISel. include "llvm/Target/GenericOpcodes.td" diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 0426ac7e62fab..45369f0ffe137 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1183,10 +1183,9 @@ InstructionCost TargetTransformInfo::getMemoryOpCost( } InstructionCost TargetTransformInfo::getMaskedMemoryOpCost( - unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, + const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { - InstructionCost Cost = TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, - AddressSpace, CostKind); + InstructionCost Cost = TTIImpl->getMaskedMemoryOpCost(MICA, CostKind); assert(Cost >= 0 && "TTI should not produce negative costs!"); return Cost; } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 9dfb6af58323a..1099faca9fa46 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -459,6 +459,15 @@ void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, addBlock(Die, Attribute, Block->BestForm(), Block); } +void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, + const DIExpression *Expr) { + DIELoc *Loc = new (DIEValueAllocator) DIELoc; + DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); + DwarfExpr.setMemoryLocationKind(); + DwarfExpr.addExpression(Expr); + addBlock(Die, Attribute, DwarfExpr.finalize()); +} + void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, unsigned Column, const DIFile *File) { if (Line == 0) @@ -842,27 +851,14 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIStringType *STy) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_string_length, *VarDIE); } else if (DIExpression *Expr = STy->getStringLengthExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - // This is to describe the memory location of the - // length of a Fortran deferred length string, so - // lock it down as such. - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_string_length, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_string_length, Expr); } else { uint64_t Size = STy->getSizeInBits() >> 3; addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size); } if (DIExpression *Expr = STy->getStringLocationExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - // This is to describe the memory location of the - // string, so lock it down as such. - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_data_location, Expr); } if (STy->getEncoding()) { @@ -1618,11 +1614,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &DW_Subrange, const DISubrangeType *SR, if (auto *VarDIE = getDIE(BV)) addDIEEntry(DW_Subrange, Attr, *VarDIE); } else if (auto *BE = dyn_cast_if_present(Bound)) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(BE); - addBlock(DW_Subrange, Attr, DwarfExpr.finalize()); + addBlock(DW_Subrange, Attr, BE); } else if (auto *BI = dyn_cast_if_present(Bound)) { if (Attr == dwarf::DW_AT_GNU_bias) { if (BI->getSExtValue() != 0) @@ -1660,11 +1652,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR) { if (auto *VarDIE = getDIE(BV)) addDIEEntry(DW_Subrange, Attr, *VarDIE); } else if (auto *BE = dyn_cast_if_present(Bound)) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(BE); - addBlock(DW_Subrange, Attr, DwarfExpr.finalize()); + addBlock(DW_Subrange, Attr, BE); } else if (auto *BI = dyn_cast_if_present(Bound)) { if (Attr == dwarf::DW_AT_count) { if (BI->getSExtValue() != -1) @@ -1710,11 +1698,7 @@ void DwarfUnit::constructGenericSubrangeDIE(DIE &Buffer, addSInt(DwGenericSubrange, Attr, dwarf::DW_FORM_sdata, BE->getElement(1)); } else { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(BE); - addBlock(DwGenericSubrange, Attr, DwarfExpr.finalize()); + addBlock(DwGenericSubrange, Attr, BE); } } }; @@ -1781,44 +1765,28 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_data_location, *VarDIE); } else if (DIExpression *Expr = CTy->getDataLocationExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_data_location, Expr); } if (DIVariable *Var = CTy->getAssociated()) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_associated, *VarDIE); } else if (DIExpression *Expr = CTy->getAssociatedExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_associated, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_associated, Expr); } if (DIVariable *Var = CTy->getAllocated()) { if (auto *VarDIE = getDIE(Var)) addDIEEntry(Buffer, dwarf::DW_AT_allocated, *VarDIE); } else if (DIExpression *Expr = CTy->getAllocatedExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(Expr); - addBlock(Buffer, dwarf::DW_AT_allocated, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_allocated, Expr); } if (auto *RankConst = CTy->getRankConst()) { addSInt(Buffer, dwarf::DW_AT_rank, dwarf::DW_FORM_sdata, RankConst->getSExtValue()); } else if (auto *RankExpr = CTy->getRankExp()) { - DIELoc *Loc = new (DIEValueAllocator) DIELoc; - DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc); - DwarfExpr.setMemoryLocationKind(); - DwarfExpr.addExpression(RankExpr); - addBlock(Buffer, dwarf::DW_AT_rank, DwarfExpr.finalize()); + addBlock(Buffer, dwarf::DW_AT_rank, RankExpr); } if (auto *BitStride = CTy->getBitStrideConst()) { diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h index 7841ff7fa5952..6875c415057d2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -218,6 +218,9 @@ class DwarfUnit : public DIEUnit { void addBlock(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form, DIEBlock *Block); + /// Add an expression as block data. + void addBlock(DIE &Die, dwarf::Attribute Attribute, const DIExpression *Expr); + /// Add location information to specified debug information entry. void addSourceLine(DIE &Die, unsigned Line, unsigned Column, const DIFile *File); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index bf195ca210e9b..0bae00bafee3c 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4730,12 +4730,12 @@ bool AArch64TTIImpl::prefersVectorizedAddressing() const { } InstructionCost -AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, unsigned AddressSpace, +AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + Type *Src = MICA.getDataType(); + if (useNeonVector(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); auto LT = getTypeLegalizationCost(Src); if (!LT.first.isValid()) return InstructionCost::getInvalid(); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index d189f563f99a1..6cc4987428567 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -188,8 +188,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase { unsigned Opcode2) const; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index ca98b80787fb4..a87f9f274a4d3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1423,7 +1423,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, /// \returns true if the target has packed f32 instructions that only read 32 /// bits from a scalar operand (SGPR or literal) and replicates the bits to /// both channels. - bool hasPKF32InstsReplicatingLow32BitsOfScalarInput() const { + bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const { return getGeneration() == GFX12 && GFX1250Insts; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index cfc0c16ea30e5..8a959125de402 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -766,29 +766,21 @@ static void appendFoldCandidate(SmallVectorImpl &FoldList, FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp)); } -// Returns true if the instruction is a packed f32 instruction that only reads -// 32 bits from a scalar operand (SGPR or literal) and replicates the bits to -// both channels. -static bool -isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST, - MachineInstr *MI) { - if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput()) +// Returns true if the instruction is a packed F32 instruction and the +// corresponding scalar operand reads 32 bits and replicates the bits to both +// channels. +static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand( + const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) { + if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput()) return false; - switch (MI->getOpcode()) { - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_FMA_F32: - return true; - default: - return false; - } - llvm_unreachable("unknown instruction"); + const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo]; + return OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; } // Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or // literal) and replicates the bits to both channels. Therefore, if the hi and // lo are not same, we can't fold it. -static bool checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput( +static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand( const FoldableDef &OpToFold) { assert(OpToFold.isImm() && "Expected immediate operand"); uint64_t ImmVal = OpToFold.getEffectiveImmVal().value(); @@ -953,8 +945,8 @@ bool SIFoldOperandsImpl::tryAddToFoldList( // Special case for PK_F32 instructions if we are trying to fold an imm to // src0 or src1. if (OpToFold.isImm() && - isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) && - !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) + isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, MI, OpNo) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) return false; appendFoldCandidate(FoldList, MI, OpNo, OpToFold); @@ -1171,8 +1163,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm( return false; if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) { - if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) && - !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold)) + if (isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, UseMI, UseOpIdx) && + !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold)) return false; appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold); return true; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 24f58a68c345d..d12b802fe234f 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1631,20 +1631,22 @@ InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, +ARMTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned IID = MICA.getID(); + Type *Src = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); if (ST->hasMVEIntegerOps()) { - if (Opcode == Instruction::Load && + if (IID == Intrinsic::masked_load && isLegalMaskedLoad(Src, Alignment, AddressSpace)) return ST->getMVEVectorCostFactor(CostKind); - if (Opcode == Instruction::Store && + if (IID == Intrinsic::masked_store && isLegalMaskedStore(Src, Alignment, AddressSpace)) return ST->getMVEVectorCostFactor(CostKind); } if (!isa(Src)) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); // Scalar cost, which is currently very high due to the efficiency of the // generated code. return cast(Src)->getNumElements() * 8; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h index 0810c5532ed91..919a6fc9fd0b0 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -275,8 +275,7 @@ class ARMTTIImpl final : public BasicTTIImplBase { const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index e925e041eb64e..8f3f0cc8abb01 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -224,11 +224,9 @@ InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, - Align Alignment, unsigned AddressSpace, +HexagonTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); } InstructionCost diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index cec2bf9656ffc..e95b5a10b76a7 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -120,8 +120,7 @@ class HexagonTTIImpl final : public BasicTTIImplBase { TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index bcdb46eca9744..cd7bc37942ca4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -364,7 +364,42 @@ def INT_FENCE_SC_CLUSTER: NullaryInst<"fence.sc.cluster", int_nvvm_fence_sc_cluster>, Requires<[hasPTX<78>, hasSM<90>]>; +def INT_FENCE_MBARRIER_INIT_RELEASE_CLUSTER: + NullaryInst<"fence.mbarrier_init.release.cluster", + int_nvvm_fence_mbarrier_init_release_cluster>, + Requires<[hasPTX<80>, hasSM<90>]>; + +let Predicates = [hasPTX<86>, hasSM<90>] in { +def INT_FENCE_ACQUIRE_SYNC_RESTRICT_CLUSTER_CLUSTER: + NullaryInst<"fence.acquire.sync_restrict::shared::cluster.cluster", + int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster>; + +def INT_FENCE_RELEASE_SYNC_RESTRICT_CTA_CLUSTER: + NullaryInst<"fence.release.sync_restrict::shared::cta.cluster", + int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster>; +} + // Proxy fence (uni-directional) +let Predicates = [hasPTX<86>, hasSM<90>] in { +def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_ACQUIRE_SYNC_RESTRICT_SPACE_CLUSTER_SCOPE_CLUSTER: + NullaryInst<"fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster", + int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster>; + +def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_RELEASE_SYNC_RESTRICT_SPACE_CTA_SCOPE_CLUSTER: + NullaryInst<"fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster", + int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster>; +} + +// Proxy fence (bi-directional) +foreach proxykind = ["alias", "async", "async.global", "async.shared_cta", + "async.shared_cluster"] in { + defvar Preds = !if(!eq(proxykind, "alias"), [hasPTX<75>, hasSM<70>], + [hasPTX<80>, hasSM<90>]); + defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>; + def : NullaryInst<"fence.proxy." # !subst("_", "::", proxykind), + !cast(Intr.record_name)>, Requires; +} + class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE : NullaryInst<"fence.proxy.tensormap::generic.release." # Scope, Intr>, Requires<[hasPTX<83>, hasSM<90>]>; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index dca6e9cffebb0..1a1a93a9cb178 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1008,13 +1008,17 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead( } InstructionCost -RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, +RISCVTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load + : Instruction::Store; + Type *Src = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); + if (!isLegalMaskedLoadStore(Src, Alignment) || CostKind != TTI::TCK_RecipThroughput) - return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace, - CostKind); + return BaseT::getMaskedMemoryOpCost(MICA, CostKind); return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 6886e8964e29e..39c1173e2986c 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -144,8 +144,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool shouldConsiderVectorizationRegPressure() const override { return true; } InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 0b1430e373fc7..4b77bf925b2ba 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -5411,9 +5411,14 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, } InstructionCost -X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, - unsigned AddressSpace, +X86TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const { + unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load + : Instruction::Store; + Type *SrcTy = MICA.getDataType(); + Align Alignment = MICA.getAlignment(); + unsigned AddressSpace = MICA.getAddressSpace(); + bool IsLoad = (Instruction::Load == Opcode); bool IsStore = (Instruction::Store == Opcode); @@ -6647,10 +6652,12 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( LegalVT.getVectorNumElements()); InstructionCost MemOpCost; bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; - if (UseMaskedMemOp) - MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment, - AddressSpace, CostKind); - else + if (UseMaskedMemOp) { + unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load + : Intrinsic::masked_store; + MemOpCost = getMaskedMemoryOpCost( + {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind); + } else MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace, CostKind); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h index de5e1c297b1e4..df1393ce16ca1 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -183,8 +183,7 @@ class X86TTIImpl final : public BasicTTIImplBase { TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I = nullptr) const override; InstructionCost - getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, - unsigned AddressSpace, + getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override; InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp index 94663ff928a0b..fa35eef2c00f5 100644 --- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp +++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp @@ -1469,6 +1469,9 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr( Constant::getNullValue(F->getType())); Value *Select = Builder.CreateSelect(ICmp, JT, Constant::getNullValue(F->getType())); + + if (auto *SI = dyn_cast(Select)) + setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE); // For phi nodes, we need to update the incoming value for all operands // with the same predecessor. if (PN) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c680b6fca84cd..aa52f9e2a53ca 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5251,8 +5251,10 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; if (Legal->isMaskRequired(I)) { - Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, - CostKind); + unsigned IID = I->getOpcode() == Instruction::Load + ? Intrinsic::masked_load + : Intrinsic::masked_store; + Cost += TTI.getMaskedMemoryOpCost({IID, VectorTy, Alignment, AS}, CostKind); } else { TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0)); Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index deb8ee2d88055..e33ff724ccdd5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6904,9 +6904,10 @@ static bool isMaskedLoadCompress( ScalarLoadsCost; InstructionCost LoadCost = 0; if (IsMasked) { - LoadCost = - TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, - LI->getPointerAddressSpace(), CostKind); + LoadCost = TTI.getMaskedMemoryOpCost({Intrinsic::masked_load, LoadVecTy, + CommonAlignment, + LI->getPointerAddressSpace()}, + CostKind); } else { LoadCost = TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment, @@ -7305,8 +7306,9 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( break; case LoadsState::CompressVectorize: VecLdCost += TTI.getMaskedMemoryOpCost( - Instruction::Load, SubVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind) + + {Intrinsic::masked_load, SubVecTy, CommonAlignment, + LI0->getPointerAddressSpace()}, + CostKind) + VectorGEPCost + ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy, {}, CostKind); @@ -15102,8 +15104,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, CommonAlignment, LI0->getPointerAddressSpace(), CostKind); } else if (IsMasked) { VecLdCost = TTI->getMaskedMemoryOpCost( - Instruction::Load, LoadVecTy, CommonAlignment, - LI0->getPointerAddressSpace(), CostKind); + {Intrinsic::masked_load, LoadVecTy, CommonAlignment, + LI0->getPointerAddressSpace()}, + CostKind); // TODO: include this cost into CommonCost. VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 94657f5d39390..e89e91b959926 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -3592,8 +3592,10 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, InstructionCost Cost = 0; if (IsMasked) { + unsigned IID = isa(this) ? Intrinsic::masked_load + : Intrinsic::masked_store; Cost += - Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind); + Ctx.TTI.getMaskedMemoryOpCost({IID, Ty, Alignment, AS}, Ctx.CostKind); } else { TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo( isa(this) ? getOperand(0) @@ -3711,8 +3713,10 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); + // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics. + // After migrating to getMemIntrinsicInstrCost, switch this to vp_load. InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Instruction::Load, Ty, Alignment, AS, Ctx.CostKind); + {Intrinsic::masked_load, Ty, Alignment, AS}, Ctx.CostKind); if (!Reverse) return Cost; @@ -3820,8 +3824,10 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); + // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics. + // After migrating to getMemIntrinsicInstrCost, switch this to vp_store. InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Instruction::Store, Ty, Alignment, AS, Ctx.CostKind); + {Intrinsic::masked_store, Ty, Alignment, AS}, Ctx.CostKind); if (!Reverse) return Cost; diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll new file mode 100644 index 0000000000000..d46408e31752f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async_generic_acquire_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_acquire_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_generic_release_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_release_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll new file mode 100644 index 0000000000000..896c624602a60 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async() { +; CHECK-LABEL: test_nvvm_fence_proxy_async( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async() + ret void +} + +define void @test_nvvm_fence_proxy_async_global() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_global( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.global; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.global() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cluster() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cluster( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cta() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cta( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cta; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cta() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll new file mode 100644 index 0000000000000..ab35e4fb396d6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll @@ -0,0 +1,8 @@ +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 -o /dev/null 2>&1 | FileCheck %s + +define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) { + ; CHECK: immarg value 130 out of range [128, 129) + call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 130); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy.ll b/llvm/test/CodeGen/NVPTX/fence-proxy.ll new file mode 100644 index 0000000000000..cb5679e68944d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_70 && ptxas-isa-7.5 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | %ptxas-verify -arch=sm_70 %} + +define void @test_nvvm_fence_proxy_alias() { +; CHECK-LABEL: test_nvvm_fence_proxy_alias( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.alias; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.alias() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/op-fence.ll b/llvm/test/CodeGen/NVPTX/op-fence.ll new file mode 100644 index 0000000000000..629b702742afb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/op-fence.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_mbarrier_init +define void @test_fence_mbarrier_init() { +; CHECK-LABEL: test_fence_mbarrier_init( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.mbarrier_init.release.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.mbarrier_init.release.cluster(); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/thread-fence.ll b/llvm/test/CodeGen/NVPTX/thread-fence.ll new file mode 100644 index 0000000000000..185461bd183d0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/thread-fence.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_acquire +define void @test_fence_acquire() { +; CHECK-LABEL: test_fence_acquire( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster(); + + ret void +} + +; CHECK-LABEL: test_fence_release +define void @test_fence_release() { +; CHECK-LABEL: test_fence_release( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster(); + + ret void +} diff --git a/llvm/test/TableGen/target-specialized-pseudos.td b/llvm/test/TableGen/target-specialized-pseudos.td new file mode 100644 index 0000000000000..99c63f3ec29d9 --- /dev/null +++ b/llvm/test/TableGen/target-specialized-pseudos.td @@ -0,0 +1,101 @@ +// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s -DONECASE -o - | FileCheck -check-prefixes=CHECK,ONECASE %s +// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s -DALLCASES -o - | FileCheck -check-prefixes=CHECK,ALLCASES %s +// RUN: not llvm-tblgen -gen-instr-info -I %p/../../include %s -DERROR -o /dev/null 2>&1 | FileCheck -check-prefix=ERROR %s + +// CHECK: namespace llvm::MyTarget { +// CHECK: enum { +// CHECK: LOAD_STACK_GUARD = [[LOAD_STACK_GUARD_OPCODE:[0-9]+]], +// CHECK: PREALLOCATED_ARG = [[PREALLOCATED_ARG_OPCODE:[0-9]+]], +// CHECK: PATCHABLE_EVENT_CALL = [[PATCHABLE_EVENT_CALL_OPCODE:[0-9]+]], +// CHECK: PATCHABLE_TYPED_EVENT_CALL = [[PATCHABLE_TYPED_EVENT_CALL_OPCODE:[0-9]+]], + +// Make sure no enum entry is emitted for MY_LOAD_STACK_GUARD +// CHECK: G_UBFX = [[G_UBFX_OPCODE:[0-9]+]], +// CHECK-NEXT: MY_MOV = [[MY_MOV_OPCODE:[0-9]+]], +// CHECK-NEXT: INSTRUCTION_LIST_END = [[INSTR_LIST_END_OPCODE:[0-9]+]] + + +// CHECK: extern const MyTargetInstrTable MyTargetDescs = { +// CHECK-NEXT: { +// CHECK-NEXT: { [[MY_MOV_OPCODE]], 2, 1, 2, 0, 0, 0, {{[0-9]+}}, MyTargetImpOpBase + 0, 0|(1ULL< + : Register { + let Namespace = "MyTarget"; +} + +class MyClass types, dag registers> + : RegisterClass<"MyTarget", types, size, registers> { + let Size = size; +} + +def X0 : MyReg<"x0">; +def X1 : MyReg<"x1">; +def XRegs : RegisterClass<"MyTarget", [i64], 64, (add X0, X1)>; + + +class TestInstruction : Instruction { + let Size = 2; + let Namespace = "MyTarget"; + let hasSideEffects = false; +} + +#ifdef ONECASE + +// Example setting the pointer register class manually +def MY_LOAD_STACK_GUARD : + TargetSpecializedStandardPseudoInstruction { + let Namespace = "MyTarget"; + let OutOperandList = (outs XRegs:$dst); +} + +#endif + +#ifdef ALLCASES + +defm my_remaps : RemapAllTargetPseudoPointerOperands; + +#endif + + +#ifdef ERROR + +def MY_LOAD_STACK_GUARD_0 : TargetSpecializedStandardPseudoInstruction; + +// ERROR: :[[@LINE+1]]:5: error: multiple overrides of 'LOAD_STACK_GUARD' defined +def MY_LOAD_STACK_GUARD_1 : TargetSpecializedStandardPseudoInstruction; + +#endif + +def MY_MOV : TestInstruction { + let OutOperandList = (outs XRegs:$dst); + let InOperandList = (ins XRegs:$src); + let AsmString = "my_mov $dst, $src"; +} + + +def MyTargetISA : InstrInfo; +def MyTarget : Target { let InstructionSet = MyTargetISA; } diff --git a/llvm/test/Transforms/LowerTypeTests/function-weak.ll b/llvm/test/Transforms/LowerTypeTests/function-weak.ll index 4ea03b6c2c1fa..dbbe8fa4a0a9a 100644 --- a/llvm/test/Transforms/LowerTypeTests/function-weak.ll +++ b/llvm/test/Transforms/LowerTypeTests/function-weak.ll @@ -32,10 +32,10 @@ target triple = "x86_64-unknown-linux-gnu" declare !type !0 extern_weak void @f() ; CHECK: define zeroext i1 @check_f() -define zeroext i1 @check_f() { +define zeroext i1 @check_f() !prof !{!"function_entry_count", i32 10} { entry: ; CHECK: [[CMP:%.*]] = icmp ne ptr @f, null -; CHECK: [[SEL:%.*]] = select i1 [[CMP]], ptr @[[JT:.*]], ptr null +; CHECK: [[SEL:%.*]] = select i1 [[CMP]], ptr @[[JT:.*]], ptr null, !prof ![[SELPROF:[0-9]+]] ; CHECK: [[PTI:%.*]] = ptrtoint ptr [[SEL]] to i1 ; CHECK: ret i1 [[PTI]] ret i1 ptrtoint (ptr @f to i1) @@ -165,3 +165,4 @@ define i1 @foo(ptr %p) { ; CHECK-NEXT: } !0 = !{i32 0, !"typeid1"} +; CHECK: ![[SELPROF]] = !{!"unknown", !"lowertypetests"} \ No newline at end of file diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp index e5025784d304d..35ec495b93ba2 100644 --- a/llvm/utils/TableGen/CodeGenMapTable.cpp +++ b/llvm/utils/TableGen/CodeGenMapTable.cpp @@ -80,6 +80,7 @@ #include "TableGenBackends.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/TableGen/CodeGenHelpers.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" @@ -549,9 +550,8 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { if (InstrMapVec.empty()) return; - OS << "#ifdef GET_INSTRMAP_INFO\n"; - OS << "#undef GET_INSTRMAP_INFO\n"; - OS << "namespace llvm::" << NameSpace << " {\n\n"; + IfDefEmitter IfDef(OS, "GET_INSTRMAP_INFO"); + NamespaceEmitter NS(OS, ("llvm::" + NameSpace).str()); // Emit coulumn field names and their values as enums. emitEnums(OS, Records); @@ -574,6 +574,4 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) { // Emit map tables and the functions to query them. IMap.emitTablesWithFunc(OS); } - OS << "} // end namespace llvm::" << NameSpace << '\n'; - OS << "#endif // GET_INSTRMAP_INFO\n\n"; } diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp index c0daac127f71a..e080ca0aa0b31 100644 --- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp @@ -283,15 +283,25 @@ void CodeGenTarget::ComputeInstrsByEnum() const { assert(EndOfPredefines == getNumFixedInstructions() && "Missing generic opcode"); + [[maybe_unused]] unsigned SkippedInsts = 0; + for (const auto &[_, CGIUp] : InstMap) { const CodeGenInstruction *CGI = CGIUp.get(); if (CGI->Namespace != "TargetOpcode") { + + if (CGI->TheDef->isSubClassOf( + "TargetSpecializedStandardPseudoInstruction")) { + ++SkippedInsts; + continue; + } + InstrsByEnum.push_back(CGI); NumPseudoInstructions += CGI->TheDef->getValueAsBit("isPseudo"); } } - assert(InstrsByEnum.size() == InstMap.size() && "Missing predefined instr"); + assert(InstrsByEnum.size() + SkippedInsts == InstMap.size() && + "Missing predefined instr"); // All of the instructions are now in random order based on the map iteration. llvm::sort( diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp index 32994c12aa98b..d46c9d811753a 100644 --- a/llvm/utils/TableGen/InstrInfoEmitter.cpp +++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp @@ -72,6 +72,13 @@ class InstrInfoEmitter { using OperandInfoListTy = std::vector; using OperandInfoMapTy = std::map; + DenseMap + TargetSpecializedPseudoInsts; + + /// Compute mapping of opcodes which should have their definitions overridden + /// by a target version. + void buildTargetSpecializedPseudoInstsMap(); + /// Generate member functions in the target-specific GenInstrInfo class. /// /// This method is used to custom expand TIIPredicate definitions. @@ -216,6 +223,10 @@ InstrInfoEmitter::CollectOperandInfo(OperandInfoListTy &OperandInfoList, const CodeGenTarget &Target = CDP.getTargetInfo(); unsigned Offset = 0; for (const CodeGenInstruction *Inst : Target.getInstructions()) { + auto OverrideEntry = TargetSpecializedPseudoInsts.find(Inst); + if (OverrideEntry != TargetSpecializedPseudoInsts.end()) + Inst = OverrideEntry->second; + OperandInfoTy OperandInfo = GetOperandInfo(*Inst); if (OperandInfoMap.try_emplace(OperandInfo, Offset).second) { OperandInfoList.push_back(OperandInfo); @@ -859,6 +870,25 @@ void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS, } } +void InstrInfoEmitter::buildTargetSpecializedPseudoInstsMap() { + ArrayRef SpecializedInsts = Records.getAllDerivedDefinitions( + "TargetSpecializedStandardPseudoInstruction"); + const CodeGenTarget &Target = CDP.getTargetInfo(); + + for (const Record *SpecializedRec : SpecializedInsts) { + const CodeGenInstruction &SpecializedInst = + Target.getInstruction(SpecializedRec); + const Record *BaseInstRec = SpecializedRec->getValueAsDef("Instruction"); + + const CodeGenInstruction &BaseInst = Target.getInstruction(BaseInstRec); + + if (!TargetSpecializedPseudoInsts.insert({&BaseInst, &SpecializedInst}) + .second) + PrintFatalError(SpecializedRec, "multiple overrides of '" + + BaseInst.getName() + "' defined"); + } +} + //===----------------------------------------------------------------------===// // Main Output. //===----------------------------------------------------------------------===// @@ -881,6 +911,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) { // Collect all of the operand info records. Timer.startTimer("Collect operand info"); + buildTargetSpecializedPseudoInstsMap(); + OperandInfoListTy OperandInfoList; OperandInfoMapTy OperandInfoMap; unsigned OperandInfoSize = @@ -963,6 +995,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) { for (const CodeGenInstruction *Inst : reverse(NumberedInstructions)) { // Keep a list of the instruction names. InstrNames.add(Inst->getName()); + + auto OverrideEntry = TargetSpecializedPseudoInsts.find(Inst); + if (OverrideEntry != TargetSpecializedPseudoInsts.end()) + Inst = OverrideEntry->second; + // Emit the record into the table. emitRecord(*Inst, --Num, InstrInfo, EmittedLists, OperandInfoMap, OS); } diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td index 054c13a88a552..6b0c84d31d1ba 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td @@ -44,4 +44,35 @@ def PartialEntityAccessOpInterface : OpInterface<"PartialEntityAccessOpInterface ]; } +def AddressOfGlobalOpInterface : OpInterface<"AddressOfGlobalOpInterface"> { + let cppNamespace = "::mlir::acc"; + + let description = [{ + An interface for operations that compute the address of a global variable + or symbol. + }]; + + let methods = [ + InterfaceMethod<"Get the symbol reference to the global", "::mlir::SymbolRefAttr", + "getSymbol", (ins)>, + ]; +} + +def GlobalVariableOpInterface : OpInterface<"GlobalVariableOpInterface"> { + let cppNamespace = "::mlir::acc"; + + let description = [{ + An interface for operations that define global variables. This interface + provides a uniform way to query properties of global variables across + different dialects. + }]; + + let methods = [ + InterfaceMethod<"Check if the global variable is constant", "bool", + "isConstant", (ins), [{ + return false; + }]>, + ]; +} + #endif // OPENACC_OPS_INTERFACES diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td index 970d9304d8289..cad78df2fbb0b 100644 --- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td @@ -97,7 +97,14 @@ def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> { "mlir::acc::DeviceType::None", "Target device type for implicit routine generation. " "Ensures that `acc routine` device_type clauses are " - "properly considered not just default clauses."> + "properly considered not just default clauses.", + [{::llvm::cl::values( + clEnumValN(mlir::acc::DeviceType::None, "none", "none"), + clEnumValN(mlir::acc::DeviceType::Host, "host", "host"), + clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"), + clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"), + clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon")) + }]> ]; } diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 8c9c137b8aebb..5749e6ded73ba 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -211,6 +211,24 @@ struct LLVMPointerPointerLikeModel Type getElementType(Type pointer) const { return Type(); } }; +struct MemrefAddressOfGlobalModel + : public AddressOfGlobalOpInterface::ExternalModel< + MemrefAddressOfGlobalModel, memref::GetGlobalOp> { + SymbolRefAttr getSymbol(Operation *op) const { + auto getGlobalOp = cast(op); + return getGlobalOp.getNameAttr(); + } +}; + +struct MemrefGlobalVariableModel + : public GlobalVariableOpInterface::ExternalModel { + bool isConstant(Operation *op) const { + auto globalOp = cast(op); + return globalOp.getConstant(); + } +}; + /// Helper function for any of the times we need to modify an ArrayAttr based on /// a device type list. Returns a new ArrayAttr with all of the /// existingDeviceTypes, plus the effective new ones(or an added none if hte new @@ -302,6 +320,11 @@ void OpenACCDialect::initialize() { MemRefPointerLikeModel>(*getContext()); LLVM::LLVMPointerType::attachInterface( *getContext()); + + // Attach operation interfaces + memref::GetGlobalOp::attachInterface( + *getContext()); + memref::GlobalOp::attachInterface(*getContext()); } //===----------------------------------------------------------------------===// diff --git a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt index 177c8680b0040..c8c2bb96b0539 100644 --- a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt +++ b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_unittest(MLIROpenACCTests OpenACCOpsTest.cpp + OpenACCOpsInterfacesTest.cpp OpenACCUtilsTest.cpp ) mlir_target_link_libraries(MLIROpenACCTests diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp new file mode 100644 index 0000000000000..261f5c513ea24 --- /dev/null +++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp @@ -0,0 +1,95 @@ +//===- OpenACCOpsInterfacesTest.cpp - Unit tests for OpenACC interfaces --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/OpenACC/OpenACC.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/IR/OwningOpRef.h" +#include "gtest/gtest.h" + +using namespace mlir; +using namespace mlir::acc; + +//===----------------------------------------------------------------------===// +// Test Fixture +//===----------------------------------------------------------------------===// + +class OpenACCOpsInterfacesTest : public ::testing::Test { +protected: + OpenACCOpsInterfacesTest() + : context(), builder(&context), loc(UnknownLoc::get(&context)) { + context.loadDialect(); + } + + MLIRContext context; + OpBuilder builder; + Location loc; +}; + +//===----------------------------------------------------------------------===// +// GlobalVariableOpInterface Tests +//===----------------------------------------------------------------------===// + +TEST_F(OpenACCOpsInterfacesTest, GlobalVariableOpInterfaceNonConstant) { + // Test that a non-constant global returns false for isConstant() + + auto memrefType = MemRefType::get({10}, builder.getF32Type()); + OwningOpRef globalOp = memref::GlobalOp::create( + builder, loc, + /*sym_name=*/builder.getStringAttr("mutable_global"), + /*sym_visibility=*/builder.getStringAttr("private"), + /*type=*/TypeAttr::get(memrefType), + /*initial_value=*/Attribute(), + /*constant=*/UnitAttr(), + /*alignment=*/IntegerAttr()); + + auto globalVarIface = + dyn_cast(globalOp->getOperation()); + ASSERT_TRUE(globalVarIface != nullptr); + EXPECT_FALSE(globalVarIface.isConstant()); +} + +TEST_F(OpenACCOpsInterfacesTest, GlobalVariableOpInterfaceConstant) { + // Test that a constant global returns true for isConstant() + + auto memrefType = MemRefType::get({5}, builder.getI32Type()); + OwningOpRef constantGlobalOp = memref::GlobalOp::create( + builder, loc, + /*sym_name=*/builder.getStringAttr("constant_global"), + /*sym_visibility=*/builder.getStringAttr("public"), + /*type=*/TypeAttr::get(memrefType), + /*initial_value=*/Attribute(), + /*constant=*/builder.getUnitAttr(), + /*alignment=*/IntegerAttr()); + + auto globalVarIface = + dyn_cast(constantGlobalOp->getOperation()); + ASSERT_TRUE(globalVarIface != nullptr); + EXPECT_TRUE(globalVarIface.isConstant()); +} + +//===----------------------------------------------------------------------===// +// AddressOfGlobalOpInterface Tests +//===----------------------------------------------------------------------===// + +TEST_F(OpenACCOpsInterfacesTest, AddressOfGlobalOpInterfaceGetSymbol) { + // Test that getSymbol() returns the correct symbol reference + + auto memrefType = MemRefType::get({5}, builder.getI32Type()); + const auto *symbolName = "test_global_symbol"; + + OwningOpRef getGlobalOp = memref::GetGlobalOp::create( + builder, loc, memrefType, FlatSymbolRefAttr::get(&context, symbolName)); + + auto addrOfGlobalIface = + dyn_cast(getGlobalOp->getOperation()); + ASSERT_TRUE(addrOfGlobalIface != nullptr); + EXPECT_EQ(addrOfGlobalIface.getSymbol().getLeafReference(), symbolName); +} diff --git a/orc-rt/lib/executor/TaskDispatcher.cpp b/orc-rt/lib/executor/TaskDispatcher.cpp index 5f34627fb5150..9e42a66c2ea94 100644 --- a/orc-rt/lib/executor/TaskDispatcher.cpp +++ b/orc-rt/lib/executor/TaskDispatcher.cpp @@ -1,4 +1,4 @@ -//===- TaskDispatch.cpp ---------------------------------------------------===// +//===- TaskDispatcher.cpp -------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Contains the implementation of APIs in the orc-rt/TaskDispatch.h header. +// Contains the implementation of APIs in the orc-rt/TaskDispatcher.h header. // //===----------------------------------------------------------------------===// diff --git a/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp b/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp index d6d301302220d..4bf7e5df69654 100644 --- a/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp +++ b/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp @@ -1,4 +1,4 @@ -//===- ThreadPoolTaskDispatch.cpp -----------------------------------------===// +//===- ThreadPoolTaskDispatcher.cpp ---------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// Contains the implementation of APIs in the orc-rt/ThreadPoolTaskDispatch.h +// Contains the implementation of APIs in the orc-rt/ThreadPoolTaskDispatcher.h // header. // //===----------------------------------------------------------------------===// diff --git a/revert_patches.txt b/revert_patches.txt index 9e465ba90ae6a..a3a76b6ac1e40 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -5,3 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485) breaks build of ROCmValidationSuite [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662) --- +Shore will help land downstream +[AMDGPU] Adding instruction specific features (#167809) +--- diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index b027d82d98177..643f8ab03f724 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -2874,6 +2874,10 @@ llvm_target_lib_list = [lib for lib in [ ["-gen-subtarget"], "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc", ), + ( + ["-gen-sd-node-info"], + "lib/Target/NVPTX/NVPTXGenSDNodeInfo.inc", + ), ], }, {