From 61372fc5db9b14fd612be8a58a76edd7f0ee38aa Mon Sep 17 00:00:00 2001 From: Helena Kotas Date: Tue, 10 Sep 2024 12:52:02 -0700 Subject: [PATCH 001/114] [HLSL] Warn on duplicate is_rov attribute; remove unnecessary parentheses (#107973) We should issue a warning whenever a duplicate resource type attribute is found. Currently we do that only for `resource_class`. This PR fixes that by checking for duplicate `is_rov` attributes as well. Also removes unnecessary parenthesis on `is_rov`. --- clang/lib/AST/TypePrinter.cpp | 2 +- clang/lib/Sema/SemaHLSL.cpp | 4 ++++ clang/lib/Sema/SemaType.cpp | 6 +++++- clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl | 6 +++--- clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl | 8 ++++---- clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl | 2 +- 6 files changed, 18 insertions(+), 10 deletions(-) diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index add6a5d10d61f..be627a6242eb4 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -2077,7 +2077,7 @@ void TypePrinter::printHLSLAttributedResourceAfter( << HLSLResourceClassAttr::ConvertResourceClassToStr(Attrs.ResourceClass) << ")]]"; if (Attrs.IsROV) - OS << " [[hlsl::is_rov()]]"; + OS << " [[hlsl::is_rov]]"; } void TypePrinter::printObjCInterfaceBefore(const ObjCInterfaceType *T, diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index f2158226e6ca7..4e44813fe515c 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -592,6 +592,10 @@ bool clang::CreateHLSLAttributedResourceType(Sema &S, QualType Wrapped, break; } case attr::HLSLROV: + if (ResAttrs.IsROV) { + S.Diag(A->getLocation(), diag::warn_duplicate_attribute_exact) << A; + return false; + } ResAttrs.IsROV = true; break; default: diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 520dce870b7b7..e627fee51b66b 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -8844,7 +8844,11 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type, } case ParsedAttr::AT_HLSLResourceClass: case ParsedAttr::AT_HLSLROV: { - if (state.getSema().HLSL().handleResourceTypeAttr(attr)) + // Only collect HLSL resource type attributes that are in + // decl-specifier-seq; do not collect attributes on declarations or those + // that get to slide after declaration name. + if (TAL == TAL_DeclSpec && + state.getSema().HLSL().handleResourceTypeAttr(attr)) attr.setUsedAsTypeAttr(); break; } diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl index 24c85c6ccf7d7..cf21ec4d380db 100644 --- a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl @@ -1,16 +1,16 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition -// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:68 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t' +// CHECK: FieldDecl 0x{{[0-9a-f]+}} col:68 h '__hlsl_resource_t {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov]]':'__hlsl_resource_t' struct MyBuffer { __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h; }; -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:66 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t' +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:66 res '__hlsl_resource_t {{\[\[}}hlsl::resource_class(SRV)]] {{\[\[}}hlsl::is_rov]]':'__hlsl_resource_t' __hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res; // CHECK: FunctionDecl 0x{{[0-9a-f]+}} line:14:6 f 'void () -// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]] {{\[\[}}hlsl::is_rov()]]':'__hlsl_resource_t' +// CHECK: VarDecl 0x{{[0-9a-f]+}} col:72 r '__hlsl_resource_t {{\[\[}}hlsl::resource_class(Sampler)]] {{\[\[}}hlsl::is_rov]]':'__hlsl_resource_t' void f() { __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r; } diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl index 68b2d9ecb190a..15685bd1a3baa 100644 --- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl +++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl @@ -1,10 +1,10 @@ // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify // expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}} -[[hlsl::is_rov()]] __hlsl_resource_t res0; +[[hlsl::is_rov]] __hlsl_resource_t res0; // expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}} -__hlsl_resource_t [[hlsl::is_rov()]] res1; +__hlsl_resource_t [[hlsl::is_rov]] res1; // expected-error@+1{{'is_rov' attribute takes no arguments}} __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2; @@ -12,5 +12,5 @@ __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2; // expected-error@+1{{use of undeclared identifier 'gibberish'}} __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3; -// duplicate attribute with the same meaning - no error -__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov()]] [[hlsl::is_rov()]] res4; +// expected-warning@+1{{attribute 'is_rov' is already applied}} +__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4; diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl index 6324a11fc8a2d..7c3830a291970 100644 --- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl +++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl @@ -11,6 +11,6 @@ RWBuffer Buffer1; // CHECK: -TemplateArgument type 'vector' // CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector' 4 // CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float' -// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit referenced h 'vector {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov()]]':'vector' +// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <> implicit referenced h 'vector {{\[\[}}hlsl::resource_class(UAV)]] {{\[\[}}hlsl::is_rov]]':'vector' // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <> Implicit TypedBuffer RasterizerOrderedBuffer > BufferArray3[4]; From 0b12cd227e593f5518da5170a399730bb314223e Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Tue, 10 Sep 2024 13:02:15 -0700 Subject: [PATCH 002/114] [rtsan] Ensure pthread is initialized in test (#108040) --- .../lib/rtsan/tests/rtsan_test_interceptors.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp index 0eeaf9da67098..1ef4c66a28de8 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors.cpp @@ -472,11 +472,12 @@ TEST_F(PthreadMutexLockTest, PthreadMutexUnlockSurvivesWhenNotRealtime) { ExpectNonRealtimeSurvival(Func); } -TEST(TestRtsanInterceptors, PthreadMutexJoinDiesWhenRealtime) { - auto Func = []() { - pthread_t thread{}; - pthread_join(thread, nullptr); - }; +TEST(TestRtsanInterceptors, PthreadJoinDiesWhenRealtime) { + pthread_t thread{}; + ASSERT_EQ(0, + pthread_create(&thread, nullptr, &FakeThreadEntryPoint, nullptr)); + + auto Func = [&thread]() { pthread_join(thread, nullptr); }; ExpectRealtimeDeath(Func, "pthread_join"); ExpectNonRealtimeSurvival(Func); From b9703cb1a535b72f6f0812322225e50f9e325850 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 10 Sep 2024 20:04:01 +0000 Subject: [PATCH 003/114] [gn build] Port becb03f3c624 --- llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn index 28f250ad3b7ba..ff4f558ca2fcf 100644 --- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn @@ -110,6 +110,7 @@ static_library("CodeGen") { "Targets/AVR.cpp", "Targets/BPF.cpp", "Targets/CSKY.cpp", + "Targets/DirectX.cpp", "Targets/Hexagon.cpp", "Targets/Lanai.cpp", "Targets/LoongArch.cpp", From cb3eb068e6b008b4784a93ac181516ae69350bf1 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 10 Sep 2024 20:04:02 +0000 Subject: [PATCH 004/114] [gn build] Port f4e2d7bfc143 --- llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn index 009aba221a0bc..e296a7b93c760 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Coroutines/BUILD.gn @@ -18,6 +18,7 @@ static_library("Coroutines") { "CoroFrame.cpp", "CoroSplit.cpp", "Coroutines.cpp", + "SpillUtils.cpp", "SuspendCrossingInfo.cpp", ] } From 6dacc382f5158b28550c25cd452848f4ab3ecd63 Mon Sep 17 00:00:00 2001 From: erichkeane Date: Tue, 10 Sep 2024 13:09:27 -0700 Subject: [PATCH 005/114] [OpenACC] Properly ignore side-effects in clause arguments The OpenACC standard makes depending on side effects to be effectively UB, so this patch ensures we handle them reaonably by making it a potentially evaluated context, and ignoring cleanups. --- clang/lib/Sema/SemaOpenACC.cpp | 12 ++++++++++ .../SemaOpenACC/compute-construct-ast.cpp | 23 ++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index cf207be33175c..e1fc9cea1eb2b 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -1210,6 +1210,10 @@ ExprResult SemaOpenACC::CheckReductionVar(Expr *VarExpr) { void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K, SourceLocation DirLoc) { + // Start an evaluation context to parse the clause arguments on. + SemaRef.PushExpressionEvaluationContext( + Sema::ExpressionEvaluationContext::PotentiallyEvaluated); + switch (K) { case OpenACCDirectiveKind::Invalid: // Nothing to do here, an invalid kind has nothing we can check here. We @@ -1626,6 +1630,8 @@ ExprResult SemaOpenACC::ActOnArraySectionExpr(Expr *Base, SourceLocation LBLoc, bool SemaOpenACC::ActOnStartStmtDirective(OpenACCDirectiveKind K, SourceLocation StartLoc) { + SemaRef.DiscardCleanupsInEvaluationContext(); + SemaRef.PopExpressionEvaluationContext(); return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/true); } @@ -1649,6 +1655,7 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K, ParentlessLoopConstructs); ParentlessLoopConstructs.clear(); + return ComputeConstruct; } case OpenACCDirectiveKind::Loop: { @@ -1704,6 +1711,11 @@ StmtResult SemaOpenACC::ActOnAssociatedStmt(SourceLocation DirectiveLoc, bool SemaOpenACC::ActOnStartDeclDirective(OpenACCDirectiveKind K, SourceLocation StartLoc) { + // OpenCC3.3 2.1 (line 889) + // A program must not depend on the order of evaluation of expressions in + // clause arguments or on any side effects of the evaluations. + SemaRef.DiscardCleanupsInEvaluationContext(); + SemaRef.PopExpressionEvaluationContext(); return diagnoseConstructAppertainment(*this, K, StartLoc, /*IsStmt=*/false); } diff --git a/clang/test/SemaOpenACC/compute-construct-ast.cpp b/clang/test/SemaOpenACC/compute-construct-ast.cpp index e632522f877b5..7a33aeb80570c 100644 --- a/clang/test/SemaOpenACC/compute-construct-ast.cpp +++ b/clang/test/SemaOpenACC/compute-construct-ast.cpp @@ -117,5 +117,26 @@ struct S { void use() { TemplFunc(); } -#endif +struct HasCtor { HasCtor(); operator int(); ~HasCtor();}; + +void useCtorType() { + // CHECK-LABEL: useCtorType + // CHECK-NEXT: CompoundStmt + +#pragma acc kernels num_workers(HasCtor{}) + // CHECK-NEXT: OpenACCComputeConstruct{{.*}} kernels + // CHECK-NEXT: num_workers clause + // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' + // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int' + // CHECK-NEXT: MemberExpr{{.*}}.operator int + // CHECK-NEXT: MaterializeTemporaryExpr{{.*}}'HasCtor' + // CHECK-NEXT: CXXBindTemporaryExpr{{.*}}'HasCtor' + // CHECK-NEXT: CXXTemporaryObjectExpr{{.*}}'HasCtor' + + while(true); + // CHECK-NEXT: WhileStmt + // CHECK-NEXT: CXXBoolLiteralExpr + // CHECK-NEXT: NullStmt +} +#endif From 27a01f6b4c47baefc347e47e4d38ea26bb721b2d Mon Sep 17 00:00:00 2001 From: Matheus Izvekov Date: Tue, 10 Sep 2024 17:11:49 -0300 Subject: [PATCH 006/114] [clang] correct argument offset for function template partial ordering (#107972) --- clang/lib/Sema/SemaTemplateDeduction.cpp | 28 ++++++++++++------------ clang/test/SemaTemplate/GH18291.cpp | 27 ++++++++++++++++++++++- 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 4c88159ea4ced..562c57a41299a 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -5502,10 +5502,6 @@ static TemplateDeductionResult CheckDeductionConsistency( ArrayRef DeducedArgs, bool CheckConsistency) { MultiLevelTemplateArgumentList MLTAL(FTD, DeducedArgs, /*Final=*/true); - if (ArgIdx != -1) - if (auto *MD = dyn_cast(FTD->getTemplatedDecl()); - MD && MD->isImplicitObjectMemberFunction()) - ArgIdx -= 1; Sema::ArgumentPackSubstitutionIndexRAII PackIndex( S, ArgIdx != -1 ? ::getPackIndexForParam(S, FTD, MLTAL, ArgIdx) : -1); bool IsIncompleteSubstitution = false; @@ -5576,12 +5572,10 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( /// Determine whether the function template \p FT1 is at least as /// specialized as \p FT2. -static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc, - FunctionTemplateDecl *FT1, - FunctionTemplateDecl *FT2, - TemplatePartialOrderingContext TPOC, - ArrayRef Args1, - ArrayRef Args2) { +static bool isAtLeastAsSpecializedAs( + Sema &S, SourceLocation Loc, FunctionTemplateDecl *FT1, + FunctionTemplateDecl *FT2, TemplatePartialOrderingContext TPOC, + ArrayRef Args1, ArrayRef Args2, bool Args1Offset) { FunctionDecl *FD1 = FT1->getTemplatedDecl(); FunctionDecl *FD2 = FT2->getTemplatedDecl(); const FunctionProtoType *Proto1 = FD1->getType()->getAs(); @@ -5676,6 +5670,8 @@ static bool isAtLeastAsSpecializedAs(Sema &S, SourceLocation Loc, TemplateDeductionInfo &Info, SmallVectorImpl &Deduced, PartialOrderingKind) { + if (ArgIdx != -1) + ArgIdx -= Args1Offset; return ::CheckDeductionConsistency( S, FTD, ArgIdx, P, A, DeducedArgs, /*CheckConsistency=*/HasDeducedParam[ParamIdx]); @@ -5763,6 +5759,8 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate( const FunctionDecl *FD2 = FT2->getTemplatedDecl(); bool ShouldConvert1 = false; bool ShouldConvert2 = false; + bool Args1Offset = false; + bool Args2Offset = false; QualType Obj1Ty; QualType Obj2Ty; if (TPOC == TPOC_Call) { @@ -5811,6 +5809,7 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate( Obj1Ty = GetImplicitObjectParameterType(this->Context, Method1, RawObj1Ty, IsRValRef2); Args1.push_back(Obj1Ty); + Args1Offset = true; } if (ShouldConvert2) { bool IsRValRef1 = @@ -5821,6 +5820,7 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate( Obj2Ty = GetImplicitObjectParameterType(this->Context, Method2, RawObj2Ty, IsRValRef1); Args2.push_back(Obj2Ty); + Args2Offset = true; } } else { if (NonStaticMethod1 && Method1->hasCXXExplicitFunctionObjectParameter()) @@ -5842,10 +5842,10 @@ FunctionTemplateDecl *Sema::getMoreSpecializedTemplate( } else { assert(!Reversed && "Only call context could have reversed arguments"); } - bool Better1 = - isAtLeastAsSpecializedAs(*this, Loc, FT1, FT2, TPOC, Args1, Args2); - bool Better2 = - isAtLeastAsSpecializedAs(*this, Loc, FT2, FT1, TPOC, Args2, Args1); + bool Better1 = isAtLeastAsSpecializedAs(*this, Loc, FT1, FT2, TPOC, Args1, + Args2, Args2Offset); + bool Better2 = isAtLeastAsSpecializedAs(*this, Loc, FT2, FT1, TPOC, Args2, + Args1, Args1Offset); // C++ [temp.deduct.partial]p10: // F is more specialized than G if F is at least as specialized as G and G // is not at least as specialized as F. diff --git a/clang/test/SemaTemplate/GH18291.cpp b/clang/test/SemaTemplate/GH18291.cpp index ca1e69e4ca3f5..820564ffa6f1a 100644 --- a/clang/test/SemaTemplate/GH18291.cpp +++ b/clang/test/SemaTemplate/GH18291.cpp @@ -86,4 +86,29 @@ namespace func_pointer { template void pow(_Tp, complex::type>) = delete; void (*ptr)(const complex &, complex){pow}; } // namespace param -} // namespace t3 +} // namespace func_pointer + +namespace static_vs_nonstatic { + namespace implicit_obj_param { + struct A { + template + static void f(int a, Args... args) {} + template + void f(Args... args) = delete; + }; + void g(){ + A::f(0); + } + } // namespace implicit_obj_param + namespace explicit_obj_param { + struct A { + template + static void f(int, Args... args) {} + template + void f(this A *, Args... args) = delete; + }; + void g(){ + A::f(0); + } + } // namespace explicit_obj_param +} // namespace static_vs_nonstatic From 3363760f9a00c5d4dac1e08d44f9d79b8e322511 Mon Sep 17 00:00:00 2001 From: vporpo Date: Tue, 10 Sep 2024 13:17:26 -0700 Subject: [PATCH 007/114] [SandboxIR] PassManager (#107932) This patch implements a simple pass manager for Sandbox IR. --- llvm/include/llvm/SandboxIR/Pass.h | 4 +- llvm/include/llvm/SandboxIR/PassManager.h | 70 +++++++++++++++++++++++ llvm/lib/SandboxIR/CMakeLists.txt | 1 + llvm/lib/SandboxIR/PassManager.cpp | 22 +++++++ llvm/unittests/SandboxIR/PassTest.cpp | 51 +++++++++++++++++ 5 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 llvm/include/llvm/SandboxIR/PassManager.h create mode 100644 llvm/lib/SandboxIR/PassManager.cpp diff --git a/llvm/include/llvm/SandboxIR/Pass.h b/llvm/include/llvm/SandboxIR/Pass.h index d659e96839213..caf1c70a84147 100644 --- a/llvm/include/llvm/SandboxIR/Pass.h +++ b/llvm/include/llvm/SandboxIR/Pass.h @@ -37,8 +37,8 @@ class Pass { Pass.print(OS); return OS; } - void print(raw_ostream &OS) const { OS << Name; } - LLVM_DUMP_METHOD void dump() const; + virtual void print(raw_ostream &OS) const { OS << Name; } + LLVM_DUMP_METHOD virtual void dump() const; #endif }; diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h new file mode 100644 index 0000000000000..cb321fe699a56 --- /dev/null +++ b/llvm/include/llvm/SandboxIR/PassManager.h @@ -0,0 +1,70 @@ +//===- PassManager.h --------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Registers and executes the Sandbox IR passes. +// +// The pass manager contains an ordered sequence of passes that it runs in +// order. The passes are owned by the PassRegistry, not by the PassManager. +// +// Note that in this design a pass manager is also a pass. So a pass manager +// runs when it is it's turn to run in its parent pass-manager pass pipeline. +// + +#ifndef LLVM_SANDBOXIR_PASSMANAGER_H +#define LLVM_SANDBOXIR_PASSMANAGER_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/SandboxIR/Pass.h" +#include "llvm/Support/Debug.h" + +namespace llvm::sandboxir { + +class Value; + +/// Base class. +template +class PassManager : public ParentPass { +protected: + /// The list of passes that this pass manager will run. + SmallVector Passes; + + PassManager(StringRef Name) : ParentPass(Name) {} + PassManager(const PassManager &) = delete; + virtual ~PassManager() = default; + PassManager &operator=(const PassManager &) = delete; + +public: + /// Adds \p Pass to the pass pipeline. + void addPass(ContainedPass *Pass) { + // TODO: Check that Pass's class type works with this PassManager type. + Passes.push_back(Pass); + } +#ifndef NDEBUG + void print(raw_ostream &OS) const override { + OS << this->getName(); + OS << "("; + interleave(Passes, OS, [&OS](auto *Pass) { OS << Pass->getName(); }, ","); + OS << ")"; + } + LLVM_DUMP_METHOD void dump() const override { + print(dbgs()); + dbgs() << "\n"; + } +#endif +}; + +class FunctionPassManager final + : public PassManager { +public: + FunctionPassManager(StringRef Name) : PassManager(Name) {} + bool runOnFunction(Function &F) final; +}; + +} // namespace llvm::sandboxir + +#endif // LLVM_SANDBOXIR_PASSMANAGER_H diff --git a/llvm/lib/SandboxIR/CMakeLists.txt b/llvm/lib/SandboxIR/CMakeLists.txt index 2f047944e0335..03474be0c7b80 100644 --- a/llvm/lib/SandboxIR/CMakeLists.txt +++ b/llvm/lib/SandboxIR/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_component_library(LLVMSandboxIR Pass.cpp + PassManager.cpp SandboxIR.cpp Tracker.cpp Type.cpp diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp new file mode 100644 index 0000000000000..d10f3926f7bcd --- /dev/null +++ b/llvm/lib/SandboxIR/PassManager.cpp @@ -0,0 +1,22 @@ +//===- PassManager.cpp - Runs a pipeline of Sandbox IR passes -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/SandboxIR/PassManager.h" +#include "llvm/SandboxIR/SandboxIR.h" + +using namespace llvm::sandboxir; + +bool FunctionPassManager::runOnFunction(Function &F) { + bool Change = false; + for (FunctionPass *Pass : Passes) { + Change |= Pass->runOnFunction(F); + // TODO: run the verifier. + } + // TODO: Check ChangeAll against hashes before/after. + return Change; +} diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp index 65992d8cb95ee..8e080128b15b3 100644 --- a/llvm/unittests/SandboxIR/PassTest.cpp +++ b/llvm/unittests/SandboxIR/PassTest.cpp @@ -9,6 +9,7 @@ #include "llvm/SandboxIR/Pass.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Module.h" +#include "llvm/SandboxIR/PassManager.h" #include "llvm/SandboxIR/SandboxIR.h" #include "llvm/Support/SourceMgr.h" #include "gtest/gtest.h" @@ -82,3 +83,53 @@ define void @foo() { EXPECT_DEATH(TestNamePass("-dash"), ".*start with.*"); #endif } + +TEST_F(PassTest, FunctionPassManager) { + auto *F = parseFunction(R"IR( +define void @foo() { + ret void +} +)IR", + "foo"); + class TestPass1 final : public FunctionPass { + unsigned &BBCnt; + + public: + TestPass1(unsigned &BBCnt) : FunctionPass("test-pass1"), BBCnt(BBCnt) {} + bool runOnFunction(Function &F) final { + for ([[maybe_unused]] auto &BB : F) + ++BBCnt; + return false; + } + }; + class TestPass2 final : public FunctionPass { + unsigned &BBCnt; + + public: + TestPass2(unsigned &BBCnt) : FunctionPass("test-pass2"), BBCnt(BBCnt) {} + bool runOnFunction(Function &F) final { + for ([[maybe_unused]] auto &BB : F) + ++BBCnt; + return false; + } + }; + unsigned BBCnt1 = 0; + unsigned BBCnt2 = 0; + TestPass1 TPass1(BBCnt1); + TestPass2 TPass2(BBCnt2); + + FunctionPassManager FPM("test-fpm"); + FPM.addPass(&TPass1); + FPM.addPass(&TPass2); + // Check runOnFunction(). + FPM.runOnFunction(*F); + EXPECT_EQ(BBCnt1, 1u); + EXPECT_EQ(BBCnt2, 1u); +#ifndef NDEBUG + // Check dump(). + std::string Buff; + llvm::raw_string_ostream SS(Buff); + FPM.print(SS); + EXPECT_EQ(Buff, "test-fpm(test-pass1,test-pass2)"); +#endif // NDEBUG +} From 99fb1506a869fa5e82dbd36e1a63cd21450f1502 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Tue, 10 Sep 2024 20:18:56 +0000 Subject: [PATCH 008/114] [gn build] Port 3363760f9a00 --- llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn index aa49c76e84cec..e69104909330d 100644 --- a/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/SandboxIR/BUILD.gn @@ -6,6 +6,7 @@ static_library("SandboxIR") { ] sources = [ "Pass.cpp", + "PassManager.cpp", "SandboxIR.cpp", "Tracker.cpp", "Type.cpp", From e3c537ff903af9a92ff43bab6d21c0ea759d65e5 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 10 Sep 2024 21:37:12 +0100 Subject: [PATCH 009/114] [VPlan] Consider non-header phis in planContainsAdditionalSimp. Update planContainsAdditionalSimplifications to also check phis not in the loop header. This ensures we don't miss cases where VPBlendRecipes (which correspond to such phis) have been simplified. Fixes https://github.com/llvm/llvm-project/issues/107473. --- .../Transforms/Vectorize/LoopVectorize.cpp | 7 +- .../LoopVectorize/RISCV/dead-ops-cost.ll | 82 +++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2be3b57752925..b821da03c16e9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7314,9 +7314,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, // Return true if the loop contains any instructions that are not also part of // the VPlan or are skipped for VPlan-based cost computations. This indicates // that the VPlan contains extra simplifications. - return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx](BasicBlock *BB) { - return any_of(*BB, [&SeenInstrs, &CostCtx](Instruction &I) { - if (isa(&I)) + return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx, + TheLoop](BasicBlock *BB) { + return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) { + if (isa(&I) && BB == TheLoop->getHeader()) return false; return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true); }); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 54c7299f6db0f..6d309c4453c7e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -322,7 +322,87 @@ return: ret i32 0 } +; Test case for https://github.com/llvm/llvm-project/issues/107473. +define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) { +; CHECK-LABEL: define void @test_phi_in_latch_redundant( +; CHECK-SAME: ptr [[DST:%.*]], i32 [[A:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 37, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 37, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 37, [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 9 +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], shufflevector ( insertelement ( poison, i64 9, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 9, [[TMP5]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i32 -1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv2i32.nxv2p0( [[TMP10]], [[TMP11]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 37, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: br i1 false, label %[[LOOP_LATCH]], label %[[THEN:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[NOT_A:%.*]] = xor i32 [[A]], -1 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[P:%.*]] = phi i32 [ [[NOT_A]], %[[THEN]] ], [ 0, %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 [[P]], ptr [[GEP]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 9 +; CHECK-NEXT: [[EC:%.*]] = icmp slt i64 [[IV]], 322 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 false, label %loop.latch, label %then + +then: + %not.a = xor i32 %a, -1 + br label %loop.latch + +loop.latch: + %p = phi i32 [ %not.a, %then ], [ 0, %loop.header ] + %gep = getelementptr i32, ptr %dst, i64 %iv + store i32 %p, ptr %gep, align 4 + %iv.next = add i64 %iv, 9 + %ec = icmp slt i64 %iv, 322 + br i1 %ec, label %loop.header, label %exit + +exit: + ret void +} ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} @@ -343,4 +423,6 @@ return: ; CHECK: [[META15]] = distinct !{[[META15]], [[META13]]} ; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} ; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]} +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} ;. From 0f56ba13bff7ab72bfafcf7c5cf9e5b8bd16d895 Mon Sep 17 00:00:00 2001 From: "Henrik G. Olsson" Date: Tue, 10 Sep 2024 13:38:04 -0700 Subject: [PATCH 010/114] [llvm-lit] Process ANSI color codes in test output when formatting (#106776) Test output that carried color across newlines previously resulted in the formatting around the output also being colored. Detect the current ANSI color and reset it when printing formatting, and then reapply it. As an added bonus an unterminated color code is also detected, preventing it from leaking out into the rest of the terminal. Fixes #106633 --- llvm/utils/lit/lit/TestRunner.py | 28 +++++++++++++++++-- .../Inputs/escape-color/color-escaped.txt | 10 +++++++ .../lit/tests/Inputs/escape-color/color.txt | 6 ++++ .../lit/tests/Inputs/escape-color/lit.cfg | 8 ++++++ llvm/utils/lit/tests/escape-color.py | 4 +++ 5 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt create mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color.txt create mode 100644 llvm/utils/lit/tests/Inputs/escape-color/lit.cfg create mode 100644 llvm/utils/lit/tests/escape-color.py diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 19f35fc7e212f..a2c76d41a43e0 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -1017,6 +1017,20 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): return exitCode +def findColor(line, curr_color): + start = line.rfind("\33[") + if start == -1: + return curr_color + end = line.find("m", start+2) + if end == -1: + return curr_color + match = line[start:end+1] + # "\33[0m" means "reset all formatting". Sometimes the 0 is skipped. + if match == "\33[m" or match == "\33[0m": + return None + return match + + def formatOutput(title, data, limit=None): if not data.strip(): return "" @@ -1027,8 +1041,18 @@ def formatOutput(title, data, limit=None): msg = "" ndashes = 30 # fmt: off - out = f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n" - out += f"# | " + "\n# | ".join(data.splitlines()) + "\n" + out = f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n" + curr_color = None + for line in data.splitlines(): + if curr_color: + out += "\33[0m" + out += "# | " + if curr_color: + out += curr_color + out += line + "\n" + curr_color = findColor(line, curr_color) + if curr_color: + out += "\33[0m" # prevent unterminated formatting from leaking out += f"# `---{msg}{'-' * (ndashes - 4 - len(msg))}\n" # fmt: on return out diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt b/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt new file mode 100644 index 0000000000000..e7a33e380b351 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt @@ -0,0 +1,10 @@ +# .---command stdout------------ +# | # RUN: cat %s +# | red +# | still red(B +# | plain +# | green +# | still green (never terminated) +# `----------------------------- + +-- diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color.txt b/llvm/utils/lit/tests/Inputs/escape-color/color.txt new file mode 100644 index 0000000000000..15ffc22d134f0 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/escape-color/color.txt @@ -0,0 +1,6 @@ +# RUN: cat %s +red +still red(B +plain +green +still green (never terminated) diff --git a/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg b/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg new file mode 100644 index 0000000000000..36f4eb69d4858 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg @@ -0,0 +1,8 @@ +import lit.formats + +config.name = "escape-color" +config.suffixes = [".txt"] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None + diff --git a/llvm/utils/lit/tests/escape-color.py b/llvm/utils/lit/tests/escape-color.py new file mode 100644 index 0000000000000..1d0b93b004e9d --- /dev/null +++ b/llvm/utils/lit/tests/escape-color.py @@ -0,0 +1,4 @@ +# cut off the first 9 lines to avoid absolute file paths in the output +# then keep only the next 10 lines to avoid test timing in the output +# RUN: %{lit} %{inputs}/escape-color/color.txt -a | tail -n +10 | head -n 10 > %t +# RUN: diff %{inputs}/escape-color/color-escaped.txt %t From b5ce7a9fbd898767d0c1fa13ef5c33bbbe67981f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 10 Sep 2024 14:07:26 -0700 Subject: [PATCH 011/114] [flang][cuda] Avoid generating data transfer when calling size intrinsic (#108081) cuf.data_transfer was wrongly generated when calling the `size` intrinsic on a device allocatable variable. Since the descriptor is available on the host, there is no transfer needed. Add `DescriptorInquiry` in the `CollectCudaSymbolsHelper` to filter out symbols that are not needed for the transfer decision to be made. --- flang/lib/Evaluate/tools.cpp | 3 +++ flang/test/Lower/CUDA/cuda-data-transfer.cuf | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp index 6b3db619c1e2f..400f27aef98da 100644 --- a/flang/lib/Evaluate/tools.cpp +++ b/flang/lib/Evaluate/tools.cpp @@ -1011,6 +1011,9 @@ struct CollectCudaSymbolsHelper : public SetTraverse>> ! CHECK-NOT: cuf.data_transfer + +subroutine sub18() + integer, device, allocatable :: a(:) + integer :: isz + + isz = size(a) +end subroutine + +! CHECK-LABEL: func.func @_QPsub18() +! CHECK-NOT: cuf.data_transfer From d8a8eae6daa6523b28526e2b8f65879e74858f68 Mon Sep 17 00:00:00 2001 From: Daniel Thornburgh Date: Tue, 10 Sep 2024 14:08:27 -0700 Subject: [PATCH 012/114] Revert "[libc++][string] Remove potential non-trailing 0-length array" (#108091) Reverts llvm/llvm-project#105865 This breaks a pair of LLDB tests in CI. --- libcxx/include/string | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/libcxx/include/string b/libcxx/include/string index aba79a74912f5..3480b57375c11 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -749,14 +749,6 @@ struct __can_be_converted_to_string_view struct __uninitialized_size_tag {}; struct __init_with_sentinel_tag {}; -template -struct __padding { - char __padding_[_PaddingSize]; -}; - -template <> -struct __padding<0> {}; - template class basic_string { private: @@ -861,7 +853,7 @@ private: struct __short { value_type __data_[__min_cap]; - _LIBCPP_NO_UNIQUE_ADDRESS __padding __padding_; + unsigned char __padding_[sizeof(value_type) - 1]; unsigned char __size_ : 7; unsigned char __is_long_ : 1; }; @@ -913,7 +905,7 @@ private: unsigned char __is_long_ : 1; unsigned char __size_ : 7; }; - _LIBCPP_NO_UNIQUE_ADDRESS __padding __padding_; + char __padding_[sizeof(value_type) - 1]; value_type __data_[__min_cap]; }; From ce392471c0d9cb3ef88d05fcbcff59de8ea0c1e1 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:08:55 -0700 Subject: [PATCH 013/114] [flang] Silence spurious error on non-CUDA use of CUDA module (#107444) When a module file has been compiled with CUDA enabled, don't emit spurious errors about non-interoperable types when that module is read by a USE statement in a later non-CUDA compilation. --- flang/include/flang/Semantics/type.h | 3 --- flang/lib/Evaluate/type.cpp | 4 ++-- flang/lib/Semantics/check-declarations.cpp | 21 +++++++++++++-------- flang/lib/Semantics/expression.cpp | 4 ++-- flang/lib/Semantics/type.cpp | 9 --------- flang/test/Semantics/Inputs/modfile66.cuf | 4 ++++ flang/test/Semantics/modfile66.f90 | 3 +++ 7 files changed, 24 insertions(+), 24 deletions(-) create mode 100644 flang/test/Semantics/Inputs/modfile66.cuf create mode 100644 flang/test/Semantics/modfile66.f90 diff --git a/flang/include/flang/Semantics/type.h b/flang/include/flang/Semantics/type.h index 04f8b11e992a0..e2d47d38f927f 100644 --- a/flang/include/flang/Semantics/type.h +++ b/flang/include/flang/Semantics/type.h @@ -459,8 +459,5 @@ inline const DerivedTypeSpec *DeclTypeSpec::AsDerived() const { return const_cast(this)->AsDerived(); } -std::optional IsInteroperableIntrinsicType( - const DeclTypeSpec &, const common::LanguageFeatureControl &); - } // namespace Fortran::semantics #endif // FORTRAN_SEMANTICS_TYPE_H_ diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp index 5ecc3701b4f24..a1df40667471a 100644 --- a/flang/lib/Evaluate/type.cpp +++ b/flang/lib/Evaluate/type.cpp @@ -820,8 +820,8 @@ std::optional IsInteroperableIntrinsicType(const DynamicType &type, return true; case TypeCategory::Real: case TypeCategory::Complex: - return (features && features->IsEnabled(common::LanguageFeature::CUDA)) || - type.kind() >= 4; // no short or half floats + return type.kind() >= 4 /* not a short or half float */ || !features || + features->IsEnabled(common::LanguageFeature::CUDA); case TypeCategory::Logical: return type.kind() == 1; // C_BOOL case TypeCategory::Character: diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index 734c34276b13b..c896ee7d29381 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -3003,17 +3003,17 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType( } else { msgs.Annex(std::move(bad)); } - } else if (!IsInteroperableIntrinsicType( - *type, context_.languageFeatures()) + } else if (auto dyType{evaluate::DynamicType::From(*type)}; dyType && + !evaluate::IsInteroperableIntrinsicType( + *dyType, &context_.languageFeatures()) .value_or(false)) { - auto maybeDyType{evaluate::DynamicType::From(*type)}; if (type->category() == DeclTypeSpec::Logical) { if (context_.ShouldWarn(common::UsageWarning::LogicalVsCBool)) { msgs.Say(component.name(), "A LOGICAL component of an interoperable type should have the interoperable KIND=C_BOOL"_port_en_US); } - } else if (type->category() == DeclTypeSpec::Character && - maybeDyType && maybeDyType->kind() == 1) { + } else if (type->category() == DeclTypeSpec::Character && dyType && + dyType->kind() == 1) { if (context_.ShouldWarn(common::UsageWarning::BindCCharLength)) { msgs.Say(component.name(), "A CHARACTER component of an interoperable type should have length 1"_port_en_US); @@ -3106,10 +3106,15 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(const Symbol &symbol) { type->category() == DeclTypeSpec::Character && type->characterTypeSpec().length().isDeferred()) { // ok; F'2023 18.3.7 p2(6) - } else if (derived || - IsInteroperableIntrinsicType(*type, context_.languageFeatures()) - .value_or(false)) { + } else if (derived) { // type has been checked + } else if (auto dyType{evaluate::DynamicType::From(*type)}; dyType && + evaluate::IsInteroperableIntrinsicType(*dyType, + InModuleFile() ? nullptr : &context_.languageFeatures()) + .value_or(false)) { // F'2023 18.3.7 p2(4,5) + // N.B. Language features are not passed to IsInteroperableIntrinsicType + // when processing a module file, since the module file might have been + // compiled with CUDA while the client is not. } else if (type->category() == DeclTypeSpec::Logical) { if (context_.ShouldWarn(common::UsageWarning::LogicalVsCBool) && !InModuleFile()) { diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 3684839c187e6..0eabe532cfe0c 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -1956,7 +1956,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::ArrayConstructor &array) { // Check if implicit conversion of expr to the symbol type is legal (if needed), // and make it explicit if requested. -static MaybeExpr implicitConvertTo(const semantics::Symbol &sym, +static MaybeExpr ImplicitConvertTo(const semantics::Symbol &sym, Expr &&expr, bool keepConvertImplicit) { if (!keepConvertImplicit) { return ConvertToType(sym, std::move(expr)); @@ -2196,7 +2196,7 @@ MaybeExpr ExpressionAnalyzer::Analyze( // convert would cause a segfault. Lowering will deal with // conditionally converting and preserving the lower bounds in this // case. - if (MaybeExpr converted{implicitConvertTo( + if (MaybeExpr converted{ImplicitConvertTo( *symbol, std::move(*value), IsAllocatable(*symbol))}) { if (auto componentShape{GetShape(GetFoldingContext(), *symbol)}) { if (auto valueShape{GetShape(GetFoldingContext(), *converted)}) { diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp index cfaee0b8ba6dc..aa6e8973ebd30 100644 --- a/flang/lib/Semantics/type.cpp +++ b/flang/lib/Semantics/type.cpp @@ -893,13 +893,4 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &o, const DeclTypeSpec &x) { return o << x.AsFortran(); } -std::optional IsInteroperableIntrinsicType( - const DeclTypeSpec &type, const common::LanguageFeatureControl &features) { - if (auto dyType{evaluate::DynamicType::From(type)}) { - return IsInteroperableIntrinsicType(*dyType, &features); - } else { - return std::nullopt; - } -} - } // namespace Fortran::semantics diff --git a/flang/test/Semantics/Inputs/modfile66.cuf b/flang/test/Semantics/Inputs/modfile66.cuf new file mode 100644 index 0000000000000..be400da749148 --- /dev/null +++ b/flang/test/Semantics/Inputs/modfile66.cuf @@ -0,0 +1,4 @@ +module usereal2 + !REAL(2) is interoperable under CUDA + real(2), bind(c) :: x +end diff --git a/flang/test/Semantics/modfile66.f90 b/flang/test/Semantics/modfile66.f90 new file mode 100644 index 0000000000000..51b4d8375d50d --- /dev/null +++ b/flang/test/Semantics/modfile66.f90 @@ -0,0 +1,3 @@ +! RUN: %flang_fc1 -fsyntax-only %S/Inputs/modfile66.cuf && %flang_fc1 -fsyntax-only %s +use usereal2 ! valid since x is not used +end From 5a229dbca19f5ad7ebd15c85b432cd54e70fd09a Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:10:40 -0700 Subject: [PATCH 014/114] [flang] Relax error into a warning (#107489) The standard requires that a generic interface with the same name as a derived type contain only functions. We generally allow a generic interface to contain both functions and subroutines, since there's never any ambiguity at the point of call; these is helpful when the specific procedures of two generics are combined during USE association. Emit a warning instead of a hard error when a generic interface with the same name as a derived type contains a subroutine to improve portability of code from compilers that don't check for this condition. --- flang/docs/Extensions.md | 5 +-- flang/lib/Semantics/resolve-names.cpp | 44 +++++++++++++-------------- flang/test/Semantics/resolve24.f90 | 8 ++--- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index fb57744c21570..340939baca99f 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -507,10 +507,7 @@ end f18 supports them with a portability warning. * f18 does not enforce a blanket prohibition against generic interfaces containing a mixture of functions and subroutines. - Apart from some contexts in which the standard requires all of - a particular generic interface to have only all functions or - all subroutines as its specific procedures, we allow both to - appear, unlike several other Fortran compilers. + We allow both to appear, unlike several other Fortran compilers. This is especially desirable when two generics of the same name are combined due to USE association and the mixture may be inadvertent. diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 2e86e0afc9bd0..b764678357db3 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -3639,36 +3639,36 @@ void InterfaceVisitor::CheckGenericProcedures(Symbol &generic) { } return; } - const Symbol &firstSpecific{specifics.front()}; - bool isFunction{firstSpecific.test(Symbol::Flag::Function)}; - bool isBoth{false}; + const Symbol *function{nullptr}; + const Symbol *subroutine{nullptr}; for (const Symbol &specific : specifics) { - if (isFunction != specific.test(Symbol::Flag::Function)) { // C1514 - if (context().ShouldWarn( + if (!function && specific.test(Symbol::Flag::Function)) { + function = &specific; + } else if (!subroutine && specific.test(Symbol::Flag::Subroutine)) { + subroutine = &specific; + if (details.derivedType() && + context().ShouldWarn( common::LanguageFeature::SubroutineAndFunctionSpecifics)) { + SayDerivedType(generic.name(), + "Generic interface '%s' should only contain functions due to derived type with same name"_warn_en_US, + *details.derivedType()->GetUltimate().scope()); + } + } + if (function && subroutine) { + if (context().ShouldWarn(common::LanguageFeature:: + SubroutineAndFunctionSpecifics)) { // C1514 auto &msg{Say(generic.name(), "Generic interface '%s' has both a function and a subroutine"_warn_en_US)}; - if (isFunction) { - msg.Attach(firstSpecific.name(), "Function declaration"_en_US); - msg.Attach(specific.name(), "Subroutine declaration"_en_US); - } else { - msg.Attach(firstSpecific.name(), "Subroutine declaration"_en_US); - msg.Attach(specific.name(), "Function declaration"_en_US); - } + msg.Attach(function->name(), "Function declaration"_en_US); + msg.Attach(subroutine->name(), "Subroutine declaration"_en_US); } - isFunction = false; - isBoth = true; break; } } - if (!isFunction && details.derivedType()) { - SayDerivedType(generic.name(), - "Generic interface '%s' may only contain functions due to derived type" - " with same name"_err_en_US, - *details.derivedType()->GetUltimate().scope()); - } - if (!isBoth) { - generic.set(isFunction ? Symbol::Flag::Function : Symbol::Flag::Subroutine); + if (function && !subroutine) { + generic.set(Symbol::Flag::Function); + } else if (subroutine && !function) { + generic.set(Symbol::Flag::Subroutine); } } diff --git a/flang/test/Semantics/resolve24.f90 b/flang/test/Semantics/resolve24.f90 index 4af6f202cf4f1..72d6719665bb5 100644 --- a/flang/test/Semantics/resolve24.f90 +++ b/flang/test/Semantics/resolve24.f90 @@ -1,6 +1,6 @@ ! RUN: %python %S/test_errors.py %s %flang_fc1 subroutine test1 - !ERROR: Generic interface 'foo' has both a function and a subroutine + !WARNING: Generic interface 'foo' has both a function and a subroutine interface foo subroutine s1(x) end subroutine @@ -12,7 +12,7 @@ function f() end subroutine subroutine test2 - !ERROR: Generic interface 'foo' has both a function and a subroutine + !WARNING: Generic interface 'foo' has both a function and a subroutine interface foo function t2f1(x) end function @@ -24,7 +24,7 @@ function t2f2(x, y) end subroutine module test3 - !ERROR: Generic interface 'foo' has both a function and a subroutine + !WARNING: Generic interface 'foo' has both a function and a subroutine interface foo module procedure s module procedure f @@ -39,7 +39,7 @@ function f() subroutine test4 type foo end type - !ERROR: Generic interface 'foo' may only contain functions due to derived type with same name + !WARNING: Generic interface 'foo' should only contain functions due to derived type with same name interface foo subroutine s() end subroutine From d2126ec1af543b23817c742c283ec441f21bf42b Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:11:10 -0700 Subject: [PATCH 015/114] [flang] Fix bogus error about procedure incompatbility (#107645) This was a subtle problem. When the shape of a function result is explicit but not constant, it is characterized with bounds expressions that use Extremum operations to force extents to 0 rather than be negative. These Extremum operations are formatted as "max()" intrinsic functions in the module file. Upon being read from the module file, they are not folded back into Extremum operations, but remain as function references; and this then leads to expressions not comparing equal when the procedure characteristics are compared to those of a local procedure declared identically. The real fix here would be for folding to just always change max and min function references into Extremum<> operations, constant operands or not, and I tried that, but it lead to test failures and crashes in lowering that I couldn't resolve. So, until those can be fixed, here's a change that will read max/min operations in module file declarations back into Extremum operations to solve the compatibility checking problem, but leave other non-constant max/min operations as function calls. --- flang/include/flang/Evaluate/expression.h | 3 ++ flang/include/flang/Evaluate/tools.h | 16 ++++++++ flang/lib/Evaluate/expression.cpp | 22 ++++++++-- flang/lib/Evaluate/fold-implementation.h | 50 +++++++++++++++-------- flang/test/Semantics/Inputs/modfile67.mod | 16 ++++++++ flang/test/Semantics/modfile67.f90 | 35 ++++++++++++++++ 6 files changed, 122 insertions(+), 20 deletions(-) create mode 100644 flang/test/Semantics/Inputs/modfile67.mod create mode 100644 flang/test/Semantics/modfile67.f90 diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h index 3ba46edba717b..2a40193e32306 100644 --- a/flang/include/flang/Evaluate/expression.h +++ b/flang/include/flang/Evaluate/expression.h @@ -342,6 +342,7 @@ template struct Extremum : public Operation, A, A, A> { : Base{x, y}, ordering{ord} {} Extremum(Ordering ord, Expr &&x, Expr &&y) : Base{std::move(x), std::move(y)}, ordering{ord} {} + bool operator==(const Extremum &) const; Ordering ordering{Ordering::Greater}; }; @@ -381,6 +382,7 @@ struct LogicalOperation : Base{x, y}, logicalOperator{opr} {} LogicalOperation(LogicalOperator opr, Expr &&x, Expr &&y) : Base{std::move(x), std::move(y)}, logicalOperator{opr} {} + bool operator==(const LogicalOperation &) const; LogicalOperator logicalOperator; }; @@ -634,6 +636,7 @@ class Relational : public Operation, LogicalResult, T, T> { : Base{a, b}, opr{r} {} Relational(RelationalOperator r, Expr &&a, Expr &&b) : Base{std::move(a), std::move(b)}, opr{r} {} + bool operator==(const Relational &) const; RelationalOperator opr; }; diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h index 3675d9f924876..a0487e399d936 100644 --- a/flang/include/flang/Evaluate/tools.h +++ b/flang/include/flang/Evaluate/tools.h @@ -218,6 +218,22 @@ template A *UnwrapExpr(std::optional &x) { } } +template const A *UnwrapExpr(const B *x) { + if (x) { + return UnwrapExpr(*x); + } else { + return nullptr; + } +} + +template A *UnwrapExpr(B *x) { + if (x) { + return UnwrapExpr(*x); + } else { + return nullptr; + } +} + // A variant of UnwrapExpr above that also skips through (parentheses) // and conversions of kinds within a category. Useful for extracting LEN // type parameter inquiries, at least. diff --git a/flang/lib/Evaluate/expression.cpp b/flang/lib/Evaluate/expression.cpp index 5b0bc14dc3e1b..1a65d4c7362fe 100644 --- a/flang/lib/Evaluate/expression.cpp +++ b/flang/lib/Evaluate/expression.cpp @@ -125,6 +125,24 @@ template LLVM_DUMP_METHOD void ExpressionBase::dump() const { // Equality testing +template bool Extremum::operator==(const Extremum &that) const { + return ordering == that.ordering && Base::operator==(that); +} + +template +bool LogicalOperation::operator==(const LogicalOperation &that) const { + return logicalOperator == that.logicalOperator && Base::operator==(that); +} + +template +bool Relational::operator==(const Relational &that) const { + return opr == that.opr && Base::operator==(that); +} + +bool Relational::operator==(const Relational &that) const { + return u == that.u; +} + bool ImpliedDoIndex::operator==(const ImpliedDoIndex &that) const { return name == that.name; } @@ -181,10 +199,6 @@ bool StructureConstructor::operator==(const StructureConstructor &that) const { return result_ == that.result_ && values_ == that.values_; } -bool Relational::operator==(const Relational &that) const { - return u == that.u; -} - template bool Expr>::operator==( const Expr> &that) const { diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h index 9ce0edbdcb779..1b14a305b87f4 100644 --- a/flang/lib/Evaluate/fold-implementation.h +++ b/flang/lib/Evaluate/fold-implementation.h @@ -1088,24 +1088,42 @@ Expr FoldMINorMAX( static_assert(T::category == TypeCategory::Integer || T::category == TypeCategory::Real || T::category == TypeCategory::Character); - std::vector *> constantArgs; - // Call Folding on all arguments, even if some are not constant, - // to make operand promotion explicit. - for (auto &arg : funcRef.arguments()) { - if (auto *cst{Folder{context}.Folding(arg)}) { - constantArgs.push_back(cst); + auto &args{funcRef.arguments()}; + bool ok{true}; + std::optional> result; + Folder folder{context}; + for (std::optional &arg : args) { + // Call Folding on all arguments to make operand promotion explicit. + if (!folder.Folding(arg)) { + // TODO: Lowering can't handle having every FunctionRef for max and min + // being converted into Extremum. That needs fixing. Until that + // is corrected, however, it is important that max and min references + // in module files be converted into Extremum even when not constant; + // the Extremum operations created to normalize the + // values of array bounds are formatted as max operations in the + // declarations in modules, and need to be read back in as such in + // order for expression comparison to not produce false inequalities + // when checking function results for procedure interface compatibility. + if (!context.moduleFileName()) { + ok = false; + } + } + Expr *argExpr{arg ? arg->UnwrapExpr() : nullptr}; + if (argExpr) { + *argExpr = Fold(context, std::move(*argExpr)); + } + if (Expr * tExpr{UnwrapExpr>(argExpr)}) { + if (result) { + result = FoldOperation( + context, Extremum{order, std::move(*result), Expr{*tExpr}}); + } else { + result = Expr{*tExpr}; + } + } else { + ok = false; } } - if (constantArgs.size() != funcRef.arguments().size()) { - return Expr(std::move(funcRef)); - } - CHECK(!constantArgs.empty()); - Expr result{std::move(*constantArgs[0])}; - for (std::size_t i{1}; i < constantArgs.size(); ++i) { - Extremum extremum{order, result, Expr{std::move(*constantArgs[i])}}; - result = FoldOperation(context, std::move(extremum)); - } - return result; + return ok && result ? std::move(*result) : Expr{std::move(funcRef)}; } // For AMAX0, AMIN0, AMAX1, AMIN1, DMAX1, DMIN1, MAX0, MIN0, MAX1, and MIN1 diff --git a/flang/test/Semantics/Inputs/modfile67.mod b/flang/test/Semantics/Inputs/modfile67.mod new file mode 100644 index 0000000000000..1aa0158e35089 --- /dev/null +++ b/flang/test/Semantics/Inputs/modfile67.mod @@ -0,0 +1,16 @@ +!mod$ v1 sum:37cfecee3234c8ab +module modfile67 +type::t +procedure(foo),nopass,pointer::p +end type +contains +pure function foo(n,a) result(r) +integer(4),intent(in)::n +real(4),intent(in)::a(1_8:int(n,kind=8)) +logical(4)::r(1_8:int(int(max(0_8,int(n,kind=8)),kind=4),kind=8)) +end +function fooptr(f) +procedure(foo)::f +type(t)::fooptr +end +end diff --git a/flang/test/Semantics/modfile67.f90 b/flang/test/Semantics/modfile67.f90 new file mode 100644 index 0000000000000..18cf95bd42fbf --- /dev/null +++ b/flang/test/Semantics/modfile67.f90 @@ -0,0 +1,35 @@ +!RUN: %flang_fc1 -fsyntax-only -J%S/Inputs %s + +#if 0 +!modfile67.mod was produced from this source, and must be read into this +!compilation from its module file in order to truly test this fix. +module modfile67 + type t + procedure(foo), nopass, pointer :: p + end type + contains + pure function foo(n,a) result(r) + integer, intent(in) :: n + real, intent(in), dimension(n) :: a + logical, dimension(size(a)) :: r + r = .false. + end + type(t) function fooptr(f) + procedure(foo) f + fooptr%p => f + end +end +#endif + +program test + use modfile67 + type(t) x + x = fooptr(bar) ! ensure no bogus error about procedure incompatibility + contains + pure function bar(n,a) result(r) + integer, intent(in) :: n + real, intent(in), dimension(n) :: a + logical, dimension(size(a)) :: r + r = .false. + end +end From fe58527305d86df8bd9770f3d41a6de420958af7 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:11:37 -0700 Subject: [PATCH 016/114] [flang] Relax ETIME(VALUES=) runtime checking (#107647) Don't require the "VALUES=" argument to the extension intrinsic procedure ETIME to have exactly two elements. Other compilers that support ETIME do not, and it's easy to adapt the behavior to whatever the dynamic size turns out to be. --- flang/runtime/time-intrinsic.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/flang/runtime/time-intrinsic.cpp b/flang/runtime/time-intrinsic.cpp index 7e590eabf3966..e6f6e81c7b50c 100644 --- a/flang/runtime/time-intrinsic.cpp +++ b/flang/runtime/time-intrinsic.cpp @@ -490,16 +490,20 @@ void RTNAME(Etime)(const Descriptor *values, const Descriptor *time, auto typeCode{values->type().GetCategoryAndKind()}; // ETIME values argument must have decimal range == 2. RUNTIME_CHECK(terminator, - values->rank() == 1 && values->GetDimension(0).Extent() == 2 && - typeCode && typeCode->first == Fortran::common::TypeCategory::Real); + values->rank() == 1 && typeCode && + typeCode->first == Fortran::common::TypeCategory::Real); // Only accept KIND=4 here. int kind{typeCode->second}; RUNTIME_CHECK(terminator, kind == 4); - - ApplyFloatingPointKind( - kind, terminator, *values, /* atIndex = */ 0, usrTime); - ApplyFloatingPointKind( - kind, terminator, *values, /* atIndex = */ 1, sysTime); + auto extent{values->GetDimension(0).Extent()}; + if (extent >= 1) { + ApplyFloatingPointKind( + kind, terminator, *values, /* atIndex = */ 0, usrTime); + } + if (extent >= 2) { + ApplyFloatingPointKind( + kind, terminator, *values, /* atIndex = */ 1, sysTime); + } } if (time) { From 26ac30bcec71ae97ba740fb6cf473eac3ac37887 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:13:09 -0700 Subject: [PATCH 017/114] [flang] Accept initialized SAVE local in specification expression (#107656) Specification expressions may contain references to dummy arguments, host objects, module variables, and variables in COMMON blocks, since they will have values on entry to the scope. A local variable with a initializer and the SAVE attribute (which will always be implied by an explicit initialization) will also always work, and is accepted by at least one other compiler, so accept it with a warning. --- flang/docs/Extensions.md | 3 +++ flang/include/flang/Common/Fortran-features.h | 5 ++++- flang/lib/Evaluate/check-expression.cpp | 17 +++++++++++++++-- flang/test/Semantics/resolve69.f90 | 6 +++--- flang/test/Semantics/resolve77.f90 | 1 + flang/test/Semantics/spec-expr.f90 | 2 +- 6 files changed, 27 insertions(+), 7 deletions(-) diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 340939baca99f..ed1ef49f8b77a 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -386,6 +386,9 @@ end probably by a C or C++ external definition. * An automatic data object may be declared in the specification part of the main program. +* A local data object may appear in a specification expression, even + when it is not a dummy argument or in COMMON, so long as it is + has the SAVE attribute and was initialized. ### Extensions supported when enabled by options diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h index 0c8a3d2bd5281..86c6e02b0f2ff 100644 --- a/flang/include/flang/Common/Fortran-features.h +++ b/flang/include/flang/Common/Fortran-features.h @@ -51,7 +51,8 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines, BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize, NonBindCInteroperability, CudaManaged, CudaUnified, PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy, - UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr) + UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr, + SavedLocalInSpecExpr) // Portability and suspicious usage warnings ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, @@ -146,6 +147,8 @@ class LanguageFeatureControl { warnUsage_.set(UsageWarning::VectorSubscriptFinalization); warnUsage_.set(UsageWarning::UndefinedFunctionResult); warnUsage_.set(UsageWarning::UselessIomsg); + // New warnings, on by default + warnLanguage_.set(LanguageFeature::SavedLocalInSpecExpr); } LanguageFeatureControl(const LanguageFeatureControl &) = default; diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp index fef4620857a08..8a90404db0456 100644 --- a/flang/lib/Evaluate/check-expression.cpp +++ b/flang/lib/Evaluate/check-expression.cpp @@ -554,6 +554,18 @@ class CheckSpecificationExprHelper } } else if (&symbol.owner() != &scope_ || &ultimate.owner() != &scope_) { return std::nullopt; // host association is in play + } else if (semantics::IsSaved(ultimate) && + semantics::IsInitialized(ultimate) && + context_.languageFeatures().IsEnabled( + common::LanguageFeature::SavedLocalInSpecExpr)) { + if (!scope_.IsModuleFile() && + context_.languageFeatures().ShouldWarn( + common::LanguageFeature::SavedLocalInSpecExpr)) { + context_.messages().Say( + "specification expression refers to local object '%s' (initialized and saved)"_port_en_US, + ultimate.name().ToString()); + } + return std::nullopt; } else if (const auto *object{ ultimate.detailsIf()}) { if (object->commonBlock()) { @@ -781,8 +793,9 @@ bool CheckSpecificationExprHelper::IsPermissibleInquiry( template void CheckSpecificationExpr(const A &x, const semantics::Scope &scope, FoldingContext &context, bool forElementalFunctionResult) { - if (auto why{CheckSpecificationExprHelper{ - scope, context, forElementalFunctionResult}(x)}) { + CheckSpecificationExprHelper helper{ + scope, context, forElementalFunctionResult}; + if (auto why{helper(x)}) { context.messages().Say("Invalid specification expression%s: %s"_err_en_US, forElementalFunctionResult ? " for elemental function result" : "", *why); diff --git a/flang/test/Semantics/resolve69.f90 b/flang/test/Semantics/resolve69.f90 index e1f7773eee9da..5acfd30604fe3 100644 --- a/flang/test/Semantics/resolve69.f90 +++ b/flang/test/Semantics/resolve69.f90 @@ -16,7 +16,7 @@ subroutine s1() ! integer, parameter :: constVal = 1 integer :: nonConstVal = 1 -!ERROR: Invalid specification expression: reference to local entity 'nonconstval' +!PORTABILITY: specification expression refers to local object 'nonconstval' (initialized and saved) character(nonConstVal) :: colonString1 character(len=20, kind=constVal + 1) :: constKindString character(len=:, kind=constVal + 1), pointer :: constKindString1 @@ -53,13 +53,13 @@ function foo3() type (derived(constVal, 3)) :: constDerivedKind !ERROR: Value of KIND type parameter 'typekind' must be constant -!ERROR: Invalid specification expression: reference to local entity 'nonconstval' +!PORTABILITY: specification expression refers to local object 'nonconstval' (initialized and saved) type (derived(nonConstVal, 3)) :: nonConstDerivedKind !OK because all type-params are constants type (derived(3, constVal)) :: constDerivedLen -!ERROR: Invalid specification expression: reference to local entity 'nonconstval' +!PORTABILITY: specification expression refers to local object 'nonconstval' (initialized and saved) type (derived(3, nonConstVal)) :: nonConstDerivedLen !ERROR: 'colonderivedlen' has a type derived(typekind=3_4,typelen=:) with a deferred type parameter but is neither an allocatable nor an object pointer type (derived(3, :)) :: colonDerivedLen diff --git a/flang/test/Semantics/resolve77.f90 b/flang/test/Semantics/resolve77.f90 index 943993ee74d76..0133fac3bfbc5 100644 --- a/flang/test/Semantics/resolve77.f90 +++ b/flang/test/Semantics/resolve77.f90 @@ -60,6 +60,7 @@ pure integer function if2(n) block data common /blk2/ n data n/100/ + !PORTABILITY: specification expression refers to local object 'n' (initialized and saved) !ERROR: Automatic data object 'a' may not appear in a BLOCK DATA subprogram real a(n) end diff --git a/flang/test/Semantics/spec-expr.f90 b/flang/test/Semantics/spec-expr.f90 index aa010ed0bf7ed..9d209c3583b43 100644 --- a/flang/test/Semantics/spec-expr.f90 +++ b/flang/test/Semantics/spec-expr.f90 @@ -104,7 +104,7 @@ subroutine s7biii(x, y) integer :: local = 5 ! OK, since "localConst" is a constant real, dimension(localConst) :: realArray1 - !ERROR: Invalid specification expression: reference to local entity 'local' + !PORTABILITY: specification expression refers to local object 'local' (initialized and saved) real, dimension(local) :: realArray2 real, dimension(size(realArray1)) :: realArray3 ! ok real, dimension(size(x)) :: realArray4 ! ok From cd92c4255582299b9a55fa0dc485982b8f54c49a Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:13:28 -0700 Subject: [PATCH 018/114] [flang][runtime] Don't emit runtime error for "AA" editing (#107714) Commas are optional between edit descriptors in a format, so treat "AA" as if it were "A,A". --- flang/runtime/format-implementation.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flang/runtime/format-implementation.h b/flang/runtime/format-implementation.h index 74254bebe6e7a..46204ca927c13 100644 --- a/flang/runtime/format-implementation.h +++ b/flang/runtime/format-implementation.h @@ -443,8 +443,9 @@ RT_API_ATTRS int FormatControl::CueUpNextDataEdit( if (ch != 'P') { // 1PE5.2 - comma not required (C1302) CharType peek{Capitalize(PeekNext())}; if (peek >= 'A' && peek <= 'Z') { - if (ch == 'A' /* anticipate F'202X AT editing */ || ch == 'B' || - ch == 'D' || ch == 'E' || ch == 'R' || ch == 'S' || ch == 'T') { + if ((ch == 'A' && peek == 'T' /* anticipate F'202X AT editing */) || + ch == 'B' || ch == 'D' || ch == 'E' || ch == 'R' || ch == 'S' || + ch == 'T') { // Assume a two-letter edit descriptor next = peek; ++offset_; From ea858e39bf5b1d09021d142f0c82ef1d4a82d367 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:13:47 -0700 Subject: [PATCH 019/114] [flang][runtime] Accept '\n' as space in internal list-directed input (#107716) When scanning ahead for the first character in the next input item in list-directed internal input, allow a newline character to appear and treat it as a space, matching the behavior of nearly all other Fortran compilers. --- flang/runtime/io-stmt.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h index d67d1ec80afce..2e0ca46078ecd 100644 --- a/flang/runtime/io-stmt.h +++ b/flang/runtime/io-stmt.h @@ -194,8 +194,9 @@ class IoStatementState { std::size_t &byteCount) { auto ch{GetCurrentChar(byteCount)}; bool inNamelist{mutableModes().inNamelist}; - while (!ch || *ch == ' ' || *ch == '\t' || (inNamelist && *ch == '!')) { - if (ch && (*ch == ' ' || *ch == '\t')) { + while (!ch || *ch == ' ' || *ch == '\t' || *ch == '\n' || + (inNamelist && *ch == '!')) { + if (ch && (*ch == ' ' || *ch == '\t' || *ch == '\n')) { HandleRelativePosition(byteCount); } else if (!AdvanceRecord()) { return Fortran::common::nullopt; From 15106c26662a573df31e8dfdd9350c313b8bfd84 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:14:08 -0700 Subject: [PATCH 020/114] [flang][runtime] Fix odd "invalid descriptor" runtime crash (#107785) A defined assignment generic interface for a given LHS/RHS type & rank combination may have a specific procedure with LHS dummy argument that is neither allocatable nor pointer, or specific procedure(s) whose LHS dummy arguments are allocatable or pointer. It is possible to have two specific procedures if one's LHS dummy argument is allocatable and the other's is pointer. However, the runtime doesn't work with LHS dummy arguments that are allocatable, and will crash with a mysterious "invalid descriptor" error message. Extend the list of special bindings to include ScalarAllocatableAssignment and ScalarPointerAssignment, use them when appropriate in the runtime type information tables, and handle them in Assign() in the runtime support library. --- flang/lib/Semantics/expression.cpp | 3 +- flang/lib/Semantics/runtime-type-info.cpp | 21 +++++++-- flang/module/__fortran_type_info.f90 | 13 +++--- flang/runtime/assign.cpp | 16 +++++-- flang/runtime/descriptor-io.h | 2 +- flang/runtime/namelist.cpp | 4 +- flang/runtime/type-info.cpp | 9 +++- flang/runtime/type-info.h | 22 +++++----- flang/test/Semantics/typeinfo01.f90 | 18 ++++---- flang/test/Semantics/typeinfo02.f90 | 4 +- flang/test/Semantics/typeinfo04.f90 | 2 +- flang/test/Semantics/typeinfo12.f90 | 52 +++++++++++++++++++++++ 12 files changed, 126 insertions(+), 40 deletions(-) create mode 100644 flang/test/Semantics/typeinfo12.f90 diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 0eabe532cfe0c..a5b5d48e2bfee 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -4605,7 +4605,8 @@ std::optional ArgumentAnalyzer::GetDefinedAssignmentProc() { } for (std::size_t i{0}; !proc && i < actuals_.size(); ++i) { const Symbol *generic{nullptr}; - if (const Symbol *binding{FindBoundOp(oprName, i, generic, true)}) { + if (const Symbol * + binding{FindBoundOp(oprName, i, generic, /*isSubroutine=*/true)}) { if (CheckAccessibleSymbol(scope, DEREF(generic))) { // ignore inaccessible type-bound ASSIGNMENT(=) generic } else if (const Symbol * diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp index 9f3eb5fbe11a1..427a8421aeaf9 100644 --- a/flang/lib/Semantics/runtime-type-info.cpp +++ b/flang/lib/Semantics/runtime-type-info.cpp @@ -149,6 +149,10 @@ class RuntimeTableBuilder { SomeExpr explicitEnum_; // Value::Genre::Explicit SomeExpr lenParameterEnum_; // Value::Genre::LenParameter SomeExpr scalarAssignmentEnum_; // SpecialBinding::Which::ScalarAssignment + SomeExpr + scalarAllocatableAssignmentEnum_; // SpecialBinding::Which::ScalarAllocatableAssignment + SomeExpr + scalarPointerAssignmentEnum_; // SpecialBinding::Which::ScalarPointerAssignment SomeExpr elementalAssignmentEnum_; // SpecialBinding::Which::ElementalAssignment SomeExpr readFormattedEnum_; // SpecialBinding::Which::ReadFormatted @@ -174,6 +178,9 @@ RuntimeTableBuilder::RuntimeTableBuilder( explicitEnum_{GetEnumValue("explicit")}, lenParameterEnum_{GetEnumValue("lenparameter")}, scalarAssignmentEnum_{GetEnumValue("scalarassignment")}, + scalarAllocatableAssignmentEnum_{ + GetEnumValue("scalarallocatableassignment")}, + scalarPointerAssignmentEnum_{GetEnumValue("scalarpointerassignment")}, elementalAssignmentEnum_{GetEnumValue("elementalassignment")}, readFormattedEnum_{GetEnumValue("readformatted")}, readUnformattedEnum_{GetEnumValue("readunformatted")}, @@ -1122,10 +1129,10 @@ void RuntimeTableBuilder::DescribeSpecialProc( // Non-type-bound generic INTERFACEs and assignments from distinct // types must not be used for component intrinsic assignment. CHECK(proc->dummyArguments.size() == 2); - const auto t1{ + const auto &ddo1{ DEREF(std::get_if( - &proc->dummyArguments[0].u)) - .type.type()}; + &proc->dummyArguments[0].u))}; + const auto t1{ddo1.type.type()}; const auto t2{ DEREF(std::get_if( &proc->dummyArguments[1].u)) @@ -1137,7 +1144,13 @@ void RuntimeTableBuilder::DescribeSpecialProc( return; } which = proc->IsElemental() ? elementalAssignmentEnum_ - : scalarAssignmentEnum_; + : ddo1.attrs.test( + evaluate::characteristics::DummyDataObject::Attr::Allocatable) + ? scalarAllocatableAssignmentEnum_ + : ddo1.attrs.test( + evaluate::characteristics::DummyDataObject::Attr::Pointer) + ? scalarPointerAssignmentEnum_ + : scalarAssignmentEnum_; if (binding && binding->passName() && *binding->passName() == proc->dummyArguments[1].name) { argThatMightBeDescriptor = 1; diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90 index 5f2273de1e3d1..7dfcfe71fcb32 100644 --- a/flang/module/__fortran_type_info.f90 +++ b/flang/module/__fortran_type_info.f90 @@ -106,11 +106,14 @@ end type enum, bind(c) ! SpecialBinding::Which - enumerator :: ScalarAssignment = 1, ElementalAssignment = 2 - enumerator :: ReadFormatted = 3, ReadUnformatted = 4 - enumerator :: WriteFormatted = 5, WriteUnformatted = 6 - enumerator :: ElementalFinal = 7, AssumedRankFinal = 8 - enumerator :: ScalarFinal = 9 ! higher-rank final procedures follow + enumerator :: ScalarAssignment = 1 + enumerator :: ScalarAllocatableAssignment = 2 + enumerator :: ScalarPointerAssignment = 3 + enumerator :: ElementalAssignment = 4 + enumerator :: ReadFormatted = 5, ReadUnformatted = 6 + enumerator :: WriteFormatted = 7, WriteUnformatted = 8 + enumerator :: ElementalFinal = 9, AssumedRankFinal = 10 + enumerator :: ScalarFinal = 11 ! higher-rank final procedures follow end enum type, bind(c) :: SpecialBinding diff --git a/flang/runtime/assign.cpp b/flang/runtime/assign.cpp index d558ada51cd21..166cf54778921 100644 --- a/flang/runtime/assign.cpp +++ b/flang/runtime/assign.cpp @@ -352,6 +352,17 @@ RT_API_ATTRS static void Assign( // the Assign() is invoked recursively for component-per-component // assignments. if (to.rank() == 0) { + if (to.IsAllocatable()) { + if (const auto *special{toDerived->FindSpecialBinding(typeInfo:: + SpecialBinding::Which::ScalarAllocatableAssignment)}) { + return DoScalarDefinedAssignment(to, from, *special); + } + } else if (to.IsPointer()) { + if (const auto *special{toDerived->FindSpecialBinding( + typeInfo::SpecialBinding::Which::ScalarPointerAssignment)}) { + return DoScalarDefinedAssignment(to, from, *special); + } + } if (const auto *special{toDerived->FindSpecialBinding( typeInfo::SpecialBinding::Which::ScalarAssignment)}) { return DoScalarDefinedAssignment(to, from, *special); @@ -417,9 +428,8 @@ RT_API_ATTRS static void Assign( StaticDescriptor statDesc[2]; Descriptor &toCompDesc{statDesc[0].descriptor()}; Descriptor &fromCompDesc{statDesc[1].descriptor()}; - comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt); - comp.CreatePointerDescriptor( - fromCompDesc, from, terminator, fromAt); + comp.CreateTargetDescriptor(toCompDesc, to, terminator, toAt); + comp.CreateTargetDescriptor(fromCompDesc, from, terminator, fromAt); Assign(toCompDesc, fromCompDesc, terminator, nestedFlags); } else { // Component has intrinsic type; simply copy raw bytes std::size_t componentByteSize{comp.SizeInBytes(to)}; diff --git a/flang/runtime/descriptor-io.h b/flang/runtime/descriptor-io.h index ff5f683c6da52..66158b4076164 100644 --- a/flang/runtime/descriptor-io.h +++ b/flang/runtime/descriptor-io.h @@ -255,7 +255,7 @@ static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io, // Create a descriptor for the component StaticDescriptor statDesc; Descriptor &desc{statDesc.descriptor()}; - component.CreatePointerDescriptor( + component.CreateTargetDescriptor( desc, origDescriptor, terminator, origSubscripts); return DescriptorIO(io, desc, table); } else { diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp index af092de70f781..fe26a0d3a6e89 100644 --- a/flang/runtime/namelist.cpp +++ b/flang/runtime/namelist.cpp @@ -362,7 +362,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc, io.HandleRelativePosition(byteCount); // skip over '(' StaticDescriptor staticDesc; Descriptor &tmpDesc{staticDesc.descriptor()}; - comp->CreatePointerDescriptor(tmpDesc, source, handler); + comp->CreateTargetDescriptor(tmpDesc, source, handler); if (!HandleSubscripts(io, desc, tmpDesc, compName)) { return false; } @@ -370,7 +370,7 @@ static RT_API_ATTRS bool HandleComponent(IoStatementState &io, Descriptor &desc, } } if (!createdDesc) { - comp->CreatePointerDescriptor(desc, source, handler); + comp->CreateTargetDescriptor(desc, source, handler); } if (source.rank() > 0) { if (desc.rank() > 0) { diff --git a/flang/runtime/type-info.cpp b/flang/runtime/type-info.cpp index cb18c5669b5ff..531944086c7f7 100644 --- a/flang/runtime/type-info.cpp +++ b/flang/runtime/type-info.cpp @@ -134,7 +134,7 @@ RT_API_ATTRS void Component::EstablishDescriptor(Descriptor &descriptor, } } -RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor, +RT_API_ATTRS void Component::CreateTargetDescriptor(Descriptor &descriptor, const Descriptor &container, Terminator &terminator, const SubscriptValue *subscripts) const { RUNTIME_CHECK(terminator, genre_ == Genre::Data); @@ -144,7 +144,6 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor, } else { descriptor.set_base_addr(container.OffsetElement() + offset_); } - descriptor.raw().attribute = CFI_attribute_pointer; } RT_API_ATTRS const DerivedType *DerivedType::GetParentType() const { @@ -297,6 +296,12 @@ FILE *SpecialBinding::Dump(FILE *f) const { case Which::ScalarAssignment: std::fputs(" ScalarAssignment", f); break; + case Which::ScalarAllocatableAssignment: + std::fputs(" ScalarAllocatableAssignment", f); + break; + case Which::ScalarPointerAssignment: + std::fputs(" ScalarPointerAssignment", f); + break; case Which::ElementalAssignment: std::fputs(" ElementalAssignment", f); break; diff --git a/flang/runtime/type-info.h b/flang/runtime/type-info.h index c3f3595e32ef2..340971bfacf3e 100644 --- a/flang/runtime/type-info.h +++ b/flang/runtime/type-info.h @@ -89,9 +89,9 @@ class Component { RT_API_ATTRS void EstablishDescriptor( Descriptor &, const Descriptor &container, Terminator &) const; - // Creates a pointer descriptor from this component description, possibly + // Creates a descriptor from this component description, possibly // with subscripts - RT_API_ATTRS void CreatePointerDescriptor(Descriptor &, + RT_API_ATTRS void CreateTargetDescriptor(Descriptor &, const Descriptor &container, Terminator &, const SubscriptValue * = nullptr) const; @@ -126,14 +126,16 @@ class SpecialBinding { enum class Which : std::uint8_t { None = 0, ScalarAssignment = 1, - ElementalAssignment = 2, - ReadFormatted = 3, - ReadUnformatted = 4, - WriteFormatted = 5, - WriteUnformatted = 6, - ElementalFinal = 7, - AssumedRankFinal = 8, - ScalarFinal = 9, + ScalarAllocatableAssignment = 2, + ScalarPointerAssignment = 3, + ElementalAssignment = 4, + ReadFormatted = 5, + ReadUnformatted = 6, + WriteFormatted = 7, + WriteUnformatted = 8, + ElementalFinal = 9, + AssumedRankFinal = 10, + ScalarFinal = 11, // higher-ranked final procedures follow }; diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90 index 0d381f10b0483..b6f0e2e12ff6f 100644 --- a/flang/test/Semantics/typeinfo01.f90 +++ b/flang/test/Semantics/typeinfo01.f90 @@ -102,8 +102,8 @@ impure elemental subroutine s1(x, y) class(t), intent(out) :: x class(t), intent(in) :: y end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=16_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=4_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] end module @@ -125,8 +125,8 @@ impure elemental subroutine s3(x) subroutine s4(x) type(t), contiguous :: x(:,:,:) end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)] +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=29184_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=9_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=13_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=14_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)] end module module m09 @@ -167,8 +167,8 @@ subroutine wu(x,u,iostat,iomsg) integer, intent(out) :: iostat character(len=*), intent(inout) :: iomsg end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)] +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=480_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=7_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=8_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)] end module @@ -216,8 +216,8 @@ subroutine wu(x,u,iostat,iomsg) integer, intent(out) :: iostat character(len=*), intent(inout) :: iomsg end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)] +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=480_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=8_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)] end module module m11 @@ -260,7 +260,7 @@ module m13 contains procedure :: assign1, assign2 generic :: assignment(=) => assign1, assign2 - ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=assign1)] + ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=4_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=assign1)] end type contains impure elemental subroutine assign1(to, from) diff --git a/flang/test/Semantics/typeinfo02.f90 b/flang/test/Semantics/typeinfo02.f90 index 29d14c7a0f196..2b911e7238f88 100644 --- a/flang/test/Semantics/typeinfo02.f90 +++ b/flang/test/Semantics/typeinfo02.f90 @@ -29,5 +29,5 @@ subroutine wf2(x,u,iot,v,iostat,iomsg) character(len=*), intent(inout) :: iomsg end subroutine end module -!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf1)] -!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf2)] +!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf1)] +!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf2)] diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90 index de8464321a409..2527f656da3d1 100644 --- a/flang/test/Semantics/typeinfo04.f90 +++ b/flang/test/Semantics/typeinfo04.f90 @@ -7,7 +7,7 @@ module m contains final :: final end type -!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) +!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=512_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) type, abstract :: t1 end type !CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90 new file mode 100644 index 0000000000000..983e09be0055b --- /dev/null +++ b/flang/test/Semantics/typeinfo12.f90 @@ -0,0 +1,52 @@ +!RUN: bbc --dump-symbols %s | FileCheck %s +!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s + +! Test defined assignment with allocatable / pointer LHS arguments. +! The special bindings for the defined assignments must reflect that +! their LHS arguments are allocatables and pointers. +! (This program is executable and should print 1; 102; 3 204.) + +module m + type :: base + integer :: i + contains + procedure, pass(src) :: ass1, ass2 + generic :: assignment(=) => ass1, ass2 + end type base + type, extends(base) :: derived + end type + +!CHECK: .dt.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.base,name=.n.base,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.base,procptr=NULL(),special=.s.base,specialbitset=12_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.derived, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.derived,name=.n.derived,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.derived,procptr=NULL(),special=.s.derived,specialbitset=12_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:1_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass1),specialbinding(which=3_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass2)] +!CHECK: .s.derived, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:1_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass1),specialbinding(which=3_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=ass2)] + +contains + subroutine ass1(res, src) + class(base), allocatable, intent(out) :: res + class(base), intent(in) :: src + allocate(res, source=src) + res%i = res%i + 100 + end subroutine + subroutine ass2(res, src) + class(base), pointer, intent(in out) :: res + class(base), intent(in) :: src + allocate(res, source=src) + res%i = src%i + 200 + end subroutine +end +program genext + use m + type(derived) :: od1 + class(base), allocatable :: od2 + class(base), pointer :: od3a, od3b + od1 = derived(1) + print *, od1%i + od2 = derived(2) + print *, od2%i + allocate(od3a) + od3a%i = 3 + od3b => od3a + od3b = derived(4) + print *, od3a%i, od3b%i +end program genext From 37f94cd99a5bc4b186651d6967d8595c4786d8ed Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:14:33 -0700 Subject: [PATCH 021/114] [flang] Accept KIND(x) when x is assumed-rank (#107787) Don't emit a bogus error about being unable to forward an assumed-rank dummy argument as an actual argument in the case of the KIND intrinsic function. Fixes https://github.com/llvm/llvm-project/issues/107782. --- flang/lib/Evaluate/intrinsics.cpp | 2 +- flang/test/Evaluate/fold-assumed-rank-kind.f90 | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 flang/test/Evaluate/fold-assumed-rank-kind.f90 diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index ebe946ac60ccb..876c2aed4ffd6 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -587,7 +587,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"izext", {{"i", AnyInt}}, TypePattern{IntType, KindCode::exactKind, 2}}, {"jzext", {{"i", AnyInt}}, DefaultInt}, {"kind", - {{"x", AnyIntrinsic, Rank::elemental, Optionality::required, + {{"x", AnyIntrinsic, Rank::anyOrAssumedRank, Optionality::required, common::Intent::In, {ArgFlag::canBeMoldNull}}}, DefaultInt, Rank::elemental, IntrinsicClass::inquiryFunction}, {"lbound", diff --git a/flang/test/Evaluate/fold-assumed-rank-kind.f90 b/flang/test/Evaluate/fold-assumed-rank-kind.f90 new file mode 100644 index 0000000000000..674f60c6a0e2f --- /dev/null +++ b/flang/test/Evaluate/fold-assumed-rank-kind.f90 @@ -0,0 +1,6 @@ +! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s +subroutine subr(ar) + real(8) :: ar(..) +!CHECK: PRINT *, 8_4 + print *, kind(ar) +end From d418a03e01e6a31b51b0c9dd42ba46da6c47f89d Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 14:15:20 -0700 Subject: [PATCH 022/114] [flang] Fix error from semantics on use associated procedure pointer (#107928) Use associated procedure pointers were eliciting bogus errors from semantics if their modules also contained generic procedure interfaces of the same name. (The compiler handles this case correctly when the specific procedure of the same name is not a pointer.) With this fix, the test case in https://github.com/llvm/llvm-project/issues/107784 no longer experiences semantic errors; however, it now crashes unexpectedly in lowering. --- flang/include/flang/Semantics/scope.h | 2 + flang/lib/Semantics/compute-offsets.cpp | 7 +++ flang/lib/Semantics/expression.cpp | 7 ++- flang/lib/Semantics/resolve-names.cpp | 69 +++++++++++++++++++++---- flang/lib/Semantics/symbol.cpp | 3 +- flang/test/Semantics/generic10.f90 | 17 ++++++ 6 files changed, 92 insertions(+), 13 deletions(-) create mode 100644 flang/test/Semantics/generic10.f90 diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h index a58163f5460c2..e73a507e9b3f5 100644 --- a/flang/include/flang/Semantics/scope.h +++ b/flang/include/flang/Semantics/scope.h @@ -138,6 +138,8 @@ class Scope { const_iterator cend() const { return symbols_.cend(); } // Return symbols in declaration order (the iterators above are in name order) + // When a generic procedure interface shadows a derived type or specific + // procedure, only the generic's symbol appears in the output. SymbolVector GetSymbols() const; MutableSymbolVector GetSymbols(); diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp index d9a9576e9d676..b5a58ddca0ecd 100644 --- a/flang/lib/Semantics/compute-offsets.cpp +++ b/flang/lib/Semantics/compute-offsets.cpp @@ -114,6 +114,13 @@ void ComputeOffsetsHelper::Compute(Scope &scope) { dependents_.find(symbol) == dependents_.end() && equivalenceBlock_.find(symbol) == equivalenceBlock_.end()) { DoSymbol(*symbol); + if (auto *generic{symbol->detailsIf()}) { + if (Symbol * specific{generic->specific()}; + specific && !FindCommonBlockContaining(*specific)) { + // might be a shadowed procedure pointer + DoSymbol(*specific); + } + } } } // Ensure that the size is a multiple of the alignment diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index a5b5d48e2bfee..943512f75d7eb 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -210,7 +210,8 @@ class ArgumentAnalyzer { // or procedure pointer reference in a ProcedureDesignator. MaybeExpr ExpressionAnalyzer::Designate(DataRef &&ref) { const Symbol &last{ref.GetLastSymbol()}; - const Symbol &symbol{BypassGeneric(last).GetUltimate()}; + const Symbol &specific{BypassGeneric(last)}; + const Symbol &symbol{specific.GetUltimate()}; if (semantics::IsProcedure(symbol)) { if (symbol.attrs().test(semantics::Attr::ABSTRACT)) { Say("Abstract procedure interface '%s' may not be used as a designator"_err_en_US, @@ -226,6 +227,10 @@ MaybeExpr ExpressionAnalyzer::Designate(DataRef &&ref) { } else if (!symbol.attrs().test(semantics::Attr::INTRINSIC)) { if (symbol.has()) { Say("'%s' is not a specific procedure"_err_en_US, last.name()); + } else if (IsProcedurePointer(specific)) { + // For procedure pointers, retain associations so that data accesses + // from client modules will work. + return Expr{ProcedureDesignator{specific}}; } else { return Expr{ProcedureDesignator{symbol}}; } diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index b764678357db3..d8f601212d8d0 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -618,6 +618,20 @@ class ScopeHandler : public ImplicitRulesVisitor { return *derivedType; } } + } else if constexpr (std::is_same_v) { + if (auto *d{symbol->detailsIf()}) { + if (!d->derivedType()) { + // procedure pointer with same name as a generic + auto *specific{d->specific()}; + if (!specific) { + specific = &currScope().MakeSymbol(name, attrs, std::move(details)); + d->set_specific(*specific); + } else { + SayAlreadyDeclared(name, *specific); + } + return *specific; + } + } } if (symbol->CanReplaceDetails(details)) { // update the existing symbol @@ -3035,14 +3049,26 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName, return; } const Symbol &useUltimate{useSymbol.GetUltimate()}; + const auto *useGeneric{useUltimate.detailsIf()}; if (localSymbol->has()) { - localSymbol->set_details(UseDetails{localName, useSymbol}); - localSymbol->attrs() = - useSymbol.attrs() & ~Attrs{Attr::PUBLIC, Attr::PRIVATE, Attr::SAVE}; - localSymbol->implicitAttrs() = - localSymbol->attrs() & Attrs{Attr::ASYNCHRONOUS, Attr::VOLATILE}; - localSymbol->flags() = useSymbol.flags(); - return; + if (useGeneric && useGeneric->specific() && + IsProcedurePointer(*useGeneric->specific())) { + // We are use-associating a generic that shadows a procedure pointer. + // Local references that might be made to that procedure pointer should + // use a UseDetails symbol for proper data addressing. So create an + // empty local generic now into which the use-associated generic may + // be copied. + localSymbol->set_details(GenericDetails{}); + localSymbol->get().set_kind(useGeneric->kind()); + } else { // just create UseDetails + localSymbol->set_details(UseDetails{localName, useSymbol}); + localSymbol->attrs() = + useSymbol.attrs() & ~Attrs{Attr::PUBLIC, Attr::PRIVATE, Attr::SAVE}; + localSymbol->implicitAttrs() = + localSymbol->attrs() & Attrs{Attr::ASYNCHRONOUS, Attr::VOLATILE}; + localSymbol->flags() = useSymbol.flags(); + return; + } } Symbol &localUltimate{localSymbol->GetUltimate()}; @@ -3066,10 +3092,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName, // - anything other than a derived type, non-generic procedure, or // generic procedure being combined with something other than an // prior USE association of itself - auto *localGeneric{localUltimate.detailsIf()}; - const auto *useGeneric{useUltimate.detailsIf()}; - Symbol *localDerivedType{nullptr}; if (localUltimate.has()) { localDerivedType = &localUltimate; @@ -3261,6 +3284,15 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName, // At this point, there must be at least one generic interface. CHECK(localGeneric || (useGeneric && (localDerivedType || localProcedure))); + // Ensure that a use-associated specific procedure that is a procedure + // pointer is properly represented as a USE association of an entity. + if (IsProcedurePointer(useProcedure)) { + Symbol &combined{currScope().MakeSymbol(localSymbol->name(), + useProcedure->attrs(), UseDetails{localName, *useProcedure})}; + combined.flags() |= useProcedure->flags(); + combinedProcedure = &combined; + } + if (localGeneric) { // Create a local copy of a previously use-associated generic so that // it can be locally extended without corrupting the original. @@ -5079,7 +5111,22 @@ bool DeclarationVisitor::HasCycle( Symbol &DeclarationVisitor::DeclareProcEntity( const parser::Name &name, Attrs attrs, const Symbol *interface) { - Symbol &symbol{DeclareEntity(name, attrs)}; + Symbol *proc{nullptr}; + if (auto *extant{FindInScope(name)}) { + if (auto *d{extant->detailsIf()}; d && !d->derivedType()) { + // procedure pointer with same name as a generic + if (auto *specific{d->specific()}) { + SayAlreadyDeclared(name, *specific); + } else { + // Create the ProcEntityDetails symbol in the scope as the "specific()" + // symbol behind an existing GenericDetails symbol of the same name. + proc = &Resolve(name, + currScope().MakeSymbol(name.source, attrs, ProcEntityDetails{})); + d->set_specific(*proc); + } + } + } + Symbol &symbol{proc ? *proc : DeclareEntity(name, attrs)}; if (auto *details{symbol.detailsIf()}) { if (context().HasError(symbol)) { } else if (HasCycle(symbol, interface)) { diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp index b593bf89b18bc..14d6564664f2c 100644 --- a/flang/lib/Semantics/symbol.cpp +++ b/flang/lib/Semantics/symbol.cpp @@ -210,8 +210,9 @@ const Symbol *GenericDetails::CheckSpecific() const { } Symbol *GenericDetails::CheckSpecific() { if (specific_ && !specific_->has()) { + const Symbol &ultimate{specific_->GetUltimate()}; for (const Symbol &proc : specificProcs_) { - if (&proc == specific_) { + if (&proc.GetUltimate() == &ultimate) { return nullptr; } } diff --git a/flang/test/Semantics/generic10.f90 b/flang/test/Semantics/generic10.f90 new file mode 100644 index 0000000000000..203d0bb855208 --- /dev/null +++ b/flang/test/Semantics/generic10.f90 @@ -0,0 +1,17 @@ +! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s +module m + procedure(func), pointer :: foo + interface foo + procedure :: foo + end interface + contains + function func(x) + func = x + end +end + +program main + use m +!CHECK: foo => func + foo => func +end From 5a2071b184e00f086f5b538f2209bcdb8aba3078 Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Tue, 10 Sep 2024 14:20:32 -0700 Subject: [PATCH 023/114] [compiler-rt][rtsan] Improve error message wording to match ASan style (#107620) --- compiler-rt/lib/rtsan/rtsan_context.cpp | 9 +++++---- compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h | 7 ++++--- compiler-rt/test/rtsan/basic.cpp | 3 ++- compiler-rt/test/rtsan/disabler.cpp | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/compiler-rt/lib/rtsan/rtsan_context.cpp b/compiler-rt/lib/rtsan/rtsan_context.cpp index a49b70360babb..8609394fa222f 100644 --- a/compiler-rt/lib/rtsan/rtsan_context.cpp +++ b/compiler-rt/lib/rtsan/rtsan_context.cpp @@ -95,10 +95,11 @@ void __rtsan::PrintDiagnostics(const char *intercepted_function_name, uptr pc, uptr bp) { ScopedErrorReportLock l; - fprintf(stderr, - "Real-time violation: intercepted call to real-time unsafe function " - "`%s` in real-time context! Stack trace:\n", - intercepted_function_name); + Report("ERROR: RealtimeSanitizer: unsafe-library-call\n"); + Printf("Intercepted call to real-time unsafe function " + "`%s` in real-time context!\n", + intercepted_function_name); + __rtsan::PrintStackTrace(pc, bp); } diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h b/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h index 6ca09cf657094..4ba4fc5e53086 100644 --- a/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h +++ b/compiler-rt/lib/rtsan/tests/rtsan_test_utilities.h @@ -30,9 +30,10 @@ void ExpectRealtimeDeath(Function &&Func, auto GetExpectedErrorSubstring = [&]() -> std::string { return intercepted_method_name != nullptr - ? "Real-time violation: intercepted call to real-time unsafe " - "function `" + - std::string(intercepted_method_name) + "`" + ? ".*==ERROR: RealtimeSanitizer: unsafe-library-call.*" + "Intercepted call to real-time unsafe function `" + + std::string(intercepted_method_name) + + "` in real-time context!" : ""; }; diff --git a/compiler-rt/test/rtsan/basic.cpp b/compiler-rt/test/rtsan/basic.cpp index f4075bb27e4f9..607db90213a30 100644 --- a/compiler-rt/test/rtsan/basic.cpp +++ b/compiler-rt/test/rtsan/basic.cpp @@ -17,6 +17,7 @@ void violation() [[clang::nonblocking]] { int main() { violation(); return 0; - // CHECK: Real-time violation: intercepted call to real-time unsafe function `malloc` in real-time context! Stack trace: + // CHECK: ==ERROR: RealtimeSanitizer: unsafe-library-call + // CHECK-NEXT: Intercepted call to real-time unsafe function `malloc` in real-time context! // CHECK-NEXT: {{.*malloc*}} } diff --git a/compiler-rt/test/rtsan/disabler.cpp b/compiler-rt/test/rtsan/disabler.cpp index 0a6411a2be694..dd1d4439beae4 100644 --- a/compiler-rt/test/rtsan/disabler.cpp +++ b/compiler-rt/test/rtsan/disabler.cpp @@ -41,7 +41,7 @@ int main() { // CHECK: Allocated pointer {{.*}} in disabled context // CHECK: Allocated second pointer {{.*}} in disabled context // CHECK: Free'd second pointer in disabled context - // CHECK: {{.*Real-time violation.*}} + // CHECK: ==ERROR: RealtimeSanitizer: unsafe-library-call // CHECK-NOT: {{.*malloc*}} // CHECK-NEXT: {{.*free.*}} } From 5495c36104103c4172808a28e8b2df3c806b1d85 Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 10 Sep 2024 14:22:57 -0700 Subject: [PATCH 024/114] [WebAssembly] Misc. refactoring in AsmTypeCheck (NFC) (#107978) Existing methods in AsmTypeCheck assumes symbol operand is the 0th operand; they take a `MCInst` and take `getOperand(0)` on it. I think passing a `MCOperand` removes this assumption and also is more intuitive. This was motivated by a new `try_table` instruction, whose support is going to be added to AsmTypeCheck soon, which has tag symbol operands in any position, depending on the number and the kinds of catch clauses. This PR changes all methods' signature that assumes the 0th operand is the relevant one, even if it's not the symbol operand. This also adds `getSignature` method, which factors out the common task when getting a `WasmSignature` from a `MCOperand`. --- .../AsmParser/WebAssemblyAsmTypeCheck.cpp | 92 +++++++++++-------- .../AsmParser/WebAssemblyAsmTypeCheck.h | 11 ++- 2 files changed, 62 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp index 9f9e7d1c0ed06..ec3d51d4e0e84 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.cpp @@ -112,9 +112,9 @@ bool WebAssemblyAsmTypeCheck::popRefType(SMLoc ErrorLoc) { return false; } -bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCInst &Inst, +bool WebAssemblyAsmTypeCheck::getLocal(SMLoc ErrorLoc, const MCOperand &LocalOp, wasm::ValType &Type) { - auto Local = static_cast(Inst.getOperand(0).getImm()); + auto Local = static_cast(LocalOp.getImm()); if (Local >= LocalTypes.size()) return typeError(ErrorLoc, StringRef("no local type specified for index ") + std::to_string(Local)); @@ -178,21 +178,21 @@ bool WebAssemblyAsmTypeCheck::checkSig(SMLoc ErrorLoc, return false; } -bool WebAssemblyAsmTypeCheck::getSymRef(SMLoc ErrorLoc, const MCInst &Inst, +bool WebAssemblyAsmTypeCheck::getSymRef(SMLoc ErrorLoc, const MCOperand &SymOp, const MCSymbolRefExpr *&SymRef) { - auto Op = Inst.getOperand(0); - if (!Op.isExpr()) + if (!SymOp.isExpr()) return typeError(ErrorLoc, StringRef("expected expression operand")); - SymRef = dyn_cast(Op.getExpr()); + SymRef = dyn_cast(SymOp.getExpr()); if (!SymRef) return typeError(ErrorLoc, StringRef("expected symbol operand")); return false; } -bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst, +bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, + const MCOperand &GlobalOp, wasm::ValType &Type) { const MCSymbolRefExpr *SymRef; - if (getSymRef(ErrorLoc, Inst, SymRef)) + if (getSymRef(ErrorLoc, GlobalOp, SymRef)) return true; auto WasmSym = cast(&SymRef->getSymbol()); switch (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA)) { @@ -217,10 +217,10 @@ bool WebAssemblyAsmTypeCheck::getGlobal(SMLoc ErrorLoc, const MCInst &Inst, return false; } -bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCInst &Inst, +bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCOperand &TableOp, wasm::ValType &Type) { const MCSymbolRefExpr *SymRef; - if (getSymRef(ErrorLoc, Inst, SymRef)) + if (getSymRef(ErrorLoc, TableOp, SymRef)) return true; auto WasmSym = cast(&SymRef->getSymbol()); if (WasmSym->getType().value_or(wasm::WASM_SYMBOL_TYPE_DATA) != @@ -231,6 +231,34 @@ bool WebAssemblyAsmTypeCheck::getTable(SMLoc ErrorLoc, const MCInst &Inst, return false; } +bool WebAssemblyAsmTypeCheck::getSignature(SMLoc ErrorLoc, + const MCOperand &SigOp, + wasm::WasmSymbolType Type, + const wasm::WasmSignature *&Sig) { + const MCSymbolRefExpr *SymRef = nullptr; + if (getSymRef(ErrorLoc, SigOp, SymRef)) + return true; + const auto *WasmSym = cast(&SymRef->getSymbol()); + Sig = WasmSym->getSignature(); + + if (!Sig || WasmSym->getType() != Type) { + const char *TypeName = nullptr; + switch (Type) { + case wasm::WASM_SYMBOL_TYPE_FUNCTION: + TypeName = "func"; + break; + case wasm::WASM_SYMBOL_TYPE_TAG: + TypeName = "tag"; + break; + default: + return true; + } + return typeError(ErrorLoc, StringRef("symbol ") + WasmSym->getName() + + ": missing ." + TypeName + "type"); + } + return false; +} + bool WebAssemblyAsmTypeCheck::endOfFunction(SMLoc ErrorLoc) { // Check the return types. for (auto RVT : llvm::reverse(ReturnTypes)) { @@ -252,48 +280,48 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, dumpTypeStack("typechecking " + Name + ": "); wasm::ValType Type; if (Name == "local.get") { - if (getLocal(Operands[1]->getStartLoc(), Inst, Type)) + if (getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; Stack.push_back(Type); } else if (Name == "local.set") { - if (getLocal(Operands[1]->getStartLoc(), Inst, Type)) + if (getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; if (popType(ErrorLoc, Type)) return true; } else if (Name == "local.tee") { - if (getLocal(Operands[1]->getStartLoc(), Inst, Type)) + if (getLocal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; if (popType(ErrorLoc, Type)) return true; Stack.push_back(Type); } else if (Name == "global.get") { - if (getGlobal(Operands[1]->getStartLoc(), Inst, Type)) + if (getGlobal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; Stack.push_back(Type); } else if (Name == "global.set") { - if (getGlobal(Operands[1]->getStartLoc(), Inst, Type)) + if (getGlobal(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; if (popType(ErrorLoc, Type)) return true; } else if (Name == "table.get") { - if (getTable(Operands[1]->getStartLoc(), Inst, Type)) + if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; if (popType(ErrorLoc, wasm::ValType::I32)) return true; Stack.push_back(Type); } else if (Name == "table.set") { - if (getTable(Operands[1]->getStartLoc(), Inst, Type)) + if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; if (popType(ErrorLoc, Type)) return true; if (popType(ErrorLoc, wasm::ValType::I32)) return true; } else if (Name == "table.size") { - if (getTable(Operands[1]->getStartLoc(), Inst, Type)) + if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; Stack.push_back(wasm::ValType::I32); } else if (Name == "table.grow") { - if (getTable(Operands[1]->getStartLoc(), Inst, Type)) + if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; if (popType(ErrorLoc, wasm::ValType::I32)) return true; @@ -301,7 +329,7 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, return true; Stack.push_back(wasm::ValType::I32); } else if (Name == "table.fill") { - if (getTable(Operands[1]->getStartLoc(), Inst, Type)) + if (getTable(Operands[1]->getStartLoc(), Inst.getOperand(0), Type)) return true; if (popType(ErrorLoc, wasm::ValType::I32)) return true; @@ -352,15 +380,10 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, return true; Unreachable = false; if (Name == "catch") { - const MCSymbolRefExpr *SymRef; - if (getSymRef(Operands[1]->getStartLoc(), Inst, SymRef)) + const wasm::WasmSignature *Sig = nullptr; + if (getSignature(Operands[1]->getStartLoc(), Inst.getOperand(0), + wasm::WASM_SYMBOL_TYPE_TAG, Sig)) return true; - const auto *WasmSym = cast(&SymRef->getSymbol()); - const auto *Sig = WasmSym->getSignature(); - if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_TAG) - return typeError(Operands[1]->getStartLoc(), StringRef("symbol ") + - WasmSym->getName() + - ": missing .tagtype"); // catch instruction pushes values whose types are specified in the tag's // "params" part Stack.insert(Stack.end(), Sig->Params.begin(), Sig->Params.end()); @@ -383,15 +406,10 @@ bool WebAssemblyAsmTypeCheck::typeCheck(SMLoc ErrorLoc, const MCInst &Inst, if (Name == "return_call_indirect" && endOfFunction(ErrorLoc)) return true; } else if (Name == "call" || Name == "return_call") { - const MCSymbolRefExpr *SymRef; - if (getSymRef(Operands[1]->getStartLoc(), Inst, SymRef)) - return true; - auto WasmSym = cast(&SymRef->getSymbol()); - auto Sig = WasmSym->getSignature(); - if (!Sig || WasmSym->getType() != wasm::WASM_SYMBOL_TYPE_FUNCTION) - return typeError(Operands[1]->getStartLoc(), StringRef("symbol ") + - WasmSym->getName() + - ": missing .functype"); + const wasm::WasmSignature *Sig = nullptr; + if (getSignature(Operands[1]->getStartLoc(), Inst.getOperand(0), + wasm::WASM_SYMBOL_TYPE_FUNCTION, Sig)) + return true; if (checkSig(ErrorLoc, *Sig)) return true; if (Name == "return_call" && endOfFunction(ErrorLoc)) diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h index 6fa95c3929753..9ba5693719e91 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmTypeCheck.h @@ -41,14 +41,17 @@ class WebAssemblyAsmTypeCheck final { bool typeError(SMLoc ErrorLoc, const Twine &Msg); bool popType(SMLoc ErrorLoc, std::optional EVT); bool popRefType(SMLoc ErrorLoc); - bool getLocal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type); + bool getLocal(SMLoc ErrorLoc, const MCOperand &LocalOp, wasm::ValType &Type); bool checkEnd(SMLoc ErrorLoc, bool PopVals = false); bool checkBr(SMLoc ErrorLoc, size_t Level); bool checkSig(SMLoc ErrorLoc, const wasm::WasmSignature &Sig); - bool getSymRef(SMLoc ErrorLoc, const MCInst &Inst, + bool getSymRef(SMLoc ErrorLoc, const MCOperand &SymOp, const MCSymbolRefExpr *&SymRef); - bool getGlobal(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type); - bool getTable(SMLoc ErrorLoc, const MCInst &Inst, wasm::ValType &Type); + bool getGlobal(SMLoc ErrorLoc, const MCOperand &GlobalOp, + wasm::ValType &Type); + bool getTable(SMLoc ErrorLoc, const MCOperand &TableOp, wasm::ValType &Type); + bool getSignature(SMLoc ErrorLoc, const MCOperand &SigOp, + wasm::WasmSymbolType Type, const wasm::WasmSignature *&Sig); public: WebAssemblyAsmTypeCheck(MCAsmParser &Parser, const MCInstrInfo &MII, From ace6d5f2ce53ae88205fc39dafa45e5682fd9a52 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Tue, 10 Sep 2024 14:33:04 -0700 Subject: [PATCH 025/114] [SandboxIR] Fix base class of FenceInst. Verify instructions when building a BB in debug mode. (#108078) @vporpo suggested in an offline conversation that verifying all instructions during `BasicBlock::buildBasicBlockFromLLVMIR` would be a good way to get coverage for errors like this during testing. He also suggested not gating it on `SBVEC_EXPENSIVE_CHECKS` for now as the checks are pretty basic at the moment and they only affect Debug builds. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 6 ++---- llvm/lib/SandboxIR/SandboxIR.cpp | 10 +++++++++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 8f025f7257b39..91d6b58cfee00 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -1194,9 +1194,7 @@ class BasicBlock : public Value { Instruction &back() const; #ifndef NDEBUG - void verify() const final { - assert(isa(Val) && "Expected BasicBlock!"); - } + void verify() const final; void dumpOS(raw_ostream &OS) const final; #endif }; @@ -1435,7 +1433,7 @@ template class SingleLLVMInstructionImpl : public Instruction { #endif }; -class FenceInst : public SingleLLVMInstructionImpl { +class FenceInst : public SingleLLVMInstructionImpl { FenceInst(llvm::FenceInst *FI, Context &Ctx) : SingleLLVMInstructionImpl(ClassID::Fence, Opcode::Fence, FI, Ctx) {} friend Context; // For constructor; diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 07472d1bff47b..a4b68bd8ffd7c 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -3173,7 +3173,7 @@ void BasicBlock::buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB) { Ctx.getOrCreateValue(Op); } } -#if !defined(NDEBUG) && defined(SBVEC_EXPENSIVE_CHECKS) +#if !defined(NDEBUG) verify(); #endif } @@ -3249,4 +3249,12 @@ void BasicBlock::dumpOS(raw_ostream &OS) const { } } } + +void BasicBlock::verify() const { + assert(isa(Val) && "Expected BasicBlock!"); + for (const auto &I : *this) { + I.verify(); + } +} + #endif // NDEBUG From 0fc4147c6d0c5bfb1fd9ed2a9f1c3a70e9281813 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 10 Sep 2024 14:23:16 -0700 Subject: [PATCH 026/114] [RISCV] Add test coverage for mul X, C where C=2^N*(3,5,9)*(3,5,9) --- llvm/test/CodeGen/RISCV/rv64zba.ll | 85 ++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll index 62595fd4a7ad6..19d7324eeff4a 100644 --- a/llvm/test/CodeGen/RISCV/rv64zba.ll +++ b/llvm/test/CodeGen/RISCV/rv64zba.ll @@ -569,6 +569,60 @@ define i64 @addmul72(i64 %a, i64 %b) { ret i64 %d } +define i64 @mul50(i64 %a) { +; CHECK-LABEL: mul50: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 50 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 50 + ret i64 %c +} + +define i64 @addmul50(i64 %a, i64 %b) { +; CHECK-LABEL: addmul50: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 50 +; CHECK-NEXT: mul a0, a0, a2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 50 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @mul100(i64 %a) { +; CHECK-LABEL: mul100: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 100 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 100 + ret i64 %c +} + +define i64 @addmul100(i64 %a, i64 %b) { +; CHECK-LABEL: addmul100: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 100 +; CHECK-NEXT: mul a0, a0, a2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 100 + %d = add i64 %c, %b + ret i64 %d +} + +define i64 @mul162(i64 %a) { +; CHECK-LABEL: mul162: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 162 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 162 + ret i64 %c +} + define i64 @addmul162(i64 %a, i64 %b) { ; CHECK-LABEL: addmul162: ; CHECK: # %bb.0: @@ -581,6 +635,16 @@ define i64 @addmul162(i64 %a, i64 %b) { ret i64 %d } +define i64 @mul180(i64 %a) { +; CHECK-LABEL: mul180: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 180 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 180 + ret i64 %c +} + define i64 @addmul180(i64 %a, i64 %b) { ; CHECK-LABEL: addmul180: ; CHECK: # %bb.0: @@ -605,6 +669,27 @@ define i64 @add255mul180(i64 %a) { ret i64 %d } +define i64 @mul200(i64 %a) { +; CHECK-LABEL: mul200: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 200 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 200 + ret i64 %c +} + +define i64 @addmul200(i64 %a, i64 %b) { +; CHECK-LABEL: addmul200: +; CHECK: # %bb.0: +; CHECK-NEXT: li a2, 200 +; CHECK-NEXT: mul a0, a0, a2 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: ret + %c = mul i64 %a, 200 + %d = add i64 %c, %b + ret i64 %d +} define i64 @addmul4096(i64 %a, i64 %b) { ; CHECK-LABEL: addmul4096: From 2ddf21bc702de25a34bb4a13b3610d8cc6bf3ca0 Mon Sep 17 00:00:00 2001 From: vporpo Date: Tue, 10 Sep 2024 14:42:09 -0700 Subject: [PATCH 027/114] [SandboxIR] Pass registry (#108084) This patch implements a simple Pass Registry class, which takes ownership of the passes registered with it and provides an interface to get the pass pointer by its name. --- llvm/include/llvm/SandboxIR/PassManager.h | 29 +++++++++++++++++++++++ llvm/lib/SandboxIR/Pass.cpp | 1 + llvm/lib/SandboxIR/PassManager.cpp | 6 +++++ llvm/unittests/SandboxIR/PassTest.cpp | 29 +++++++++++++++++++++++ 4 files changed, 65 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h index cb321fe699a56..5e250641f3b3f 100644 --- a/llvm/include/llvm/SandboxIR/PassManager.h +++ b/llvm/include/llvm/SandboxIR/PassManager.h @@ -18,6 +18,7 @@ #ifndef LLVM_SANDBOXIR_PASSMANAGER_H #define LLVM_SANDBOXIR_PASSMANAGER_H +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/SandboxIR/Pass.h" #include "llvm/Support/Debug.h" @@ -65,6 +66,34 @@ class FunctionPassManager final bool runOnFunction(Function &F) final; }; +/// Owns the passes and provides an API to get a pass by its name. +class PassRegistry { + SmallVector, 8> Passes; + DenseMap NameToPassMap; + +public: + PassRegistry() = default; + /// Registers \p PassPtr and takes ownership. + Pass ®isterPass(std::unique_ptr &&PassPtr) { + auto &PassRef = *PassPtr.get(); + NameToPassMap[PassRef.getName()] = &PassRef; + Passes.push_back(std::move(PassPtr)); + return PassRef; + } + /// \Returns the pass with name \p Name, or null if not registered. + Pass *getPassByName(StringRef Name) const { + auto It = NameToPassMap.find(Name); + return It != NameToPassMap.end() ? It->second : nullptr; + } +#ifndef NDEBUG + void print(raw_ostream &OS) const { + for (const auto &PassPtr : Passes) + OS << PassPtr->getName() << "\n"; + } + LLVM_DUMP_METHOD void dump() const; +#endif +}; + } // namespace llvm::sandboxir #endif // LLVM_SANDBOXIR_PASSMANAGER_H diff --git a/llvm/lib/SandboxIR/Pass.cpp b/llvm/lib/SandboxIR/Pass.cpp index 64e1b609a9f49..c6ec1aec48b19 100644 --- a/llvm/lib/SandboxIR/Pass.cpp +++ b/llvm/lib/SandboxIR/Pass.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/SandboxIR/Pass.h" +#include "llvm/SandboxIR/PassManager.h" #include "llvm/Support/Debug.h" using namespace llvm::sandboxir; diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp index d10f3926f7bcd..2dd19e74734db 100644 --- a/llvm/lib/SandboxIR/PassManager.cpp +++ b/llvm/lib/SandboxIR/PassManager.cpp @@ -20,3 +20,9 @@ bool FunctionPassManager::runOnFunction(Function &F) { // TODO: Check ChangeAll against hashes before/after. return Change; } +#ifndef NDEBUG +void PassRegistry::dump() const { + print(dbgs()); + dbgs() << "\n"; +} +#endif // NDEBUG diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp index 8e080128b15b3..3517f0e32b1bb 100644 --- a/llvm/unittests/SandboxIR/PassTest.cpp +++ b/llvm/unittests/SandboxIR/PassTest.cpp @@ -133,3 +133,32 @@ define void @foo() { EXPECT_EQ(Buff, "test-fpm(test-pass1,test-pass2)"); #endif // NDEBUG } + +TEST_F(PassTest, PassRegistry) { + class TestPass1 final : public FunctionPass { + public: + TestPass1() : FunctionPass("test-pass1") {} + bool runOnFunction(Function &F) final { return false; } + }; + class TestPass2 final : public FunctionPass { + public: + TestPass2() : FunctionPass("test-pass2") {} + bool runOnFunction(Function &F) final { return false; } + }; + + PassRegistry Registry; + auto &TP1 = Registry.registerPass(std::make_unique()); + auto &TP2 = Registry.registerPass(std::make_unique()); + + // Check getPassByName(). + EXPECT_EQ(Registry.getPassByName("test-pass1"), &TP1); + EXPECT_EQ(Registry.getPassByName("test-pass2"), &TP2); + +#ifndef NDEBUG + // Check print(). + std::string Buff; + llvm::raw_string_ostream SS(Buff); + Registry.print(SS); + EXPECT_EQ(Buff, "test-pass1\ntest-pass2\n"); +#endif // NDEBUG +} From d452429821d3263a73b27387324bc272b47ed1bf Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 15:00:42 -0700 Subject: [PATCH 028/114] [flang] Fix shared library flang build (#108101) I broke the shared library builds a few minutes ago by introducing a cyclic dependency between two parts of the compiler. Fix. --- flang/lib/Evaluate/check-expression.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp index 8a90404db0456..a1ede7d7553bf 100644 --- a/flang/lib/Evaluate/check-expression.cpp +++ b/flang/lib/Evaluate/check-expression.cpp @@ -525,6 +525,11 @@ class CheckSpecificationExprHelper Result operator()(const semantics::Symbol &symbol) const { const auto &ultimate{symbol.GetUltimate()}; + const auto *object{ultimate.detailsIf()}; + bool isInitialized{semantics::IsSaved(ultimate) && + !IsAllocatable(ultimate) && object && + (ultimate.test(Symbol::Flag::InDataStmt) || + object->init().has_value())}; if (const auto *assoc{ ultimate.detailsIf()}) { return (*this)(assoc->expr()); @@ -554,8 +559,7 @@ class CheckSpecificationExprHelper } } else if (&symbol.owner() != &scope_ || &ultimate.owner() != &scope_) { return std::nullopt; // host association is in play - } else if (semantics::IsSaved(ultimate) && - semantics::IsInitialized(ultimate) && + } else if (isInitialized && context_.languageFeatures().IsEnabled( common::LanguageFeature::SavedLocalInSpecExpr)) { if (!scope_.IsModuleFile() && From 957af7373881e62eec34ca87106fa2a2c2391d8e Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Tue, 10 Sep 2024 15:07:13 -0700 Subject: [PATCH 029/114] [sanitizer] Add CHECKs to validate calculated TLS range (#107941) --- compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp index 666e6f3b35106..a1107ff7d2473 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp @@ -151,6 +151,10 @@ DTLS::DTV *DTLS_on_tls_get_addr(void *arg_void, void *res, // This may happen inside the DTOR of main thread, so just ignore it. tls_size = 0; } + if (tls_size) { + CHECK_LE(tls_beg, reinterpret_cast(res)); + CHECK_LT(reinterpret_cast(res), tls_beg + tls_size); + } dtv->beg = tls_beg; dtv->size = tls_size; return dtv; From 10c04d9873dbbbca26f4d996396da297b9144add Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Tue, 10 Sep 2024 15:12:56 -0700 Subject: [PATCH 030/114] [LLDB]Skip Summary Statistics Tests for Windows (#108079) Follow up to #102708, the tests are failing for windows. There is a large variance in these tests between summary strings and built in types. I'm disabling these test for windows, and will add windows specific tests as a follow up to this. --- lldb/test/API/commands/statistics/basic/TestStats.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py index 03ec169344705..a0a9eeb649320 100644 --- a/lldb/test/API/commands/statistics/basic/TestStats.py +++ b/lldb/test/API/commands/statistics/basic/TestStats.py @@ -921,6 +921,7 @@ def test_order_of_options_do_not_matter(self): f"The order of options '{options[0]}' and '{options[1]}' should not matter", ) + @skipIfWindows def test_summary_statistics_providers(self): """ Test summary timing statistics is included in statistics dump when @@ -960,6 +961,7 @@ def test_summary_statistics_providers(self): self.assertIn("'totalTime':", summary_provider_str) self.assertIn("'type': 'python'", summary_provider_str) + @skipIfWindows def test_summary_statistics_providers_vec(self): """ Test summary timing statistics is included in statistics dump when From 6007ad79afeffb1288781b4a7241290386293aff Mon Sep 17 00:00:00 2001 From: "Henrik G. Olsson" Date: Tue, 10 Sep 2024 15:19:52 -0700 Subject: [PATCH 031/114] Revert "[llvm-lit] Process ANSI color codes in test output when formatting" (#108104) Reverts llvm/llvm-project#106776 because of a test failure on Windows. --- llvm/utils/lit/lit/TestRunner.py | 28 ++----------------- .../Inputs/escape-color/color-escaped.txt | 10 ------- .../lit/tests/Inputs/escape-color/color.txt | 6 ---- .../lit/tests/Inputs/escape-color/lit.cfg | 8 ------ llvm/utils/lit/tests/escape-color.py | 4 --- 5 files changed, 2 insertions(+), 54 deletions(-) delete mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt delete mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color.txt delete mode 100644 llvm/utils/lit/tests/Inputs/escape-color/lit.cfg delete mode 100644 llvm/utils/lit/tests/escape-color.py diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index a2c76d41a43e0..19f35fc7e212f 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -1017,20 +1017,6 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): return exitCode -def findColor(line, curr_color): - start = line.rfind("\33[") - if start == -1: - return curr_color - end = line.find("m", start+2) - if end == -1: - return curr_color - match = line[start:end+1] - # "\33[0m" means "reset all formatting". Sometimes the 0 is skipped. - if match == "\33[m" or match == "\33[0m": - return None - return match - - def formatOutput(title, data, limit=None): if not data.strip(): return "" @@ -1041,18 +1027,8 @@ def formatOutput(title, data, limit=None): msg = "" ndashes = 30 # fmt: off - out = f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n" - curr_color = None - for line in data.splitlines(): - if curr_color: - out += "\33[0m" - out += "# | " - if curr_color: - out += curr_color - out += line + "\n" - curr_color = findColor(line, curr_color) - if curr_color: - out += "\33[0m" # prevent unterminated formatting from leaking + out = f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n" + out += f"# | " + "\n# | ".join(data.splitlines()) + "\n" out += f"# `---{msg}{'-' * (ndashes - 4 - len(msg))}\n" # fmt: on return out diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt b/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt deleted file mode 100644 index e7a33e380b351..0000000000000 --- a/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt +++ /dev/null @@ -1,10 +0,0 @@ -# .---command stdout------------ -# | # RUN: cat %s -# | red -# | still red(B -# | plain -# | green -# | still green (never terminated) -# `----------------------------- - --- diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color.txt b/llvm/utils/lit/tests/Inputs/escape-color/color.txt deleted file mode 100644 index 15ffc22d134f0..0000000000000 --- a/llvm/utils/lit/tests/Inputs/escape-color/color.txt +++ /dev/null @@ -1,6 +0,0 @@ -# RUN: cat %s -red -still red(B -plain -green -still green (never terminated) diff --git a/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg b/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg deleted file mode 100644 index 36f4eb69d4858..0000000000000 --- a/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg +++ /dev/null @@ -1,8 +0,0 @@ -import lit.formats - -config.name = "escape-color" -config.suffixes = [".txt"] -config.test_format = lit.formats.ShTest() -config.test_source_root = None -config.test_exec_root = None - diff --git a/llvm/utils/lit/tests/escape-color.py b/llvm/utils/lit/tests/escape-color.py deleted file mode 100644 index 1d0b93b004e9d..0000000000000 --- a/llvm/utils/lit/tests/escape-color.py +++ /dev/null @@ -1,4 +0,0 @@ -# cut off the first 9 lines to avoid absolute file paths in the output -# then keep only the next 10 lines to avoid test timing in the output -# RUN: %{lit} %{inputs}/escape-color/color.txt -a | tail -n +10 | head -n 10 > %t -# RUN: diff %{inputs}/escape-color/color-escaped.txt %t From d14a600b1eb650f05fcd56a7b790e30f1f52e751 Mon Sep 17 00:00:00 2001 From: vporpo Date: Tue, 10 Sep 2024 15:28:21 -0700 Subject: [PATCH 032/114] [SandboxIR] Implement BlockAddress (#107940) This patch implements sandboxir::BlockAddress mirroring llvm:BlockAddress. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 29 +++++++++++ .../llvm/SandboxIR/SandboxIRValues.def | 1 + llvm/lib/SandboxIR/SandboxIR.cpp | 32 ++++++++++++- llvm/unittests/SandboxIR/SandboxIRTest.cpp | 48 +++++++++++++++++++ 4 files changed, 109 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 91d6b58cfee00..2fdbbbd094650 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -123,6 +123,7 @@ class ConstantFP; class ConstantAggregateZero; class ConstantPointerNull; class PoisonValue; +class BlockAddress; class Context; class Function; class Instruction; @@ -323,6 +324,7 @@ class Value { friend class ConstantPointerNull; // For `Val`. friend class UndefValue; // For `Val`. friend class PoisonValue; // For `Val`. + friend class BlockAddress; // For `Val`. /// All values point to the context. Context &Ctx; @@ -1112,6 +1114,33 @@ class PoisonValue final : public UndefValue { #endif }; +class BlockAddress final : public Constant { + BlockAddress(llvm::BlockAddress *C, Context &Ctx) + : Constant(ClassID::BlockAddress, C, Ctx) {} + friend class Context; // For constructor. + +public: + /// Return a BlockAddress for the specified function and basic block. + static BlockAddress *get(Function *F, BasicBlock *BB); + + /// Return a BlockAddress for the specified basic block. The basic + /// block must be embedded into a function. + static BlockAddress *get(BasicBlock *BB); + + /// Lookup an existing \c BlockAddress constant for the given BasicBlock. + /// + /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress. + static BlockAddress *lookup(const BasicBlock *BB); + + Function *getFunction() const; + BasicBlock *getBasicBlock() const; + + /// For isa/dyn_cast. + static bool classof(const sandboxir::Value *From) { + return From->getSubclassID() == ClassID::BlockAddress; + } +}; + /// Iterator for `Instruction`s in a `BasicBlock. /// \Returns an sandboxir::Instruction & when derereferenced. class BBIterator { diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index 459226216703d..c29e8be24ea75 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -34,6 +34,7 @@ DEF_CONST(ConstantAggregateZero, ConstantAggregateZero) DEF_CONST(ConstantPointerNull, ConstantPointerNull) DEF_CONST(UndefValue, UndefValue) DEF_CONST(PoisonValue, PoisonValue) +DEF_CONST(BlockAddress, BlockAddress) #ifndef DEF_INSTR #define DEF_INSTR(ID, OPCODE, CLASS) diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index a4b68bd8ffd7c..18fdcda15a1a9 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -2489,6 +2489,32 @@ PoisonValue *PoisonValue::getElementValue(unsigned Idx) const { cast(Val)->getElementValue(Idx))); } +BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) { + auto *LLVMC = llvm::BlockAddress::get(cast(F->Val), + cast(BB->Val)); + return cast(F->getContext().getOrCreateConstant(LLVMC)); +} + +BlockAddress *BlockAddress::get(BasicBlock *BB) { + auto *LLVMC = llvm::BlockAddress::get(cast(BB->Val)); + return cast(BB->getContext().getOrCreateConstant(LLVMC)); +} + +BlockAddress *BlockAddress::lookup(const BasicBlock *BB) { + auto *LLVMC = llvm::BlockAddress::lookup(cast(BB->Val)); + return cast_or_null(BB->getContext().getValue(LLVMC)); +} + +Function *BlockAddress::getFunction() const { + return cast( + Ctx.getValue(cast(Val)->getFunction())); +} + +BasicBlock *BlockAddress::getBasicBlock() const { + return cast( + Ctx.getValue(cast(Val)->getBasicBlock())); +} + FunctionType *Function::getFunctionType() const { return cast( Ctx.getType(cast(Val)->getFunctionType())); @@ -2585,6 +2611,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { It->second = std::unique_ptr( new ConstantFP(cast(C), *this)); return It->second.get(); + case llvm::Value::BlockAddressVal: + It->second = std::unique_ptr( + new BlockAddress(cast(C), *this)); + return It->second.get(); case llvm::Value::ConstantAggregateZeroVal: { auto *CAZ = cast(C); It->second = std::unique_ptr( @@ -2640,7 +2670,7 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { return It->second.get(); } if (auto *BB = dyn_cast(LLVMV)) { - assert(isa(U) && + assert(isa(U) && "This won't create a SBBB, don't call this function directly!"); if (auto *SBBB = getValue(BB)) return SBBB; diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 1b939b4d047aa..b76d24dc297b9 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -729,6 +729,54 @@ define void @foo() { EXPECT_EQ(UndefStruct->getNumElements(), 2u); } +TEST_F(SandboxIRTest, BlockAddress) { + parseIR(C, R"IR( +define void @foo(ptr %ptr) { +bb0: + store ptr blockaddress(@foo, %bb0), ptr %ptr + ret void +bb1: + ret void +bb2: + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + auto &F = *Ctx.createFunction(&LLVMF); + auto *BB0 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "bb0"))); + auto *BB1 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "bb1"))); + auto *BB2 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "bb2"))); + auto It = BB0->begin(); + auto *SI = cast(&*It++); + [[maybe_unused]] auto *Ret = cast(&*It++); + + // Check classof(), creation, getFunction(), getBasicBlock(). + auto *BB0Addr = cast(SI->getValueOperand()); + EXPECT_EQ(BB0Addr->getBasicBlock(), BB0); + EXPECT_EQ(BB0Addr->getFunction(), &F); + // Check get(F, BB). + auto *NewBB0Addr = sandboxir::BlockAddress::get(&F, BB0); + EXPECT_EQ(NewBB0Addr, BB0Addr); + // Check get(BB). + auto *NewBB0Addr2 = sandboxir::BlockAddress::get(BB0); + EXPECT_EQ(NewBB0Addr2, BB0Addr); + auto *BB1Addr = sandboxir::BlockAddress::get(BB1); + EXPECT_EQ(BB1Addr->getBasicBlock(), BB1); + EXPECT_NE(BB1Addr, BB0Addr); + // Check lookup(). + auto *LookupBB0Addr = sandboxir::BlockAddress::lookup(BB0); + EXPECT_EQ(LookupBB0Addr, BB0Addr); + auto *LookupBB1Addr = sandboxir::BlockAddress::lookup(BB1); + EXPECT_EQ(LookupBB1Addr, BB1Addr); + auto *LookupBB2Addr = sandboxir::BlockAddress::lookup(BB2); + EXPECT_EQ(LookupBB2Addr, nullptr); +} + TEST_F(SandboxIRTest, Use) { parseIR(C, R"IR( define i32 @foo(i32 %v0, i32 %v1) { From bb7286515c0b285382f370232f97ffa7cfcbc550 Mon Sep 17 00:00:00 2001 From: Sterling-Augustine <56981066+Sterling-Augustine@users.noreply.github.com> Date: Tue, 10 Sep 2024 22:44:30 +0000 Subject: [PATCH 033/114] [SandboxIR] Implement FixedVectorType (#107930) --- llvm/include/llvm/SandboxIR/Type.h | 46 ++++++++++++++++++++ llvm/lib/SandboxIR/Type.cpp | 5 +++ llvm/unittests/SandboxIR/TypesTest.cpp | 58 ++++++++++++++++++++++++++ 3 files changed, 109 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index 44aee4e4a5b46..ec141c249fb21 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -25,6 +25,7 @@ class Context; // Forward declare friend classes for MSVC. class PointerType; class VectorType; +class FixedVectorType; class IntegerType; class FunctionType; class ArrayType; @@ -41,6 +42,7 @@ class Type { friend class ArrayType; // For LLVMTy. friend class StructType; // For LLVMTy. friend class VectorType; // For LLVMTy. + friend class FixedVectorType; // For LLVMTy. friend class PointerType; // For LLVMTy. friend class FunctionType; // For LLVMTy. friend class IntegerType; // For LLVMTy. @@ -344,6 +346,50 @@ class VectorType : public Type { } }; +class FixedVectorType : public VectorType { +public: + static FixedVectorType *get(Type *ElementType, unsigned NumElts); + + static FixedVectorType *get(Type *ElementType, const FixedVectorType *FVTy) { + return get(ElementType, FVTy->getNumElements()); + } + + static FixedVectorType *getInteger(FixedVectorType *VTy) { + return cast(VectorType::getInteger(VTy)); + } + + static FixedVectorType *getExtendedElementVectorType(FixedVectorType *VTy) { + return cast(VectorType::getExtendedElementVectorType(VTy)); + } + + static FixedVectorType *getTruncatedElementVectorType(FixedVectorType *VTy) { + return cast( + VectorType::getTruncatedElementVectorType(VTy)); + } + + static FixedVectorType *getSubdividedVectorType(FixedVectorType *VTy, + int NumSubdivs) { + return cast( + VectorType::getSubdividedVectorType(VTy, NumSubdivs)); + } + + static FixedVectorType *getHalfElementsVectorType(FixedVectorType *VTy) { + return cast(VectorType::getHalfElementsVectorType(VTy)); + } + + static FixedVectorType *getDoubleElementsVectorType(FixedVectorType *VTy) { + return cast(VectorType::getDoubleElementsVectorType(VTy)); + } + + static bool classof(const Type *T) { + return isa(T->LLVMTy); + } + + unsigned getNumElements() const { + return cast(LLVMTy)->getNumElements(); + } +}; + class FunctionType : public Type { public: // TODO: add missing functions diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp index bf9f02e2ba311..26aa8b3743084 100644 --- a/llvm/lib/SandboxIR/Type.cpp +++ b/llvm/lib/SandboxIR/Type.cpp @@ -103,6 +103,11 @@ bool VectorType::isValidElementType(Type *ElemTy) { return llvm::VectorType::isValidElementType(ElemTy->LLVMTy); } +FixedVectorType *FixedVectorType::get(Type *ElementType, unsigned NumElts) { + return cast(ElementType->getContext().getType( + llvm::FixedVectorType::get(ElementType->LLVMTy, NumElts))); +} + IntegerType *IntegerType::get(Context &Ctx, unsigned NumBits) { return cast( Ctx.getType(llvm::IntegerType::get(Ctx.LLVMCtx, NumBits))); diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp index e4f9235c1ef3c..3564ae6683014 100644 --- a/llvm/unittests/SandboxIR/TypesTest.cpp +++ b/llvm/unittests/SandboxIR/TypesTest.cpp @@ -323,6 +323,64 @@ define void @foo(<4 x i16> %vi0, <4 x float> %vf1, i8 %i0) { EXPECT_FALSE(sandboxir::VectorType::isValidElementType(FVecTy)); } +TEST_F(SandboxTypeTest, FixedVectorType) { + parseIR(C, R"IR( +define void @foo(<4 x i16> %vi0, <4 x float> %vf1, i8 %i0) { + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + // Check classof(), creation, accessors + auto *Vec4i16Ty = cast(F->getArg(0)->getType()); + EXPECT_TRUE(Vec4i16Ty->getElementType()->isIntegerTy(16)); + EXPECT_EQ(Vec4i16Ty->getElementCount(), ElementCount::getFixed(4)); + + // get(ElementType, NumElements) + EXPECT_EQ( + sandboxir::FixedVectorType::get(sandboxir::Type::getInt16Ty(Ctx), 4), + F->getArg(0)->getType()); + // get(ElementType, Other) + EXPECT_EQ(sandboxir::FixedVectorType::get( + sandboxir::Type::getInt16Ty(Ctx), + cast(F->getArg(0)->getType())), + F->getArg(0)->getType()); + auto *Vec4FTy = cast(F->getArg(1)->getType()); + EXPECT_TRUE(Vec4FTy->getElementType()->isFloatTy()); + // getInteger + auto *Vec4i32Ty = sandboxir::FixedVectorType::getInteger(Vec4FTy); + EXPECT_TRUE(Vec4i32Ty->getElementType()->isIntegerTy(32)); + EXPECT_EQ(Vec4i32Ty->getElementCount(), Vec4FTy->getElementCount()); + // getExtendedElementCountVectorType + auto *Vec4i64Ty = + sandboxir::FixedVectorType::getExtendedElementVectorType(Vec4i16Ty); + EXPECT_TRUE(Vec4i64Ty->getElementType()->isIntegerTy(32)); + EXPECT_EQ(Vec4i64Ty->getElementCount(), Vec4i16Ty->getElementCount()); + // getTruncatedElementVectorType + auto *Vec4i8Ty = + sandboxir::FixedVectorType::getTruncatedElementVectorType(Vec4i16Ty); + EXPECT_TRUE(Vec4i8Ty->getElementType()->isIntegerTy(8)); + EXPECT_EQ(Vec4i8Ty->getElementCount(), Vec4i8Ty->getElementCount()); + // getSubdividedVectorType + auto *Vec8i8Ty = + sandboxir::FixedVectorType::getSubdividedVectorType(Vec4i16Ty, 1); + EXPECT_TRUE(Vec8i8Ty->getElementType()->isIntegerTy(8)); + EXPECT_EQ(Vec8i8Ty->getElementCount(), ElementCount::getFixed(8)); + // getNumElements + EXPECT_EQ(Vec8i8Ty->getNumElements(), 8u); + // getHalfElementsVectorType + auto *Vec2i16Ty = + sandboxir::FixedVectorType::getHalfElementsVectorType(Vec4i16Ty); + EXPECT_TRUE(Vec2i16Ty->getElementType()->isIntegerTy(16)); + EXPECT_EQ(Vec2i16Ty->getElementCount(), ElementCount::getFixed(2)); + // getDoubleElementsVectorType + auto *Vec8i16Ty = + sandboxir::FixedVectorType::getDoubleElementsVectorType(Vec4i16Ty); + EXPECT_TRUE(Vec8i16Ty->getElementType()->isIntegerTy(16)); + EXPECT_EQ(Vec8i16Ty->getElementCount(), ElementCount::getFixed(8)); +} + TEST_F(SandboxTypeTest, FunctionType) { parseIR(C, R"IR( define void @foo() { From 5804193e38680683b370cb3ced46c018d4dbd1b2 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Tue, 10 Sep 2024 16:14:07 -0700 Subject: [PATCH 034/114] Revert "[sanitizer] Add CHECKs to validate calculated TLS range" (#108112) Reverts llvm/llvm-project#107941 Broke PPC bot --- compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp index a1107ff7d2473..666e6f3b35106 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_tls_get_addr.cpp @@ -151,10 +151,6 @@ DTLS::DTV *DTLS_on_tls_get_addr(void *arg_void, void *res, // This may happen inside the DTOR of main thread, so just ignore it. tls_size = 0; } - if (tls_size) { - CHECK_LE(tls_beg, reinterpret_cast(res)); - CHECK_LT(reinterpret_cast(res), tls_beg + tls_size); - } dtv->beg = tls_beg; dtv->size = tls_size; return dtv; From 829ea59ddaf0ddfa1d9316a9260bd3ba17562ffe Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Tue, 10 Sep 2024 16:21:01 -0700 Subject: [PATCH 035/114] [docs] Add a section on AI-generated content to the developer policy (#91014) Governments around the world are starting to require labelling for AI-generated content, and some LLVM stakeholders have asked if LLVM contains AI-generated content. Defining a policy on the use of AI tools allows us to answer that question affirmatively, one way of the other. The policy proposed here allows the use of AI tools in LLVM contributions, flowing from the idea that any contribution is fine regardless of how it is made, as long as the contributor has the right to license it under the project license. I gathered input from the community in this RFC and incorporated it into the policy: https://discourse.llvm.org/t/rfc-define-policy-on-ai-tool-usage-in-contributions/78758 --- llvm/docs/DeveloperPolicy.rst | 28 +++++++++++++++++++++++++++- llvm/docs/FAQ.rst | 7 +++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst index 49ec310b382f9..f74adc4702d38 100644 --- a/llvm/docs/DeveloperPolicy.rst +++ b/llvm/docs/DeveloperPolicy.rst @@ -1077,6 +1077,8 @@ If you have questions or comments about these topics, please ask on the please realize that most compiler developers are not lawyers, and therefore you will not be getting official legal advice. +.. _LLVM Discourse forums: https://discourse.llvm.org + Copyright --------- @@ -1301,4 +1303,28 @@ to move code from (e.g.) libc++ to the LLVM core without concern, but that code cannot be moved from the LLVM core to libc++ without the copyright owner's permission. -.. _LLVM Discourse forums: https://discourse.llvm.org +.. _ai contributions: + +AI generated contributions +-------------------------- + +Artificial intelligence systems raise many questions around copyright that have +yet to be answered. Our policy on AI tools is guided by our copyright policy: +Contributors are responsible for ensuring that they have the right to contribute +code under the terms of our license, typically meaning that either they, their +employer, or their collaborators hold the copyright. Using AI tools to +regenerate copyrighted material does not remove the copyright, and contributors +are responsible for ensuring that such material does not appear in their +contributions. + +As such, the LLVM policy is that contributors are permitted to use artificial +intelligence tools to produce contributions, provided that they have the right +to license that code under the project license. Contributions found to violate +this policy will be removed just like any other offending contribution. + +While the LLVM project has a liberal policy on AI tool use, contributors are +considered responsible for their contributions. We encourage contributors to +review all generated code before sending it for review to verify its +correctness and to understand it so that they can answer questions during code +review. Reviewing and maintaining generated code that the original contributor +does not understand is not a good use of limited project resources. diff --git a/llvm/docs/FAQ.rst b/llvm/docs/FAQ.rst index 229ac99f703c1..aa20de47a6998 100644 --- a/llvm/docs/FAQ.rst +++ b/llvm/docs/FAQ.rst @@ -22,6 +22,13 @@ Yes. This is why we distribute LLVM under a less restrictive license than GPL, as explained in the first question above. +Can I use AI coding tools, such as GitHub co-pilot, to write LLVM patches? +-------------------------------------------------------------------------- +Yes, as long as the resulting work can be licensed under the project license, as +covered in the :doc:`DeveloperPolicy`. Using an AI tool to reproduce copyrighted +work does not rinse it of copyright and grant you the right to relicense it. + + Source Code =========== From ae5f1a78d3a930466f927989faac8e0b9d820a7b Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Tue, 10 Sep 2024 16:27:56 -0700 Subject: [PATCH 036/114] [MemProf] Convert CallContextInfo to a struct (NFC) (#108086) As suggested in #107918, improve readability by converting this tuple to a struct. --- .../IPO/MemProfContextDisambiguation.cpp | 42 ++++++++++++------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 5c09bb1800cb2..fa25baee2ba03 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -470,8 +470,20 @@ class CallsiteContextGraph { private: using EdgeIter = typename std::vector>::iterator; - using CallContextInfo = std::tuple, - const FuncTy *, DenseSet>; + // Structure to keep track of information for each call as we are matching + // non-allocation callsites onto context nodes created from the allocation + // call metadata / summary contexts. + struct CallContextInfo { + // The callsite we're trying to match. + CallTy Call; + // The callsites stack ids that have a context node in the graph. + std::vector StackIds; + // The function containing this callsite. + const FuncTy *Func; + // Initially empty, if needed this will be updated to contain the context + // ids for use in a new context node created for this callsite. + DenseSet ContextIds; + }; /// Assigns the given Node to calls at or inlined into the location with /// the Node's stack id, after post order traversing and processing its @@ -1458,7 +1470,7 @@ void CallsiteContextGraph::updateStackNodes() { auto &Calls = It.getSecond(); // Skip single calls with a single stack id. These don't need a new node. if (Calls.size() == 1) { - auto &Ids = std::get<1>(Calls[0]); + auto &Ids = Calls[0].StackIds; if (Ids.size() == 1) continue; } @@ -1474,18 +1486,15 @@ void CallsiteContextGraph::updateStackNodes() { // that to sort by. DenseMap FuncToIndex; for (const auto &[Idx, CallCtxInfo] : enumerate(Calls)) - FuncToIndex.insert({std::get<2>(CallCtxInfo), Idx}); + FuncToIndex.insert({CallCtxInfo.Func, Idx}); std::stable_sort( Calls.begin(), Calls.end(), [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) { - auto &IdsA = std::get<1>(A); - auto &IdsB = std::get<1>(B); - auto *FuncA = std::get<2>(A); - auto *FuncB = std::get<2>(B); - return IdsA.size() > IdsB.size() || - (IdsA.size() == IdsB.size() && - (IdsA < IdsB || - (IdsA == IdsB && FuncToIndex[FuncA] < FuncToIndex[FuncB]))); + return A.StackIds.size() > B.StackIds.size() || + (A.StackIds.size() == B.StackIds.size() && + (A.StackIds < B.StackIds || + (A.StackIds == B.StackIds && + FuncToIndex[A.Func] < FuncToIndex[B.Func]))); }); // Find the node for the last stack id, which should be the same @@ -1520,7 +1529,7 @@ void CallsiteContextGraph::updateStackNodes() { #ifndef NDEBUG // If this call has a different set of ids than the last one, clear the // set used to ensure they are sorted properly. - if (I > 0 && Ids != std::get<1>(Calls[I - 1])) + if (I > 0 && Ids != Calls[I - 1].StackIds) MatchingIdsFuncSet.clear(); else // If the prior call had the same stack ids this set would not be empty. @@ -1607,17 +1616,18 @@ void CallsiteContextGraph::updateStackNodes() { // assigned to the same context node, and skip them. bool DuplicateContextIds = false; for (unsigned J = I + 1; J < Calls.size(); J++) { - auto &NextIds = std::get<1>(Calls[J]); + auto &CallCtxInfo = Calls[J]; + auto &NextIds = CallCtxInfo.StackIds; if (NextIds != Ids) break; - auto *NextFunc = std::get<2>(Calls[J]); + auto *NextFunc = CallCtxInfo.Func; if (NextFunc != Func) { // We have another Call with the same ids but that cannot share this // node, must duplicate ids for it. DuplicateContextIds = true; break; } - auto &NextCall = std::get<0>(Calls[J]); + auto &NextCall = CallCtxInfo.Call; CallToMatchingCall[NextCall] = Call; // Update I so that it gets incremented correctly to skip this call. I = J; From d2f25e5405cce348913994db71a5efb0c1cf7f28 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 10 Sep 2024 16:38:26 -0700 Subject: [PATCH 037/114] [LegalizeTypes] Avoid creating an unused node in ExpandIntRes_ADDSUB. NFC The Hi result is sometimes calculated a different way and this node goes unused. Defer creation until we know for sure it is neeeded. The test changes is because the node creation order changed the names in the debug output. --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 7 ++++--- .../update_llc_test_checks/Inputs/lanai_isel.ll.expected | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a1cb74f43e605..2fa9e46eae506 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -3496,7 +3496,6 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, if (N->getOpcode() == ISD::ADD) { Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps); - Hi = DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2)); SDValue Cmp; // Special case: X+1 has a carry out if X+1==0. This may reduce the live // range of X. We assume comparing with 0 is cheap. @@ -3521,10 +3520,12 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N, Carry = DAG.getSelect(dl, NVT, Cmp, DAG.getConstant(1, dl, NVT), DAG.getConstant(0, dl, NVT)); - if (isAllOnesConstant(LoOps[1]) && isAllOnesConstant(HiOps[1])) + if (isAllOnesConstant(LoOps[1]) && isAllOnesConstant(HiOps[1])) { Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps[0], Carry); - else + } else { + Hi = DAG.getNode(ISD::ADD, dl, NVT, ArrayRef(HiOps, 2)); Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry); + } } else { Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps); Hi = DAG.getNode(ISD::SUB, dl, NVT, ArrayRef(HiOps, 2)); diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected index 71e82eca6c3e3..936efa378c1a4 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/lanai_isel.ll.expected @@ -12,10 +12,10 @@ define i64 @i64_test(i64 %i) nounwind readnone { ; CHECK-NEXT: t24: i32 = ADD_R t5, t22, TargetConstant:i32<0> ; CHECK-NEXT: t3: i32,ch = LDW_RI TargetFrameIndex:i32<-1>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0 ; CHECK-NEXT: t19: i32,ch = LDW_RI TargetFrameIndex:i32<0>, TargetConstant:i32<0>, TargetConstant:i32<0>, t0 -; CHECK-NEXT: t25: i32 = ADD_R t3, t19, TargetConstant:i32<0> +; CHECK-NEXT: t27: i32 = ADD_R t3, t19, TargetConstant:i32<0> ; CHECK-NEXT: t30: i32,glue = SFSUB_F_RR t24, t5 ; CHECK-NEXT: t31: i32 = SCC TargetConstant:i32<4>, t30:1 -; CHECK-NEXT: t28: i32 = ADD_R t25, t31, TargetConstant:i32<0> +; CHECK-NEXT: t28: i32 = ADD_R t27, t31, TargetConstant:i32<0> ; CHECK-NEXT: t15: ch,glue = CopyToReg t0, Register:i32 $rv, t28 ; CHECK-NEXT: t17: ch,glue = CopyToReg t15, Register:i32 $r9, t24, t15:1 ; CHECK-NEXT: t18: ch = RET Register:i32 $rv, Register:i32 $r9, t17, t17:1 From db7e8f2ae81fe10170dc202e45ee8b784e75c74c Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Tue, 10 Sep 2024 17:02:15 -0700 Subject: [PATCH 038/114] [compiler-rt] Hardcode uptr/sptr typedefs on Linux Arm (#108105) After #106155, Android arm32 asan builds stopped working with missing definition linker errors. This is due to inconsistent definitions of `uptr` of either `unsigned long` or `unsigned int` even between TUs in compiler-rt. This is caused by Linux arm32 headers redefining `__UINTPTR_TYPE__` (see `arch/arm/include/uapi/asm/types.h` in the Linux kernel repo), meaning include order/whether or not the Linux header is included changes compiler-rt symbol mangling. As a workaround, this hardcodes `uptr`/`sptr` in compiler-rt to `unsigned int`/`int` on Linux arm32, matching clang/gcc. --- compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h index f8f03454ea169..9208b12552ff5 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h @@ -139,8 +139,14 @@ namespace __sanitizer { #if defined(__UINTPTR_TYPE__) +# if defined(__arm__) && defined(__linux__) +// Linux Arm headers redefine __UINTPTR_TYPE__ and disagree with clang/gcc. +typedef unsigned int uptr; +typedef int sptr; +# else typedef __UINTPTR_TYPE__ uptr; typedef __INTPTR_TYPE__ sptr; +# endif #elif defined(_WIN64) // 64-bit Windows uses LLP64 data model. typedef unsigned long long uptr; From 6e854a6a01d310689a8b5d50126decd46b3880ea Mon Sep 17 00:00:00 2001 From: ChiaHungDuan Date: Tue, 10 Sep 2024 17:46:02 -0700 Subject: [PATCH 039/114] [scudo] Fix the logic of MaxAllowedFragmentedPages (#107927) MTE doesn't support MaxReleasedCachePages which may break the assumption that only the first 4 pages will have memory tagged. --- compiler-rt/lib/scudo/standalone/secondary.h | 25 +++++++++++++------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h index 1a232b9b9fb2d..c79ec1360b00a 100644 --- a/compiler-rt/lib/scudo/standalone/secondary.h +++ b/compiler-rt/lib/scudo/standalone/secondary.h @@ -72,13 +72,16 @@ namespace { struct CachedBlock { static constexpr u16 CacheIndexMax = UINT16_MAX; static constexpr u16 InvalidEntry = CacheIndexMax; - // * MaxReleasedCachePages default is currently 4 - // - We arrived at this value after noticing that mapping - // in larger memory regions performs better than releasing - // memory and forcing a cache hit. According to the data, - // it suggests that beyond 4 pages, the release execution time is - // longer than the map execution time. In this way, the default - // is dependent on the platform. + // We allow a certain amount of fragmentation and part of the fragmented bytes + // will be released by `releaseAndZeroPagesToOS()`. This increases the chance + // of cache hit rate and reduces the overhead to the RSS at the same time. See + // more details in the `MapAllocatorCache::retrieve()` section. + // + // We arrived at this default value after noticing that mapping in larger + // memory regions performs better than releasing memory and forcing a cache + // hit. According to the data, it suggests that beyond 4 pages, the release + // execution time is longer than the map execution time. In this way, + // the default is dependent on the platform. static constexpr uptr MaxReleasedCachePages = 4U; uptr CommitBase = 0; @@ -725,8 +728,14 @@ MapAllocator::tryAllocateFromCache(const Options &Options, uptr Size, uptr EntryHeaderPos; uptr MaxAllowedFragmentedPages = MaxUnreleasedCachePages; - if (UNLIKELY(useMemoryTagging(Options))) + if (LIKELY(!useMemoryTagging(Options))) { MaxAllowedFragmentedPages += CachedBlock::MaxReleasedCachePages; + } else { + // TODO: Enable MaxReleasedCachePages may result in pages for an entry being + // partially released and it erases the tag of those pages as well. To + // support this feature for MTE, we need to tag those pages again. + DCHECK_EQ(CachedBlock::MaxReleasedCachePages, 0U); + } Entry = Cache.retrieve(MaxAllowedFragmentedPages, Size, Alignment, getHeadersSize(), EntryHeaderPos); From 68f31aaae95f9824e58001c7f9115034df51039e Mon Sep 17 00:00:00 2001 From: SahilPatidar Date: Wed, 11 Sep 2024 06:20:01 +0530 Subject: [PATCH 040/114] [ORC][Runtime] Add `dlupdate` for MachO (#97441) With the help of @lhames, This pull request introduces the `dlupdate` function in the ORC runtime. `dlupdate` enables incremental execution of new initializers introduced in the REPL environment. Unlike traditional `dlopen`, which manages initializers, code mapping, and library reference counts, `dlupdate` focuses exclusively on running new initializers. --- compiler-rt/lib/orc/dlfcn_wrapper.cpp | 13 +++ compiler-rt/lib/orc/macho_platform.cpp | 86 +++++++++++++++++++ compiler-rt/lib/orc/macho_platform.h | 1 + llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h | 2 + llvm/lib/ExecutionEngine/Orc/LLJIT.cpp | 27 +++++- .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 1 + 6 files changed, 128 insertions(+), 2 deletions(-) diff --git a/compiler-rt/lib/orc/dlfcn_wrapper.cpp b/compiler-rt/lib/orc/dlfcn_wrapper.cpp index fd9dce40d6738..bbbc79f607f27 100644 --- a/compiler-rt/lib/orc/dlfcn_wrapper.cpp +++ b/compiler-rt/lib/orc/dlfcn_wrapper.cpp @@ -20,6 +20,7 @@ using namespace orc_rt; extern "C" const char *__orc_rt_jit_dlerror(); extern "C" void *__orc_rt_jit_dlopen(const char *path, int mode); +extern "C" int __orc_rt_jit_dlupdate(void *dso_handle, int mode); extern "C" int __orc_rt_jit_dlclose(void *dso_handle); ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult @@ -41,6 +42,18 @@ __orc_rt_jit_dlopen_wrapper(const char *ArgData, size_t ArgSize) { .release(); } +#ifdef __APPLE__ +ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult +__orc_rt_jit_dlupdate_wrapper(const char *ArgData, size_t ArgSize) { + return WrapperFunction::handle( + ArgData, ArgSize, + [](ExecutorAddr &DSOHandle, int32_t mode) { + return __orc_rt_jit_dlupdate(DSOHandle.toPtr(), mode); + }) + .release(); +} +#endif + ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult __orc_rt_jit_dlclose_wrapper(const char *ArgData, size_t ArgSize) { return WrapperFunction::handle( diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp index c092545b2a367..8cc3594b5d0cf 100644 --- a/compiler-rt/lib/orc/macho_platform.cpp +++ b/compiler-rt/lib/orc/macho_platform.cpp @@ -331,6 +331,7 @@ class MachOPlatformRuntimeState { const char *dlerror(); void *dlopen(std::string_view Name, int Mode); + int dlupdate(void *DSOHandle, int Mode); int dlclose(void *DSOHandle); void *dlsym(void *DSOHandle, const char *Symbol); @@ -380,6 +381,12 @@ class MachOPlatformRuntimeState { Error dlopenInitialize(std::unique_lock &JDStatesLock, JITDylibState &JDS, MachOJITDylibDepInfoMap &DepInfo); + Error dlupdateImpl(void *DSOHandle, int Mode); + Error dlupdateFull(std::unique_lock &JDStatesLock, + JITDylibState &JDS); + Error dlupdateInitialize(std::unique_lock &JDStatesLock, + JITDylibState &JDS); + Error dlcloseImpl(void *DSOHandle); Error dlcloseDeinitialize(std::unique_lock &JDStatesLock, JITDylibState &JDS); @@ -789,6 +796,20 @@ void *MachOPlatformRuntimeState::dlopen(std::string_view Path, int Mode) { } } +int MachOPlatformRuntimeState::dlupdate(void *DSOHandle, int Mode) { + ORC_RT_DEBUG({ + std::string S; + printdbg("MachOPlatform::dlupdate(%p) (%s)\n", DSOHandle, S.c_str()); + }); + std::lock_guard Lock(DyldAPIMutex); + if (auto Err = dlupdateImpl(DSOHandle, Mode)) { + // FIXME: Make dlerror thread safe. + DLFcnError = toString(std::move(Err)); + return -1; + } + return 0; +} + int MachOPlatformRuntimeState::dlclose(void *DSOHandle) { ORC_RT_DEBUG({ auto *JDS = getJITDylibStateByHeader(DSOHandle); @@ -1244,6 +1265,67 @@ Error MachOPlatformRuntimeState::dlopenInitialize( return Error::success(); } +Error MachOPlatformRuntimeState::dlupdateImpl(void *DSOHandle, int Mode) { + std::unique_lock Lock(JDStatesMutex); + + // Try to find JITDylib state by DSOHandle. + auto *JDS = getJITDylibStateByHeader(DSOHandle); + + if (!JDS) { + std::ostringstream ErrStream; + ErrStream << "No registered JITDylib for " << DSOHandle; + return make_error(ErrStream.str()); + } + + if (!JDS->referenced()) + return make_error("dlupdate failed, JITDylib must be open."); + + if (!JDS->Sealed) { + if (auto Err = dlupdateFull(Lock, *JDS)) + return Err; + } + + return Error::success(); +} + +Error MachOPlatformRuntimeState::dlupdateFull( + std::unique_lock &JDStatesLock, JITDylibState &JDS) { + // Call back to the JIT to push the initializers. + Expected DepInfo((MachOJITDylibDepInfoMap())); + // Unlock so that we can accept the initializer update. + JDStatesLock.unlock(); + if (auto Err = WrapperFunction( + SPSExecutorAddr)>:: + call(JITDispatch(&__orc_rt_macho_push_initializers_tag), DepInfo, + ExecutorAddr::fromPtr(JDS.Header))) + return Err; + JDStatesLock.lock(); + + if (!DepInfo) + return DepInfo.takeError(); + + if (auto Err = dlupdateInitialize(JDStatesLock, JDS)) + return Err; + + return Error::success(); +} + +Error MachOPlatformRuntimeState::dlupdateInitialize( + std::unique_lock &JDStatesLock, JITDylibState &JDS) { + ORC_RT_DEBUG({ + printdbg("MachOPlatformRuntimeState::dlupdateInitialize(\"%s\")\n", + JDS.Name.c_str()); + }); + + // Initialize this JITDylib. + if (auto Err = registerObjCRegistrationObjects(JDStatesLock, JDS)) + return Err; + if (auto Err = runModInits(JDStatesLock, JDS)) + return Err; + + return Error::success(); +} + Error MachOPlatformRuntimeState::dlcloseImpl(void *DSOHandle) { std::unique_lock Lock(JDStatesMutex); @@ -1517,6 +1599,10 @@ void *__orc_rt_macho_jit_dlopen(const char *path, int mode) { return MachOPlatformRuntimeState::get().dlopen(path, mode); } +int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode) { + return MachOPlatformRuntimeState::get().dlupdate(dso_handle, mode); +} + int __orc_rt_macho_jit_dlclose(void *dso_handle) { return MachOPlatformRuntimeState::get().dlclose(dso_handle); } diff --git a/compiler-rt/lib/orc/macho_platform.h b/compiler-rt/lib/orc/macho_platform.h index 62234039437c0..ad70c97809d2f 100644 --- a/compiler-rt/lib/orc/macho_platform.h +++ b/compiler-rt/lib/orc/macho_platform.h @@ -24,6 +24,7 @@ ORC_RT_INTERFACE void __orc_rt_macho_cxa_finalize(void *dso_handle); // dlfcn functions. ORC_RT_INTERFACE const char *__orc_rt_macho_jit_dlerror(); ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlopen(const char *path, int mode); +ORC_RT_INTERFACE int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode); ORC_RT_INTERFACE int __orc_rt_macho_jit_dlclose(void *dso_handle); ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlsym(void *dso_handle, const char *symbol); diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h index 3a71ddc88ce95..2660b9f74f405 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h @@ -13,6 +13,7 @@ #ifndef LLVM_EXECUTIONENGINE_ORC_LLJIT_H #define LLVM_EXECUTIONENGINE_ORC_LLJIT_H +#include "llvm/ADT/SmallSet.h" #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" @@ -620,6 +621,7 @@ class ORCPlatformSupport : public LLJIT::PlatformSupport { private: orc::LLJIT &J; DenseMap DSOHandles; + SmallPtrSet InitializedDylib; }; } // End namespace orc diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 2f9f4d33df017..19b3f3d6ea038 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -602,6 +602,7 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) { using llvm::orc::shared::SPSExecutorAddr; using llvm::orc::shared::SPSString; using SPSDLOpenSig = SPSExecutorAddr(SPSString, int32_t); + using SPSDLUpdateSig = int32_t(SPSExecutorAddr, int32_t); enum dlopen_mode : int32_t { ORC_RT_RTLD_LAZY = 0x1, ORC_RT_RTLD_NOW = 0x2, @@ -612,9 +613,30 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) { auto &ES = J.getExecutionSession(); auto MainSearchOrder = J.getMainJITDylib().withLinkOrderDo( [](const JITDylibSearchOrder &SO) { return SO; }); + StringRef WrapperToCall = "__orc_rt_jit_dlopen_wrapper"; + bool dlupdate = false; + if (ES.getTargetTriple().isOSBinFormatMachO()) { + if (InitializedDylib.contains(&JD)) { + WrapperToCall = "__orc_rt_jit_dlupdate_wrapper"; + dlupdate = true; + } else + InitializedDylib.insert(&JD); + } - if (auto WrapperAddr = ES.lookup( - MainSearchOrder, J.mangleAndIntern("__orc_rt_jit_dlopen_wrapper"))) { + if (auto WrapperAddr = + ES.lookup(MainSearchOrder, J.mangleAndIntern(WrapperToCall))) { + if (dlupdate) { + int32_t result; + auto E = ES.callSPSWrapper(WrapperAddr->getAddress(), + result, DSOHandles[&JD], + int32_t(ORC_RT_RTLD_LAZY)); + if (E) + return E; + else if (result) + return make_error("dlupdate failed", + inconvertibleErrorCode()); + return Error::success(); + } return ES.callSPSWrapper(WrapperAddr->getAddress(), DSOHandles[&JD], JD.getName(), int32_t(ORC_RT_RTLD_LAZY)); @@ -641,6 +663,7 @@ Error ORCPlatformSupport::deinitialize(orc::JITDylib &JD) { return make_error("dlclose failed", inconvertibleErrorCode()); DSOHandles.erase(&JD); + InitializedDylib.erase(&JD); } else return WrapperAddr.takeError(); return Error::success(); diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index a71afe1a3162f..e56d6b47799c0 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -428,6 +428,7 @@ MachOPlatform::standardRuntimeUtilityAliases() { {"___orc_rt_run_program", "___orc_rt_macho_run_program"}, {"___orc_rt_jit_dlerror", "___orc_rt_macho_jit_dlerror"}, {"___orc_rt_jit_dlopen", "___orc_rt_macho_jit_dlopen"}, + {"___orc_rt_jit_dlupdate", "___orc_rt_macho_jit_dlupdate"}, {"___orc_rt_jit_dlclose", "___orc_rt_macho_jit_dlclose"}, {"___orc_rt_jit_dlsym", "___orc_rt_macho_jit_dlsym"}, {"___orc_rt_log_error", "___orc_rt_log_error_to_stderr"}}; From 77fc8dae22ff1fa38c0271abc5521db76351f1fd Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 11 Sep 2024 09:13:23 +0800 Subject: [PATCH 041/114] [RISCV] Rematerialize vmv.v.x (#107993) Even though vmv.v.x has a non constant scalar operand, we can still rematerialize it because we have split register allocation between vectors and scalars. InlineSpiller will check to make sure that the scalar operand is live at the point where the rematerialization occurs, so this won't extend any scalar live ranges. However this also means we may not be able to rematerialize in some cases, as shown in @vmv.v.x_needs_extended. It might be worthwhile teaching InlineSpiller to extend scalar live ranges in a future patch. I experimented with this locally and it reduced spills on 531.deepsjeng_r by a further 3%. --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 1 + .../Target/RISCV/RISCVInstrInfoVPseudos.td | 1 + llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll | 157 ++++++-------- llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll | 196 ++++++++---------- llvm/test/CodeGen/RISCV/rvv/remat.ll | 141 +++++++++++++ 5 files changed, 300 insertions(+), 196 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 325a50c9f48a1..2bb9df4ead0e9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -169,6 +169,7 @@ Register RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI, bool RISCVInstrInfo::isReallyTriviallyReMaterializable( const MachineInstr &MI) const { switch (RISCV::getRVVMCOpcode(MI.getOpcode())) { + case RISCV::VMV_V_X: case RISCV::VMV_V_I: case RISCV::VID_V: if (MI.getOperand(1).isUndef() && diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index e11f176bfe604..c6cecb7d07182 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -2475,6 +2475,7 @@ multiclass VPseudoUnaryVMV_V_X_I { def "_V_" # mx : VPseudoUnaryNoMask, SchedUnary<"WriteVIMovV", "ReadVIMovV", mx, forcePassthruRead=true>; + let isReMaterializable = 1 in def "_X_" # mx : VPseudoUnaryNoMask, SchedUnary<"WriteVIMovX", "ReadVIMovX", mx, forcePassthruRead=true>; diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll index 01aac122d5957..7031f93edc2c3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll @@ -2022,14 +2022,9 @@ define @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v24, v16, v24 +; RV32-NEXT: vsub.vv v16, v16, v24 ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: addi a3, a3, 819 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v0, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v24, v0 -; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v24, v16, v0 +; RV32-NEXT: vsrl.vi v16, v16, 2 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v0, (a3) # Unknown-size Folded Spill -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vadd.vv v24, v16, v24 -; RV32-NEXT: vsrl.vi v16, v24, 4 +; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vadd.vv v16, v24, v16 +; RV32-NEXT: vsrl.vi v24, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: addi a3, a3, -241 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma @@ -2437,16 +2412,16 @@ define @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v24, v8, v24 +; RV32-NEXT: vsub.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v24, v0 -; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vadd.vv v8, v8, v24 +; RV32-NEXT: vand.vv v24, v8, v0 +; RV32-NEXT: vsrl.vi v8, v8, 2 +; RV32-NEXT: vand.vv v8, v8, v0 +; RV32-NEXT: vadd.vv v8, v24, v8 ; RV32-NEXT: vsrl.vi v24, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll index 0ef0a431dabc4..d36240e493e41 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll @@ -2266,7 +2266,7 @@ define @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 40 -; RV32-NEXT: mul a1, a1, a2 -; RV32-NEXT: sub sp, sp, a1 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 5 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: sub a2, a0, a1 ; RV32-NEXT: sltu a3, a0, a2 @@ -2624,22 +2615,22 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: and a3, a3, a2 ; RV32-NEXT: li a2, 1 ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v16, a2 +; RV32-NEXT: vsub.vx v24, v16, a2 ; RV32-NEXT: vnot.v v16, v16 -; RV32-NEXT: vand.vv v16, v16, v8 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: vsrl.vi v24, v16, 1 ; RV32-NEXT: lui a4, 349525 ; RV32-NEXT: addi a4, a4, 1365 ; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v0, a4 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 24 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v24, v8 +; RV32-NEXT: vand.vv v24, v24, v0 ; RV32-NEXT: vsub.vv v16, v16, v24 ; RV32-NEXT: lui a4, 209715 ; RV32-NEXT: addi a4, a4, 819 @@ -2648,6 +2639,11 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma ; RV32-NEXT: vand.vv v24, v16, v0 ; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: csrr a4, vlenb +; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: add a4, sp, a4 +; RV32-NEXT: addi a4, a4, 16 +; RV32-NEXT: vs8r.v v0, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vand.vv v16, v16, v0 ; RV32-NEXT: vadd.vv v16, v24, v16 ; RV32-NEXT: vsrl.vi v24, v16, 4 @@ -2655,50 +2651,46 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: lui a4, 61681 ; RV32-NEXT: addi a4, a4, -241 ; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v8, a4 +; RV32-NEXT: vmv.v.x v24, a4 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 4 +; RV32-NEXT: slli a4, a4, 3 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v16, v8 +; RV32-NEXT: vand.vv v16, v16, v24 ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a4, a4, 257 ; RV32-NEXT: vsetvli a5, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a4 -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 3 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vmv.v.x v24, a4 +; RV32-NEXT: addi a4, sp, 16 +; RV32-NEXT: vs8r.v v24, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vsetvli zero, a3, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v16 +; RV32-NEXT: vmul.vv v16, v16, v24 ; RV32-NEXT: li a3, 56 -; RV32-NEXT: vsrl.vx v8, v8, a3 -; RV32-NEXT: addi a4, sp, 16 -; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsrl.vx v16, v16, a3 ; RV32-NEXT: bltu a0, a1, .LBB47_2 ; RV32-NEXT: # %bb.1: ; RV32-NEXT: mv a0, a1 ; RV32-NEXT: .LBB47_2: -; RV32-NEXT: slli a1, a1, 5 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsub.vx v8, v24, a2 -; RV32-NEXT: vnot.v v24, v24 -; RV32-NEXT: vand.vv v8, v24, v8 +; RV32-NEXT: vsub.vx v24, v8, a2 +; RV32-NEXT: vnot.v v8, v8 +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsrl.vi v24, v8, 1 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: li a1, 24 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v24, v24, v0 ; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload ; RV32-NEXT: vand.vv v24, v8, v0 ; RV32-NEXT: vsrl.vi v8, v8, 2 ; RV32-NEXT: vand.vv v8, v8, v0 @@ -2706,23 +2698,17 @@ define @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: vsrl.vi v24, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 3 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload -; RV32-NEXT: vmul.vv v8, v8, v16 -; RV32-NEXT: vsrl.vx v8, v8, a3 +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: addi a0, sp, 16 -; RV32-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; RV32-NEXT: vmul.vv v8, v8, v24 +; RV32-NEXT: vsrl.vx v8, v8, a3 ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 40 -; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll index 2b12249378eb1..514612cd0525d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/remat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll @@ -171,3 +171,144 @@ define void @vmv.v.i(ptr %p) { store volatile %vmv.v.i, ptr %p ret void } + +; The live range of %x needs extended down to the use of vmv.v.x at the end of +; the block. +define void @vmv.v.x_needs_extended(ptr %p, i64 %x) { +; POSTRA-LABEL: vmv.v.x_needs_extended: +; POSTRA: # %bb.0: +; POSTRA-NEXT: addi sp, sp, -16 +; POSTRA-NEXT: .cfi_def_cfa_offset 16 +; POSTRA-NEXT: csrr a2, vlenb +; POSTRA-NEXT: slli a2, a2, 3 +; POSTRA-NEXT: sub sp, sp, a2 +; POSTRA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; POSTRA-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; POSTRA-NEXT: vmv.v.x v8, a1 +; POSTRA-NEXT: addi a1, sp, 16 +; POSTRA-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vl8re64.v v16, (a0) +; POSTRA-NEXT: vl8re64.v v24, (a0) +; POSTRA-NEXT: vl8re64.v v0, (a0) +; POSTRA-NEXT: vl8re64.v v8, (a0) +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vs8r.v v0, (a0) +; POSTRA-NEXT: vs8r.v v24, (a0) +; POSTRA-NEXT: vs8r.v v16, (a0) +; POSTRA-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: csrr a0, vlenb +; POSTRA-NEXT: slli a0, a0, 3 +; POSTRA-NEXT: add sp, sp, a0 +; POSTRA-NEXT: addi sp, sp, 16 +; POSTRA-NEXT: ret +; +; PRERA-LABEL: vmv.v.x_needs_extended: +; PRERA: # %bb.0: +; PRERA-NEXT: addi sp, sp, -16 +; PRERA-NEXT: .cfi_def_cfa_offset 16 +; PRERA-NEXT: csrr a2, vlenb +; PRERA-NEXT: slli a2, a2, 3 +; PRERA-NEXT: sub sp, sp, a2 +; PRERA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; PRERA-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; PRERA-NEXT: vmv.v.x v8, a1 +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: addi a1, sp, 16 +; PRERA-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; PRERA-NEXT: vl8re64.v v24, (a0) +; PRERA-NEXT: vl8re64.v v0, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v0, (a0) +; PRERA-NEXT: vs8r.v v24, (a0) +; PRERA-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: csrr a0, vlenb +; PRERA-NEXT: slli a0, a0, 3 +; PRERA-NEXT: add sp, sp, a0 +; PRERA-NEXT: addi sp, sp, 16 +; PRERA-NEXT: ret + %vmv.v.x = call @llvm.riscv.vmv.v.x.nxv8i64( poison, i64 %x, i64 -1) + store volatile %vmv.v.x, ptr %p + + %a = load volatile , ptr %p + %b = load volatile , ptr %p + %c = load volatile , ptr %p + %d = load volatile , ptr %p + store volatile %d, ptr %p + store volatile %c, ptr %p + store volatile %b, ptr %p + store volatile %a, ptr %p + + store volatile %vmv.v.x, ptr %p + ret void +} + +define void @vmv.v.x_live(ptr %p, i64 %x) { +; POSTRA-LABEL: vmv.v.x_live: +; POSTRA: # %bb.0: +; POSTRA-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; POSTRA-NEXT: vmv.v.x v8, a1 +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vl8re64.v v16, (a0) +; POSTRA-NEXT: vl8re64.v v24, (a0) +; POSTRA-NEXT: vl8re64.v v0, (a0) +; POSTRA-NEXT: vl8re64.v v8, (a0) +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vs8r.v v0, (a0) +; POSTRA-NEXT: vs8r.v v24, (a0) +; POSTRA-NEXT: vs8r.v v16, (a0) +; POSTRA-NEXT: vmv.v.x v8, a1 +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: sd a1, 0(a0) +; POSTRA-NEXT: ret +; +; PRERA-LABEL: vmv.v.x_live: +; PRERA: # %bb.0: +; PRERA-NEXT: addi sp, sp, -16 +; PRERA-NEXT: .cfi_def_cfa_offset 16 +; PRERA-NEXT: csrr a2, vlenb +; PRERA-NEXT: slli a2, a2, 3 +; PRERA-NEXT: sub sp, sp, a2 +; PRERA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; PRERA-NEXT: vsetvli a2, zero, e64, m8, ta, ma +; PRERA-NEXT: vmv.v.x v8, a1 +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: addi a2, sp, 16 +; PRERA-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; PRERA-NEXT: vl8re64.v v24, (a0) +; PRERA-NEXT: vl8re64.v v0, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v0, (a0) +; PRERA-NEXT: vs8r.v v24, (a0) +; PRERA-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: sd a1, 0(a0) +; PRERA-NEXT: csrr a0, vlenb +; PRERA-NEXT: slli a0, a0, 3 +; PRERA-NEXT: add sp, sp, a0 +; PRERA-NEXT: addi sp, sp, 16 +; PRERA-NEXT: ret + %vmv.v.x = call @llvm.riscv.vmv.v.x.nxv8i64( poison, i64 %x, i64 -1) + store volatile %vmv.v.x, ptr %p + + %a = load volatile , ptr %p + %b = load volatile , ptr %p + %c = load volatile , ptr %p + %d = load volatile , ptr %p + store volatile %d, ptr %p + store volatile %c, ptr %p + store volatile %b, ptr %p + store volatile %a, ptr %p + + store volatile %vmv.v.x, ptr %p + store volatile i64 %x, ptr %p + ret void +} From 69ed73380b9a1ec2d09d9b443a52b3ccd141334d Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Wed, 11 Sep 2024 09:17:05 +0800 Subject: [PATCH 042/114] [RISCV] Add testcase for -mcmodel= (#107816) This is a pre-commit test for #107817 --- clang/test/Driver/riscv-mcmodel.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 clang/test/Driver/riscv-mcmodel.c diff --git a/clang/test/Driver/riscv-mcmodel.c b/clang/test/Driver/riscv-mcmodel.c new file mode 100644 index 0000000000000..4f5fa95f59b66 --- /dev/null +++ b/clang/test/Driver/riscv-mcmodel.c @@ -0,0 +1,14 @@ +// RUN: %clang --target=riscv32 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s +// RUN: %clang --target=riscv64 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s + +// RUN: %clang --target=riscv32 -### -c -mcmodel=medlow %s 2>&1 | FileCheck --check-prefix=SMALL %s +// RUN: %clang --target=riscv64 -### -c -mcmodel=medlow %s 2>&1 | FileCheck --check-prefix=SMALL %s + +// RUN: %clang --target=riscv32 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s +// RUN: %clang --target=riscv64 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s + +// RUN: %clang --target=riscv32 -### -c -mcmodel=medany %s 2>&1 | FileCheck --check-prefix=MEDIUM %s +// RUN: %clang --target=riscv64 -### -c -mcmodel=medany %s 2>&1 | FileCheck --check-prefix=MEDIUM %s + +// SMALL: "-mcmodel=small" +// MEDIUM: "-mcmodel=medium" From c641b611f86a846a51763a54a196375aba3e6e4e Mon Sep 17 00:00:00 2001 From: YunQiang Su Date: Wed, 11 Sep 2024 09:37:12 +0800 Subject: [PATCH 043/114] MIPSr6: Add llvm.is.fpclasss intrinsic support (#107857) MIPSr6 has class.s/class.d instructions. Let's use them for llvm.is.fpclass intrinsic. --- llvm/lib/Target/Mips/Mips32r6InstrInfo.td | 37 ++++ llvm/lib/Target/Mips/MipsISelLowering.cpp | 5 +- llvm/lib/Target/Mips/MipsInstrInfo.h | 17 ++ llvm/test/CodeGen/Mips/is_fpclass.ll | 246 ++++++++++++++++++++++ 4 files changed, 303 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/Mips/is_fpclass.ll diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td index 91ffbc4eb77dd..27b9ce60ba826 100644 --- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td +++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td @@ -1139,6 +1139,43 @@ let AdditionalPredicates = [NotInMicroMips] in { ISA_MIPS32R6; } +// llvm.is_fpclass operations. +def to_fclass_mask: SDNodeXFormgetZExtValue(); + unsigned Mask = 0; + if (Check & fcSNan) + Mask |= Mips::FClassMaskSignalingNaN; + if (Check & fcQNan) + Mask |= Mips::FClassMaskQuietNaN; + if (Check & fcPosInf) + Mask |= Mips::FClassMaskPositiveInfinity; + if (Check & fcNegInf) + Mask |= Mips::FClassMaskNegativeInfinity; + if (Check & fcPosNormal) + Mask |= Mips::FClassMaskPositiveNormal; + if (Check & fcNegNormal) + Mask |= Mips::FClassMaskNegativeNormal; + if (Check & fcPosSubnormal) + Mask |= Mips::FClassMaskPositiveSubnormal; + if (Check & fcNegSubnormal) + Mask |= Mips::FClassMaskNegativeSubnormal; + if (Check & fcPosZero) + Mask |= Mips::FClassMaskPositiveZero; + if (Check & fcNegZero) + Mask |= Mips::FClassMaskNegativeZero; + return CurDAG->getTargetConstant(Mask, SDLoc(N), MVT::i32); +}]>; +let AdditionalPredicates = [NotInMicroMips] in { + def : MipsPat<(is_fpclass f32:$lhs, i32:$imm), + (SLTu ZERO, (ANDi (MFC1 (CLASS_S f32:$lhs)), + (to_fclass_mask imm:$imm)))>, + ISA_MIPS32R6; + def : MipsPat<(is_fpclass f64:$lhs, i32:$imm), + (SLTu ZERO, (ANDi (MFC1_D64 (CLASS_D f64:$lhs)), + (to_fclass_mask imm:$imm)))>, + ISA_MIPS32R6; +} + // Pseudo instructions let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, hasDelaySlot = 1, hasExtraSrcRegAllocReq = 1, isCTI = 1, Defs = [AT], hasPostISelHook = 1 in { diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index fa57a3fa9b155..59f78a8ca306c 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -359,8 +359,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); - // Lower fmin and fmax operations for MIPS R6. - // Instructions are defined but never used. + // Lower fmin/fmax/fclass operations for MIPS R6. if (Subtarget.hasMips32r6()) { setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal); setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal); @@ -370,6 +369,8 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); setOperationAction(ISD::FMINNUM, MVT::f64, Expand); setOperationAction(ISD::FMAXNUM, MVT::f64, Expand); + setOperationAction(ISD::IS_FPCLASS, MVT::f32, Legal); + setOperationAction(ISD::IS_FPCLASS, MVT::f64, Legal); } else { setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h index dc4b9d99b39d2..4e039e0e32aba 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.h +++ b/llvm/lib/Target/Mips/MipsInstrInfo.h @@ -213,6 +213,23 @@ class MipsInstrInfo : public MipsGenInstrInfo { const MipsInstrInfo *createMips16InstrInfo(const MipsSubtarget &STI); const MipsInstrInfo *createMipsSEInstrInfo(const MipsSubtarget &STI); +namespace Mips { +// Mask assignments for floating-point. +enum FClassMask { + FClassMaskSignalingNaN = 1 << 0, + FClassMaskQuietNaN = 1 << 1, + FClassMaskNegativeInfinity = 1 << 2, + FClassMaskNegativeNormal = 1 << 3, + FClassMaskNegativeSubnormal = 1 << 4, + FClassMaskNegativeZero = 1 << 5, + FClassMaskPositiveInfinity = 1 << 6, + FClassMaskPositiveNormal = 1 << 7, + FClassMaskPositiveSubnormal = 1 << 8, + FClassMaskPositiveZero = 1 << 9 +}; + +} // namespace Mips + } // end namespace llvm #endif // LLVM_LIB_TARGET_MIPS_MIPSINSTRINFO_H diff --git a/llvm/test/CodeGen/Mips/is_fpclass.ll b/llvm/test/CodeGen/Mips/is_fpclass.ll new file mode 100644 index 0000000000000..9454a064c5312 --- /dev/null +++ b/llvm/test/CodeGen/Mips/is_fpclass.ll @@ -0,0 +1,246 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=mipsisa32r6-unknown-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s + + +define i1 @isnan_float(float %x) nounwind { +; CHECK-LABEL: isnan_float: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 3 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3) ; nan + ret i1 %1 +} + +define i1 @isnan_double(double %x) nounwind { +; CHECK-LABEL: isnan_double: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 3 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3) ; nan + ret i1 %1 +} + +define i1 @isnan_float_strictfp(float %x) strictfp nounwind { +; CHECK-LABEL: isnan_float_strictfp: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 3 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 3) strictfp ; nan + ret i1 %1 +} + +define i1 @isnan_double_strictfp(double %x) strictfp nounwind { +; CHECK-LABEL: isnan_double_strictfp: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 3 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 3) strictfp ; nan + ret i1 %1 +} + +define i1 @isinf_float(float %x) nounwind { +; CHECK-LABEL: isinf_float: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 68 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 516) ; 0x204 = "inf" + ret i1 %1 +} + +define i1 @isfinite_float(float %x) nounwind { +; CHECK-LABEL: isfinite_float: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 952 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 504) ; 0x1f8 = "finite" + ret i1 %1 +} + +define i1 @isnormal_float(float %x) nounwind { +; CHECK-LABEL: isnormal_float: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 136 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 264) ; 0x108 = "normal" + ret i1 %1 +} + +define i1 @issubnormal_float(float %x) nounwind { +; CHECK-LABEL: issubnormal_float: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 272 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 144) ; 0x90 = "subnormal" + ret i1 %1 +} + +define i1 @iszero_float(float %x) nounwind { +; CHECK-LABEL: iszero_float: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 544 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 96) ; 0x60 = "zero" + ret i1 %1 +} + +define i1 @issnan_float(float %x) nounwind { +; CHECK-LABEL: issnan_float: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 1 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 1) + ret i1 %1 +} + +define i1 @issnan_double(double %x) nounwind { +; CHECK-LABEL: issnan_double: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 1 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 1) + ret i1 %1 +} + +define i1 @isqnan_float(float %x) nounwind { +; CHECK-LABEL: isqnan_float: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 2 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 2) + ret i1 %1 +} + +define i1 @isqnan_double(double %x) nounwind { +; CHECK-LABEL: isqnan_double: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 2 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 2) + ret i1 %1 +} + +define i1 @isposzero_double(double %x) nounwind { +; CHECK-LABEL: isposzero_double: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 512 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 64) + ret i1 %1 +} + +define i1 @isnegzero_double(double %x) nounwind { +; CHECK-LABEL: isnegzero_double: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 32 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 32) + ret i1 %1 +} + +define i1 @isposnormal_double(double %x) nounwind { +; CHECK-LABEL: isposnormal_double: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 128 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 256) + ret i1 %1 +} + +define i1 @isnegnormal_double(double %x) nounwind { +; CHECK-LABEL: isnegnormal_double: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 8 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 8) + ret i1 %1 +} + +define i1 @isnormal_double(double %x) nounwind { +; CHECK-LABEL: isnormal_double: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 136 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 264) + ret i1 %1 +} + +define i1 @isclass_00d_double(double %x) nounwind { +; CHECK-LABEL: isclass_00d_double: +; CHECK: # %bb.0: +; CHECK-NEXT: class.d $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 13 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f64(double %x, i32 13) + ret i1 %1 +} + +define i1 @isclass_1c0_float(float %x) nounwind { +; CHECK-LABEL: isclass_1c0_float: +; CHECK: # %bb.0: +; CHECK-NEXT: class.s $f0, $f12 +; CHECK-NEXT: mfc1 $1, $f0 +; CHECK-NEXT: andi $1, $1, 896 +; CHECK-NEXT: jr $ra +; CHECK-NEXT: sltu $2, $zero, $1 + %1 = call i1 @llvm.is.fpclass.f32(float %x, i32 448) + ret i1 %1 +} + +declare i1 @llvm.is.fpclass.f32(float, i32) +declare i1 @llvm.is.fpclass.f64(double, i32) From 21a0176c584c47218f20322641af8a855b8ce5e2 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 11 Sep 2024 09:38:29 +0800 Subject: [PATCH 044/114] [RISCV] Rematerialize vfmv.v.f (#108007) This is the same principle as vmv.v.x in #107993, but for floats. --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 1 + .../Target/RISCV/RISCVInstrInfoVPseudos.td | 1 + llvm/test/CodeGen/RISCV/rvv/remat.ll | 65 +++++++++++++++++++ 3 files changed, 67 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 2bb9df4ead0e9..a805c68e7795c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -170,6 +170,7 @@ bool RISCVInstrInfo::isReallyTriviallyReMaterializable( const MachineInstr &MI) const { switch (RISCV::getRVVMCOpcode(MI.getOpcode())) { case RISCV::VMV_V_X: + case RISCV::VFMV_V_F: case RISCV::VMV_V_I: case RISCV::VID_V: if (MI.getOperand(1).isUndef() && diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index c6cecb7d07182..2eceef5066f77 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -6558,6 +6558,7 @@ defm PseudoVFMERGE : VPseudoVMRG_FM; //===----------------------------------------------------------------------===// // 13.16. Vector Floating-Point Move Instruction //===----------------------------------------------------------------------===// +let isReMaterializable = 1 in defm PseudoVFMV_V : VPseudoVMV_F; //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll index 514612cd0525d..343b086898c14 100644 --- a/llvm/test/CodeGen/RISCV/rvv/remat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll @@ -312,3 +312,68 @@ define void @vmv.v.x_live(ptr %p, i64 %x) { store volatile i64 %x, ptr %p ret void } + +define void @vfmv.v.f(ptr %p, double %x) { +; POSTRA-LABEL: vfmv.v.f: +; POSTRA: # %bb.0: +; POSTRA-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; POSTRA-NEXT: vfmv.v.f v8, fa0 +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vl8re64.v v16, (a0) +; POSTRA-NEXT: vl8re64.v v24, (a0) +; POSTRA-NEXT: vl8re64.v v0, (a0) +; POSTRA-NEXT: vl8re64.v v8, (a0) +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vs8r.v v0, (a0) +; POSTRA-NEXT: vs8r.v v24, (a0) +; POSTRA-NEXT: vs8r.v v16, (a0) +; POSTRA-NEXT: vfmv.v.f v8, fa0 +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: fsd fa0, 0(a0) +; POSTRA-NEXT: ret +; +; PRERA-LABEL: vfmv.v.f: +; PRERA: # %bb.0: +; PRERA-NEXT: addi sp, sp, -16 +; PRERA-NEXT: .cfi_def_cfa_offset 16 +; PRERA-NEXT: csrr a1, vlenb +; PRERA-NEXT: slli a1, a1, 3 +; PRERA-NEXT: sub sp, sp, a1 +; PRERA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; PRERA-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; PRERA-NEXT: vfmv.v.f v8, fa0 +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: addi a1, sp, 16 +; PRERA-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; PRERA-NEXT: vl8re64.v v24, (a0) +; PRERA-NEXT: vl8re64.v v0, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v0, (a0) +; PRERA-NEXT: vs8r.v v24, (a0) +; PRERA-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: fsd fa0, 0(a0) +; PRERA-NEXT: csrr a0, vlenb +; PRERA-NEXT: slli a0, a0, 3 +; PRERA-NEXT: add sp, sp, a0 +; PRERA-NEXT: addi sp, sp, 16 +; PRERA-NEXT: ret + %vfmv.v.f = call @llvm.riscv.vfmv.v.f.nxv8f64( poison, double %x, i64 -1) + store volatile %vfmv.v.f, ptr %p + + %a = load volatile , ptr %p + %b = load volatile , ptr %p + %c = load volatile , ptr %p + %d = load volatile , ptr %p + store volatile %d, ptr %p + store volatile %c, ptr %p + store volatile %b, ptr %p + store volatile %a, ptr %p + + store volatile %vfmv.v.f, ptr %p + store volatile double %x, ptr %p + ret void +} From 933fc63a1d230896bc09a08cf08dde4ac5b51703 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 11 Sep 2024 09:44:57 +0800 Subject: [PATCH 045/114] [RISCV] Rematerialize vmv.s.x and vfmv.s.f (#108012) Continuing with #107993 and #108007, this handles the last of the main rematerializable vector instructions. There's an extra spill in one of the test cases, but it's likely noise from the spill weights and isn't an issue in practice. --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 2 + .../Target/RISCV/RISCVInstrInfoVPseudos.td | 4 +- .../rvv/fixed-vectors-interleaved-access.ll | 640 +++++++++--------- llvm/test/CodeGen/RISCV/rvv/remat.ll | 130 ++++ 4 files changed, 464 insertions(+), 312 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index a805c68e7795c..13212c2aea5dd 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -172,6 +172,8 @@ bool RISCVInstrInfo::isReallyTriviallyReMaterializable( case RISCV::VMV_V_X: case RISCV::VFMV_V_F: case RISCV::VMV_V_I: + case RISCV::VMV_S_X: + case RISCV::VFMV_S_F: case RISCV::VID_V: if (MI.getOperand(1).isUndef() && /* After RISCVInsertVSETVLI most pseudos will have implicit uses on vl diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 2eceef5066f77..430e09fd834ba 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -6764,7 +6764,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { Pseudo<(outs GPR:$rd), (ins VR:$rs2, ixlenimm:$sew), []>, Sched<[WriteVMovXS, ReadVMovXS]>, RISCVVPseudo; - let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, + let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VMV_S_X, isReMaterializable = 1, Constraints = "$rd = $rs1" in def PseudoVMV_S_X: Pseudo<(outs VR:$rd), (ins VR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew), @@ -6787,7 +6787,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { (ins VR:$rs2, ixlenimm:$sew), []>, Sched<[WriteVMovFS, ReadVMovFS]>, RISCVVPseudo; - let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, + let HasVLOp = 1, HasSEWOp = 1, BaseInstr = VFMV_S_F, isReMaterializable = 1, Constraints = "$rd = $rs1" in def "PseudoVFMV_S_" # f.FX : Pseudo<(outs VR:$rd), diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index bc3e135a588a6..eff56e408d6d5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -159,296 +159,308 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 80 +; RV32-NEXT: li a3, 84 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: sub sp, sp, a2 -; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd0, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 80 * vlenb +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd4, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 84 * vlenb ; RV32-NEXT: addi a3, a1, 256 ; RV32-NEXT: li a2, 32 ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-NEXT: vle32.v v16, (a3) +; RV32-NEXT: vle32.v v8, (a3) ; RV32-NEXT: csrr a3, vlenb -; RV32-NEXT: slli a3, a3, 6 +; RV32-NEXT: li a4, 76 +; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill ; RV32-NEXT: addi a3, a1, 128 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-NEXT: vslideup.vi v8, v16, 4 +; RV32-NEXT: vslideup.vi v4, v8, 4 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 40 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v4, (a4) # Unknown-size Folded Spill ; RV32-NEXT: lui a4, 12 ; RV32-NEXT: vmv.s.x v0, a4 -; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma -; RV32-NEXT: vslidedown.vi v16, v16, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 56 +; RV32-NEXT: li a5, 24 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs8r.v v16, (a4) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v3, v0 -; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vslideup.vi v8, v16, 10, v0.t +; RV32-NEXT: vs1r.v v0, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v8, v8, 16 ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a5, 44 +; RV32-NEXT: li a5, 48 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vslideup.vi v4, v8, 10, v0.t ; RV32-NEXT: lui a4, %hi(.LCPI6_0) ; RV32-NEXT: addi a4, a4, %lo(.LCPI6_0) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v8, (a4) -; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: slli a4, a4, 5 -; RV32-NEXT: add a4, sp, a4 -; RV32-NEXT: addi a4, a4, 16 -; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v0, (a4) ; RV32-NEXT: lui a4, %hi(.LCPI6_1) ; RV32-NEXT: addi a4, a4, %lo(.LCPI6_1) ; RV32-NEXT: lui a5, 1 ; RV32-NEXT: vle16.v v8, (a4) ; RV32-NEXT: csrr a4, vlenb -; RV32-NEXT: li a6, 24 +; RV32-NEXT: li a6, 56 ; RV32-NEXT: mul a4, a4, a6 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs4r.v v8, (a4) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a4, 72 +; RV32-NEXT: li a4, 68 ; RV32-NEXT: mul a1, a1, a4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vle32.v v24, (a3) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 48 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill ; RV32-NEXT: addi a1, a5, -64 -; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: vmv.s.x v16, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 36 +; RV32-NEXT: li a3, 44 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v16, v8, v4 +; RV32-NEXT: vs1r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vrgatherei16.vv v16, v8, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 24 +; RV32-NEXT: li a3, 44 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 44 +; RV32-NEXT: li a3, 56 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v8, v16 +; RV32-NEXT: vmv.v.v v4, v16 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 44 +; RV32-NEXT: li a3, 36 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v4, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu ; RV32-NEXT: vslideup.vi v12, v8, 2 -; RV32-NEXT: vmv1r.v v8, v3 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: li a3, 24 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs1r.v v3, (a1) # Unknown-size Folded Spill -; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vslideup.vi v12, v16, 8, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_2) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) ; RV32-NEXT: lui a3, %hi(.LCPI6_3) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_3) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v0, (a1) -; RV32-NEXT: vle16.v v4, (a3) +; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v8, (a3) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 28 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_4) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v10, (a1) +; RV32-NEXT: vle16.v v2, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v24, v16, v0 +; RV32-NEXT: vrgatherei16.vv v24, v16, v12 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 36 +; RV32-NEXT: li a3, 44 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 48 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v4, v0.t +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 28 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v24, v8, v4, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 56 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 12, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v12, v24 +; RV32-NEXT: vmv.v.v v8, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 36 +; RV32-NEXT: li a3, 56 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v12, v24, v10 -; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vrgatherei16.vv v8, v24, v2 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v12, v24, 6, v0.t +; RV32-NEXT: vslideup.vi v8, v24, 6, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a3, 44 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_5) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) ; RV32-NEXT: lui a3, %hi(.LCPI6_6) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_6) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v12, (a1) -; RV32-NEXT: vle16.v v8, (a3) -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v4, (a3) ; RV32-NEXT: li a1, 960 -; RV32-NEXT: vmv.s.x v8, a1 +; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v0, v12 -; RV32-NEXT: vmv1r.v v3, v8 -; RV32-NEXT: vmv1r.v v0, v8 +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vrgatherei16.vv v8, v16, v24 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v24, v16, v8, v0.t +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 24 +; RV32-NEXT: li a3, 28 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: lui a1, %hi(.LCPI6_7) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) ; RV32-NEXT: lui a3, %hi(.LCPI6_8) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_8) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vle16.v v16, (a1) ; RV32-NEXT: lui a1, %hi(.LCPI6_9) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_9) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v4, (a3) -; RV32-NEXT: vle16.v v12, (a1) +; RV32-NEXT: vle16.v v8, (a3) +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a3, a3, 2 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs4r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v12, v24, v8 +; RV32-NEXT: vrgatherei16.vv v12, v8, v16 +; RV32-NEXT: vmv1r.v v0, v1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 4 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 +; RV32-NEXT: li a3, 48 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv4r.v v24, v16 ; RV32-NEXT: vslideup.vi v12, v16, 4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 12 +; RV32-NEXT: li a3, 24 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v8, v16, v4 -; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vrgatherei16.vv v8, v0, v20 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 48 -; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v28, v0.t +; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgatherei16.vv v8, v24, v20, v0.t ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 @@ -461,48 +473,51 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a1, 15 ; RV32-NEXT: vmv.s.x v3, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vslideup.vi v8, v16, 6 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vslideup.vi v8, v24, 6 ; RV32-NEXT: vmv1r.v v0, v3 -; RV32-NEXT: vrgatherei16.vv v8, v24, v12, v0.t +; RV32-NEXT: vrgatherei16.vv v8, v16, v12, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: li a3, 76 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv4r.v v24, v16 ; RV32-NEXT: lui a1, %hi(.LCPI6_11) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_11) ; RV32-NEXT: lui a3, %hi(.LCPI6_12) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_12) ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vle16.v v24, (a1) +; RV32-NEXT: vle16.v v28, (a1) ; RV32-NEXT: vle16.v v4, (a3) ; RV32-NEXT: li a1, 1008 ; RV32-NEXT: vmv.s.x v0, a1 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vrgatherei16.vv v8, v16, v24 +; RV32-NEXT: vrgatherei16.vv v8, v16, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 48 +; RV32-NEXT: li a3, 60 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v16, v4, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill @@ -511,14 +526,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: lui a3, %hi(.LCPI6_14) ; RV32-NEXT: addi a3, a3, %lo(.LCPI6_14) ; RV32-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV32-NEXT: vle16.v v20, (a1) +; RV32-NEXT: vle16.v v8, (a1) ; RV32-NEXT: lui a1, %hi(.LCPI6_15) ; RV32-NEXT: addi a1, a1, %lo(.LCPI6_15) ; RV32-NEXT: vsetvli zero, a2, e16, m4, ta, ma -; RV32-NEXT: vle16.v v24, (a3) -; RV32-NEXT: vle16.v v8, (a1) +; RV32-NEXT: vle16.v v28, (a3) +; RV32-NEXT: vle16.v v12, (a1) ; RV32-NEXT: addi a1, sp, 16 -; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV32-NEXT: vmv1r.v v0, v3 ; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: li a3, 40 @@ -526,21 +541,16 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 56 -; RV32-NEXT: mul a1, a1, a3 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu -; RV32-NEXT: vrgatherei16.vv v16, v8, v20, v0.t +; RV32-NEXT: vrgatherei16.vv v16, v24, v8, v0.t ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 5 +; RV32-NEXT: li a3, 44 +; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 24 +; RV32-NEXT: li a3, 28 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -548,20 +558,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma ; RV32-NEXT: vmv.v.v v20, v8 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a3, 72 +; RV32-NEXT: li a3, 68 ; RV32-NEXT: mul a1, a1, a3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu -; RV32-NEXT: vrgatherei16.vv v8, v0, v24 +; RV32-NEXT: vrgatherei16.vv v8, v0, v28 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 3 +; RV32-NEXT: slli a1, a1, 2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 48 +; RV32-NEXT: li a2, 60 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 @@ -570,56 +580,57 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV32-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vrgatherei16.vv v8, v24, v4, v0.t ; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 24 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb ; RV32-NEXT: slli a1, a1, 4 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma +; RV32-NEXT: vmv.v.v v28, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 12 +; RV32-NEXT: li a2, 76 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v24, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vsetivli zero, 10, e32, m4, tu, ma -; RV32-NEXT: vmv.v.v v24, v0 ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 6 +; RV32-NEXT: slli a1, a1, 3 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: slli a1, a1, 2 -; RV32-NEXT: add a1, sp, a1 -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload -; RV32-NEXT: vmv.v.v v28, v0 +; RV32-NEXT: vmv.v.v v24, v0 ; RV32-NEXT: vmv.v.v v16, v8 ; RV32-NEXT: addi a1, a0, 320 ; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma ; RV32-NEXT: vse32.v v16, (a1) ; RV32-NEXT: addi a1, a0, 256 -; RV32-NEXT: vse32.v v28, (a1) -; RV32-NEXT: addi a1, a0, 192 ; RV32-NEXT: vse32.v v24, (a1) +; RV32-NEXT: addi a1, a0, 192 +; RV32-NEXT: vse32.v v28, (a1) ; RV32-NEXT: addi a1, a0, 128 ; RV32-NEXT: vse32.v v20, (a1) ; RV32-NEXT: addi a1, a0, 64 ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: li a3, 36 +; RV32-NEXT: li a3, 56 ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a1) ; RV32-NEXT: csrr a1, vlenb -; RV32-NEXT: li a2, 44 +; RV32-NEXT: li a2, 36 ; RV32-NEXT: mul a1, a1, a2 ; RV32-NEXT: add a1, sp, a1 ; RV32-NEXT: addi a1, a1, 16 ; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: li a1, 80 +; RV32-NEXT: li a1, 84 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: add sp, sp, a0 ; RV32-NEXT: addi sp, sp, 16 @@ -630,15 +641,15 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 74 +; RV64-NEXT: li a3, 66 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: sub sp, sp, a2 -; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xca, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 74 * vlenb +; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 66 * vlenb ; RV64-NEXT: addi a2, a1, 256 ; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-NEXT: vle64.v v16, (a2) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 25 +; RV64-NEXT: li a3, 21 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 @@ -646,76 +657,85 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a2, a1, 128 ; RV64-NEXT: vle64.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a3, a1, 6 -; RV64-NEXT: add a1, a3, a1 +; RV64-NEXT: li a3, 57 +; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vrgather.vi v12, v16, 4 ; RV64-NEXT: li a1, 128 -; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma ; RV64-NEXT: vslidedown.vi v16, v16, 8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a3, 49 +; RV64-NEXT: li a3, 37 ; RV64-NEXT: mul a1, a1, a3 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v8 ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu ; RV64-NEXT: vrgather.vi v12, v16, 2, v0.t ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vid.v v10 ; RV64-NEXT: li a1, 6 -; RV64-NEXT: vmul.vx v2, v10, a1 +; RV64-NEXT: vmul.vx v8, v10, a1 ; RV64-NEXT: li a1, 56 -; RV64-NEXT: vle64.v v16, (a2) +; RV64-NEXT: vle64.v v24, (a2) ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: li a3, 57 +; RV64-NEXT: li a3, 45 ; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 -; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill -; RV64-NEXT: vmv.s.x v7, a1 -; RV64-NEXT: vadd.vi v10, v2, -16 +; RV64-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v10, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 53 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v16, v24, v2 -; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vs1r.v v10, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vadd.vi v10, v8, -16 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 57 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vrgatherei16.vv v16, v0, v8 +; RV64-NEXT: vmv2r.v v4, v8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 53 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl1r.v v6, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv1r.v v0, v6 ; RV64-NEXT: vrgatherei16.vv v16, v24, v10, v0.t ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma ; RV64-NEXT: vmv.v.v v12, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 21 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 25 +; RV64-NEXT: li a2, 21 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v12, v16, 5 -; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: vmv1r.v v6, v8 +; RV64-NEXT: vrgather.vi v12, v8, 5 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl1r.v v1, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 49 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -723,19 +743,19 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vrgather.vi v12, v16, 3, v0.t ; RV64-NEXT: vmv.v.v v28, v12 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v24, v2, 1 -; RV64-NEXT: vadd.vi v26, v2, -15 +; RV64-NEXT: vadd.vi v24, v4, 1 +; RV64-NEXT: vadd.vi v26, v4, -15 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu ; RV64-NEXT: vrgatherei16.vv v16, v8, v24 -; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vmv1r.v v0, v6 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -744,8 +764,8 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma ; RV64-NEXT: vmv.v.v v28, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 4 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 13 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill @@ -755,7 +775,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vmv.v.i v9, 6 ; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 25 +; RV64-NEXT: li a2, 21 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 @@ -763,259 +783,253 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma ; RV64-NEXT: vrgatherei16.vv v12, v16, v9 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vrgatherei16.vv v12, v16, v10 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vmv4r.v v8, v16 ; RV64-NEXT: vrgather.vi v12, v16, 2 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 37 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vrgather.vi v12, v16, 3 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 5 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: li a1, 24 -; RV64-NEXT: vmv.s.x v1, a1 -; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v24, v2, 2 -; RV64-NEXT: vadd.vi v4, v2, -14 +; RV64-NEXT: vmv.s.x v0, a1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 21 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v8, v16, v24 -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64-NEXT: vadd.vi v16, v4, 2 +; RV64-NEXT: vadd.vi v2, v4, -14 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 57 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v24, v4, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vrgatherei16.vv v8, v24, v16 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 25 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v8, v16, v2, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v6 +; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 49 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v28, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v20, v16, 4, v0.t +; RV64-NEXT: vrgather.vi v28, v24, 4, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v28, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv2r.v v8, v4 ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v4, v2, 3 -; RV64-NEXT: vadd.vi v8, v2, -13 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs2r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vadd.vi v4, v4, 3 +; RV64-NEXT: vadd.vi v6, v8, -13 +; RV64-NEXT: vmv2r.v v2, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v8, v16, v4 -; RV64-NEXT: vmv1r.v v0, v1 +; RV64-NEXT: vrgatherei16.vv v8, v24, v4 ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 21 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl2r.v v16, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v8, v24, v16, v0.t +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v8, v16, v6, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 21 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vmv1r.v v0, v6 +; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 49 +; RV64-NEXT: li a2, 37 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v4, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgather.vi v8, v24, 5, v0.t -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 -; RV64-NEXT: mul a1, a1, a2 -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vrgather.vi v4, v16, 5, v0.t ; RV64-NEXT: lui a1, 96 ; RV64-NEXT: li a2, 192 -; RV64-NEXT: vmv.s.x v28, a2 +; RV64-NEXT: vmv.s.x v1, a2 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmv.v.x v8, a1 -; RV64-NEXT: vmv1r.v v0, v28 +; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 37 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v12, v24, v8, v0.t +; RV64-NEXT: vrgatherei16.vv v12, v16, v8, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 37 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: li a1, 28 ; RV64-NEXT: vmv.s.x v0, a1 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill -; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v30, v2, 4 -; RV64-NEXT: vadd.vi v6, v2, -12 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 +; RV64-NEXT: slli a2, a1, 3 ; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v16, v8, v30 +; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64-NEXT: vadd.vi v22, v2, 4 +; RV64-NEXT: vadd.vi v20, v2, -12 ; RV64-NEXT: csrr a1, vlenb ; RV64-NEXT: li a2, 57 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vrgatherei16.vv v16, v8, v6, v0.t +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vrgatherei16.vv v8, v24, v22 ; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 45 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgatherei16.vv v8, v24, v20, v0.t ; RV64-NEXT: lui a1, 112 ; RV64-NEXT: addi a1, a1, 1 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmv.v.x v12, a1 -; RV64-NEXT: vmv1r.v v0, v28 +; RV64-NEXT: vmv1r.v v0, v1 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 5 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; RV64-NEXT: vrgatherei16.vv v16, v24, v12, v0.t +; RV64-NEXT: vrgatherei16.vv v20, v16, v12, v0.t ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 5 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 45 +; RV64-NEXT: li a2, 53 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 25 -; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma -; RV64-NEXT: vmv.v.v v16, v24 -; RV64-NEXT: vmv2r.v v8, v2 +; RV64-NEXT: vmv.v.v v12, v24 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 53 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill ; RV64-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; RV64-NEXT: vadd.vi v12, v2, 5 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 6 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vrgatherei16.vv v24, v0, v12 +; RV64-NEXT: vrgatherei16.vv v24, v16, v12 ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vadd.vi v2, v8, -11 -; RV64-NEXT: addi a1, sp, 16 -; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v12, v2, -11 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 57 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; RV64-NEXT: vrgatherei16.vv v24, v8, v2, v0.t +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 41 +; RV64-NEXT: li a2, 45 ; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vrgatherei16.vv v24, v16, v12, v0.t +; RV64-NEXT: vmv4r.v v12, v4 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 21 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vsetivli zero, 5, e64, m4, tu, ma ; RV64-NEXT: vmv.v.v v12, v0 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 37 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv.v.v v20, v8 ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: add a1, sp, a1 -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload -; RV64-NEXT: vmv.v.v v20, v0 -; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: slli a2, a1, 5 -; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload @@ -1028,24 +1042,30 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_ ; RV64-NEXT: addi a1, a0, 192 ; RV64-NEXT: vse64.v v12, (a1) ; RV64-NEXT: addi a1, a0, 128 -; RV64-NEXT: vse64.v v16, (a1) +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 53 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: addi a1, a0, 64 ; RV64-NEXT: csrr a2, vlenb -; RV64-NEXT: slli a3, a2, 4 -; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: li a3, 13 +; RV64-NEXT: mul a2, a2, a3 ; RV64-NEXT: add a2, sp, a2 ; RV64-NEXT: addi a2, a2, 16 ; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a1) ; RV64-NEXT: csrr a1, vlenb -; RV64-NEXT: li a2, 21 -; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: add a1, sp, a1 ; RV64-NEXT: addi a1, a1, 16 ; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload ; RV64-NEXT: vse64.v v8, (a0) ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: li a1, 74 +; RV64-NEXT: li a1, 66 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: add sp, sp, a0 ; RV64-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll index 343b086898c14..4f58ccb5188d3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/remat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll @@ -377,3 +377,133 @@ define void @vfmv.v.f(ptr %p, double %x) { store volatile double %x, ptr %p ret void } + +define void @vmv.s.x(ptr %p, i64 %x) { +; POSTRA-LABEL: vmv.s.x: +; POSTRA: # %bb.0: +; POSTRA-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; POSTRA-NEXT: vmv.s.x v8, a1 +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vl8re64.v v16, (a0) +; POSTRA-NEXT: vl8re64.v v24, (a0) +; POSTRA-NEXT: vl8re64.v v0, (a0) +; POSTRA-NEXT: vl8re64.v v8, (a0) +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vs8r.v v0, (a0) +; POSTRA-NEXT: vs8r.v v24, (a0) +; POSTRA-NEXT: vs8r.v v16, (a0) +; POSTRA-NEXT: vmv.s.x v8, a1 +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: sd a1, 0(a0) +; POSTRA-NEXT: ret +; +; PRERA-LABEL: vmv.s.x: +; PRERA: # %bb.0: +; PRERA-NEXT: addi sp, sp, -16 +; PRERA-NEXT: .cfi_def_cfa_offset 16 +; PRERA-NEXT: csrr a2, vlenb +; PRERA-NEXT: slli a2, a2, 3 +; PRERA-NEXT: sub sp, sp, a2 +; PRERA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; PRERA-NEXT: vsetvli a2, zero, e64, m1, ta, ma +; PRERA-NEXT: vmv.s.x v8, a1 +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: addi a2, sp, 16 +; PRERA-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; PRERA-NEXT: vl8re64.v v24, (a0) +; PRERA-NEXT: vl8re64.v v0, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v0, (a0) +; PRERA-NEXT: vs8r.v v24, (a0) +; PRERA-NEXT: vl8r.v v16, (a2) # Unknown-size Folded Reload +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: sd a1, 0(a0) +; PRERA-NEXT: csrr a0, vlenb +; PRERA-NEXT: slli a0, a0, 3 +; PRERA-NEXT: add sp, sp, a0 +; PRERA-NEXT: addi sp, sp, 16 +; PRERA-NEXT: ret + %vmv.s.x = call @llvm.riscv.vmv.s.x.nxv8i64( poison, i64 %x, i64 -1) + store volatile %vmv.s.x, ptr %p + + %a = load volatile , ptr %p + %b = load volatile , ptr %p + %c = load volatile , ptr %p + %d = load volatile , ptr %p + store volatile %d, ptr %p + store volatile %c, ptr %p + store volatile %b, ptr %p + store volatile %a, ptr %p + + store volatile %vmv.s.x, ptr %p + store volatile i64 %x, ptr %p + ret void +} + +define void @vfmv.s.f(ptr %p, double %x) { +; POSTRA-LABEL: vfmv.s.f: +; POSTRA: # %bb.0: +; POSTRA-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; POSTRA-NEXT: vfmv.s.f v8, fa0 +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vl8re64.v v16, (a0) +; POSTRA-NEXT: vl8re64.v v24, (a0) +; POSTRA-NEXT: vl8re64.v v0, (a0) +; POSTRA-NEXT: vl8re64.v v8, (a0) +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: vs8r.v v0, (a0) +; POSTRA-NEXT: vs8r.v v24, (a0) +; POSTRA-NEXT: vs8r.v v16, (a0) +; POSTRA-NEXT: vfmv.s.f v8, fa0 +; POSTRA-NEXT: vs8r.v v8, (a0) +; POSTRA-NEXT: fsd fa0, 0(a0) +; POSTRA-NEXT: ret +; +; PRERA-LABEL: vfmv.s.f: +; PRERA: # %bb.0: +; PRERA-NEXT: addi sp, sp, -16 +; PRERA-NEXT: .cfi_def_cfa_offset 16 +; PRERA-NEXT: csrr a1, vlenb +; PRERA-NEXT: slli a1, a1, 3 +; PRERA-NEXT: sub sp, sp, a1 +; PRERA-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; PRERA-NEXT: vsetvli a1, zero, e64, m1, ta, ma +; PRERA-NEXT: vfmv.s.f v8, fa0 +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: addi a1, sp, 16 +; PRERA-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; PRERA-NEXT: vl8re64.v v24, (a0) +; PRERA-NEXT: vl8re64.v v0, (a0) +; PRERA-NEXT: vl8re64.v v16, (a0) +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v0, (a0) +; PRERA-NEXT: vs8r.v v24, (a0) +; PRERA-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; PRERA-NEXT: vs8r.v v16, (a0) +; PRERA-NEXT: vs8r.v v8, (a0) +; PRERA-NEXT: fsd fa0, 0(a0) +; PRERA-NEXT: csrr a0, vlenb +; PRERA-NEXT: slli a0, a0, 3 +; PRERA-NEXT: add sp, sp, a0 +; PRERA-NEXT: addi sp, sp, 16 +; PRERA-NEXT: ret + %vfmv.s.f = call @llvm.riscv.vfmv.s.f.nxv8f64( poison, double %x, i64 -1) + store volatile %vfmv.s.f, ptr %p + + %a = load volatile , ptr %p + %b = load volatile , ptr %p + %c = load volatile , ptr %p + %d = load volatile , ptr %p + store volatile %d, ptr %p + store volatile %c, ptr %p + store volatile %b, ptr %p + store volatile %a, ptr %p + + store volatile %vfmv.s.f, ptr %p + store volatile double %x, ptr %p + ret void +} From 901006f238aae8dd7e75d173bf9429e8e44f6385 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 18:45:06 -0700 Subject: [PATCH 046/114] =?UTF-8?q?[flang]=20Make=20flang=20module=20hidde?= =?UTF-8?q?n=20dependency=20explicit=20to=20correct=20build=E2=80=A6=20(#1?= =?UTF-8?q?08129)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … failure Any flang module with a derived type definition implicitly depends on flang/module/__fortran_type_info.f90. Make this dependency explicit so that an unlucky build order doesn't cause a crash. --- flang/tools/f18/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt index 344a781c41e95..4670362f7a103 100644 --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -72,9 +72,6 @@ if (NOT CMAKE_CROSSCOMPILING) set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod) else() set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_builtins.mod) - if(NOT ${filename} STREQUAL "__fortran_type_info") - set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod) - endif() if(${filename} STREQUAL "iso_fortran_env") set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/iso_fortran_env_impl.mod) endif() @@ -83,6 +80,9 @@ if (NOT CMAKE_CROSSCOMPILING) set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_ieee_exceptions.mod) endif() endif() + if(NOT ${filename} STREQUAL "__fortran_type_info") + set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod) + endif() # The module contains PPC vector types that needs the PPC target. if(${filename} STREQUAL "__ppc_intrinsics" OR From 12530015a45accd6cf9dd6d565c89b1d7e562be5 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 10 Sep 2024 18:47:20 -0700 Subject: [PATCH 047/114] [RISCV] Add reductions to list of roots in tryToReduceVL (#107595) This allows us to reduce VLs feeding reduction instructions. In particular, this means that <3 x Ty> reduce(load) like sequences no longer require a VL toggle. This was waiting on 3d72957; now that the latent correctness issue is fixed, we can expand this transform. --- llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 18 ++++++ .../redundant-copy-from-tail-duplicate.ll | 2 +- .../rvv/fixed-vectors-reduction-formation.ll | 49 ++++++---------- .../CodeGen/RISCV/rvv/vreductions-fp-vp.ll | 56 +++++++++---------- 4 files changed, 64 insertions(+), 61 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp index 298f3317bf61a..026e5d653c38c 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp @@ -145,6 +145,24 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const { case RISCV::VMERGE_VVM: SrcIdx = 3; // TODO: We can also handle the false operand. break; + case RISCV::VREDSUM_VS: + case RISCV::VREDMAXU_VS: + case RISCV::VREDMAX_VS: + case RISCV::VREDMINU_VS: + case RISCV::VREDMIN_VS: + case RISCV::VREDAND_VS: + case RISCV::VREDOR_VS: + case RISCV::VREDXOR_VS: + case RISCV::VWREDSUM_VS: + case RISCV::VWREDSUMU_VS: + case RISCV::VFREDUSUM_VS: + case RISCV::VFREDOSUM_VS: + case RISCV::VFREDMAX_VS: + case RISCV::VFREDMIN_VS: + case RISCV::VFWREDUSUM_VS: + case RISCV::VFWREDOSUM_VS: + SrcIdx = 2; + break; } MachineOperand &VL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc())); diff --git a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll index 3d367ddc59bca..5d588ad66b9ca 100644 --- a/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll +++ b/llvm/test/CodeGen/RISCV/redundant-copy-from-tail-duplicate.ll @@ -19,7 +19,7 @@ define signext i32 @sum(ptr %a, i32 signext %n, i1 %prof.min.iters.check, %v, %v, %v, %v, %v, %v, %v, %v, %v, %v, %v, %v, Date: Wed, 11 Sep 2024 09:53:04 +0800 Subject: [PATCH 048/114] SelectionDAG: Remove unneeded getSelectCC in expandFMINIMUMNUM_FMAXIMUMNUM (#107416) ISD::FCANONICALIZE is enough, which can process NaN or non-NaN correctly, thus getSelectCC is not needed here. --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index b3307dc9b7730..03010c1df0014 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8616,10 +8616,7 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, // If MinMax is NaN, let's quiet it. if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(LHS) && !DAG.isKnownNeverNaN(RHS)) { - SDValue MinMaxQuiet = - DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags); - MinMax = - DAG.getSelectCC(DL, MinMax, MinMax, MinMaxQuiet, MinMax, ISD::SETUO); + MinMax = DAG.getNode(ISD::FCANONICALIZE, DL, VT, MinMax, Flags); } // Fixup signed zero behavior. From 76151c449080b7239c8b442291514a4300d51cba Mon Sep 17 00:00:00 2001 From: ChiaHungDuan Date: Tue, 10 Sep 2024 18:56:49 -0700 Subject: [PATCH 049/114] Revert "[scudo] Fix the logic of MaxAllowedFragmentedPages" (#108130) Reverts llvm/llvm-project#107927 We are supposed to check the MaxAllowedFragmentedPages instead. --- compiler-rt/lib/scudo/standalone/secondary.h | 25 +++++++------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h index c79ec1360b00a..1a232b9b9fb2d 100644 --- a/compiler-rt/lib/scudo/standalone/secondary.h +++ b/compiler-rt/lib/scudo/standalone/secondary.h @@ -72,16 +72,13 @@ namespace { struct CachedBlock { static constexpr u16 CacheIndexMax = UINT16_MAX; static constexpr u16 InvalidEntry = CacheIndexMax; - // We allow a certain amount of fragmentation and part of the fragmented bytes - // will be released by `releaseAndZeroPagesToOS()`. This increases the chance - // of cache hit rate and reduces the overhead to the RSS at the same time. See - // more details in the `MapAllocatorCache::retrieve()` section. - // - // We arrived at this default value after noticing that mapping in larger - // memory regions performs better than releasing memory and forcing a cache - // hit. According to the data, it suggests that beyond 4 pages, the release - // execution time is longer than the map execution time. In this way, - // the default is dependent on the platform. + // * MaxReleasedCachePages default is currently 4 + // - We arrived at this value after noticing that mapping + // in larger memory regions performs better than releasing + // memory and forcing a cache hit. According to the data, + // it suggests that beyond 4 pages, the release execution time is + // longer than the map execution time. In this way, the default + // is dependent on the platform. static constexpr uptr MaxReleasedCachePages = 4U; uptr CommitBase = 0; @@ -728,14 +725,8 @@ MapAllocator::tryAllocateFromCache(const Options &Options, uptr Size, uptr EntryHeaderPos; uptr MaxAllowedFragmentedPages = MaxUnreleasedCachePages; - if (LIKELY(!useMemoryTagging(Options))) { + if (UNLIKELY(useMemoryTagging(Options))) MaxAllowedFragmentedPages += CachedBlock::MaxReleasedCachePages; - } else { - // TODO: Enable MaxReleasedCachePages may result in pages for an entry being - // partially released and it erases the tag of those pages as well. To - // support this feature for MTE, we need to tag those pages again. - DCHECK_EQ(CachedBlock::MaxReleasedCachePages, 0U); - } Entry = Cache.retrieve(MaxAllowedFragmentedPages, Size, Alignment, getHeadersSize(), EntryHeaderPos); From c5711130be758ca82216abb81c6b870a830e8f82 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Sep 2024 19:00:32 -0700 Subject: [PATCH 050/114] [flang] Fix cycle of build dependencies (#108132) While trying to fix one build problem, I made things worse. This should clear things up. --- flang/tools/f18/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt index 4670362f7a103..9d7b8633958cb 100644 --- a/flang/tools/f18/CMakeLists.txt +++ b/flang/tools/f18/CMakeLists.txt @@ -80,7 +80,7 @@ if (NOT CMAKE_CROSSCOMPILING) set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_ieee_exceptions.mod) endif() endif() - if(NOT ${filename} STREQUAL "__fortran_type_info") + if(NOT ${filename} STREQUAL "__fortran_type_info" AND NOT ${filename} STREQUAL "__fortran_builtins") set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod) endif() From 3b4e7c9c4502d41ece4ef3431bbc12f055adabb5 Mon Sep 17 00:00:00 2001 From: Sterling-Augustine <56981066+Sterling-Augustine@users.noreply.github.com> Date: Wed, 11 Sep 2024 02:11:14 +0000 Subject: [PATCH 051/114] [SandboxIR] Implement ScalableVectorType (#108124) As in the heading. --- llvm/include/llvm/SandboxIR/Type.h | 83 +++++++++++++++++++++----- llvm/lib/SandboxIR/Type.cpp | 6 ++ llvm/unittests/SandboxIR/TypesTest.cpp | 59 ++++++++++++++++++ 3 files changed, 133 insertions(+), 15 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index ec141c249fb21..a2ac9e014b44a 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -26,6 +26,7 @@ class Context; class PointerType; class VectorType; class FixedVectorType; +class ScalableVectorType; class IntegerType; class FunctionType; class ArrayType; @@ -39,21 +40,22 @@ class StructType; class Type { protected: llvm::Type *LLVMTy; - friend class ArrayType; // For LLVMTy. - friend class StructType; // For LLVMTy. - friend class VectorType; // For LLVMTy. - friend class FixedVectorType; // For LLVMTy. - friend class PointerType; // For LLVMTy. - friend class FunctionType; // For LLVMTy. - friend class IntegerType; // For LLVMTy. - friend class Function; // For LLVMTy. - friend class CallBase; // For LLVMTy. - friend class ConstantInt; // For LLVMTy. - friend class ConstantArray; // For LLVMTy. - friend class ConstantStruct; // For LLVMTy. - friend class ConstantVector; // For LLVMTy. - friend class CmpInst; // For LLVMTy. TODO: Cleanup after - // sandboxir::VectorType is more complete. + friend class ArrayType; // For LLVMTy. + friend class StructType; // For LLVMTy. + friend class VectorType; // For LLVMTy. + friend class FixedVectorType; // For LLVMTy. + friend class ScalableVectorType; // For LLVMTy. + friend class PointerType; // For LLVMTy. + friend class FunctionType; // For LLVMTy. + friend class IntegerType; // For LLVMTy. + friend class Function; // For LLVMTy. + friend class CallBase; // For LLVMTy. + friend class ConstantInt; // For LLVMTy. + friend class ConstantArray; // For LLVMTy. + friend class ConstantStruct; // For LLVMTy. + friend class ConstantVector; // For LLVMTy. + friend class CmpInst; // For LLVMTy. TODO: Cleanup after + // sandboxir::VectorType is more complete. // Friend all instruction classes because `create()` functions use LLVMTy. #define DEF_INSTR(ID, OPCODE, CLASS) friend class CLASS; @@ -390,6 +392,57 @@ class FixedVectorType : public VectorType { } }; +class ScalableVectorType : public VectorType { +public: + static ScalableVectorType *get(Type *ElementType, unsigned MinNumElts); + + static ScalableVectorType *get(Type *ElementType, + const ScalableVectorType *SVTy) { + return get(ElementType, SVTy->getMinNumElements()); + } + + static ScalableVectorType *getInteger(ScalableVectorType *VTy) { + return cast(VectorType::getInteger(VTy)); + } + + static ScalableVectorType * + getExtendedElementVectorType(ScalableVectorType *VTy) { + return cast( + VectorType::getExtendedElementVectorType(VTy)); + } + + static ScalableVectorType * + getTruncatedElementVectorType(ScalableVectorType *VTy) { + return cast( + VectorType::getTruncatedElementVectorType(VTy)); + } + + static ScalableVectorType *getSubdividedVectorType(ScalableVectorType *VTy, + int NumSubdivs) { + return cast( + VectorType::getSubdividedVectorType(VTy, NumSubdivs)); + } + + static ScalableVectorType * + getHalfElementsVectorType(ScalableVectorType *VTy) { + return cast(VectorType::getHalfElementsVectorType(VTy)); + } + + static ScalableVectorType * + getDoubleElementsVectorType(ScalableVectorType *VTy) { + return cast( + VectorType::getDoubleElementsVectorType(VTy)); + } + + unsigned getMinNumElements() const { + return cast(LLVMTy)->getMinNumElements(); + } + + static bool classof(const Type *T) { + return isa(T->LLVMTy); + } +}; + class FunctionType : public Type { public: // TODO: add missing functions diff --git a/llvm/lib/SandboxIR/Type.cpp b/llvm/lib/SandboxIR/Type.cpp index 26aa8b3743084..87dcb726dde35 100644 --- a/llvm/lib/SandboxIR/Type.cpp +++ b/llvm/lib/SandboxIR/Type.cpp @@ -108,6 +108,12 @@ FixedVectorType *FixedVectorType::get(Type *ElementType, unsigned NumElts) { llvm::FixedVectorType::get(ElementType->LLVMTy, NumElts))); } +ScalableVectorType *ScalableVectorType::get(Type *ElementType, + unsigned NumElts) { + return cast(ElementType->getContext().getType( + llvm::ScalableVectorType::get(ElementType->LLVMTy, NumElts))); +} + IntegerType *IntegerType::get(Context &Ctx, unsigned NumBits) { return cast( Ctx.getType(llvm::IntegerType::get(Ctx.LLVMCtx, NumBits))); diff --git a/llvm/unittests/SandboxIR/TypesTest.cpp b/llvm/unittests/SandboxIR/TypesTest.cpp index 3564ae6683014..40aa32fb08ed0 100644 --- a/llvm/unittests/SandboxIR/TypesTest.cpp +++ b/llvm/unittests/SandboxIR/TypesTest.cpp @@ -381,6 +381,65 @@ define void @foo(<4 x i16> %vi0, <4 x float> %vf1, i8 %i0) { EXPECT_EQ(Vec8i16Ty->getElementCount(), ElementCount::getFixed(8)); } +TEST_F(SandboxTypeTest, ScalableVectorType) { + parseIR(C, R"IR( +define void @foo( %vi0, %vf1, i8 %i0) { + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + // Check classof(), creation, accessors + auto *Vec4i16Ty = + cast(F->getArg(0)->getType()); + EXPECT_TRUE(Vec4i16Ty->getElementType()->isIntegerTy(16)); + EXPECT_EQ(Vec4i16Ty->getMinNumElements(), 4u); + + // get(ElementType, NumElements) + EXPECT_EQ( + sandboxir::ScalableVectorType::get(sandboxir::Type::getInt16Ty(Ctx), 4), + F->getArg(0)->getType()); + // get(ElementType, Other) + EXPECT_EQ(sandboxir::ScalableVectorType::get( + sandboxir::Type::getInt16Ty(Ctx), + cast(F->getArg(0)->getType())), + F->getArg(0)->getType()); + auto *Vec4FTy = cast(F->getArg(1)->getType()); + EXPECT_TRUE(Vec4FTy->getElementType()->isFloatTy()); + // getInteger + auto *Vec4i32Ty = sandboxir::ScalableVectorType::getInteger(Vec4FTy); + EXPECT_TRUE(Vec4i32Ty->getElementType()->isIntegerTy(32)); + EXPECT_EQ(Vec4i32Ty->getMinNumElements(), Vec4FTy->getMinNumElements()); + // getExtendedElementCountVectorType + auto *Vec4i64Ty = + sandboxir::ScalableVectorType::getExtendedElementVectorType(Vec4i16Ty); + EXPECT_TRUE(Vec4i64Ty->getElementType()->isIntegerTy(32)); + EXPECT_EQ(Vec4i64Ty->getMinNumElements(), Vec4i16Ty->getMinNumElements()); + // getTruncatedElementVectorType + auto *Vec4i8Ty = + sandboxir::ScalableVectorType::getTruncatedElementVectorType(Vec4i16Ty); + EXPECT_TRUE(Vec4i8Ty->getElementType()->isIntegerTy(8)); + EXPECT_EQ(Vec4i8Ty->getMinNumElements(), Vec4i8Ty->getMinNumElements()); + // getSubdividedVectorType + auto *Vec8i8Ty = + sandboxir::ScalableVectorType::getSubdividedVectorType(Vec4i16Ty, 1); + EXPECT_TRUE(Vec8i8Ty->getElementType()->isIntegerTy(8)); + EXPECT_EQ(Vec8i8Ty->getMinNumElements(), 8u); + // getMinNumElements + EXPECT_EQ(Vec8i8Ty->getMinNumElements(), 8u); + // getHalfElementsVectorType + auto *Vec2i16Ty = + sandboxir::ScalableVectorType::getHalfElementsVectorType(Vec4i16Ty); + EXPECT_TRUE(Vec2i16Ty->getElementType()->isIntegerTy(16)); + EXPECT_EQ(Vec2i16Ty->getMinNumElements(), 2u); + // getDoubleElementsVectorType + auto *Vec8i16Ty = + sandboxir::ScalableVectorType::getDoubleElementsVectorType(Vec4i16Ty); + EXPECT_TRUE(Vec8i16Ty->getElementType()->isIntegerTy(16)); + EXPECT_EQ(Vec8i16Ty->getMinNumElements(), 8u); +} + TEST_F(SandboxTypeTest, FunctionType) { parseIR(C, R"IR( define void @foo() { From e67a6667dc2b46ece983321af89ae40ca7986b16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 10 Sep 2024 19:33:33 -0700 Subject: [PATCH 052/114] [flang][cuda] Avoid extra load in c_f_pointer lowering with c_devptr (#108090) Remove unnecessary load of the `cptr` component when getting the `__address`. `fir.coordinate_of` operation can be chained so the load is not needed. --- flang/lib/Optimizer/Builder/FIRBuilder.cpp | 3 +-- flang/test/Lower/CUDA/cuda-devptr.cuf | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index c5a135a189e8d..d786d79ba8701 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -1594,8 +1594,7 @@ mlir::Value fir::factory::genCPtrOrCFunptrValue(fir::FirOpBuilder &builder, cPtrCoor = builder.create(loc, addrFieldTy, cPtr, arrayAttr); } - mlir::Value cptr = builder.create(loc, cPtrCoor); - return genCPtrOrCFunptrValue(builder, loc, cptr); + return genCPtrOrCFunptrValue(builder, loc, cPtrCoor); } if (fir::isa_ref_type(cPtr.getType())) { diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf index 21c5088b640fc..2eac890970d52 100644 --- a/flang/test/Lower/CUDA/cuda-devptr.cuf +++ b/flang/test/Lower/CUDA/cuda-devptr.cuf @@ -40,8 +40,9 @@ end ! CHECK: %[[X:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QFsub2Ex"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[CPTR:.*]] = fir.field_index cptr, !fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}> ! CHECK: %[[CPTR_COORD:.*]] = fir.coordinate_of %{{.*}}#1, %[[CPTR]] : (!fir.ref}>>, !fir.field) -> !fir.ref> -! CHECK: %[[CPTR_LOAD:.*]] = fir.load %[[CPTR_COORD]] : !fir.ref> -! CHECK: %[[ADDRESS:.*]] = fir.extract_value %[[CPTR_LOAD]], [0 : index] : (!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>) -> i64 -! CHECK: %[[ADDRESS_IDX:.*]] = fir.convert %[[ADDRESS]] : (i64) -> !fir.ptr> +! CHECK: %[[ADDRESS:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> +! CHECK: %[[ADDRESS_COORD:.*]] = fir.coordinate_of %[[CPTR_COORD]], %[[ADDRESS]] : (!fir.ref>, !fir.field) -> !fir.ref +! CHECK: %[[ADDRESS_LOADED:.*]] = fir.load %[[ADDRESS_COORD]] : !fir.ref +! CHECK: %[[ADDRESS_IDX:.*]] = fir.convert %[[ADDRESS_LOADED]] : (i64) -> !fir.ptr> ! CHECK: %[[EMBOX:.*]] = fir.embox %[[ADDRESS_IDX]](%{{.*}}) : (!fir.ptr>, !fir.shape<1>) -> !fir.box>> ! CHECK: fir.store %[[EMBOX]] to %[[X]]#1 : !fir.ref>>> From 3dad29b677e427bf69c035605a16efd065576829 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Tue, 10 Sep 2024 19:36:04 -0700 Subject: [PATCH 053/114] [LTO] Remove unused includes (NFC) (#108110) clangd reports these as unused headers. My manual inspection agrees with the findings. --- llvm/include/llvm/LTO/LTO.h | 1 - llvm/include/llvm/Transforms/IPO/FunctionImport.h | 2 -- llvm/lib/LTO/LTO.cpp | 1 - llvm/lib/Transforms/IPO/FunctionImport.cpp | 1 - 4 files changed, 5 deletions(-) diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index fc6e93606de12..214aa4e1c562d 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -19,7 +19,6 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" -#include "llvm/ADT/StringMap.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/LTO/Config.h" diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h index 0a6cc5951b706..70739709a810a 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h @@ -17,9 +17,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Support/Error.h" #include -#include #include -#include #include #include diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 5d9a5cbd18f15..a88124dacfaef 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -38,7 +38,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" -#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/SHA1.h" diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp index 1aac8e0713587..ff0d78178bd18 100644 --- a/llvm/lib/Transforms/IPO/FunctionImport.cpp +++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp @@ -46,7 +46,6 @@ #include "llvm/Transforms/Utils/ValueMapper.h" #include #include -#include #include #include #include From 6bbf7f06d8e4e84bbda9027252b26a0d9ae10cde Mon Sep 17 00:00:00 2001 From: Heejin Ahn Date: Tue, 10 Sep 2024 21:32:24 -0700 Subject: [PATCH 054/114] [WebAssembly] Add assembly support for final EH proposal (#107917) This adds the basic assembly generation support for the final EH proposal, which was newly adopted in Sep 2023 and advanced into Phase 4 in Jul 2024: https://github.com/WebAssembly/exception-handling/blob/main/proposals/exception-handling/Exceptions.md This adds support for the generation of new `try_table` and `throw_ref` instruction in .s asesmbly format. This does NOT yet include - Block annotation comment generation for .s format - .o object file generation - .s assembly parsing - Type checking (AsmTypeCheck) - Disassembler - Fixing unwind mismatches in CFGStackify These will be added as follow-up PRs. --- The format for `TRY_TABLE`, both for `MachineInstr` and `MCInst`, is as follows: ``` TRY_TABLE type number_of_catches catch_clauses* ``` where `catch_clause` is ``` catch_opcode tag+ destination ``` `catch_opcode` should be one of 0/1/2/3, which denotes `CATCH`/`CATCH_REF`/`CATCH_ALL`/`CATCH_ALL_REF` respectively. (See `BinaryFormat/Wasm.h`) `tag` exists when the catch is one of `CATCH` or `CATCH_REF`. The MIR format is printed as just the list of raw operands. The (stack-based) assembly instruction supports pretty-printing, including printing `catch` clauses by name, in InstPrinter. In addition to the new instructions `TRY_TABLE` and `THROW_REF`, this adds four pseudo instructions: `CATCH`, `CATCH_REF`, `CATCH_ALL`, and `CATCH_ALL_REF`. These are pseudo instructions to simulate block return values of `catch`, `catch_ref`, `catch_all`, `catch_all_ref` clauses in `try_table` respectively, given that we don't support block return values except for one case (`fixEndsAtEndOfFunction` in CFGStackify). These will be omitted when we lower the instructions to `MCInst` at the end. LateEHPrepare now will have one more stage to covert `CATCH`/`CATCH_ALL`s to `CATCH_REF`/`CATCH_ALL_REF`s when there is a `RETHROW` to rethrow its exception. The pass also converts `RETHROW`s into `THROW_REF`. Note that we still use `RETHROW` as an interim pseudo instruction until we convert them to `THROW_REF` in LateEHPrepare. CFGStackify has a new `placeTryTableMarker` function, which places `try_table`/`end_try_table` markers with a necessary `catch` clause and also `block`/`end_block` markers for the destination of the `catch` clause. In MCInstLower, now we need to support one more case for the multivalue block signature (`catch_ref`'s destination's `(i32, exnref)` return type). InstPrinter has a new routine to print the `catch_list` type, which is used to print `try_table` instructions. The new test, `exception.ll`'s source is the same as `exception-legacy.ll`, with the FileCheck expectations changed. One difference is the commands in this file have `-wasm-enable-exnref` to test the new format, and don't have `-wasm-disable-explicit-locals -wasm-keep-registers`, because the new custom InstPrinter routine to print `catch_list` only works for the stack-based instructions (`_S`), and we can't use `-wasm-keep-registers` for them. As in `exception-legacy.ll`, the FileCheck lines for the new tests do not contain the whole program; they mostly contain only the control flow instructions for readability. --- llvm/include/llvm/BinaryFormat/Wasm.h | 8 + .../AsmParser/WebAssemblyAsmParser.cpp | 10 +- .../MCTargetDesc/WebAssemblyInstPrinter.cpp | 41 ++ .../MCTargetDesc/WebAssemblyInstPrinter.h | 1 + .../MCTargetDesc/WebAssemblyMCTargetDesc.h | 30 ++ .../MCTargetDesc/WebAssemblyMCTypeUtilities.h | 14 +- .../WebAssembly/WebAssemblyAsmPrinter.cpp | 11 + .../WebAssembly/WebAssemblyCFGStackify.cpp | 322 +++++++++++- .../WebAssembly/WebAssemblyISelDAGToDAG.cpp | 5 +- .../WebAssembly/WebAssemblyInstrControl.td | 36 +- .../WebAssembly/WebAssemblyLateEHPrepare.cpp | 76 ++- .../WebAssembly/WebAssemblyMCInstLower.cpp | 22 +- .../WebAssembly/WebAssemblyUtilities.cpp | 2 + .../CodeGen/WebAssembly/exception-legacy.ll | 2 +- llvm/test/CodeGen/WebAssembly/exception.ll | 470 ++++++++++++++++++ 15 files changed, 1009 insertions(+), 41 deletions(-) create mode 100644 llvm/test/CodeGen/WebAssembly/exception.ll diff --git a/llvm/include/llvm/BinaryFormat/Wasm.h b/llvm/include/llvm/BinaryFormat/Wasm.h index acf89885af6fd..9b21d6d65c2a8 100644 --- a/llvm/include/llvm/BinaryFormat/Wasm.h +++ b/llvm/include/llvm/BinaryFormat/Wasm.h @@ -144,6 +144,14 @@ enum : unsigned { WASM_OPCODE_I32_RMW_CMPXCHG = 0x48, }; +// Sub-opcodes for catch clauses in a try_table instruction +enum : unsigned { + WASM_OPCODE_CATCH = 0x00, + WASM_OPCODE_CATCH_REF = 0x01, + WASM_OPCODE_CATCH_ALL = 0x02, + WASM_OPCODE_CATCH_ALL_REF = 0x03, +}; + enum : unsigned { WASM_LIMITS_FLAG_NONE = 0x0, WASM_LIMITS_FLAG_HAS_MAX = 0x1, diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 24a9ad67cfe04..5299e6ea06f0b 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -45,7 +45,7 @@ namespace { /// WebAssemblyOperand - Instances of this class represent the operands in a /// parsed Wasm machine instruction. struct WebAssemblyOperand : public MCParsedAsmOperand { - enum KindTy { Token, Integer, Float, Symbol, BrList } Kind; + enum KindTy { Token, Integer, Float, Symbol, BrList, CatchList } Kind; SMLoc StartLoc, EndLoc; @@ -99,6 +99,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { bool isMem() const override { return false; } bool isReg() const override { return false; } bool isBrList() const { return Kind == BrList; } + bool isCatchList() const { return Kind == CatchList; } MCRegister getReg() const override { llvm_unreachable("Assembly inspects a register operand"); @@ -151,6 +152,10 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { Inst.addOperand(MCOperand::createImm(Br)); } + void addCatchListOperands(MCInst &Inst, unsigned N) const { + // TODO + } + void print(raw_ostream &OS) const override { switch (Kind) { case Token: @@ -168,6 +173,9 @@ struct WebAssemblyOperand : public MCParsedAsmOperand { case BrList: OS << "BrList:" << BrL.List.size(); break; + case CatchList: + // TODO + break; } } }; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp index b85ed1d93593b..903dbcd21ea96 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp @@ -367,3 +367,44 @@ void WebAssemblyInstPrinter::printWebAssemblySignatureOperand(const MCInst *MI, } } } + +void WebAssemblyInstPrinter::printCatchList(const MCInst *MI, unsigned OpNo, + raw_ostream &O) { + unsigned OpIdx = OpNo; + const MCOperand &Op = MI->getOperand(OpIdx++); + unsigned NumCatches = Op.getImm(); + + auto PrintTagOp = [&](const MCOperand &Op) { + const MCSymbolRefExpr *TagExpr = nullptr; + const MCSymbolWasm *TagSym = nullptr; + assert(Op.isExpr()); + TagExpr = dyn_cast(Op.getExpr()); + TagSym = cast(&TagExpr->getSymbol()); + O << TagSym->getName() << " "; + }; + + for (unsigned I = 0; I < NumCatches; I++) { + const MCOperand &Op = MI->getOperand(OpIdx++); + O << "("; + switch (Op.getImm()) { + case wasm::WASM_OPCODE_CATCH: + O << "catch "; + PrintTagOp(MI->getOperand(OpIdx++)); + break; + case wasm::WASM_OPCODE_CATCH_REF: + O << "catch_ref "; + PrintTagOp(MI->getOperand(OpIdx++)); + break; + case wasm::WASM_OPCODE_CATCH_ALL: + O << "catch_all "; + break; + case wasm::WASM_OPCODE_CATCH_ALL_REF: + O << "catch_all_ref "; + break; + } + O << MI->getOperand(OpIdx++).getImm(); // destination + O << ")"; + if (I < NumCatches - 1) + O << " "; + } +} diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h index 8fd54d1640905..b499926ab8296 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h @@ -47,6 +47,7 @@ class WebAssemblyInstPrinter final : public MCInstPrinter { raw_ostream &O); void printWebAssemblySignatureOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O); + void printCatchList(const MCInst *MI, unsigned OpNo, raw_ostream &O); // Autogenerated by tblgen. std::pair getMnemonic(const MCInst *MI) override; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 00f15e1db5e13..e3a60fa4812d8 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -87,6 +87,8 @@ enum OperandType { OPERAND_BRLIST, /// 32-bit unsigned table number. OPERAND_TABLE, + /// A list of catch clauses for try_table. + OPERAND_CATCH_LIST, }; } // end namespace WebAssembly @@ -119,6 +121,10 @@ enum TOF { // address relative the __table_base wasm global. // Only applicable to function symbols. MO_TABLE_BASE_REL, + + // On a block signature operand this indicates that this is a destination + // block of a (catch_ref) clause in try_table. + MO_CATCH_BLOCK_SIG, }; } // end namespace WebAssemblyII @@ -462,6 +468,22 @@ inline bool isMarker(unsigned Opc) { case WebAssembly::TRY_S: case WebAssembly::END_TRY: case WebAssembly::END_TRY_S: + case WebAssembly::TRY_TABLE: + case WebAssembly::TRY_TABLE_S: + case WebAssembly::END_TRY_TABLE: + case WebAssembly::END_TRY_TABLE_S: + return true; + default: + return false; + } +} + +inline bool isTry(unsigned Opc) { + switch (Opc) { + case WebAssembly::TRY: + case WebAssembly::TRY_S: + case WebAssembly::TRY_TABLE: + case WebAssembly::TRY_TABLE_S: return true; default: return false; @@ -474,6 +496,14 @@ inline bool isCatch(unsigned Opc) { case WebAssembly::CATCH_LEGACY_S: case WebAssembly::CATCH_ALL_LEGACY: case WebAssembly::CATCH_ALL_LEGACY_S: + case WebAssembly::CATCH: + case WebAssembly::CATCH_S: + case WebAssembly::CATCH_REF: + case WebAssembly::CATCH_REF_S: + case WebAssembly::CATCH_ALL: + case WebAssembly::CATCH_ALL_S: + case WebAssembly::CATCH_ALL_REF: + case WebAssembly::CATCH_ALL_REF_S: return true; default: return false; diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h index 063ee4dba9068..4aca092e0e4c4 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTypeUtilities.h @@ -33,11 +33,15 @@ enum class BlockType : unsigned { Externref = unsigned(wasm::ValType::EXTERNREF), Funcref = unsigned(wasm::ValType::FUNCREF), Exnref = unsigned(wasm::ValType::EXNREF), - // Multivalue blocks (and other non-void blocks) are only emitted when the - // blocks will never be exited and are at the ends of functions (see - // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). They also are never made - // to pop values off the stack, so the exact multivalue signature can always - // be inferred from the return type of the parent function in MCInstLower. + // Multivalue blocks are emitted in two cases: + // 1. When the blocks will never be exited and are at the ends of functions + // (see WebAssemblyCFGStackify::fixEndsAtEndOfFunction). In this case the + // exact multivalue signature can always be inferred from the return type + // of the parent function. + // 2. (catch_ref ...) clause in try_table instruction. Currently all tags we + // support (cpp_exception and c_longjmp) throws a single i32, so the + // multivalue signature for this case will be (i32, exnref). + // The real multivalue siganture will be added in MCInstLower. Multivalue = 0xffff, }; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 6dd6145ed0057..14c0eaac17daa 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -683,6 +683,17 @@ void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) { // This is a compiler barrier that prevents instruction reordering during // backend compilation, and should not be emitted. break; + case WebAssembly::CATCH: + case WebAssembly::CATCH_S: + case WebAssembly::CATCH_REF: + case WebAssembly::CATCH_REF_S: + case WebAssembly::CATCH_ALL: + case WebAssembly::CATCH_ALL_S: + case WebAssembly::CATCH_ALL_REF: + case WebAssembly::CATCH_ALL_REF_S: + // These are pseudo instructions to represent catch clauses in try_table + // instruction to simulate block return values. + break; default: { WebAssemblyMCInstLower MCInstLowering(OutContext, *this); MCInst TmpInst; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp index 3cccc57e629fd..a5f73fabca354 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp @@ -9,9 +9,9 @@ /// \file /// This file implements a CFG stacking pass. /// -/// This pass inserts BLOCK, LOOP, and TRY markers to mark the start of scopes, -/// since scope boundaries serve as the labels for WebAssembly's control -/// transfers. +/// This pass inserts BLOCK, LOOP, TRY, and TRY_TABLE markers to mark the start +/// of scopes, since scope boundaries serve as the labels for WebAssembly's +/// control transfers. /// /// This is sufficient to convert arbitrary CFGs into a form that works on /// WebAssembly, provided that all loops are single-entry. @@ -21,6 +21,7 @@ /// //===----------------------------------------------------------------------===// +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "Utils/WebAssemblyTypeUtilities.h" #include "WebAssembly.h" #include "WebAssemblyExceptionInfo.h" @@ -29,6 +30,7 @@ #include "WebAssemblySubtarget.h" #include "WebAssemblyUtilities.h" #include "llvm/ADT/Statistic.h" +#include "llvm/BinaryFormat/Wasm.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" @@ -74,6 +76,7 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { void placeBlockMarker(MachineBasicBlock &MBB); void placeLoopMarker(MachineBasicBlock &MBB); void placeTryMarker(MachineBasicBlock &MBB); + void placeTryTableMarker(MachineBasicBlock &MBB); // Exception handling related functions bool fixCallUnwindMismatches(MachineFunction &MF); @@ -97,11 +100,11 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { void fixEndsAtEndOfFunction(MachineFunction &MF); void cleanupFunctionData(MachineFunction &MF); - // For each BLOCK|LOOP|TRY, the corresponding END_(BLOCK|LOOP|TRY) or DELEGATE - // (in case of TRY). + // For each BLOCK|LOOP|TRY|TRY_TABLE, the corresponding + // END_(BLOCK|LOOP|TRY|TRY_TABLE) or DELEGATE (in case of TRY). DenseMap BeginToEnd; - // For each END_(BLOCK|LOOP|TRY) or DELEGATE, the corresponding - // BLOCK|LOOP|TRY. + // For each END_(BLOCK|LOOP|TRY|TRY_TABLE) or DELEGATE, the corresponding + // BLOCK|LOOP|TRY|TRY_TABLE. DenseMap EndToBegin; // map DenseMap TryToEHPad; @@ -150,9 +153,10 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass { } // end anonymous namespace char WebAssemblyCFGStackify::ID = 0; -INITIALIZE_PASS(WebAssemblyCFGStackify, DEBUG_TYPE, - "Insert BLOCK/LOOP/TRY markers for WebAssembly scopes", false, - false) +INITIALIZE_PASS( + WebAssemblyCFGStackify, DEBUG_TYPE, + "Insert BLOCK/LOOP/TRY/TRY_TABLE markers for WebAssembly scopes", false, + false) FunctionPass *llvm::createWebAssemblyCFGStackify() { return new WebAssemblyCFGStackify(); @@ -314,12 +318,13 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { #endif } - // If there is a previously placed BLOCK/TRY marker and its corresponding - // END marker is before the current BLOCK's END marker, that should be - // placed after this BLOCK. Otherwise it should be placed before this BLOCK - // marker. + // If there is a previously placed BLOCK/TRY/TRY_TABLE marker and its + // corresponding END marker is before the current BLOCK's END marker, that + // should be placed after this BLOCK. Otherwise it should be placed before + // this BLOCK marker. if (MI.getOpcode() == WebAssembly::BLOCK || - MI.getOpcode() == WebAssembly::TRY) { + MI.getOpcode() == WebAssembly::TRY || + MI.getOpcode() == WebAssembly::TRY_TABLE) { if (BeginToEnd[&MI]->getParent()->getNumber() <= MBB.getNumber()) AfterSet.insert(&MI); #ifndef NDEBUG @@ -329,10 +334,11 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { } #ifndef NDEBUG - // All END_(BLOCK|LOOP|TRY) markers should be before the BLOCK. + // All END_(BLOCK|LOOP|TRY|TRY_TABLE) markers should be before the BLOCK. if (MI.getOpcode() == WebAssembly::END_BLOCK || MI.getOpcode() == WebAssembly::END_LOOP || - MI.getOpcode() == WebAssembly::END_TRY) + MI.getOpcode() == WebAssembly::END_TRY || + MI.getOpcode() == WebAssembly::END_TRY_TABLE) BeforeSet.insert(&MI); #endif @@ -374,6 +380,11 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) { // loop is above this block's header, the END_LOOP should be placed after // the END_BLOCK, because the loop contains this block. Otherwise the // END_LOOP should be placed before the END_BLOCK. The same for END_TRY. + // + // Note that while there can be existing END_TRYs, there can't be + // END_TRY_TABLEs; END_TRYs are placed when its corresponding EH pad is + // processed, so they are placed below MBB (EH pad) in placeTryMarker. But + // END_TRY_TABLE is placed like a END_BLOCK, so they can't be here already. if (MI.getOpcode() == WebAssembly::END_LOOP || MI.getOpcode() == WebAssembly::END_TRY) { if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber()) @@ -657,7 +668,251 @@ void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) { updateScopeTops(Header, End); } +void WebAssemblyCFGStackify::placeTryTableMarker(MachineBasicBlock &MBB) { + assert(MBB.isEHPad()); + MachineFunction &MF = *MBB.getParent(); + auto &MDT = getAnalysis().getDomTree(); + const auto &TII = *MF.getSubtarget().getInstrInfo(); + const auto &MLI = getAnalysis().getLI(); + const auto &WEI = getAnalysis(); + SortRegionInfo SRI(MLI, WEI); + const auto &MFI = *MF.getInfo(); + + // Compute the nearest common dominator of all unwind predecessors + MachineBasicBlock *Header = nullptr; + int MBBNumber = MBB.getNumber(); + for (auto *Pred : MBB.predecessors()) { + if (Pred->getNumber() < MBBNumber) { + Header = Header ? MDT.findNearestCommonDominator(Header, Pred) : Pred; + assert(!explicitlyBranchesTo(Pred, &MBB) && + "Explicit branch to an EH pad!"); + } + } + if (!Header) + return; + + assert(&MBB != &MF.front() && "Header blocks shouldn't have predecessors"); + MachineBasicBlock *LayoutPred = MBB.getPrevNode(); + + // If the nearest common dominator is inside a more deeply nested context, + // walk out to the nearest scope which isn't more deeply nested. + for (MachineFunction::iterator I(LayoutPred), E(Header); I != E; --I) { + if (MachineBasicBlock *ScopeTop = ScopeTops[I->getNumber()]) { + if (ScopeTop->getNumber() > Header->getNumber()) { + // Skip over an intervening scope. + I = std::next(ScopeTop->getIterator()); + } else { + // We found a scope level at an appropriate depth. + Header = ScopeTop; + break; + } + } + } + + // Decide where in Header to put the TRY_TABLE. + + // Instructions that should go before the TRY_TABLE. + SmallPtrSet BeforeSet; + // Instructions that should go after the TRY_TABLE. + SmallPtrSet AfterSet; + for (const auto &MI : *Header) { + // If there is a previously placed LOOP marker and the bottom block of the + // loop is above MBB, it should be after the TRY_TABLE, because the loop is + // nested in this TRY_TABLE. Otherwise it should be before the TRY_TABLE. + if (MI.getOpcode() == WebAssembly::LOOP) { + auto *LoopBottom = BeginToEnd[&MI]->getParent()->getPrevNode(); + if (MBB.getNumber() > LoopBottom->getNumber()) + AfterSet.insert(&MI); +#ifndef NDEBUG + else + BeforeSet.insert(&MI); +#endif + } + + // All previously inserted BLOCK/TRY_TABLE markers should be after the + // TRY_TABLE because they are all nested blocks/try_tables. + if (MI.getOpcode() == WebAssembly::BLOCK || + MI.getOpcode() == WebAssembly::TRY_TABLE) + AfterSet.insert(&MI); + +#ifndef NDEBUG + // All END_(BLOCK/LOOP/TRY_TABLE) markers should be before the TRY_TABLE. + if (MI.getOpcode() == WebAssembly::END_BLOCK || + MI.getOpcode() == WebAssembly::END_LOOP || + MI.getOpcode() == WebAssembly::END_TRY_TABLE) + BeforeSet.insert(&MI); +#endif + + // Terminators should go after the TRY_TABLE. + if (MI.isTerminator()) + AfterSet.insert(&MI); + } + + // If Header unwinds to MBB (= Header contains 'invoke'), the try_table block + // should contain the call within it. So the call should go after the + // TRY_TABLE. The exception is when the header's terminator is a rethrow + // instruction, in which case that instruction, not a call instruction before + // it, is gonna throw. + MachineInstr *ThrowingCall = nullptr; + if (MBB.isPredecessor(Header)) { + auto TermPos = Header->getFirstTerminator(); + if (TermPos == Header->end() || + TermPos->getOpcode() != WebAssembly::RETHROW) { + for (auto &MI : reverse(*Header)) { + if (MI.isCall()) { + AfterSet.insert(&MI); + ThrowingCall = &MI; + // Possibly throwing calls are usually wrapped by EH_LABEL + // instructions. We don't want to split them and the call. + if (MI.getIterator() != Header->begin() && + std::prev(MI.getIterator())->isEHLabel()) { + AfterSet.insert(&*std::prev(MI.getIterator())); + ThrowingCall = &*std::prev(MI.getIterator()); + } + break; + } + } + } + } + + // Local expression tree should go after the TRY_TABLE. + // For BLOCK placement, we start the search from the previous instruction of a + // BB's terminator, but in TRY_TABLE's case, we should start from the previous + // instruction of a call that can throw, or a EH_LABEL that precedes the call, + // because the return values of the call's previous instructions can be + // stackified and consumed by the throwing call. + auto SearchStartPt = ThrowingCall ? MachineBasicBlock::iterator(ThrowingCall) + : Header->getFirstTerminator(); + for (auto I = SearchStartPt, E = Header->begin(); I != E; --I) { + if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition()) + continue; + if (WebAssembly::isChild(*std::prev(I), MFI)) + AfterSet.insert(&*std::prev(I)); + else + break; + } + + // Add the TRY_TABLE and a BLOCK for the catch destination. We currently + // generate only one CATCH clause for a TRY_TABLE, so we need one BLOCK for + // its destination. + // + // Header: + // block + // try_table (catch ... $MBB) + // ... + // + // MBB: + // end_try_table + // end_block ;; destination of (catch ...) + // ... catch handler body ... + auto InsertPos = getLatestInsertPos(Header, BeforeSet, AfterSet); + MachineInstrBuilder BlockMIB = + BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos), + TII.get(WebAssembly::BLOCK)); + auto *Block = BlockMIB.getInstr(); + MachineInstrBuilder TryTableMIB = + BuildMI(*Header, InsertPos, Header->findDebugLoc(InsertPos), + TII.get(WebAssembly::TRY_TABLE)) + .addImm(int64_t(WebAssembly::BlockType::Void)) + .addImm(1); // # of catch clauses + auto *TryTable = TryTableMIB.getInstr(); + + // Add a CATCH_*** clause to the TRY_TABLE. These are pseudo instructions + // following the destination END_BLOCK to simulate block return values, + // because we currently don't support them. + auto *Catch = WebAssembly::findCatch(&MBB); + switch (Catch->getOpcode()) { + case WebAssembly::CATCH: + // CATCH's destination block's return type is the extracted value type, + // which is currently i32 for all supported tags. + BlockMIB.addImm(int64_t(WebAssembly::BlockType::I32)); + TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH); + for (const auto &Use : Catch->uses()) { + // The only use operand a CATCH can have is the tag symbol. + TryTableMIB.addExternalSymbol(Use.getSymbolName()); + break; + } + TryTableMIB.addMBB(&MBB); + break; + case WebAssembly::CATCH_REF: + // CATCH_REF's destination block's return type is the extracted value type + // followed by an exnref, which is (i32, exnref) in our case. We assign the + // actual multiavlue signature in MCInstLower. MO_CATCH_BLOCK_SIG signals + // that this operand is used for catch_ref's multivalue destination. + BlockMIB.addImm(int64_t(WebAssembly::BlockType::Multivalue)); + Block->getOperand(0).setTargetFlags(WebAssemblyII::MO_CATCH_BLOCK_SIG); + TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH_REF); + for (const auto &Use : Catch->uses()) { + TryTableMIB.addExternalSymbol(Use.getSymbolName()); + break; + } + TryTableMIB.addMBB(&MBB); + break; + case WebAssembly::CATCH_ALL: + // CATCH_ALL's destination block's return type is void. + BlockMIB.addImm(int64_t(WebAssembly::BlockType::Void)); + TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH_ALL); + TryTableMIB.addMBB(&MBB); + break; + case WebAssembly::CATCH_ALL_REF: + // CATCH_ALL_REF's destination block's return type is exnref. + BlockMIB.addImm(int64_t(WebAssembly::BlockType::Exnref)); + TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH_ALL_REF); + TryTableMIB.addMBB(&MBB); + break; + } + + // Decide where in MBB to put the END_TRY_TABLE, and the END_BLOCK for the + // CATCH destination. + BeforeSet.clear(); + AfterSet.clear(); + for (const auto &MI : MBB) { +#ifndef NDEBUG + // END_TRY_TABLE should precede existing LOOP markers. + if (MI.getOpcode() == WebAssembly::LOOP) + AfterSet.insert(&MI); +#endif + + // If there is a previously placed END_LOOP marker and the header of the + // loop is above this try_table's header, the END_LOOP should be placed + // after the END_TRY_TABLE, because the loop contains this block. Otherwise + // the END_LOOP should be placed before the END_TRY_TABLE. + if (MI.getOpcode() == WebAssembly::END_LOOP) { + if (EndToBegin[&MI]->getParent()->getNumber() >= Header->getNumber()) + BeforeSet.insert(&MI); +#ifndef NDEBUG + else + AfterSet.insert(&MI); +#endif + } + +#ifndef NDEBUG + // CATCH, CATCH_REF, CATCH_ALL, and CATCH_ALL_REF are pseudo-instructions + // that simulate the block return value, so they should be placed after the + // END_TRY_TABLE. + if (WebAssembly::isCatch(MI.getOpcode())) + AfterSet.insert(&MI); +#endif + } + + // Mark the end of the TRY_TABLE and the BLOCK. + InsertPos = getEarliestInsertPos(&MBB, BeforeSet, AfterSet); + MachineInstr *EndTryTable = + BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos), + TII.get(WebAssembly::END_TRY_TABLE)); + registerTryScope(TryTable, EndTryTable, &MBB); + MachineInstr *EndBlock = + BuildMI(MBB, InsertPos, MBB.findPrevDebugLoc(InsertPos), + TII.get(WebAssembly::END_BLOCK)); + registerScope(Block, EndBlock); + // Track the farthest-spanning scope that ends at this point. + updateScopeTops(Header, &MBB); +} + void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) { + if (WebAssembly::WasmEnableExnref) + return; + const auto &TII = *MF.getSubtarget().getInstrInfo(); // When there is an unconditional branch right before a catch instruction and @@ -1445,6 +1700,7 @@ void WebAssemblyCFGStackify::recalculateScopeTops(MachineFunction &MF) { case WebAssembly::END_BLOCK: case WebAssembly::END_LOOP: case WebAssembly::END_TRY: + case WebAssembly::END_TRY_TABLE: case WebAssembly::DELEGATE: updateScopeTops(EndToBegin[&MI]->getParent(), &MBB); break; @@ -1502,6 +1758,7 @@ void WebAssemblyCFGStackify::fixEndsAtEndOfFunction(MachineFunction &MF) { } case WebAssembly::END_BLOCK: case WebAssembly::END_LOOP: + case WebAssembly::END_TRY_TABLE: case WebAssembly::DELEGATE: EndToBegin[&MI]->getOperand(0).setImm(int32_t(RetType)); continue; @@ -1528,7 +1785,7 @@ static void appendEndToFunction(MachineFunction &MF, TII.get(WebAssembly::END_FUNCTION)); } -/// Insert BLOCK/LOOP/TRY markers at appropriate places. +/// Insert BLOCK/LOOP/TRY/TRY_TABLE markers at appropriate places. void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) { // We allocate one more than the number of blocks in the function to // accommodate for the possible fake block we may insert at the end. @@ -1540,15 +1797,25 @@ void WebAssemblyCFGStackify::placeMarkers(MachineFunction &MF) { const MCAsmInfo *MCAI = MF.getTarget().getMCAsmInfo(); for (auto &MBB : MF) { if (MBB.isEHPad()) { - // Place the TRY for MBB if MBB is the EH pad of an exception. + // Place the TRY/TRY_TABLE for MBB if MBB is the EH pad of an exception. if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm && - MF.getFunction().hasPersonalityFn()) - placeTryMarker(MBB); + MF.getFunction().hasPersonalityFn()) { + if (WebAssembly::WasmEnableExnref) + placeTryTableMarker(MBB); + else + placeTryMarker(MBB); + } } else { // Place the BLOCK for MBB if MBB is branched to from above. placeBlockMarker(MBB); } } + + // FIXME We return here temporarily until we implement fixing unwind + // mismatches for the new exnref proposal. + if (WebAssembly::WasmEnableExnref) + return; + // Fix mismatches in unwind destinations induced by linearizing the code. if (MCAI->getExceptionHandlingType() == ExceptionHandling::Wasm && MF.getFunction().hasPersonalityFn()) { @@ -1668,11 +1935,14 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { for (auto &MBB : reverse(MF)) { for (MachineInstr &MI : llvm::reverse(MBB)) { switch (MI.getOpcode()) { + case WebAssembly::TRY_TABLE: + RewriteOperands(MI); + [[fallthrough]]; case WebAssembly::BLOCK: case WebAssembly::TRY: assert(ScopeTops[Stack.back().first->getNumber()]->getNumber() <= MBB.getNumber() && - "Block/try marker should be balanced"); + "Block/try/try_table marker should be balanced"); Stack.pop_back(); break; @@ -1687,6 +1957,7 @@ void WebAssemblyCFGStackify::rewriteDepthImmediates(MachineFunction &MF) { [[fallthrough]]; } case WebAssembly::END_BLOCK: + case WebAssembly::END_TRY_TABLE: Stack.push_back(std::make_pair(&MBB, &MI)); break; @@ -1744,7 +2015,8 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { // Liveness is not tracked for VALUE_STACK physreg. MF.getRegInfo().invalidateLiveness(); - // Place the BLOCK/LOOP/TRY markers to indicate the beginnings of scopes. + // Place the BLOCK/LOOP/TRY/TRY_TABLE markers to indicate the beginnings of + // scopes. placeMarkers(MF); // Remove unnecessary instructions possibly introduced by try/end_trys. @@ -1755,8 +2027,8 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) { // Convert MBB operands in terminators to relative depth immediates. rewriteDepthImmediates(MF); - // Fix up block/loop/try signatures at the end of the function to conform to - // WebAssembly's rules. + // Fix up block/loop/try/try_table signatures at the end of the function to + // conform to WebAssembly's rules. fixEndsAtEndOfFunction(MF); // Add an end instruction at the end of the function body. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index 60c5e18fbb0cd..b5b9cbeacfa18 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -211,8 +211,11 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) { case Intrinsic::wasm_catch: { int Tag = Node->getConstantOperandVal(2); SDValue SymNode = getTagSymNode(Tag, CurDAG); + unsigned CatchOpcode = WebAssembly::WasmEnableExnref + ? WebAssembly::CATCH + : WebAssembly::CATCH_LEGACY; MachineSDNode *Catch = - CurDAG->getMachineNode(WebAssembly::CATCH_LEGACY, DL, + CurDAG->getMachineNode(CatchOpcode, DL, { PtrVT, // exception pointer MVT::Other // outchain type diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td index 05880b89d1fbc..97ff6d77f54b1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td @@ -125,15 +125,46 @@ defm DEBUG_UNREACHABLE : NRI<(outs), (ins), [(debugtrap)], "unreachable", 0x00>; // Exception handling instructions //===----------------------------------------------------------------------===// +// A list of catch clauses attached to try_table. +def CatchListAsmOperand : AsmOperandClass { let Name = "CatchList"; } +let OperandNamespace = "WebAssembly", OperandType = "OPERAND_CATCH_LIST" in +def catch_list : Operand { + let ParserMatchClass = CatchListAsmOperand; + let PrintMethod = "printCatchList"; +} + let Predicates = [HasExceptionHandling] in { -// Throwing an exception: throw +// Throwing an exception: throw / throw_ref let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in { defm THROW : I<(outs), (ins tag_op:$tag, variable_ops), (outs), (ins tag_op:$tag), [], "throw \t$tag", "throw \t$tag", 0x08>; +defm THROW_REF : I<(outs), (ins EXNREF:$exn), (outs), (ins), [], + "throw_ref \t$exn", "throw_ref", 0x0a>; } // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 +// Region within which an exception is caught: try / end_try +let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in { +defm TRY_TABLE : I<(outs), (ins Signature:$sig, variable_ops), + (outs), (ins Signature:$sig, catch_list:$cal), [], + "try_table \t$sig", "try_table \t$sig $cal", 0x1f>; +defm END_TRY_TABLE : NRI<(outs), (ins), [], "end_try_table", 0x0b>; +} // Uses = [VALUE_STACK], Defs = [VALUE_STACK] + +// Pseudo instructions that represent catch / catch_ref / catch_all / +// catch_all_ref clauses in a try_table instruction. +let hasCtrlDep = 1, hasSideEffects = 1, isCodeGenOnly = 1 in { +let variadicOpsAreDefs = 1 in { +defm CATCH : I<(outs), (ins tag_op:$tag, variable_ops), + (outs), (ins tag_op:$tag), []>; +defm CATCH_REF : I<(outs), (ins tag_op:$tag, variable_ops), + (outs), (ins tag_op:$tag), []>; +} +defm CATCH_ALL : NRI<(outs), (ins), []>; +defm CATCH_ALL_REF : I<(outs EXNREF:$dst), (ins), (outs), (ins), []>; +} + // Pseudo instructions: cleanupret / catchret let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, isPseudo = 1, isEHScopeReturn = 1 in { @@ -147,9 +178,10 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1, // usage gets low enough. // Rethrowing an exception: rethrow +// The new exnref proposal also uses this instruction as an interim pseudo +// instruction before we convert it to a THROW_REF. let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in defm RETHROW : NRI<(outs), (ins i32imm:$depth), [], "rethrow \t$depth", 0x09>; -// isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 // The depth argument will be computed in CFGStackify. We set it to 0 here for // now. def : Pat<(int_wasm_rethrow), (RETHROW 0)>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp index f0c205cdb6aeb..70b406b6552bf 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp @@ -37,6 +37,7 @@ class WebAssemblyLateEHPrepare final : public MachineFunctionPass { void recordCatchRetBBs(MachineFunction &MF); bool hoistCatches(MachineFunction &MF); bool addCatchAlls(MachineFunction &MF); + bool addCatchRefsAndThrowRefs(MachineFunction &MF); bool replaceFuncletReturns(MachineFunction &MF); bool removeUnnecessaryUnreachables(MachineFunction &MF); bool restoreStackPointer(MachineFunction &MF); @@ -127,6 +128,8 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) { Changed |= hoistCatches(MF); Changed |= addCatchAlls(MF); Changed |= replaceFuncletReturns(MF); + if (WebAssembly::WasmEnableExnref) + Changed |= addCatchRefsAndThrowRefs(MF); } Changed |= removeUnnecessaryUnreachables(MF); if (MF.getFunction().hasPersonalityFn()) @@ -214,9 +217,12 @@ bool WebAssemblyLateEHPrepare::addCatchAlls(MachineFunction &MF) { if (InsertPos == MBB.end() || !WebAssembly::isCatch(InsertPos->getOpcode())) { Changed = true; + unsigned CatchAllOpcode = WebAssembly::WasmEnableExnref + ? WebAssembly::CATCH_ALL + : WebAssembly::CATCH_ALL_LEGACY; BuildMI(MBB, InsertPos, InsertPos == MBB.end() ? DebugLoc() : InsertPos->getDebugLoc(), - TII.get(WebAssembly::CATCH_ALL_LEGACY)); + TII.get(CatchAllOpcode)); } } return Changed; @@ -248,6 +254,10 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) { case WebAssembly::CLEANUPRET: { // Replace a cleanupret with a rethrow. For C++ support, currently // rethrow's immediate argument is always 0 (= the latest exception). + // + // Even when -wasm-enable-exnref is true, we use a RETHROW here for the + // moment. This will be converted to a THROW_REF in + // addCatchRefsAndThrowRefs. BuildMI(MBB, TI, TI->getDebugLoc(), TII.get(WebAssembly::RETHROW)) .addImm(0); TI->eraseFromParent(); @@ -259,14 +269,74 @@ bool WebAssemblyLateEHPrepare::replaceFuncletReturns(MachineFunction &MF) { return Changed; } -// Remove unnecessary unreachables after a throw or rethrow. +// Add CATCH_REF and CATCH_ALL_REF pseudo instructions to EH pads, and convert +// RETHROWs to THROW_REFs. +bool WebAssemblyLateEHPrepare::addCatchRefsAndThrowRefs(MachineFunction &MF) { + bool Changed = false; + const auto &TII = *MF.getSubtarget().getInstrInfo(); + auto &MRI = MF.getRegInfo(); + DenseMap> EHPadToRethrows; + + // Create a map of + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.getOpcode() == WebAssembly::RETHROW) { + Changed = true; + auto *EHPad = getMatchingEHPad(&MI); + EHPadToRethrows[EHPad].push_back(&MI); + } + } + } + + // Convert CATCH into CATCH_REF and CATCH_ALL into CATCH_ALL_REF, when the + // caught exception is rethrown. And convert RETHROWs to THROW_REFs. + for (auto &[EHPad, Rethrows] : EHPadToRethrows) { + auto *Catch = WebAssembly::findCatch(EHPad); + auto *InsertPos = Catch->getIterator()->getNextNode(); + auto ExnReg = MRI.createVirtualRegister(&WebAssembly::EXNREFRegClass); + if (Catch->getOpcode() == WebAssembly::CATCH) { + MachineInstrBuilder MIB = BuildMI(*EHPad, InsertPos, Catch->getDebugLoc(), + TII.get(WebAssembly::CATCH_REF)); + // Copy defs (= extracted values) from the old CATCH to the new CATCH_REF + for (const auto &Def : Catch->defs()) + MIB.addDef(Def.getReg()); + MIB.addDef(ExnReg); // Attach the exnref def after extracted values + // Copy the tag symbol (The only use operand a CATCH can have is the tag + // symbol) + for (const auto &Use : Catch->uses()) { + MIB.addExternalSymbol(Use.getSymbolName()); + break; + } + } else if (Catch->getOpcode() == WebAssembly::CATCH_ALL) { + BuildMI(*EHPad, InsertPos, Catch->getDebugLoc(), + TII.get(WebAssembly::CATCH_ALL_REF)) + .addDef(ExnReg); + } else { + assert(false); + } + Catch->eraseFromParent(); + + for (auto *Rethrow : Rethrows) { + auto InsertPos = std::next(Rethrow->getIterator()); + BuildMI(*Rethrow->getParent(), InsertPos, Rethrow->getDebugLoc(), + TII.get(WebAssembly::THROW_REF)) + .addReg(ExnReg); + Rethrow->eraseFromParent(); + } + } + + return Changed; +} + +// Remove unnecessary unreachables after a throw/rethrow/throw_ref. bool WebAssemblyLateEHPrepare::removeUnnecessaryUnreachables( MachineFunction &MF) { bool Changed = false; for (auto &MBB : MF) { for (auto &MI : MBB) { if (MI.getOpcode() != WebAssembly::THROW && - MI.getOpcode() != WebAssembly::RETHROW) + MI.getOpcode() != WebAssembly::RETHROW && + MI.getOpcode() != WebAssembly::THROW_REF) continue; Changed = true; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index 431dc7f33ac89..73ff50f39b020 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "WebAssemblyMCInstLower.h" +#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" #include "TargetInfo/WebAssemblyTargetInfo.h" #include "Utils/WebAssemblyTypeUtilities.h" #include "WebAssemblyAsmPrinter.h" @@ -220,12 +221,27 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI, MCOp = lowerTypeIndexOperand(std::move(Returns), std::move(Params)); break; - } else if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) { + } + if (Info.OperandType == WebAssembly::OPERAND_SIGNATURE) { auto BT = static_cast(MO.getImm()); assert(BT != WebAssembly::BlockType::Invalid); if (BT == WebAssembly::BlockType::Multivalue) { - SmallVector Returns; - getFunctionReturns(MI, Returns); + SmallVector Returns; + // Multivalue blocks are emitted in two cases: + // 1. When the blocks will never be exited and are at the ends of + // functions (see + // WebAssemblyCFGStackify::fixEndsAtEndOfFunction). In this case + // the exact multivalue signature can always be inferred from the + // return type of the parent function. + // 2. (catch_ref ...) clause in try_table instruction. Currently all + // tags we support (cpp_exception and c_longjmp) throws a single + // i32, so the multivalue signature for this case will be (i32, + // exnref). Having MO_CATCH_BLOCK_SIG target flags means this is + // a destination of a catch_ref. + if (MO.getTargetFlags() == WebAssemblyII::MO_CATCH_BLOCK_SIG) + Returns = {wasm::ValType::I32, wasm::ValType::EXNREF}; + else + getFunctionReturns(MI, Returns); MCOp = lowerTypeIndexOperand(std::move(Returns), SmallVector()); break; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index c5a047ee47d73..ed186e65a80cf 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -43,6 +43,8 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) { switch (MI.getOpcode()) { case WebAssembly::THROW: case WebAssembly::THROW_S: + case WebAssembly::THROW_REF: + case WebAssembly::THROW_REF_S: case WebAssembly::RETHROW: case WebAssembly::RETHROW_S: return true; diff --git a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll index aa191209516f6..3327d8be894f0 100644 --- a/llvm/test/CodeGen/WebAssembly/exception-legacy.ll +++ b/llvm/test/CodeGen/WebAssembly/exception-legacy.ll @@ -109,7 +109,7 @@ ehcleanup: ; preds = %entry } ; Calling a function that may throw within a 'catch (...)' generates a -; temrinatepad, because __cxa_end_catch() also can throw within 'catch (...)'. +; terminatepad, because __cxa_end_catch() also can throw within 'catch (...)'. ; ; void foo(); ; void terminatepad() { diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception.ll new file mode 100644 index 0000000000000..7259761d6313b --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/exception.ll @@ -0,0 +1,470 @@ +; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref -verify-machineinstrs | FileCheck --implicit-check-not=ehgcr -allow-deprecated-dag-overlap %s +; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref -verify-machineinstrs -O0 +; RUN: llc < %s -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-enable-exnref + +target triple = "wasm32-unknown-unknown" + +%struct.Temp = type { i8 } + +@_ZTIi = external dso_local constant ptr + +; CHECK: .tagtype __cpp_exception i32 + +; CHECK-LABEL: throw: +; CHECK: throw __cpp_exception +; CHECK-NOT: unreachable +define void @throw(ptr %p) { + call void @llvm.wasm.throw(i32 0, ptr %p) + ret void +} + +; Simple test with a try-catch +; +; void foo(); +; void catch() { +; try { +; foo(); +; } catch (int) { +; } +; } + +; CHECK-LABEL: catch: +; CHECK: global.get __stack_pointer +; CHECK: local.set 0 +; CHECK: block +; CHECK: block () -> (i32, exnref) +; CHECK: try_table (catch_ref __cpp_exception 0) +; CHECK: call foo +; CHECK: br 2 +; CHECK: end_try_table +; CHECK: end_block +; CHECK: local.set 2 +; CHECK: local.get 0 +; CHECK: global.set __stack_pointer +; CHECK: i32.store __wasm_lpad_context +; CHECK: call _Unwind_CallPersonality +; CHECK: block +; CHECK: br_if 0 +; CHECK: call __cxa_begin_catch +; CHECK: call __cxa_end_catch +; CHECK: br 1 +; CHECK: end_block +; CHECK: local.get 2 +; CHECK: throw_ref +; CHECK: end_block +define void @catch() personality ptr @__gxx_wasm_personality_v0 { +entry: + invoke void @foo() + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch.start] unwind to caller + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [ptr @_ZTIi] + %2 = call ptr @llvm.wasm.get.exception(token %1) + %3 = call i32 @llvm.wasm.get.ehselector(token %1) + %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) + %matches = icmp eq i32 %3, %4 + br i1 %matches, label %catch, label %rethrow + +catch: ; preds = %catch.start + %5 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ] + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +rethrow: ; preds = %catch.start + call void @llvm.wasm.rethrow() [ "funclet"(token %1) ] + unreachable + +try.cont: ; preds = %catch, %entry + ret void +} + +; Destructor (cleanup) test +; +; void foo(); +; struct Temp { +; ~Temp() {} +; }; +; void cleanup() { +; Temp t; +; foo(); +; } + +; CHECK-LABEL: cleanup: +; CHECK: block +; CHECK: block exnref +; CHECK: try_table (catch_all_ref 0) +; CHECK: call foo +; CHECK: br 2 +; CHECK: end_try_table +; CHECK: end_block +; CHECK: local.set 1 +; CHECK: global.set __stack_pointer +; CHECK: call _ZN4TempD2Ev +; CHECK: local.get 1 +; CHECK: throw_ref +; CHECK: end_block +; CHECK: call _ZN4TempD2Ev +define void @cleanup() personality ptr @__gxx_wasm_personality_v0 { +entry: + %t = alloca %struct.Temp, align 1 + invoke void @foo() + to label %invoke.cont unwind label %ehcleanup + +invoke.cont: ; preds = %entry + %call = call ptr @_ZN4TempD2Ev(ptr %t) + ret void + +ehcleanup: ; preds = %entry + %0 = cleanuppad within none [] + %call1 = call ptr @_ZN4TempD2Ev(ptr %t) [ "funclet"(token %0) ] + cleanupret from %0 unwind to caller +} + +; Calling a function that may throw within a 'catch (...)' generates a +; terminatepad, because __cxa_end_catch() also can throw within 'catch (...)'. +; +; void foo(); +; void terminatepad() { +; try { +; foo(); +; } catch (...) { +; foo(); +; } +; } + +; CHECK-LABEL: terminatepad +; CHECK: block +; CHECK: block i32 +; CHECK: try_table (catch __cpp_exception 0) +; CHECK: call foo +; CHECK: br 2 +; CHECK: end_try_table +; CHECK: end_block +; CHECK: call __cxa_begin_catch +; CHECK: block +; CHECK: block exnref +; CHECK: try_table (catch_all_ref 0) +; CHECK: call foo +; CHECK: br 2 +; CHECK: end_try_table +; CHECK: end_block +; CHECK: local.set 2 +; CHECK: block +; CHECK: block +; CHECK: try_table (catch_all 0) +; CHECK: call __cxa_end_catch +; CHECK: br 2 +; CHECK: end_try_table +; CHECK: end_block +; CHECK: call _ZSt9terminatev +; CHECK: unreachable +; CHECK: end_block +; CHECK: local.get 2 +; CHECK: throw_ref +; CHECK: end_block +; CHECK: call __cxa_end_catch +; CHECK: end_block +define void @terminatepad() personality ptr @__gxx_wasm_personality_v0 { +entry: + invoke void @foo() + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch.start] unwind to caller + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [ptr null] + %2 = call ptr @llvm.wasm.get.exception(token %1) + %3 = call i32 @llvm.wasm.get.ehselector(token %1) + %4 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ] + invoke void @foo() [ "funclet"(token %1) ] + to label %invoke.cont1 unwind label %ehcleanup + +invoke.cont1: ; preds = %catch.start + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +try.cont: ; preds = %invoke.cont1, %entry + ret void + +ehcleanup: ; preds = %catch.start + %5 = cleanuppad within %1 [] + invoke void @__cxa_end_catch() [ "funclet"(token %5) ] + to label %invoke.cont2 unwind label %terminate + +invoke.cont2: ; preds = %ehcleanup + cleanupret from %5 unwind to caller + +terminate: ; preds = %ehcleanup + %6 = cleanuppad within %5 [] + call void @_ZSt9terminatev() [ "funclet"(token %6) ] + unreachable +} + +; Tests prologues and epilogues are not generated within EH scopes. +; They should not be treated as funclets; BBs starting with a catch instruction +; should not have a prologue, and BBs ending with a catchret/cleanupret should +; not have an epilogue. This is separate from __stack_pointer restoring +; instructions after a catch instruction. +; +; void bar(int) noexcept; +; void no_prolog_epilog_in_ehpad() { +; int stack_var = 0; +; bar(stack_var); +; try { +; foo(); +; } catch (int) { +; foo(); +; } +; } + +; CHECK-LABEL: no_prolog_epilog_in_ehpad +; CHECK: call bar +; CHECK: block +; CHECK: block () -> (i32, exnref) +; CHECK: try_table (catch_ref __cpp_exception 0) +; CHECK: call foo +; CHECK: br 2 +; CHECK: end_try_table +; CHECK: end_block +; CHECK: local.set 2 +; CHECK-NOT: global.get __stack_pointer +; CHECK: global.set __stack_pointer +; CHECK: block +; CHECK: block +; CHECK: br_if 0 +; CHECK: call __cxa_begin_catch +; CHECK: block exnref +; CHECK: try_table (catch_all_ref 0) +; CHECK: call foo +; CHECK: br 3 +; CHECK: end_try_table +; CHECK: end_block +; CHECK: local.set 2 +; CHECK-NOT: global.get __stack_pointer +; CHECK: global.set __stack_pointer +; CHECK: call __cxa_end_catch +; CHECK: local.get 2 +; CHECK: throw_ref +; CHECK-NOT: global.set __stack_pointer +; CHECK: end_block +; CHECK: local.get 2 +; CHECK: throw_ref +; CHECK: end_block +; CHECK-NOT: global.set __stack_pointer +; CHECK: call __cxa_end_catch +; CHECK: end_block +define void @no_prolog_epilog_in_ehpad() personality ptr @__gxx_wasm_personality_v0 { +entry: + %stack_var = alloca i32, align 4 + call void @bar(ptr %stack_var) + invoke void @foo() + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch.start] unwind to caller + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [ptr @_ZTIi] + %2 = call ptr @llvm.wasm.get.exception(token %1) + %3 = call i32 @llvm.wasm.get.ehselector(token %1) + %4 = call i32 @llvm.eh.typeid.for(ptr @_ZTIi) + %matches = icmp eq i32 %3, %4 + br i1 %matches, label %catch, label %rethrow + +catch: ; preds = %catch.start + %5 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ] + %6 = load float, ptr %5, align 4 + invoke void @foo() [ "funclet"(token %1) ] + to label %invoke.cont1 unwind label %ehcleanup + +invoke.cont1: ; preds = %catch + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +rethrow: ; preds = %catch.start + call void @llvm.wasm.rethrow() [ "funclet"(token %1) ] + unreachable + +try.cont: ; preds = %invoke.cont1, %entry + ret void + +ehcleanup: ; preds = %catch + %7 = cleanuppad within %1 [] + call void @__cxa_end_catch() [ "funclet"(token %7) ] + cleanupret from %7 unwind to caller +} + +; When a function does not have stack-allocated objects, it does not need to +; store SP back to __stack_pointer global at the epilog. +; +; void foo(); +; void no_sp_writeback() { +; try { +; foo(); +; } catch (...) { +; } +; } + +; CHECK-LABEL: no_sp_writeback +; CHECK: block +; CHECK: block i32 +; CHECK: try_table (catch __cpp_exception 0) +; CHECK: call foo +; CHECK: br 2 +; CHECK: end_try_table +; CHECK: end_block +; CHECK: call __cxa_begin_catch +; CHECK: call __cxa_end_catch +; CHECK: end_block +; CHECK-NOT: global.set __stack_pointer +; CHECK: end_function +define void @no_sp_writeback() personality ptr @__gxx_wasm_personality_v0 { +entry: + invoke void @foo() + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch.start] unwind to caller + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [ptr null] + %2 = call ptr @llvm.wasm.get.exception(token %1) + %3 = call i32 @llvm.wasm.get.ehselector(token %1) + %4 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ] + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +try.cont: ; preds = %catch.start, %entry + ret void +} + +; When the result of @llvm.wasm.get.exception is not used. This is created to +; fix a bug in LateEHPrepare and this should not crash. +define void @get_exception_wo_use() personality ptr @__gxx_wasm_personality_v0 { +entry: + invoke void @foo() + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch.start] unwind to caller + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [ptr null] + %2 = call ptr @llvm.wasm.get.exception(token %1) + %3 = call i32 @llvm.wasm.get.ehselector(token %1) + catchret from %1 to label %try.cont + +try.cont: ; preds = %catch.start, %entry + ret void +} + +; Tests a case when a cleanup region (cleanuppad ~ clanupret) contains another +; catchpad +define void @complex_cleanup_region() personality ptr @__gxx_wasm_personality_v0 { +entry: + invoke void @foo() + to label %invoke.cont unwind label %ehcleanup + +invoke.cont: ; preds = %entry + ret void + +ehcleanup: ; preds = %entry + %0 = cleanuppad within none [] + invoke void @foo() [ "funclet"(token %0) ] + to label %ehcleanupret unwind label %catch.dispatch + +catch.dispatch: ; preds = %ehcleanup + %1 = catchswitch within %0 [label %catch.start] unwind label %ehcleanup.1 + +catch.start: ; preds = %catch.dispatch + %2 = catchpad within %1 [ptr null] + %3 = call ptr @llvm.wasm.get.exception(token %2) + %4 = call i32 @llvm.wasm.get.ehselector(token %2) + catchret from %2 to label %ehcleanupret + +ehcleanup.1: ; preds = %catch.dispatch + %5 = cleanuppad within %0 [] + unreachable + +ehcleanupret: ; preds = %catch.start, %ehcleanup + cleanupret from %0 unwind to caller +} + +; Regression test for the bug that 'rethrow' was not treated correctly as a +; terminator in isel. +define void @rethrow_terminator() personality ptr @__gxx_wasm_personality_v0 { +entry: + invoke void @foo() + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch.start] unwind label %ehcleanup + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [ptr @_ZTIi] + %2 = call ptr @llvm.wasm.get.exception(token %1) + %3 = call i32 @llvm.wasm.get.ehselector(token %1) + %4 = call i32 @llvm.eh.typeid.for.p0(ptr @_ZTIi) + %matches = icmp eq i32 %3, %4 + br i1 %matches, label %catch, label %rethrow + +catch: ; preds = %catch.start + %5 = call ptr @__cxa_begin_catch(ptr %2) [ "funclet"(token %1) ] + %6 = load i32, ptr %5, align 4 + call void @__cxa_end_catch() [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +rethrow: ; preds = %catch.start + invoke void @llvm.wasm.rethrow() #1 [ "funclet"(token %1) ] + to label %unreachable unwind label %ehcleanup + +try.cont: ; preds = %entry, %catch + ret void + +ehcleanup: ; preds = %rethrow, %catch.dispatch + ; 'rethrow' BB is this BB's predecessor, and its + ; 'invoke void @llvm.wasm.rethrow()' is lowered down to a 'RETHROW' in Wasm + ; MIR. And this 'phi' creates 'CONST_I32' instruction in the predecessor + ; 'rethrow' BB. If 'RETHROW' is not treated correctly as a terminator, it can + ; create a BB like + ; bb.3.rethrow: + ; RETHROW 0 + ; %0 = CONST_I32 20 + ; BR ... + %tmp = phi i32 [ 10, %catch.dispatch ], [ 20, %rethrow ] + %7 = cleanuppad within none [] + call void @take_i32(i32 %tmp) [ "funclet"(token %7) ] + cleanupret from %7 unwind to caller + +unreachable: ; preds = %rethrow + unreachable +} + + +declare void @foo() +declare void @bar(ptr) +declare void @take_i32(i32) +declare i32 @__gxx_wasm_personality_v0(...) +; Function Attrs: noreturn +declare void @llvm.wasm.throw(i32, ptr) #1 +; Function Attrs: nounwind +declare ptr @llvm.wasm.get.exception(token) #0 +; Function Attrs: nounwind +declare i32 @llvm.wasm.get.ehselector(token) #0 +; Function Attrs: noreturn +declare void @llvm.wasm.rethrow() #1 +; Function Attrs: nounwind +declare i32 @llvm.eh.typeid.for(ptr) #0 +declare ptr @__cxa_begin_catch(ptr) +declare void @__cxa_end_catch() +declare void @_ZSt9terminatev() +declare ptr @_ZN4TempD2Ev(ptr returned) + +attributes #0 = { nounwind } +attributes #1 = { noreturn } + +; CHECK: __cpp_exception: From d03822d8887adc9312e65abf8d8ce1a16007f2a0 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 11 Sep 2024 07:21:49 +0200 Subject: [PATCH 055/114] [clang][bytecode] Fix lookup of source locations in implicit ctors (#107992) Implicit functions may still have a body. The !hasBody() check is enough. --- clang/lib/AST/ByteCode/InterpFrame.cpp | 19 ++++++++++++++----- clang/test/AST/ByteCode/builtin-functions.cpp | 7 +++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp index 6830a7b37f1da..28e189bb339e6 100644 --- a/clang/lib/AST/ByteCode/InterpFrame.cpp +++ b/clang/lib/AST/ByteCode/InterpFrame.cpp @@ -207,31 +207,40 @@ Pointer InterpFrame::getParamPointer(unsigned Off) { return Pointer(B); } +static bool funcHasUsableBody(const Function *F) { + assert(F); + + if (F->isConstructor() || F->isDestructor()) + return true; + + return !F->getDecl()->isImplicit(); +} + SourceInfo InterpFrame::getSource(CodePtr PC) const { // Implicitly created functions don't have any code we could point at, // so return the call site. - if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller) + if (Func && !funcHasUsableBody(Func) && Caller) return Caller->getSource(RetPC); return S.getSource(Func, PC); } const Expr *InterpFrame::getExpr(CodePtr PC) const { - if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller) - return Caller->getExpr(RetPC); + if (Func && !funcHasUsableBody(Func) && Caller) + return Caller->getExpr(PC); return S.getExpr(Func, PC); } SourceLocation InterpFrame::getLocation(CodePtr PC) const { - if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller) + if (Func && !funcHasUsableBody(Func) && Caller) return Caller->getLocation(RetPC); return S.getLocation(Func, PC); } SourceRange InterpFrame::getRange(CodePtr PC) const { - if (Func && (!Func->hasBody() || Func->getDecl()->isImplicit()) && Caller) + if (Func && !funcHasUsableBody(Func) && Caller) return Caller->getRange(RetPC); return S.getRange(Func, PC); diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp index 9c9ca23e0a6a6..9fd5eae67a21f 100644 --- a/clang/test/AST/ByteCode/builtin-functions.cpp +++ b/clang/test/AST/ByteCode/builtin-functions.cpp @@ -968,3 +968,10 @@ namespace FunctionStart { static_assert(__builtin_function_start(a) == a, ""); // both-error {{not an integral constant expression}} \ // both-note {{comparison of addresses of literals has unspecified value}} } + +namespace BuiltinInImplicitCtor { + constexpr struct { + int a = __builtin_isnan(1.0); + } Foo; + static_assert(Foo.a == 0, ""); +} From 323911de277087b4898a96760c065a28f5d1bfa7 Mon Sep 17 00:00:00 2001 From: ChiaHungDuan Date: Tue, 10 Sep 2024 22:24:06 -0700 Subject: [PATCH 056/114] Reapply "[scudo] Fix the logic of MaxAllowedFragmentedPages" (#108130) (#108134) This reverts commit 76151c449080b7239c8b442291514a4300d51cba. Also changed to check MaxAllowedFragmentedPages. --- compiler-rt/lib/scudo/standalone/secondary.h | 25 +++++++++++++------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h index 1a232b9b9fb2d..2fae29e5a2168 100644 --- a/compiler-rt/lib/scudo/standalone/secondary.h +++ b/compiler-rt/lib/scudo/standalone/secondary.h @@ -72,13 +72,16 @@ namespace { struct CachedBlock { static constexpr u16 CacheIndexMax = UINT16_MAX; static constexpr u16 InvalidEntry = CacheIndexMax; - // * MaxReleasedCachePages default is currently 4 - // - We arrived at this value after noticing that mapping - // in larger memory regions performs better than releasing - // memory and forcing a cache hit. According to the data, - // it suggests that beyond 4 pages, the release execution time is - // longer than the map execution time. In this way, the default - // is dependent on the platform. + // We allow a certain amount of fragmentation and part of the fragmented bytes + // will be released by `releaseAndZeroPagesToOS()`. This increases the chance + // of cache hit rate and reduces the overhead to the RSS at the same time. See + // more details in the `MapAllocatorCache::retrieve()` section. + // + // We arrived at this default value after noticing that mapping in larger + // memory regions performs better than releasing memory and forcing a cache + // hit. According to the data, it suggests that beyond 4 pages, the release + // execution time is longer than the map execution time. In this way, + // the default is dependent on the platform. static constexpr uptr MaxReleasedCachePages = 4U; uptr CommitBase = 0; @@ -725,8 +728,14 @@ MapAllocator::tryAllocateFromCache(const Options &Options, uptr Size, uptr EntryHeaderPos; uptr MaxAllowedFragmentedPages = MaxUnreleasedCachePages; - if (UNLIKELY(useMemoryTagging(Options))) + if (LIKELY(!useMemoryTagging(Options))) { MaxAllowedFragmentedPages += CachedBlock::MaxReleasedCachePages; + } else { + // TODO: Enable MaxReleasedCachePages may result in pages for an entry being + // partially released and it erases the tag of those pages as well. To + // support this feature for MTE, we need to tag those pages again. + DCHECK_EQ(MaxAllowedFragmentedPages, MaxUnreleasedCachePages); + } Entry = Cache.retrieve(MaxAllowedFragmentedPages, Size, Alignment, getHeadersSize(), EntryHeaderPos); From 203a2ca8cd6af505e11a38aebceeaf864271042c Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Tue, 10 Sep 2024 22:25:03 -0700 Subject: [PATCH 057/114] [webkit.RefCntblBaseVirtualDtor] Make ThreadSafeRefCounted not generate warnings (#107676) This PR makes WebKit's RefCntblBaseVirtualDtor checker not generate a warning for ThreadSafeRefCounted when the destruction thread is a specific thread. Prior to this PR, we only allowed CRTP classes without a virtual destructor if its deref function had an explicit cast to the derived type, skipping any lambda declarations which aren't invoked. This ends up generating a warning for ThreadSafeRefCounted when a specific thread is used to destruct the object because there is no inline body / definition for ensureOnMainThread and ensureOnMainRunLoop and DerefFuncDeleteExprVisitor concludes that there is no explicit delete of the derived type. This PR relaxes the condition DerefFuncDeleteExprVisitor checks by allowing a delete expression to appear within a lambda declaration if it's an argument to an "opaque" function; i.e. a function without definition / body. --- .../WebKit/RefCntblBaseVirtualDtorChecker.cpp | 26 ++ .../ref-cntbl-crtp-base-no-virtual-dtor.cpp | 232 ++++++++++++++++++ 2 files changed, 258 insertions(+) create mode 100644 clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp index 9df108e28ecdb..ecba5f9aa23ee 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RefCntblBaseVirtualDtorChecker.cpp @@ -67,6 +67,32 @@ class DerefFuncDeleteExprVisitor const Decl *D = CE->getCalleeDecl(); if (D && D->hasBody()) return VisitBody(D->getBody()); + else { + auto name = safeGetName(D); + if (name == "ensureOnMainThread" || name == "ensureOnMainRunLoop") { + for (unsigned i = 0; i < CE->getNumArgs(); ++i) { + auto *Arg = CE->getArg(i); + if (VisitLabmdaArgument(Arg)) + return true; + } + } + } + return false; + } + + bool VisitLabmdaArgument(const Expr *E) { + E = E->IgnoreParenCasts(); + if (auto *TempE = dyn_cast(E)) + E = TempE->getSubExpr(); + if (auto *ConstructE = dyn_cast(E)) { + for (unsigned i = 0; i < ConstructE->getNumArgs(); ++i) { + auto *Arg = ConstructE->getArg(i); + if (auto *Lambda = dyn_cast(Arg)) { + if (VisitBody(Lambda->getBody())) + return true; + } + } + } return false; } diff --git a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp new file mode 100644 index 0000000000000..01527addb5299 --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp @@ -0,0 +1,232 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=webkit.RefCntblBaseVirtualDtor -verify %s + +#include "mock-types.h" + +namespace Detail { + +template +class CallableWrapperBase { +public: + virtual ~CallableWrapperBase() { } + virtual Out call(In...) = 0; +}; + +template class CallableWrapper; + +template +class CallableWrapper : public CallableWrapperBase { +public: + explicit CallableWrapper(CallableType&& callable) + : m_callable(WTFMove(callable)) { } + CallableWrapper(const CallableWrapper&) = delete; + CallableWrapper& operator=(const CallableWrapper&) = delete; + Out call(In... in) final; +private: + CallableType m_callable; +}; + +} // namespace Detail + +template class Function; + +template Function adopt(Detail::CallableWrapperBase*); + +template +class Function { +public: + using Impl = Detail::CallableWrapperBase; + + Function() = default; + + template + Function(FunctionType f); + + Out operator()(In... in) const; + explicit operator bool() const { return !!m_callableWrapper; } + +private: + enum AdoptTag { Adopt }; + Function(Impl* impl, AdoptTag) + : m_callableWrapper(impl) + { + } + + friend Function adopt(Impl*); + + Impl* m_callableWrapper; +}; + +template Function adopt(Detail::CallableWrapperBase* impl) +{ + return Function(impl, Function::Adopt); +} + +template, typename RefDerefTraits = DefaultRefDerefTraits> Ref adoptRef(T&); + +template +inline Ref adoptRef(T& reference) +{ + return Ref(reference); +} + +enum class DestructionThread : unsigned char { Any, Main, MainRunLoop }; +void ensureOnMainThread(Function&&); // Sync if called on main thread, async otherwise. +void ensureOnMainRunLoop(Function&&); // Sync if called on main run loop, async otherwise. + +class ThreadSafeRefCountedBase { +public: + ThreadSafeRefCountedBase() = default; + + void ref() const + { + ++m_refCount; + } + + bool hasOneRef() const + { + return refCount() == 1; + } + + unsigned refCount() const + { + return m_refCount; + } + +protected: + bool derefBase() const + { + if (!--m_refCount) { + m_refCount = 1; + return true; + } + return false; + } + +private: + mutable unsigned m_refCount { 1 }; +}; + +template class ThreadSafeRefCounted : public ThreadSafeRefCountedBase { +public: + void deref() const + { + if (!derefBase()) + return; + + if constexpr (destructionThread == DestructionThread::Any) { + delete static_cast(this); + } else if constexpr (destructionThread == DestructionThread::Main) { + ensureOnMainThread([this] { + delete static_cast(this); + }); + } + } + +protected: + ThreadSafeRefCounted() = default; +}; + +class FancyRefCountedClass final : public ThreadSafeRefCounted { +public: + static Ref create() + { + return adoptRef(*new FancyRefCountedClass()); + } + + virtual ~FancyRefCountedClass(); + +private: + FancyRefCountedClass(); +}; + +template class BadThreadSafeRefCounted : public ThreadSafeRefCountedBase { +public: + void deref() const + { + if (!derefBase()) + return; + + [this] { + delete static_cast(this); + }; + } + +protected: + BadThreadSafeRefCounted() = default; +}; + +class FancyRefCountedClass2 final : public ThreadSafeRefCounted { +// expected-warning@-1{{Class 'ThreadSafeRefCounted' is used as a base of class 'FancyRefCountedClass2' but doesn't have virtual destructor}} +public: + static Ref create() + { + return adoptRef(*new FancyRefCountedClass2()); + } + + virtual ~FancyRefCountedClass2(); + +private: + FancyRefCountedClass2(); +}; + +template class NestedThreadSafeRefCounted : public ThreadSafeRefCountedBase { +public: + void deref() const + { + if (!derefBase()) + return; + ensureOnMainRunLoop([&] { + auto destroyThis = [&] { + delete static_cast(this); + }; + destroyThis(); + }); + } + +protected: + NestedThreadSafeRefCounted() = default; +}; + +class FancyRefCountedClass3 final : public NestedThreadSafeRefCounted { +public: + static Ref create() + { + return adoptRef(*new FancyRefCountedClass3()); + } + + virtual ~FancyRefCountedClass3(); + +private: + FancyRefCountedClass3(); +}; + +template class BadNestedThreadSafeRefCounted : public ThreadSafeRefCountedBase { +public: + void deref() const + { + if (!derefBase()) + return; + ensureOnMainThread([&] { + auto destroyThis = [&] { + delete static_cast(this); + }; + }); + } + +protected: + BadNestedThreadSafeRefCounted() = default; +}; + +class FancyRefCountedClass4 final : public BadNestedThreadSafeRefCounted { +// expected-warning@-1{{Class 'BadNestedThreadSafeRefCounted' is used as a base of class 'FancyRefCountedClass4' but doesn't have virtual destructor}} +public: + static Ref create() + { + return adoptRef(*new FancyRefCountedClass4()); + } + + virtual ~FancyRefCountedClass4(); + +private: + FancyRefCountedClass4(); +}; From 3c9022c965b85951f30af140da591f819acef8a0 Mon Sep 17 00:00:00 2001 From: AdityaK Date: Tue, 10 Sep 2024 22:39:02 -0700 Subject: [PATCH 058/114] Bail out jump threading on indirect branches (#103688) The bug was introduced by https://github.com/llvm/llvm-project/pull/68473 Fixes: #102351 --- llvm/lib/Transforms/Utils/Local.cpp | 11 +- .../switch-branch-fold-indirectbr-102351.ll | 104 ++++++++++++++++++ 2 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index d0669e44f821b..c85c819263e2a 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1028,7 +1028,14 @@ CanRedirectPredsOfEmptyBBToSucc(BasicBlock *BB, BasicBlock *Succ, if (!BB->hasNPredecessorsOrMore(2)) return false; - // Get single common predecessors of both BB and Succ + if (any_of(BBPreds, [](const BasicBlock *Pred) { + return isa(Pred->begin()) && + isa(Pred->getTerminator()); + })) + return false; + + // Get the single common predecessor of both BB and Succ. Return false + // when there are more than one common predecessors. for (BasicBlock *SuccPred : SuccPreds) { if (BBPreds.count(SuccPred)) { if (CommonPred) @@ -1133,7 +1140,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB, bool BBKillable = CanPropagatePredecessorsForPHIs(BB, Succ, BBPreds); - // Even if we can not fold bB into Succ, we may be able to redirect the + // Even if we can not fold BB into Succ, we may be able to redirect the // predecessors of BB to Succ. bool BBPhisMergeable = BBKillable || diff --git a/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll b/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll new file mode 100644 index 0000000000000..03aee68fa4248 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/switch-branch-fold-indirectbr-102351.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=simplifycfg -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local noundef i32 @main() { +; CHECK-LABEL: define dso_local noundef i32 @main() { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x ptr], align 16 +; CHECK-NEXT: store ptr blockaddress(@main, %[[BB4:.*]]), ptr [[ALLOCA]], align 16, !tbaa [[TBAA0:![0-9]+]] +; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr inbounds [2 x ptr], ptr [[ALLOCA]], i64 0, i64 1 +; CHECK-NEXT: store ptr blockaddress(@main, %[[BB10:.*]]), ptr [[GETELEMENTPTR]], align 8, !tbaa [[TBAA0]] +; CHECK-NEXT: br label %[[BB1:.*]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[PHI8:%.*]], %[[BB7:.*]] ] +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[PHI9:%.*]], %[[BB7]] ] +; CHECK-NEXT: switch i32 [[PHI]], label %[[BB7]] [ +; CHECK-NEXT: i32 0, label %[[BB12:.*]] +; CHECK-NEXT: i32 1, label %[[BB4]] +; CHECK-NEXT: i32 2, label %[[BB6:.*]] +; CHECK-NEXT: ] +; CHECK: [[BB4]]: +; CHECK-NEXT: [[PHI5:%.*]] = phi i32 [ [[PHI13:%.*]], %[[BB12]] ], [ [[PHI2]], %[[BB1]] ] +; CHECK-NEXT: br label %[[BB7]] +; CHECK: [[BB6]]: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @foo(i32 noundef [[PHI2]]) +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[PHI2]], 1 +; CHECK-NEXT: br label %[[BB12]] +; CHECK: [[BB7]]: +; CHECK-NEXT: [[PHI8]] = phi i32 [ [[PHI]], %[[BB1]] ], [ 2, %[[BB4]] ] +; CHECK-NEXT: [[PHI9]] = phi i32 [ [[PHI2]], %[[BB1]] ], [ [[PHI5]], %[[BB4]] ] +; CHECK-NEXT: br label %[[BB1]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[BB10]]: +; CHECK-NEXT: [[CALL11:%.*]] = call i32 @foo(i32 noundef [[PHI13]]) +; CHECK-NEXT: ret i32 0 +; CHECK: [[BB12]]: +; CHECK-NEXT: [[PHI13]] = phi i32 [ [[ADD]], %[[BB6]] ], [ [[PHI2]], %[[BB1]] ] +; CHECK-NEXT: [[SEXT:%.*]] = sext i32 [[PHI13]] to i64 +; CHECK-NEXT: [[GETELEMENTPTR14:%.*]] = getelementptr inbounds [2 x ptr], ptr [[ALLOCA]], i64 0, i64 [[SEXT]] +; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[GETELEMENTPTR14]], align 8, !tbaa [[TBAA0]] +; CHECK-NEXT: indirectbr ptr [[LOAD]], [label %[[BB4]], label %bb10] +; +bb: + %alloca = alloca [2 x ptr], align 16 + store ptr blockaddress(@main, %bb4), ptr %alloca, align 16, !tbaa !0 + %getelementptr = getelementptr inbounds [2 x ptr], ptr %alloca, i64 0, i64 1 + store ptr blockaddress(@main, %bb10), ptr %getelementptr, align 8, !tbaa !0 + br label %bb1 + +bb1: ; preds = %bb7, %bb + %phi = phi i32 [ 0, %bb ], [ %phi8, %bb7 ] + %phi2 = phi i32 [ 0, %bb ], [ %phi9, %bb7 ] + switch i32 %phi, label %bb7 [ + i32 0, label %bb3 + i32 1, label %bb4 + i32 2, label %bb6 + ] + +bb3: ; preds = %bb1 + br label %bb12 + +bb4: ; preds = %bb12, %bb1 + %phi5 = phi i32 [ %phi13, %bb12 ], [ %phi2, %bb1 ] + br label %bb7 + +bb6: ; preds = %bb1 + %call = call i32 @foo(i32 noundef %phi2) + %add = add nsw i32 %phi2, 1 + br label %bb12 + +bb7: ; preds = %bb4, %bb1 + %phi8 = phi i32 [ %phi, %bb1 ], [ 2, %bb4 ] + %phi9 = phi i32 [ %phi2, %bb1 ], [ %phi5, %bb4 ] + br label %bb1, !llvm.loop !4 + +bb10: ; preds = %bb12 + %call11 = call i32 @foo(i32 noundef %phi13) + ret i32 0 + +bb12: ; preds = %bb6, %bb3 + %phi13 = phi i32 [ %add, %bb6 ], [ %phi2, %bb3 ] + %sext = sext i32 %phi13 to i64 + %getelementptr14 = getelementptr inbounds [2 x ptr], ptr %alloca, i64 0, i64 %sext + %load = load ptr, ptr %getelementptr14, align 8, !tbaa !0 + indirectbr ptr %load, [label %bb4, label %bb10] +} + +declare i32 @foo(i32) + +!0 = !{!1, !1, i64 0} +!1 = !{!"any pointer", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C++ TBAA"} +!4 = !{!5, !5, i64 0} +!5 = !{!"int", !2, i64 0} +;. +; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK: [[META1]] = !{!"any pointer", [[META2:![0-9]+]], i64 0} +; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK: [[META3]] = !{!"Simple C++ TBAA"} +; CHECK: [[LOOP4]] = !{[[META5:![0-9]+]], [[META5]], i64 0} +; CHECK: [[META5]] = !{!"int", [[META2]], i64 0} +;. From bc152fbf43157659f8b6817e8510e1fbe6e175b5 Mon Sep 17 00:00:00 2001 From: Prabhuk Date: Tue, 10 Sep 2024 23:04:05 -0700 Subject: [PATCH 059/114] [llvm-debuginfod-find] Enable multicall driver (#108082) Migrate llvm-debuginfod-find tool to use GenericOptTable. Enable multicall driver. --- .../tools/llvm-debuginfod-find/CMakeLists.txt | 12 +- llvm/tools/llvm-debuginfod-find/Opts.td | 17 +++ .../llvm-debuginfod-find.cpp | 109 ++++++++++++++---- 3 files changed, 115 insertions(+), 23 deletions(-) create mode 100644 llvm/tools/llvm-debuginfod-find/Opts.td diff --git a/llvm/tools/llvm-debuginfod-find/CMakeLists.txt b/llvm/tools/llvm-debuginfod-find/CMakeLists.txt index b98c431c1839b..39da11fcd9599 100644 --- a/llvm/tools/llvm-debuginfod-find/CMakeLists.txt +++ b/llvm/tools/llvm-debuginfod-find/CMakeLists.txt @@ -1,11 +1,21 @@ set(LLVM_LINK_COMPONENTS + Option Object Support ) +set(LLVM_TARGET_DEFINITIONS Opts.td) +tablegen(LLVM Opts.inc -gen-opt-parser-defs) +add_public_tablegen_target(DebugInfodFindOptsTableGen) + add_llvm_tool(llvm-debuginfod-find llvm-debuginfod-find.cpp + DEPENDS + DebugInfodFindOptsTableGen + GENERATE_DRIVER ) -target_link_libraries(llvm-debuginfod-find PRIVATE LLVMDebuginfod) +if(NOT LLVM_TOOL_LLVM_DRIVER_BUILD) + target_link_libraries(llvm-debuginfod-find PRIVATE LLVMDebuginfod) +endif() if(LLVM_INSTALL_BINUTILS_SYMLINKS) add_llvm_tool_symlink(debuginfod-find llvm-debuginfod-find) endif() diff --git a/llvm/tools/llvm-debuginfod-find/Opts.td b/llvm/tools/llvm-debuginfod-find/Opts.td new file mode 100644 index 0000000000000..a770f50d241a2 --- /dev/null +++ b/llvm/tools/llvm-debuginfod-find/Opts.td @@ -0,0 +1,17 @@ +include "llvm/Option/OptParser.td" + +class F : Flag<["-"], name>, HelpText; +class FF: Flag<["--"], name>, HelpText; +class S: Separate<["--"], name>, HelpText, MetaVarName; + +def help : FF<"help", "Display available options">; +def : F<"h", "Alias for --help">, Alias; + +def fetch_executable : FF<"executable", "If set, fetch a binary file associated with this build id, containing the executable sections.">; +def fetch_debuginfo : FF<"debuginfo", "If set, fetch a binary file associated with this build id, containing the debuginfo sections.">; +def fetch_source : S<"source", "", "Fetch a source file associated with this build id, which is at this relative path relative to the compilation directory.">; +def dump_to_stdout : FF<"dump", "If set, dumps the contents of the fetched artifact " + "to standard output. Otherwise, dumps the absolute " + "path to the cached artifact on disk.">; +def debug_file_directory : S<"debug-file-directory", "", "Path to directory where to look for debug files.">; + diff --git a/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp b/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp index 425ee8d986a82..1f4404aaa391f 100644 --- a/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp +++ b/llvm/tools/llvm-debuginfod-find/llvm-debuginfod-find.cpp @@ -16,14 +16,89 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/Debuginfod/BuildIDFetcher.h" #include "llvm/Debuginfod/Debuginfod.h" #include "llvm/Debuginfod/HTTPClient.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" +#include "llvm/Support/LLVMDriver.h" using namespace llvm; +// Command-line option boilerplate. +namespace { +enum ID { + OPT_INVALID = 0, // This is not an option ID. +#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__), +#include "Opts.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) \ + static constexpr StringLiteral NAME##_init[] = VALUE; \ + static constexpr ArrayRef NAME(NAME##_init, \ + std::size(NAME##_init) - 1); +#include "Opts.inc" +#undef PREFIX + +using namespace llvm::opt; +static constexpr opt::OptTable::Info InfoTable[] = { +#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__), +#include "Opts.inc" +#undef OPTION +}; + +class DebuginfodFindOptTable : public opt::GenericOptTable { +public: + DebuginfodFindOptTable() : GenericOptTable(InfoTable) {} +}; + +} // end anonymous namespace + +static std::string InputBuildID; +static bool FetchExecutable; +static bool FetchDebuginfo; +static std::string FetchSource; +static bool DumpToStdout; +static std::vector DebugFileDirectory; + +static void parseArgs(int argc, char **argv) { + DebuginfodFindOptTable Tbl; + llvm::StringRef ToolName = argv[0]; + llvm::BumpPtrAllocator A; + llvm::StringSaver Saver{A}; + opt::InputArgList Args = + Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) { + llvm::errs() << Msg << '\n'; + std::exit(1); + }); + + if (Args.hasArg(OPT_help)) { + Tbl.printHelp(llvm::outs(), + "llvm-debuginfod-find [options] ", + ToolName.str().c_str()); + std::exit(0); + } + + InputBuildID = Args.getLastArgValue(OPT_INPUT); + + FetchExecutable = Args.hasArg(OPT_fetch_executable); + FetchDebuginfo = Args.hasArg(OPT_fetch_debuginfo); + DumpToStdout = Args.hasArg(OPT_dump_to_stdout); + FetchSource = Args.getLastArgValue(OPT_fetch_source, ""); + DebugFileDirectory = Args.getAllArgValues(OPT_debug_file_directory); +} + +[[noreturn]] static void helpExit() { + errs() << "Must specify exactly one of --executable, " + "--source=/path/to/file, or --debuginfo.\n"; + exit(1); +} + +/* cl::OptionCategory DebuginfodFindCategory("llvm-debuginfod-find Options"); cl::opt InputBuildID(cl::Positional, cl::Required, @@ -60,30 +135,17 @@ static cl::list DebugFileDirectory( cl::desc("Path to directory where to look for debug files."), cl::cat(DebuginfodFindCategory)); -[[noreturn]] static void helpExit() { - errs() << "Must specify exactly one of --executable, " - "--source=/path/to/file, or --debuginfo."; - exit(1); -} +*/ -ExitOnError ExitOnErr; +ExitOnError ExitOnDebuginfodFindError; static std::string fetchDebugInfo(object::BuildIDRef BuildID); -int main(int argc, char **argv) { - InitLLVM X(argc, argv); +int llvm_debuginfod_find_main(int argc, char **argv, + const llvm::ToolContext &) { + // InitLLVM X(argc, argv); HTTPClient::initialize(); - - cl::HideUnrelatedOptions({&DebuginfodFindCategory}); - cl::ParseCommandLineOptions( - argc, argv, - "llvm-debuginfod-find: Fetch debuginfod artifacts\n\n" - "This program is a frontend to the debuginfod client library. The cache " - "directory, request timeout (in seconds), and debuginfod server urls are " - "set by these environment variables:\n" - "DEBUGINFOD_CACHE_PATH (default set by sys::path::cache_directory)\n" - "DEBUGINFOD_TIMEOUT (defaults to 90s)\n" - "DEBUGINFOD_URLS=[comma separated URLs] (defaults to empty)\n"); + parseArgs(argc, argv); if (FetchExecutable + FetchDebuginfo + (FetchSource != "") != 1) helpExit(); @@ -97,9 +159,10 @@ int main(int argc, char **argv) { std::string Path; if (FetchSource != "") - Path = ExitOnErr(getCachedOrDownloadSource(ID, FetchSource)); + Path = + ExitOnDebuginfodFindError(getCachedOrDownloadSource(ID, FetchSource)); else if (FetchExecutable) - Path = ExitOnErr(getCachedOrDownloadExecutable(ID)); + Path = ExitOnDebuginfodFindError(getCachedOrDownloadExecutable(ID)); else if (FetchDebuginfo) Path = fetchDebugInfo(ID); else @@ -110,11 +173,13 @@ int main(int argc, char **argv) { // Print the contents of the artifact. ErrorOr> Buf = MemoryBuffer::getFile( Path, /*IsText=*/false, /*RequiresNullTerminator=*/false); - ExitOnErr(errorCodeToError(Buf.getError())); + ExitOnDebuginfodFindError(errorCodeToError(Buf.getError())); outs() << Buf.get()->getBuffer(); } else // Print the path to the cached artifact file. outs() << Path << "\n"; + + return 0; } // Find a debug file in local build ID directories and via debuginfod. From 6dbdb8430b492959c399a7809247424c6962902f Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Sep 2024 08:47:24 +0200 Subject: [PATCH 060/114] [Clang] Fix crash due to invalid source location in __is_trivially_equality_comparable (#107815) Fixes #107777 --- clang/lib/Sema/SemaExprCXX.cpp | 3 ++- clang/test/SemaCXX/type-traits.cpp | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 14feafd1e6b17..a14a086731c13 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -5171,7 +5171,8 @@ static bool HasNonDeletedDefaultedEqualityComparison(Sema &S, // const ClassT& obj; OpaqueValueExpr Operand( - {}, Decl->getTypeForDecl()->getCanonicalTypeUnqualified().withConst(), + KeyLoc, + Decl->getTypeForDecl()->getCanonicalTypeUnqualified().withConst(), ExprValueKind::VK_LValue); UnresolvedSet<16> Functions; // obj == obj; diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp index b8a9db103782c..91ef7786f11bb 100644 --- a/clang/test/SemaCXX/type-traits.cpp +++ b/clang/test/SemaCXX/type-traits.cpp @@ -4147,6 +4147,24 @@ class Template {}; // Make sure we don't crash when instantiating a type static_assert(!__is_trivially_equality_comparable(Template>)); + +struct S operator==(S, S); + +template struct basic_string_view {}; + +struct basic_string { + operator basic_string_view() const; +}; + +template +const bool is_trivially_equality_comparable = __is_trivially_equality_comparable(T); + +template > +void find(); + +void func() { find(); } + + namespace hidden_friend { struct TriviallyEqualityComparable { From cd0e867756dbab6184d2f250f768a60bc60a0849 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 11 Sep 2024 07:50:39 +0100 Subject: [PATCH 061/114] [AArch64] Update and cleanup arm64-vector-imm.ll test. NFC --- llvm/test/CodeGen/AArch64/arm64-vector-imm.ll | 147 +++++++++++------- 1 file changed, 89 insertions(+), 58 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll b/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll index 08bceb850df40..a3efa6b961e63 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll @@ -1,134 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s define <8 x i8> @v_orrimm(ptr %A) nounwind { ; CHECK-LABEL: v_orrimm: -; CHECK-NOT: mov -; CHECK-NOT: mvn -; CHECK: orr - %tmp1 = load <8 x i8>, ptr %A - %tmp3 = or <8 x i8> %tmp1, - ret <8 x i8> %tmp3 +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: orr.2s v0, #1, lsl #24 +; CHECK-NEXT: ret + %tmp1 = load <8 x i8>, ptr %A + %tmp3 = or <8 x i8> %tmp1, + ret <8 x i8> %tmp3 } define <16 x i8> @v_orrimmQ(ptr %A) nounwind { -; CHECK: v_orrimmQ -; CHECK-NOT: mov -; CHECK-NOT: mvn -; CHECK: orr - %tmp1 = load <16 x i8>, ptr %A - %tmp3 = or <16 x i8> %tmp1, - ret <16 x i8> %tmp3 +; CHECK-LABEL: v_orrimmQ: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: orr.4s v0, #1, lsl #24 +; CHECK-NEXT: ret + %tmp1 = load <16 x i8>, ptr %A + %tmp3 = or <16 x i8> %tmp1, + ret <16 x i8> %tmp3 } define <8 x i8> @v_bicimm(ptr %A) nounwind { ; CHECK-LABEL: v_bicimm: -; CHECK-NOT: mov -; CHECK-NOT: mvn -; CHECK: bic - %tmp1 = load <8 x i8>, ptr %A - %tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 > - ret <8 x i8> %tmp3 +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: bic.2s v0, #255, lsl #24 +; CHECK-NEXT: ret + %tmp1 = load <8 x i8>, ptr %A + %tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 > + ret <8 x i8> %tmp3 } define <16 x i8> @v_bicimmQ(ptr %A) nounwind { ; CHECK-LABEL: v_bicimmQ: -; CHECK-NOT: mov -; CHECK-NOT: mvn -; CHECK: bic - %tmp1 = load <16 x i8>, ptr %A - %tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 > - ret <16 x i8> %tmp3 +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: bic.4s v0, #255, lsl #24 +; CHECK-NEXT: ret + %tmp1 = load <16 x i8>, ptr %A + %tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 > + ret <16 x i8> %tmp3 } define <2 x double> @foo(<2 x double> %bar) nounwind { -; CHECK: foo -; CHECK: fmov.2d v1, #1.0000000 +; CHECK-LABEL: foo: +; CHECK: // %bb.0: +; CHECK-NEXT: fmov.2d v1, #1.00000000 +; CHECK-NEXT: fadd.2d v0, v0, v1 +; CHECK-NEXT: ret %add = fadd <2 x double> %bar, ret <2 x double> %add } define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_4s_imm_t1: -; CHECK: movi.4s v0, #75 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.4s v0, #75 +; CHECK-NEXT: ret +entry: ret <4 x i32> } define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_4s_imm_t2: -; CHECK: movi.4s v0, #75, lsl #8 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.4s v0, #75, lsl #8 +; CHECK-NEXT: ret +entry: ret <4 x i32> } define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_4s_imm_t3: -; CHECK: movi.4s v0, #75, lsl #16 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.4s v0, #75, lsl #16 +; CHECK-NEXT: ret +entry: ret <4 x i32> } define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_4s_imm_t4: -; CHECK: movi.4s v0, #75, lsl #24 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.4s v0, #75, lsl #24 +; CHECK-NEXT: ret +entry: ret <4 x i32> } define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_8h_imm_t5: -; CHECK: movi.8h v0, #75 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.8h v0, #75 +; CHECK-NEXT: ret +entry: ret <8 x i16> } ; rdar://11989841 define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_8h_imm_t6: -; CHECK: movi.8h v0, #75, lsl #8 +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.8h v0, #75, lsl #8 +; CHECK-NEXT: ret +entry: ret <8 x i16> } define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_4s_imm_t7: -; CHECK: movi.4s v0, #75, msl #8 -ret <4 x i32> +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.4s v0, #75, msl #8 +; CHECK-NEXT: ret +entry: + ret <4 x i32> } define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_4s_imm_t8: -; CHECK: movi.4s v0, #75, msl #16 -ret <4 x i32> +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.4s v0, #75, msl #16 +; CHECK-NEXT: ret +entry: + ret <4 x i32> } define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_16b_imm_t9: -; CHECK: movi.16b v0, #75 -ret <16 x i8> +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.16b v0, #75 +; CHECK-NEXT: ret +entry: + ret <16 x i8> } define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_2d_imm_t10: -; CHECK: movi.2d v0, #0xff00ff00ff00ff -ret <2 x i64> +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi.2d v0, #0xff00ff00ff00ff +; CHECK-NEXT: ret +entry: + ret <2 x i64> } define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_4s_imm_t11: -; CHECK: fmov.4s v0, #-0.32812500 -ret <4 x i32> +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov.4s v0, #-0.32812500 +; CHECK-NEXT: ret +entry: + ret <4 x i32> } define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp { -entry: ; CHECK-LABEL: movi_2d_imm_t12: -; CHECK: fmov.2d v0, #-0.17187500 -ret <2 x i64> +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov.2d v0, #-0.17187500 +; CHECK-NEXT: ret +entry: + ret <2 x i64> } From 748023dc3210533df2c1c6efc8af1b5954493701 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Sep 2024 08:59:46 +0200 Subject: [PATCH 062/114] [libc++][NFC] Replace _LIBCPP_NORETURN and TEST_NORETURN with [[noreturn]] (#80455) `[[__noreturn__]]` is now always available, so we can simply use the attribute directly instead of through a macro. --- libcxx/.clang-format | 1 - libcxx/include/__chrono/exception.h | 4 ++-- libcxx/include/__config | 2 -- libcxx/include/__exception/exception_ptr.h | 2 +- libcxx/include/__exception/nested_exception.h | 8 ++++---- libcxx/include/__exception/operations.h | 4 ++-- libcxx/include/__exception/terminate.h | 2 +- libcxx/include/__filesystem/filesystem_error.h | 4 ++-- libcxx/include/__format/format_error.h | 2 +- .../include/__format/parser_std_format_spec.h | 4 ++-- libcxx/include/__functional/function.h | 2 +- libcxx/include/__memory/shared_ptr.h | 2 +- libcxx/include/__system_error/system_error.h | 4 ++-- libcxx/include/__utility/unreachable.h | 2 +- libcxx/include/__verbose_abort | 2 +- libcxx/include/any | 2 +- libcxx/include/future | 2 +- libcxx/include/ios | 2 +- libcxx/include/new | 4 ++-- libcxx/include/optional | 2 +- libcxx/include/regex | 2 +- libcxx/include/stdexcept | 18 +++++++++--------- libcxx/include/string | 4 ++-- libcxx/include/typeinfo | 2 +- libcxx/include/variant | 2 +- libcxx/include/vector | 8 ++++---- libcxx/src/stdexcept.cpp | 2 +- libcxx/src/string.cpp | 4 ++-- .../src/support/runtime/exception_fallback.ipp | 4 ++-- libcxx/src/support/runtime/exception_msvc.ipp | 4 ++-- .../runtime/exception_pointer_cxxabi.ipp | 4 ++-- .../runtime/exception_pointer_glibcxx.ipp | 6 +++--- .../support/runtime/exception_pointer_msvc.ipp | 4 ++-- .../exception_pointer_unimplemented.ipp | 4 ++-- libcxx/src/vector.cpp | 4 ++-- libcxx/test/support/assert_macros.h | 2 +- libcxx/test/support/check_assertion.h | 4 ++-- libcxx/test/support/count_new.h | 7 +++---- libcxx/test/support/test_macros.h | 6 ------ 39 files changed, 69 insertions(+), 79 deletions(-) diff --git a/libcxx/.clang-format b/libcxx/.clang-format index b2ca452931fec..c37b234e857de 100644 --- a/libcxx/.clang-format +++ b/libcxx/.clang-format @@ -44,7 +44,6 @@ AttributeMacros: [ '_LIBCPP_NO_UNIQUE_ADDRESS', '_LIBCPP_NOALIAS', '_LIBCPP_NODISCARD', - '_LIBCPP_NORETURN', '_LIBCPP_OVERRIDABLE_FUNC_VIS', '_LIBCPP_STANDALONE_DEBUG', '_LIBCPP_TEMPLATE_DATA_VIS', diff --git a/libcxx/include/__chrono/exception.h b/libcxx/include/__chrono/exception.h index 266f8fac44176..cc408d78a36da 100644 --- a/libcxx/include/__chrono/exception.h +++ b/libcxx/include/__chrono/exception.h @@ -71,7 +71,7 @@ class nonexistent_local_time : public runtime_error { }; template -_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_nonexistent_local_time( +[[noreturn]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_nonexistent_local_time( [[maybe_unused]] const local_time<_Duration>& __time, [[maybe_unused]] const local_info& __info) { # ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw nonexistent_local_time(__time, __info); @@ -115,7 +115,7 @@ class ambiguous_local_time : public runtime_error { }; template -_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_ambiguous_local_time( +[[noreturn]] _LIBCPP_AVAILABILITY_TZDB _LIBCPP_HIDE_FROM_ABI void __throw_ambiguous_local_time( [[maybe_unused]] const local_time<_Duration>& __time, [[maybe_unused]] const local_info& __info) { # ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw ambiguous_local_time(__time, __info); diff --git a/libcxx/include/__config b/libcxx/include/__config index bccf90d1dbacd..b0a5dda147a6a 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -312,7 +312,6 @@ _LIBCPP_HARDENING_MODE_DEBUG # define _LIBCPP_ALIGNOF(_Tp) alignof(_Tp) # define _ALIGNAS_TYPE(x) alignas(x) # define _ALIGNAS(x) alignas(x) -# define _LIBCPP_NORETURN [[noreturn]] # define _NOEXCEPT noexcept # define _NOEXCEPT_(...) noexcept(__VA_ARGS__) # define _LIBCPP_CONSTEXPR constexpr @@ -322,7 +321,6 @@ _LIBCPP_HARDENING_MODE_DEBUG # define _LIBCPP_ALIGNOF(_Tp) _Alignof(_Tp) # define _ALIGNAS_TYPE(x) __attribute__((__aligned__(_LIBCPP_ALIGNOF(x)))) # define _ALIGNAS(x) __attribute__((__aligned__(x))) -# define _LIBCPP_NORETURN __attribute__((__noreturn__)) # define _LIBCPP_HAS_NO_NOEXCEPT # define nullptr __nullptr # define _NOEXCEPT throw() diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h index beadd9212abd1..9e5351f534a1c 100644 --- a/libcxx/include/__exception/exception_ptr.h +++ b/libcxx/include/__exception/exception_ptr.h @@ -159,7 +159,7 @@ _LIBCPP_EXPORTED_FROM_ABI void swap(exception_ptr&, exception_ptr&) _NOEXCEPT; _LIBCPP_EXPORTED_FROM_ABI exception_ptr __copy_exception_ptr(void* __except, const void* __ptr); _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT; -_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr); +[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr); // This is a built-in template function which automagically extracts the required // information. diff --git a/libcxx/include/__exception/nested_exception.h b/libcxx/include/__exception/nested_exception.h index 4c7970d167ffa..8e817e1c06978 100644 --- a/libcxx/include/__exception/nested_exception.h +++ b/libcxx/include/__exception/nested_exception.h @@ -40,7 +40,7 @@ class _LIBCPP_EXPORTED_FROM_ABI nested_exception { virtual ~nested_exception() _NOEXCEPT; // access functions - _LIBCPP_NORETURN void rethrow_nested() const; + [[__noreturn__]] void rethrow_nested() const; _LIBCPP_HIDE_FROM_ABI exception_ptr nested_ptr() const _NOEXCEPT { return __ptr_; } }; @@ -55,19 +55,19 @@ struct __throw_with_nested; template struct __throw_with_nested<_Tp, _Up, true> { - _LIBCPP_NORETURN static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) { + [[__noreturn__]] static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) { throw __nested<_Up>(std::forward<_Tp>(__t)); } }; template struct __throw_with_nested<_Tp, _Up, false> { - _LIBCPP_NORETURN static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) { throw std::forward<_Tp>(__t); } + [[__noreturn__]] static inline _LIBCPP_HIDE_FROM_ABI void __do_throw(_Tp&& __t) { throw std::forward<_Tp>(__t); } }; #endif template -_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void throw_with_nested(_Tp&& __t) { +[[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void throw_with_nested(_Tp&& __t) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS using _Up = __decay_t<_Tp>; static_assert(is_copy_constructible<_Up>::value, "type thrown must be CopyConstructible"); diff --git a/libcxx/include/__exception/operations.h b/libcxx/include/__exception/operations.h index 4a0a697c00e6e..c8744eb297a4e 100644 --- a/libcxx/include/__exception/operations.h +++ b/libcxx/include/__exception/operations.h @@ -22,7 +22,7 @@ namespace std { // purposefully not using versioning namespace using unexpected_handler = void (*)(); _LIBCPP_EXPORTED_FROM_ABI unexpected_handler set_unexpected(unexpected_handler) _NOEXCEPT; _LIBCPP_EXPORTED_FROM_ABI unexpected_handler get_unexpected() _NOEXCEPT; -_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void unexpected(); +[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void unexpected(); #endif using terminate_handler = void (*)(); @@ -37,7 +37,7 @@ _LIBCPP_EXPORTED_FROM_ABI int uncaught_exceptions() _NOEXCEPT; class _LIBCPP_EXPORTED_FROM_ABI exception_ptr; _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT; -_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr); +[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr); } // namespace std #endif // _LIBCPP___EXCEPTION_OPERATIONS_H diff --git a/libcxx/include/__exception/terminate.h b/libcxx/include/__exception/terminate.h index e672471dc5263..0bfc3506d3791 100644 --- a/libcxx/include/__exception/terminate.h +++ b/libcxx/include/__exception/terminate.h @@ -16,7 +16,7 @@ #endif namespace std { // purposefully not using versioning namespace -_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void terminate() _NOEXCEPT; +[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void terminate() _NOEXCEPT; } // namespace std #endif // _LIBCPP___EXCEPTION_TERMINATE_H diff --git a/libcxx/include/__filesystem/filesystem_error.h b/libcxx/include/__filesystem/filesystem_error.h index 80a11e3b1932c..f43568c2004d2 100644 --- a/libcxx/include/__filesystem/filesystem_error.h +++ b/libcxx/include/__filesystem/filesystem_error.h @@ -69,13 +69,13 @@ class _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY _LIBCPP_EXPORTED_FROM_ABI filesyst # ifndef _LIBCPP_HAS_NO_EXCEPTIONS template -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void __throw_filesystem_error(_Args&&... __args) { throw filesystem_error(std::forward<_Args>(__args)...); } # else template -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_FILESYSTEM_LIBRARY void __throw_filesystem_error(_Args&&...) { _LIBCPP_VERBOSE_ABORT("filesystem_error was thrown in -fno-exceptions mode"); } diff --git a/libcxx/include/__format/format_error.h b/libcxx/include/__format/format_error.h index 35a39ee82f3da..1df7dbff2b7df 100644 --- a/libcxx/include/__format/format_error.h +++ b/libcxx/include/__format/format_error.h @@ -35,7 +35,7 @@ class _LIBCPP_EXPORTED_FROM_ABI format_error : public runtime_error { }; _LIBCPP_DIAGNOSTIC_POP -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_format_error(const char* __s) { +[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI void __throw_format_error(const char* __s) { # ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw format_error(__s); # else diff --git a/libcxx/include/__format/parser_std_format_spec.h b/libcxx/include/__format/parser_std_format_spec.h index 28891e5d2876c..6bdf8e319ba44 100644 --- a/libcxx/include/__format/parser_std_format_spec.h +++ b/libcxx/include/__format/parser_std_format_spec.h @@ -52,13 +52,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD namespace __format_spec { -_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void +[[noreturn]] _LIBCPP_HIDE_FROM_ABI inline void __throw_invalid_option_format_error(const char* __id, const char* __option) { std::__throw_format_error( (string("The format specifier for ") + __id + " does not allow the " + __option + " option").c_str()); } -_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void __throw_invalid_type_format_error(const char* __id) { +[[noreturn]] _LIBCPP_HIDE_FROM_ABI inline void __throw_invalid_type_format_error(const char* __id) { std::__throw_format_error( (string("The type option contains an invalid value for ") + __id + " formatting argument").c_str()); } diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h index c7b98035e34bf..ff31011caa329 100644 --- a/libcxx/include/__functional/function.h +++ b/libcxx/include/__functional/function.h @@ -78,7 +78,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_function_call : public exception { }; _LIBCPP_DIAGNOSTIC_POP -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_function_call() { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_function_call() { # ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw bad_function_call(); # else diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h index 4dd8022822d22..5dcd475e2c9f9 100644 --- a/libcxx/include/__memory/shared_ptr.h +++ b/libcxx/include/__memory/shared_ptr.h @@ -123,7 +123,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_weak_ptr : public std::exception { const char* what() const _NOEXCEPT override; }; -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_weak_ptr() { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_weak_ptr() { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw bad_weak_ptr(); #else diff --git a/libcxx/include/__system_error/system_error.h b/libcxx/include/__system_error/system_error.h index 362e67505658c..3ffa1029ca5c2 100644 --- a/libcxx/include/__system_error/system_error.h +++ b/libcxx/include/__system_error/system_error.h @@ -39,8 +39,8 @@ class _LIBCPP_EXPORTED_FROM_ABI system_error : public runtime_error { _LIBCPP_HIDE_FROM_ABI const error_code& code() const _NOEXCEPT { return __ec_; } }; -_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_system_error(int __ev, const char* __what_arg); -_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void __throw_system_error(error_code __ec, const char* __what_arg) { +[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_system_error(int __ev, const char* __what_arg); +[[__noreturn__]] _LIBCPP_HIDE_FROM_ABI inline void __throw_system_error(error_code __ec, const char* __what_arg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw system_error(__ec, __what_arg); #else diff --git a/libcxx/include/__utility/unreachable.h b/libcxx/include/__utility/unreachable.h index d833f74c2e4f1..5525452aa55ef 100644 --- a/libcxx/include/__utility/unreachable.h +++ b/libcxx/include/__utility/unreachable.h @@ -18,7 +18,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -_LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI inline void __libcpp_unreachable() { +[[__noreturn__]] _LIBCPP_HIDE_FROM_ABI inline void __libcpp_unreachable() { _LIBCPP_ASSERT_INTERNAL(false, "std::unreachable() was reached"); __builtin_unreachable(); } diff --git a/libcxx/include/__verbose_abort b/libcxx/include/__verbose_abort index 195ce65b721ff..244278aec652d 100644 --- a/libcxx/include/__verbose_abort +++ b/libcxx/include/__verbose_abort @@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // This function should never be called directly from the code -- it should only be called through // the _LIBCPP_VERBOSE_ABORT macro. -_LIBCPP_NORETURN _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS +[[__noreturn__]] _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...); // _LIBCPP_VERBOSE_ABORT(format, args...) diff --git a/libcxx/include/any b/libcxx/include/any index 7630e8a057d05..6e4ff31ff9b62 100644 --- a/libcxx/include/any +++ b/libcxx/include/any @@ -127,7 +127,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 17 -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST void __throw_bad_any_cast() { +[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST void __throw_bad_any_cast() { # ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw bad_any_cast(); # else diff --git a/libcxx/include/future b/libcxx/include/future index 01c0b10172cd3..9a0eb7971a313 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -472,7 +472,7 @@ inline _LIBCPP_HIDE_FROM_ABI error_condition make_error_condition(future_errc __ return error_condition(static_cast(__e), future_category()); } -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_future_error(future_errc __ev); +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_future_error(future_errc __ev); class _LIBCPP_EXPORTED_FROM_ABI future_error : public logic_error { error_code __ec_; diff --git a/libcxx/include/ios b/libcxx/include/ios index 426838b91e5dc..61a05fadd29a1 100644 --- a/libcxx/include/ios +++ b/libcxx/include/ios @@ -440,7 +440,7 @@ public: ~failure() _NOEXCEPT override; }; -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_failure(char const* __msg) { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_failure(char const* __msg) { # ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw ios_base::failure(__msg); # else diff --git a/libcxx/include/new b/libcxx/include/new index 9015c4e712763..207e4b46e0ca6 100644 --- a/libcxx/include/new +++ b/libcxx/include/new @@ -166,9 +166,9 @@ public: }; #endif // defined(_LIBCPP_ABI_VCRUNTIME) && defined(_HAS_EXCEPTIONS) && _HAS_EXCEPTIONS == 0 -_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_bad_alloc(); // not in C++ spec +[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_bad_alloc(); // not in C++ spec -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_array_new_length() { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_array_new_length() { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw bad_array_new_length(); #else diff --git a/libcxx/include/optional b/libcxx/include/optional index 41d7515a2b689..b0933b59b25d2 100644 --- a/libcxx/include/optional +++ b/libcxx/include/optional @@ -255,7 +255,7 @@ public: _LIBCPP_BEGIN_NAMESPACE_STD -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS void +[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS void __throw_bad_optional_access() { # ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw bad_optional_access(); diff --git a/libcxx/include/regex b/libcxx/include/regex index 08aebc2266f5d..d59abb8daf8ec 100644 --- a/libcxx/include/regex +++ b/libcxx/include/regex @@ -983,7 +983,7 @@ public: }; template -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_regex_error() { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_regex_error() { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw regex_error(_Ev); #else diff --git a/libcxx/include/stdexcept b/libcxx/include/stdexcept index 853c185187c77..bdfc27aeac374 100644 --- a/libcxx/include/stdexcept +++ b/libcxx/include/stdexcept @@ -209,9 +209,9 @@ public: _LIBCPP_BEGIN_NAMESPACE_STD // in the dylib -_LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_runtime_error(const char*); +[[__noreturn__]] _LIBCPP_EXPORTED_FROM_ABI void __throw_runtime_error(const char*); -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const char* __msg) { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const char* __msg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw logic_error(__msg); #else @@ -219,7 +219,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_logic_error(const cha #endif } -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_domain_error(const char* __msg) { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_domain_error(const char* __msg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw domain_error(__msg); #else @@ -227,7 +227,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_domain_error(const ch #endif } -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_invalid_argument(const char* __msg) { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_invalid_argument(const char* __msg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw invalid_argument(__msg); #else @@ -235,7 +235,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_invalid_argument(cons #endif } -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_length_error(const char* __msg) { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_length_error(const char* __msg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw length_error(__msg); #else @@ -243,7 +243,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_length_error(const ch #endif } -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range(const char* __msg) { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range(const char* __msg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw out_of_range(__msg); #else @@ -251,7 +251,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range(const ch #endif } -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_range_error(const char* __msg) { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_range_error(const char* __msg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw range_error(__msg); #else @@ -259,7 +259,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_range_error(const cha #endif } -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_overflow_error(const char* __msg) { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_overflow_error(const char* __msg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw overflow_error(__msg); #else @@ -267,7 +267,7 @@ _LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_overflow_error(const #endif } -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_underflow_error(const char* __msg) { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_underflow_error(const char* __msg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw underflow_error(__msg); #else diff --git a/libcxx/include/string b/libcxx/include/string index 3480b57375c11..46c5a5ac6de60 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -2230,11 +2230,11 @@ private: return std::__is_pointer_in_range(data(), data() + size() + 1, std::addressof(__v)); } - _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { + [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("basic_string"); } - _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { + [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("basic_string"); } diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo index 54e0b4cf5d634..a44fa4d73ee58 100644 --- a/libcxx/include/typeinfo +++ b/libcxx/include/typeinfo @@ -373,7 +373,7 @@ private: #endif // defined(_LIBCPP_ABI_VCRUNTIME) && _HAS_EXCEPTIONS == 0 _LIBCPP_BEGIN_NAMESPACE_STD -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_cast() { +[[__noreturn__]] inline _LIBCPP_HIDE_FROM_ABI void __throw_bad_cast() { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw bad_cast(); #else diff --git a/libcxx/include/variant b/libcxx/include/variant index 1367cd66f3701..1cac603c27c24 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -298,7 +298,7 @@ struct __farray { _LIBCPP_HIDE_FROM_ABI constexpr const _Tp& operator[](size_t __n) const noexcept { return __buf_[__n]; } }; -_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS void +[[noreturn]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS void __throw_bad_variant_access() { # ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw bad_variant_access(); diff --git a/libcxx/include/vector b/libcxx/include/vector index 2442852c764a6..fc0a48669fe53 100644 --- a/libcxx/include/vector +++ b/libcxx/include/vector @@ -995,9 +995,9 @@ private: __move_assign_alloc(__c, integral_constant()); } - _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); } + [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); } - _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); } + [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); } _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const vector& __c, true_type) { if (__alloc() != __c.__alloc()) { @@ -2163,9 +2163,9 @@ public: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __invariants() const; private: - _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); } + [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_length_error() const { std::__throw_length_error("vector"); } - _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); } + [[__noreturn__]] _LIBCPP_HIDE_FROM_ABI void __throw_out_of_range() const { std::__throw_out_of_range("vector"); } template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void diff --git a/libcxx/src/stdexcept.cpp b/libcxx/src/stdexcept.cpp index bc25c0f9e6ef6..134d28efb750f 100644 --- a/libcxx/src/stdexcept.cpp +++ b/libcxx/src/stdexcept.cpp @@ -19,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -_LIBCPP_NORETURN void __throw_runtime_error(const char* msg) { +void __throw_runtime_error(const char* msg) { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS throw runtime_error(msg); #else diff --git a/libcxx/src/string.cpp b/libcxx/src/string.cpp index cf07b3ef1ef27..12db5381a7b1b 100644 --- a/libcxx/src/string.cpp +++ b/libcxx/src/string.cpp @@ -28,8 +28,8 @@ struct __basic_string_common; // The struct isn't declared anymore in the headers. It's only here for ABI compatibility. template <> struct __basic_string_common { - _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const; - _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const; + [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const; + [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const; }; void __basic_string_common::__throw_length_error() const { std::__throw_length_error("basic_string"); } diff --git a/libcxx/src/support/runtime/exception_fallback.ipp b/libcxx/src/support/runtime/exception_fallback.ipp index 18ff4b83d8765..ca542c9497214 100644 --- a/libcxx/src/support/runtime/exception_fallback.ipp +++ b/libcxx/src/support/runtime/exception_fallback.ipp @@ -21,7 +21,7 @@ unexpected_handler set_unexpected(unexpected_handler func) noexcept { unexpected_handler get_unexpected() noexcept { return __libcpp_atomic_load(&__unexpected_handler); } -_LIBCPP_NORETURN void unexpected() { +[[noreturn]] void unexpected() { (*get_unexpected())(); // unexpected handler should not return terminate(); @@ -33,7 +33,7 @@ terminate_handler set_terminate(terminate_handler func) noexcept { terminate_handler get_terminate() noexcept { return __libcpp_atomic_load(&__terminate_handler); } -_LIBCPP_NORETURN void terminate() noexcept { +[[noreturn]] void terminate() noexcept { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { #endif // _LIBCPP_HAS_NO_EXCEPTIONS diff --git a/libcxx/src/support/runtime/exception_msvc.ipp b/libcxx/src/support/runtime/exception_msvc.ipp index 323cd9d180057..163aec057d9b5 100644 --- a/libcxx/src/support/runtime/exception_msvc.ipp +++ b/libcxx/src/support/runtime/exception_msvc.ipp @@ -32,7 +32,7 @@ unexpected_handler set_unexpected(unexpected_handler func) noexcept { return ::s unexpected_handler get_unexpected() noexcept { return ::_get_unexpected(); } -_LIBCPP_NORETURN void unexpected() { +[[noreturn]] void unexpected() { (*get_unexpected())(); // unexpected handler should not return terminate(); @@ -42,7 +42,7 @@ terminate_handler set_terminate(terminate_handler func) noexcept { return ::set_ terminate_handler get_terminate() noexcept { return ::_get_terminate(); } -_LIBCPP_NORETURN void terminate() noexcept { +[[noreturn]] void terminate() noexcept { #ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { #endif // _LIBCPP_HAS_NO_EXCEPTIONS diff --git a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp index bdb17b9996b7e..8f5c2060bb06c 100644 --- a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp +++ b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp @@ -40,7 +40,7 @@ nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {} nested_exception::~nested_exception() noexcept {} -_LIBCPP_NORETURN void nested_exception::rethrow_nested() const { +void nested_exception::rethrow_nested() const { if (__ptr_ == nullptr) terminate(); rethrow_exception(__ptr_); @@ -55,7 +55,7 @@ exception_ptr current_exception() noexcept { return ptr; } -_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) { +void rethrow_exception(exception_ptr p) { __cxa_rethrow_primary_exception(p.__ptr_); // if p.__ptr_ is NULL, above returns so we terminate terminate(); diff --git a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp index 6dad248f9e1fd..174b44ce0e6f7 100644 --- a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp +++ b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp @@ -31,7 +31,7 @@ struct exception_ptr { } // namespace __exception_ptr -_LIBCPP_NORETURN void rethrow_exception(__exception_ptr::exception_ptr); +[[noreturn]] void rethrow_exception(__exception_ptr::exception_ptr); exception_ptr::~exception_ptr() noexcept { reinterpret_cast<__exception_ptr::exception_ptr*>(this)->~exception_ptr(); } @@ -55,13 +55,13 @@ exception_ptr exception_ptr::__from_native_exception_pointer(void* __e) noexcept nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {} -_LIBCPP_NORETURN void nested_exception::rethrow_nested() const { +[[noreturn]] void nested_exception::rethrow_nested() const { if (__ptr_ == nullptr) terminate(); rethrow_exception(__ptr_); } -_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) { +[[noreturn]] void rethrow_exception(exception_ptr p) { rethrow_exception(reinterpret_cast<__exception_ptr::exception_ptr&>(p)); } diff --git a/libcxx/src/support/runtime/exception_pointer_msvc.ipp b/libcxx/src/support/runtime/exception_pointer_msvc.ipp index b87742b32ded6..2be5136176e32 100644 --- a/libcxx/src/support/runtime/exception_pointer_msvc.ipp +++ b/libcxx/src/support/runtime/exception_pointer_msvc.ipp @@ -61,13 +61,13 @@ exception_ptr current_exception() noexcept { return __ret; } -_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) { __ExceptionPtrRethrow(&p); } +[[noreturn]] void rethrow_exception(exception_ptr p) { __ExceptionPtrRethrow(&p); } nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {} nested_exception::~nested_exception() noexcept {} -_LIBCPP_NORETURN void nested_exception::rethrow_nested() const { +[[noreturn]] void nested_exception::rethrow_nested() const { if (__ptr_ == nullptr) terminate(); rethrow_exception(__ptr_); diff --git a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp index e12b0caf419d2..1fe3127f18b0b 100644 --- a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp +++ b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp @@ -44,7 +44,7 @@ nested_exception::~nested_exception() noexcept {} #endif -_LIBCPP_NORETURN void nested_exception::rethrow_nested() const { +[[noreturn]] void nested_exception::rethrow_nested() const { #warning exception_ptr not yet implemented fprintf(stderr, "exception_ptr not yet implemented\n"); ::abort(); @@ -61,7 +61,7 @@ exception_ptr current_exception() noexcept { ::abort(); } -_LIBCPP_NORETURN void rethrow_exception(exception_ptr p) { +[[noreturn]] void rethrow_exception(exception_ptr p) { #warning exception_ptr not yet implemented fprintf(stderr, "exception_ptr not yet implemented\n"); ::abort(); diff --git a/libcxx/src/vector.cpp b/libcxx/src/vector.cpp index b6153b0e9bf99..3f3a906d6421f 100644 --- a/libcxx/src/vector.cpp +++ b/libcxx/src/vector.cpp @@ -17,8 +17,8 @@ struct __vector_base_common; template <> struct __vector_base_common { - _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const; - _LIBCPP_NORETURN _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const; + [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_length_error() const; + [[noreturn]] _LIBCPP_EXPORTED_FROM_ABI void __throw_out_of_range() const; }; void __vector_base_common::__throw_length_error() const { std::__throw_length_error("vector"); } diff --git a/libcxx/test/support/assert_macros.h b/libcxx/test/support/assert_macros.h index 1059823dcb246..b7011794025bf 100644 --- a/libcxx/test/support/assert_macros.h +++ b/libcxx/test/support/assert_macros.h @@ -50,7 +50,7 @@ void test_log(const char* condition, const char* file, int line, const F& functo } template -TEST_NORETURN void test_fail(const char* file, int line, const Arg& arg) { +[[noreturn]] void test_fail(const char* file, int line, const Arg& arg) { test_log("", file, line, arg); std::abort(); } diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h index 329ce819a6c8d..47ebfeeeefc0f 100644 --- a/libcxx/test/support/check_assertion.h +++ b/libcxx/test/support/check_assertion.h @@ -142,7 +142,7 @@ std::string ToString(std::array const& causes) { return ss.str(); } -TEST_NORETURN void StopChildProcess(DeathCause cause) { std::exit(static_cast(cause)); } +[[noreturn]] void StopChildProcess(DeathCause cause) { std::exit(static_cast(cause)); } DeathCause ConvertToDeathCause(int val) { if (val < static_cast(DeathCause::VerboseAbort) || val > static_cast(DeathCause::Unknown)) { @@ -260,7 +260,7 @@ class DeathTest { } template - TEST_NORETURN void RunForChild(Func&& f) { + [[noreturn]] void RunForChild(Func&& f) { close(GetStdOutReadFD()); // don't need to read from the pipe in the child. close(GetStdErrReadFD()); auto DupFD = [](int DestFD, int TargetFD) { diff --git a/libcxx/test/support/count_new.h b/libcxx/test/support/count_new.h index 61c8ca16ab0d0..c8169d3acceab 100644 --- a/libcxx/test/support/count_new.h +++ b/libcxx/test/support/count_new.h @@ -24,14 +24,13 @@ namespace detail { - TEST_NORETURN - inline void throw_bad_alloc_helper() { +[[noreturn]] inline void throw_bad_alloc_helper() { #ifndef TEST_HAS_NO_EXCEPTIONS - throw std::bad_alloc(); + throw std::bad_alloc(); #else std::abort(); #endif - } +} } class MemCounter diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h index 5d4c1a65cfafb..3aa818af1d269 100644 --- a/libcxx/test/support/test_macros.h +++ b/libcxx/test/support/test_macros.h @@ -214,12 +214,6 @@ #define TEST_IS_EXECUTED_IN_A_SLOW_ENVIRONMENT #endif -#if defined(_LIBCPP_NORETURN) -#define TEST_NORETURN _LIBCPP_NORETURN -#else -#define TEST_NORETURN [[noreturn]] -#endif - #if defined(_LIBCPP_HAS_NO_ALIGNED_ALLOCATION) || \ (!(TEST_STD_VER > 14 || \ (defined(__cpp_aligned_new) && __cpp_aligned_new >= 201606L))) From 1e3a24d2e4eb63c17b962161ae6588d1b2c178f8 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 11 Sep 2024 09:36:20 +0200 Subject: [PATCH 063/114] [InitUndef] Don't use largest super class (#107885) The InitUndef pass currently uses the getLargestSuperClass() hook (which is only used by that pass) to chose the register to initialize. This was done to reduce the number of undef init pseudos needed, e.g. so that the vrnov0 regclass would use the same pseudo as v0. After #106744 we use a single generic pseudo, so this is no longer necessary. --- .../include/llvm/CodeGen/TargetRegisterInfo.h | 9 --- llvm/lib/CodeGen/InitUndef.cpp | 10 ++-- llvm/lib/Target/ARM/ARMBaseRegisterInfo.h | 13 ----- llvm/lib/Target/RISCV/RISCVRegisterInfo.h | 13 ----- .../rvv/subregister-undef-early-clobber.mir | 58 +++++++++---------- 5 files changed, 33 insertions(+), 70 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 197f66e8659d5..ebf06bc57948f 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -1204,15 +1204,6 @@ class TargetRegisterInfo : public MCRegisterInfo { return false; } - /// Returns the Largest Super Class that is being initialized. There - /// should be a Pseudo Instruction implemented for the super class - /// that is being returned to ensure that Init Undef can apply the - /// initialization correctly. - virtual const TargetRegisterClass * - getLargestSuperClass(const TargetRegisterClass *RC) const { - llvm_unreachable("Unexpected target register class."); - } - /// Returns if the architecture being targeted has the required Pseudo /// Instructions for initializing the register. By default this returns false, /// but where it is overriden for an architecture, the behaviour will be diff --git a/llvm/lib/CodeGen/InitUndef.cpp b/llvm/lib/CodeGen/InitUndef.cpp index 8d20f2668de6b..1613e413712d2 100644 --- a/llvm/lib/CodeGen/InitUndef.cpp +++ b/llvm/lib/CodeGen/InitUndef.cpp @@ -152,8 +152,7 @@ bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI, if (Info.UsedLanes == Info.DefinedLanes) continue; - const TargetRegisterClass *TargetRegClass = - TRI->getLargestSuperClass(MRI->getRegClass(Reg)); + const TargetRegisterClass *TargetRegClass = MRI->getRegClass(Reg); LaneBitmask NeedDef = Info.UsedLanes & ~Info.DefinedLanes; @@ -172,8 +171,8 @@ bool InitUndef::handleSubReg(MachineFunction &MF, MachineInstr &MI, Register LatestReg = Reg; for (auto ind : SubRegIndexNeedInsert) { Changed = true; - const TargetRegisterClass *SubRegClass = TRI->getLargestSuperClass( - TRI->getSubRegisterClass(TargetRegClass, ind)); + const TargetRegisterClass *SubRegClass = + TRI->getSubRegisterClass(TargetRegClass, ind); Register TmpInitSubReg = MRI->createVirtualRegister(SubRegClass); LLVM_DEBUG(dbgs() << "Register Class ID" << SubRegClass->getID() << "\n"); BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), @@ -199,8 +198,7 @@ bool InitUndef::fixupIllOperand(MachineInstr *MI, MachineOperand &MO) { dbgs() << "Emitting PseudoInitUndef Instruction for implicit register " << printReg(MO.getReg()) << '\n'); - const TargetRegisterClass *TargetRegClass = - TRI->getLargestSuperClass(MRI->getRegClass(MO.getReg())); + const TargetRegisterClass *TargetRegClass = MRI->getRegClass(MO.getReg()); LLVM_DEBUG(dbgs() << "Register Class ID" << TargetRegClass->getID() << "\n"); Register NewReg = MRI->createVirtualRegister(TargetRegClass); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h index 53803cff8b90a..58b5e98fd30b1 100644 --- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -241,19 +241,6 @@ class ARMBaseRegisterInfo : public ARMGenRegisterInfo { int getSEHRegNum(unsigned i) const { return getEncodingValue(i); } - const TargetRegisterClass * - getLargestSuperClass(const TargetRegisterClass *RC) const override { - if (ARM::MQPRRegClass.hasSubClassEq(RC)) - return &ARM::MQPRRegClass; - if (ARM::SPRRegClass.hasSubClassEq(RC)) - return &ARM::SPRRegClass; - if (ARM::DPR_VFP2RegClass.hasSubClassEq(RC)) - return &ARM::DPR_VFP2RegClass; - if (ARM::GPRRegClass.hasSubClassEq(RC)) - return &ARM::GPRRegClass; - return RC; - } - bool doesRegClassHavePseudoInitUndef( const TargetRegisterClass *RC) const override { (void)RC; diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h index 7e04e9154b524..98a712af08539 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h @@ -130,19 +130,6 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo { const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override; - const TargetRegisterClass * - getLargestSuperClass(const TargetRegisterClass *RC) const override { - if (RISCV::VRM8RegClass.hasSubClassEq(RC)) - return &RISCV::VRM8RegClass; - if (RISCV::VRM4RegClass.hasSubClassEq(RC)) - return &RISCV::VRM4RegClass; - if (RISCV::VRM2RegClass.hasSubClassEq(RC)) - return &RISCV::VRM2RegClass; - if (RISCV::VRRegClass.hasSubClassEq(RC)) - return &RISCV::VRRegClass; - return RC; - } - bool doesRegClassHavePseudoInitUndef( const TargetRegisterClass *RC) const override { return isVRRegClass(RC); diff --git a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir index be6ed4d2a6aa1..ed274cf49fa9b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir +++ b/llvm/test/CodeGen/RISCV/rvv/subregister-undef-early-clobber.mir @@ -14,9 +14,9 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_1 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -52,7 +52,7 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1 ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_0 @@ -92,7 +92,7 @@ body: | ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_3 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -130,7 +130,7 @@ body: | ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_0 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm1_2 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -166,7 +166,7 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm4 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm4 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm4 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm2_1 ; CHECK-NEXT: early-clobber %6:vrm4 = PseudoVRGATHER_VI_M4 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -239,11 +239,11 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1 - ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_1 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -279,9 +279,9 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_1 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1 ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_0 @@ -319,11 +319,11 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_2 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0 - ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_3 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -359,11 +359,11 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M1_]], %subreg.sub_vrm1_3 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0 - ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_2 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -401,9 +401,9 @@ body: | ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3 - ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_5 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -441,9 +441,9 @@ body: | ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3 - ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_4 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -481,9 +481,9 @@ body: | ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2 - ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_7 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -521,9 +521,9 @@ body: | ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2 - ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vr = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF2:%[0-9]+]]:vrnov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG2]], [[INIT_UNDEF2]], %subreg.sub_vrm1_6 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG3]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -559,9 +559,9 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_1 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -597,7 +597,7 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M2_]], %subreg.sub_vrm2_1 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_0 @@ -637,7 +637,7 @@ body: | ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_3 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -675,7 +675,7 @@ body: | ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_0 - ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF1:%[0-9]+]]:vrm2nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG1]], [[INIT_UNDEF1]], %subreg.sub_vrm2_2 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG2]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 @@ -711,7 +711,7 @@ body: | ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:vrm8 = INSERT_SUBREG [[DEF]], [[PseudoVLE32_V_M4_]], %subreg.sub_vrm4_0 ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 0, 210 /* e32, m4, ta, ma */, implicit-def $vl, implicit-def $vtype ; CHECK-NEXT: %pt2:vrm8 = IMPLICIT_DEF - ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4 = INIT_UNDEF + ; CHECK-NEXT: [[INIT_UNDEF:%[0-9]+]]:vrm4nov0 = INIT_UNDEF ; CHECK-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:vrm8 = INSERT_SUBREG [[INSERT_SUBREG]], [[INIT_UNDEF]], %subreg.sub_vrm4_1 ; CHECK-NEXT: early-clobber %6:vrm8 = PseudoVRGATHER_VI_M8 %pt2, killed [[INSERT_SUBREG1]], 0, 0, 5 /* e32 */, 0 /* tu, mu */, implicit $vl, implicit $vtype ; CHECK-NEXT: [[ADDI1:%[0-9]+]]:gpr = ADDI $x0, 0 From 19f604edfc015b35999b3b95e94f18389e4b392d Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 11 Sep 2024 08:54:45 +0100 Subject: [PATCH 064/114] [lldb][test] Add test for printing std::string through expression evaluator This would've caught the failures in https://github.com/llvm/llvm-project/pull/105865 in the libc++ data-formatter CI. --- .../libcxx/string/TestDataFormatterLibcxxString.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py index 98438742a11ca..6b5bcf8a7df2f 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/string/TestDataFormatterLibcxxString.py @@ -100,6 +100,16 @@ def cleanup(): "s", result_type=ns + "::wstring", result_summary='L"hello world! מזל טוב!"' ) + self.expect_expr( + "q", result_type=ns + "::string", result_summary='"hello world"' + ) + + self.expect_expr( + "Q", + result_type=ns + "::string", + result_summary='"quite a long std::strin with lots of info inside it"', + ) + self.expect( "frame variable", substrs=[ From 2afe678f0a246387977a8ca694d4489e2c868991 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 11 Sep 2024 10:04:37 +0200 Subject: [PATCH 065/114] [MemCpyOpt] Allow memcpy elision for non-noalias arguments (#107860) We currently elide memcpys for readonly nocapture noalias arguments. noalias is checked to make sure that there are no other ways to write the memory, e.g. through a different argument or an escaped pointer. In addition to the current noalias check, also query alias analysis, in case it can prove that modification is not possible through other means. This fixes the problem reported in https://discourse.llvm.org/t/problem-about-memcpy-elimination/81121. --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 19 ++++++++++++++----- llvm/test/Transforms/MemCpyOpt/memcpy.ll | 7 ++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 3f15fa2163d27..d81665622809c 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1950,7 +1950,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { /// during call. Try to use memcpy source directly if all of the following /// conditions are satisfied. /// 1. The memcpy dst is neither modified during the call nor captured by the -/// call. (if readonly, noalias, nocapture attributes on call-site.) +/// call. /// 2. The memcpy dst is an alloca with known alignment & size. /// 2-1. The memcpy length == the alloca size which ensures that the new /// pointer is dereferenceable for the required range @@ -1961,12 +1961,22 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) { /// 4. The memcpy src is not modified during the call. (ModRef check shows no /// Mod.) bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) { + BatchAAResults BAA(*AA); + Value *ImmutArg = CB.getArgOperand(ArgNo); + // 1. Ensure passed argument is immutable during call. - if (!(CB.paramHasAttr(ArgNo, Attribute::NoAlias) && - CB.paramHasAttr(ArgNo, Attribute::NoCapture))) + if (!CB.paramHasAttr(ArgNo, Attribute::NoCapture)) + return false; + + // We know that the argument is readonly at this point, but the function + // might still modify the same memory through a different pointer. Exclude + // this either via noalias, or alias analysis. + if (!CB.paramHasAttr(ArgNo, Attribute::NoAlias) && + isModSet( + BAA.getModRefInfo(&CB, MemoryLocation::getBeforeOrAfter(ImmutArg)))) return false; + const DataLayout &DL = CB.getDataLayout(); - Value *ImmutArg = CB.getArgOperand(ArgNo); // 2. Check that arg is alloca // TODO: Even if the arg gets back to branches, we can remove memcpy if all @@ -1986,7 +1996,6 @@ bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) { return false; MemCpyInst *MDep = nullptr; - BatchAAResults BAA(*AA); MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( CallAccess->getDefiningAccess(), Loc, BAA); if (auto *MD = dyn_cast(Clobber)) diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll index a28b0542a7c59..ba260752ce4b5 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll @@ -472,9 +472,7 @@ define void @immut_param_mayalias(ptr align 4 noalias %val) { ; argument doesn't matter. define void @immut_param_unescaped_alloca(ptr align 4 noalias %val) { ; CHECK-LABEL: @immut_param_unescaped_alloca( -; CHECK-NEXT: [[VAL1:%.*]] = alloca i8, align 4 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VAL1]], ptr align 4 [[VAL:%.*]], i64 1, i1 false) -; CHECK-NEXT: call void @f(ptr nocapture readonly align 4 [[VAL1]]) +; CHECK-NEXT: call void @f(ptr nocapture readonly align 4 [[VAL:%.*]]) ; CHECK-NEXT: ret void ; %val1 = alloca i8, align 4 @@ -489,8 +487,7 @@ define void @immut_param_memory_argmem_read(ptr align 4 noalias %val) { ; CHECK-LABEL: @immut_param_memory_argmem_read( ; CHECK-NEXT: [[VAL1:%.*]] = alloca i8, align 4 ; CHECK-NEXT: call void @f(ptr [[VAL1]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VAL1]], ptr align 4 [[VAL:%.*]], i64 1, i1 false) -; CHECK-NEXT: call void @f(ptr nocapture readonly align 4 [[VAL1]]) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: call void @f(ptr nocapture readonly align 4 [[VAL:%.*]]) #[[ATTR6:[0-9]+]] ; CHECK-NEXT: ret void ; %val1 = alloca i8, align 4 From 34cab2ed82a63ecf3d0ebf790def6d21bd4b87af Mon Sep 17 00:00:00 2001 From: Weining Lu Date: Wed, 11 Sep 2024 16:10:38 +0800 Subject: [PATCH 066/114] [Driver][test] Remove useless LoongArch test checks in mcmodel.c --- clang/test/Driver/mcmodel.c | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c index 9681c32579d71..c6c8b5433d23b 100644 --- a/clang/test/Driver/mcmodel.c +++ b/clang/test/Driver/mcmodel.c @@ -43,5 +43,4 @@ // AARCH64-PIC-LARGE: error: invalid argument '-mcmodel=large' only allowed with '-fno-pic' // ERR-AARCH64_32: error: unsupported argument 'small' to option '-mcmodel=' for target 'aarch64_32-unknown-linux' -// ERR-LOONGARCH64-PLT-LARGE: error: invalid argument '-mcmodel=large' not allowed with '-fplt' // ERR-LOONGARCH64-PLT-EXTREME: error: invalid argument '-mcmodel=extreme' not allowed with '-fplt' From c9aa55da62b2a9e482c1877897152fb3c47719d2 Mon Sep 17 00:00:00 2001 From: Kunwar Grover Date: Wed, 11 Sep 2024 09:30:05 +0100 Subject: [PATCH 067/114] [mlir][Linalg] Add speculation for LinalgStructuredOps (#108032) This patch adds speculation behavior for linalg structured ops, allowing them to be hoisted out of loops using LICM. --- .../Dialect/Linalg/IR/LinalgStructuredOps.td | 1 + mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 31 +++++++ .../loop-invariant-code-motion.mlir | 91 +++++++++++++++++++ .../mlir-linalg-ods-yaml-gen.cpp | 5 +- 4 files changed, 127 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index ac61117c3d6e3..31f2913924726 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -29,6 +29,7 @@ class LinalgStructuredBase_Op props> : Op, DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, DestinationStyleOpInterface, LinalgStructuredInterface, ReifyRankedShapedTypeOpInterface], props)> { diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 76df3ecf2d2bd..630985d76a0eb 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -34,6 +34,7 @@ #include "mlir/IR/OperationSupport.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Interfaces/InferTypeOpInterface.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" @@ -1202,6 +1203,20 @@ void GenericOp::getEffects( getGenericEffectsImpl(effects, cast(getOperation())); } +static Speculation::Speculatability +getGenericSpeculatabilityImpl(LinalgOp linalgOp) { + // Operands with value semantics are speculatable, while operands with memory + // semantics are not. + if (!linalgOp.hasPureTensorSemantics()) + return Speculation::NotSpeculatable; + // The body of the op can still have speculation in its region. + return Speculation::RecursivelySpeculatable; +} + +Speculation::Speculatability GenericOp::getSpeculatability() { + return getGenericSpeculatabilityImpl(cast(getOperation())); +} + LogicalResult GenericOp::verify() { return success(); } namespace { @@ -1553,6 +1568,10 @@ void MapOp::getEffects( getGenericEffectsImpl(effects, cast(getOperation())); } +Speculation::Speculatability MapOp::getSpeculatability() { + return getGenericSpeculatabilityImpl(cast(getOperation())); +} + //===----------------------------------------------------------------------===// // ReduceOp //===----------------------------------------------------------------------===// @@ -1621,6 +1640,10 @@ void ReduceOp::getEffects( getGenericEffectsImpl(effects, cast(getOperation())); } +Speculation::Speculatability ReduceOp::getSpeculatability() { + return getGenericSpeculatabilityImpl(cast(getOperation())); +} + static ParseResult parseDenseI64ArrayAttr(OpAsmParser &parser, NamedAttrList &attributes, StringRef attributeName) { @@ -1906,6 +1929,10 @@ void TransposeOp::getEffects( getGenericEffectsImpl(effects, cast(getOperation())); } +Speculation::Speculatability TransposeOp::getSpeculatability() { + return getGenericSpeculatabilityImpl(cast(getOperation())); +} + LogicalResult TransposeOp::fold(FoldAdaptor adaptor, SmallVectorImpl &result) { // Only the tensor type is supported. @@ -2134,6 +2161,10 @@ void BroadcastOp::getEffects( getGenericEffectsImpl(effects, cast(getOperation())); } +Speculation::Speculatability BroadcastOp::getSpeculatability() { + return getGenericSpeculatabilityImpl(cast(getOperation())); +} + void BroadcastOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { results.add>(context); diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir index 47a49465e8a7c..57f4ece9c9f2a 100644 --- a/mlir/test/Transforms/loop-invariant-code-motion.mlir +++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir @@ -1118,3 +1118,94 @@ func.func @hoist_from_scf_while(%arg0: i32, %arg1: i32) -> i32 { } return %0 : i32 } + +// ----- + +#trait = { + indexing_maps = [ + affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (k, n)>, + affine_map<(m, n, k) -> (m, n)> + ], + iterator_types = ["parallel", "parallel", "reduction"] +} + +// CHECK-LABEL: func @hoist_linalg_ops +// CHECK: linalg.generic +// CHECK: scf.for +// CHECK-NOT: linalg.generic +// CHECK: tensor.insert_slice +// CHECK: scf.yield +func.func @hoist_linalg_ops(%a : tensor<128x128xf32>, + %b : tensor<128x128xf32>, + %c: tensor<128x128xf32>, + %lb : index, + %ub : index, + %step : index, + %output : tensor) -> tensor { + %final = + scf.for %i = %lb to %ub step %step iter_args(%acc = %output) + -> tensor { + %compute = linalg.generic #trait + ins(%a, %b : tensor<128x128xf32>, tensor<128x128xf32>) + outs(%c : tensor<128x128xf32>) { + ^bb0(%in : f32, %in2 : f32, %in3 : f32): + %mul = arith.mulf %in, %in2 : f32 + %add = arith.addf %mul, %in3 : f32 + linalg.yield %in3 : f32 + } -> tensor<128x128xf32> + + %newacc = tensor.insert_slice %compute into + %output[%i, 0][128, 128][1, 1] + : tensor<128x128xf32> into tensor + scf.yield %newacc : tensor + } + + func.return %final : tensor +} + +// ----- + +#trait = { + indexing_maps = [ + affine_map<(m, n, k) -> (m, k)>, + affine_map<(m, n, k) -> (k, n)>, + affine_map<(m, n, k) -> (m, n)> + ], + iterator_types = ["parallel", "parallel", "reduction"] +} + +// CHECK-LABEL: func @hoist_linalg_ops_div_by_zero +// CHECK-NOT: linalg.generic +// CHECK: scf.for +// CHECK: linalg.generic +// CHECK: tensor.insert_slice +// CHECK: scf.yield +func.func @hoist_linalg_ops_div_by_zero(%a : tensor<128x128xi32>, + %b : tensor<128x128xi32>, + %c: tensor<128x128xi32>, + %lb : index, + %ub : index, + %step : index, + %output : tensor) -> tensor { + %cst0 = arith.constant 0 : i32 + %final = + scf.for %i = %lb to %ub step %step iter_args(%acc = %output) + -> tensor { + %compute = linalg.generic #trait + ins(%a, %b : tensor<128x128xi32>, tensor<128x128xi32>) + outs(%c : tensor<128x128xi32>) { + ^bb0(%in : i32, %in2 : i32, %in3 : i32): + %div = arith.divui %in, %in2 : i32 + %add = arith.addi %div, %in3 : i32 + linalg.yield %in3 : i32 + } -> tensor<128x128xi32> + + %newacc = tensor.insert_slice %compute into + %output[%i, 0][128, 128][1, 1] + : tensor<128x128xi32> into tensor + scf.yield %newacc : tensor + } + + func.return %final : tensor +} diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp index a00f12661f712..7d42c03469dc9 100644 --- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp +++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp @@ -656,7 +656,7 @@ ArrayAttr {0}::getIndexingMaps() {{ } )FMT"; -// Implementations of fold and getEffects. +// Implementations of fold, getEffects and getSpeculatability. // Parameters: // {0}: Class name const char structuredOpFoldersFormat[] = R"FMT( @@ -669,6 +669,9 @@ void {0}::getEffects(SmallVectorImpl< if (hasPureTensorSemantics()) return; getGenericEffectsImpl(effects, cast(getOperation())); } +Speculation::Speculatability {0}::getSpeculatability() {{ + return getGenericSpeculatabilityImpl(cast(getOperation())); +} )FMT"; // Implementation of parse/print. From db64e69fa250ea3a8d7a761220a7922fbdad0f2c Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Wed, 11 Sep 2024 09:31:53 +0100 Subject: [PATCH 068/114] [flang][debug] Handle 'used' module. (#107626) As described in #98883, we have to qualify a module variable name in debugger to get its value. This PR tries to remove this limitation. LLVM provides `DIImportedEntity` to handle such cases but the PR is made more complicated due to the following 2 issues. 1. The MLIR attributes are readonly and we have a circular dependency here. This has to be handled using the recursive interface provided by the MLIR. This requires us to first create a place holder `DISubprogramAttr` which is used in creating `DIImportedEntityAttr`. Later another `DISubprogramAttr` is created which replaces the place holder. 2. The flang IR does not provide any information about the 'used' module so this has to be extracted by doing a pass over the `DeclareOp` in the function. This presents certain limitation as 'only' and module variable renaming may not be handled properly. Due to the change in `DISubprogramAttr`, some tests also needed to be adjusted. Fixes #98883. --- .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 145 ++++++++++++++---- flang/test/Integration/debug-module-2.f90 | 2 +- flang/test/Transforms/debug-90683.fir | 2 +- flang/test/Transforms/debug-fn-info.fir | 6 +- .../test/Transforms/debug-imported-entity.fir | 30 ++++ .../Transforms/debug-line-table-inc-file.fir | 4 +- .../debug-local-global-storage-1.fir | 2 +- 7 files changed, 150 insertions(+), 41 deletions(-) create mode 100644 flang/test/Transforms/debug-imported-entity.fir diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp index 576e65ba6ecc5..46e70d7ef9180 100644 --- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp +++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp @@ -72,6 +72,10 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase { mlir::LLVM::DICompileUnitAttr cuAttr, fir::DebugTypeGenerator &typeGen, mlir::SymbolTable *symbolTable); + std::optional + getModuleAttrFromGlobalOp(fir::GlobalOp globalOp, + mlir::LLVM::DIFileAttr fileAttr, + mlir::LLVM::DIScopeAttr scope); }; bool debugInfoIsAlreadySet(mlir::Location loc) { @@ -152,6 +156,45 @@ mlir::LLVM::DIModuleAttr AddDebugInfoPass::getOrCreateModuleAttr( return modAttr; } +/// If globalOp represents a module variable, return a ModuleAttr that +/// represents that module. +std::optional +AddDebugInfoPass::getModuleAttrFromGlobalOp(fir::GlobalOp globalOp, + mlir::LLVM::DIFileAttr fileAttr, + mlir::LLVM::DIScopeAttr scope) { + mlir::MLIRContext *context = &getContext(); + mlir::OpBuilder builder(context); + + std::pair result = fir::NameUniquer::deconstruct(globalOp.getSymName()); + // Only look for module if this variable is not part of a function. + if (!result.second.procs.empty() || result.second.modules.empty()) + return std::nullopt; + + // DWARF5 says following about the fortran modules: + // A Fortran 90 module may also be represented by a module entry + // (but no declaration attribute is warranted because Fortran has no concept + // of a corresponding module body). + // But in practice, compilers use declaration attribute with a module in cases + // where module was defined in another source file (only being used in this + // one). The isInitialized() seems to provide the right information + // but inverted. It is true where module is actually defined but false where + // it is used. + // FIXME: Currently we don't have the line number on which a module was + // declared. We are using a best guess of line - 1 where line is the source + // line of the first member of the module that we encounter. + unsigned line = getLineFromLoc(globalOp.getLoc()); + + mlir::LLVM::DISubprogramAttr sp = + mlir::dyn_cast_if_present(scope); + // Modules are generated at compile unit scope + if (sp) + scope = sp.getCompileUnit(); + + return getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope, + std::max(line - 1, (unsigned)1), + !globalOp.isInitialized()); +} + void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp, mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scope, @@ -174,33 +217,11 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp, return; unsigned line = getLineFromLoc(globalOp.getLoc()); + std::optional modOpt = + getModuleAttrFromGlobalOp(globalOp, fileAttr, scope); + if (modOpt) + scope = *modOpt; - // DWARF5 says following about the fortran modules: - // A Fortran 90 module may also be represented by a module entry - // (but no declaration attribute is warranted because Fortran has no concept - // of a corresponding module body). - // But in practice, compilers use declaration attribute with a module in cases - // where module was defined in another source file (only being used in this - // one). The isInitialized() seems to provide the right information - // but inverted. It is true where module is actually defined but false where - // it is used. - // FIXME: Currently we don't have the line number on which a module was - // declared. We are using a best guess of line - 1 where line is the source - // line of the first member of the module that we encounter. - - if (result.second.procs.empty()) { - // Only look for module if this variable is not part of a function. - if (result.second.modules.empty()) - return; - - // Modules are generated at compile unit scope - if (mlir::LLVM::DISubprogramAttr sp = - mlir::dyn_cast_if_present(scope)) - scope = sp.getCompileUnit(); - - scope = getOrCreateModuleAttr(result.second.modules[0], fileAttr, scope, - line - 1, !globalOp.isInitialized()); - } mlir::LLVM::DITypeAttr diType = typeGen.convertType(globalOp.getType(), fileAttr, scope, declOp); auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get( @@ -262,7 +283,7 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp, mlir::LLVM::DIFileAttr::get(context, fileName, filePath); // Only definitions need a distinct identifier and a compilation unit. - mlir::DistinctAttr id; + mlir::DistinctAttr id, id2; mlir::LLVM::DIScopeAttr Scope = fileAttr; mlir::LLVM::DICompileUnitAttr compilationUnit; mlir::LLVM::DISubprogramFlags subprogramFlags = @@ -270,7 +291,10 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp, if (isOptimized) subprogramFlags = mlir::LLVM::DISubprogramFlags::Optimized; if (!funcOp.isExternal()) { + // Place holder and final function have to have different IDs, otherwise + // translation code will reject one of them. id = mlir::DistinctAttr::create(mlir::UnitAttr::get(context)); + id2 = mlir::DistinctAttr::create(mlir::UnitAttr::get(context)); compilationUnit = cuAttr; subprogramFlags = subprogramFlags | mlir::LLVM::DISubprogramFlags::Definition; @@ -299,14 +323,69 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp, line - 1, false); } - auto spAttr = mlir::LLVM::DISubprogramAttr::get( - context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr, - line, line, subprogramFlags, subTypeAttr, /*retainedNodes=*/{}); - funcOp->setLoc(builder.getFusedLoc({funcOp->getLoc()}, spAttr)); - // Don't process variables if user asked for line tables only. - if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly) + if (debugLevel == mlir::LLVM::DIEmissionKind::LineTablesOnly) { + auto spAttr = mlir::LLVM::DISubprogramAttr::get( + context, id, compilationUnit, Scope, funcName, fullName, funcFileAttr, + line, line, subprogramFlags, subTypeAttr, /*retainedNodes=*/{}); + funcOp->setLoc(builder.getFusedLoc({l}, spAttr)); return; + } + + mlir::DistinctAttr recId = + mlir::DistinctAttr::create(mlir::UnitAttr::get(context)); + + // The debug attribute in MLIR are readonly once created. But in case of + // imported entities, we have a circular dependency. The + // DIImportedEntityAttr requires scope information (DISubprogramAttr in this + // case) and DISubprogramAttr requires the list of imported entities. The + // MLIR provides a way where a DISubprogramAttr an be created with a certain + // recID and be used in places like DIImportedEntityAttr. After that another + // DISubprogramAttr can be created with same recID but with list of entities + // now available. The MLIR translation code takes care of updating the + // references. Note that references will be updated only in the things that + // are part of DISubprogramAttr (like DIImportedEntityAttr) so we have to + // create the final DISubprogramAttr before we process local variables. + // Look at DIRecursiveTypeAttrInterface for more details. + + auto spAttr = mlir::LLVM::DISubprogramAttr::get( + context, recId, /*isRecSelf=*/true, id, compilationUnit, Scope, funcName, + fullName, funcFileAttr, line, line, subprogramFlags, subTypeAttr, + /*retainedNodes=*/{}); + + // There is no direct information in the IR for any 'use' statement in the + // function. We have to extract that information from the DeclareOp. We do + // a pass on the DeclareOp and generate ModuleAttr and corresponding + // DIImportedEntityAttr for that module. + // FIXME: As we are depending on the variables to see which module is being + // 'used' in the function, there are certain limitations. + // For things like 'use mod1, only: v1', whole module will be brought into the + // namespace in the debug info. It is not a problem as such unless there is a + // clash of names. + // There is no information about module variable renaming + llvm::DenseSet importedModules; + funcOp.walk([&](fir::cg::XDeclareOp declOp) { + if (&funcOp.front() == declOp->getBlock()) + if (auto global = + symbolTable->lookup(declOp.getUniqName())) { + std::optional modOpt = + getModuleAttrFromGlobalOp(global, fileAttr, cuAttr); + if (modOpt) { + auto importedEntity = mlir::LLVM::DIImportedEntityAttr::get( + context, llvm::dwarf::DW_TAG_imported_module, spAttr, *modOpt, + fileAttr, /*line=*/1, /*name=*/nullptr, /*elements*/ {}); + importedModules.insert(importedEntity); + } + } + }); + llvm::SmallVector entities(importedModules.begin(), + importedModules.end()); + // We have the imported entities now. Generate the final DISubprogramAttr. + spAttr = mlir::LLVM::DISubprogramAttr::get( + context, recId, /*isRecSelf=*/false, id2, compilationUnit, Scope, + funcName, fullName, funcFileAttr, line, line, subprogramFlags, + subTypeAttr, entities); + funcOp->setLoc(builder.getFusedLoc({l}, spAttr)); funcOp.walk([&](fir::cg::XDeclareOp declOp) { // FIXME: We currently dont handle variables that are not in the entry diff --git a/flang/test/Integration/debug-module-2.f90 b/flang/test/Integration/debug-module-2.f90 index 60fccaa2a6c1f..f07416c3ef3cc 100644 --- a/flang/test/Integration/debug-module-2.f90 +++ b/flang/test/Integration/debug-module-2.f90 @@ -17,7 +17,7 @@ module helper integer gli contains -!CHECK-DAG: !DISubprogram(name: "test", linkageName: "_QMhelperPtest", scope: ![[MOD]], file: ![[FILE2]], line: [[@LINE+1]]{{.*}}unit: ![[CU]]) +!CHECK-DAG: !DISubprogram(name: "test", linkageName: "_QMhelperPtest", scope: ![[MOD]], file: ![[FILE2]], line: [[@LINE+1]]{{.*}}unit: ![[CU]]{{.*}}) subroutine test() glr = 12.34 gli = 67 diff --git a/flang/test/Transforms/debug-90683.fir b/flang/test/Transforms/debug-90683.fir index cc6929c10411f..a21332e3968a7 100644 --- a/flang/test/Transforms/debug-90683.fir +++ b/flang/test/Transforms/debug-90683.fir @@ -22,4 +22,4 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} { // CHECK-DAG: #[[TY:.*]] = #llvm.di_basic_type // CHECK-DAG: #[[TY1:.*]] = #llvm.di_subroutine_type -// CHECK-DAG: #{{.*}} = #llvm.di_subprogram +// CHECK-DAG: #{{.*}} = #llvm.di_subprogram<{{.*}}name = "cabs", linkageName = "cabs"{{.*}}, type = #[[TY1]]> diff --git a/flang/test/Transforms/debug-fn-info.fir b/flang/test/Transforms/debug-fn-info.fir index f456e35d3dd70..5433e088a648d 100644 --- a/flang/test/Transforms/debug-fn-info.fir +++ b/flang/test/Transforms/debug-fn-info.fir @@ -69,7 +69,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} { // CHECK: #[[TY2:.*]] = #llvm.di_subroutine_type // Line numbers should match the number in corresponding loc entry. -// CHECK: #llvm.di_subprogram -// CHECK: #llvm.di_subprogram -// CHECK: #llvm.di_subprogram +// CHECK: #llvm.di_subprogram<{{.*}}name = "_QQmain", linkageName = "_QQmain", file = {{.*}}, line = 15, scopeLine = 15, subprogramFlags = Definition, type = #[[TY0]]> +// CHECK: #llvm.di_subprogram<{{.*}}name = "fn1", linkageName = "_QFPfn1", file = {{.*}}, line = 26, scopeLine = 26, subprogramFlags = Definition, type = #[[TY1]]> +// CHECK: #llvm.di_subprogram<{{.*}}name = "fn2", linkageName = "_QFPfn2", file = {{.*}}, line = 43, scopeLine = 43, subprogramFlags = Definition, type = #[[TY2]]> diff --git a/flang/test/Transforms/debug-imported-entity.fir b/flang/test/Transforms/debug-imported-entity.fir new file mode 100644 index 0000000000000..7be6531a703a8 --- /dev/null +++ b/flang/test/Transforms/debug-imported-entity.fir @@ -0,0 +1,30 @@ +// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s + + +module attributes {dlti.dl_spec = #dlti.dl_spec<>} { + fir.global @_QMfooEv1 : i32 { + %0 = fir.zero_bits i32 + fir.has_value %0 : i32 + } + fir.global internal @_QFtestExyz : i32 { + %c12_i32 = arith.constant 12 : i32 + fir.has_value %c12_i32 : i32 + } loc(#loc4) + func.func @test() attributes {fir.bindc_name = "test"} { + %0 = fir.address_of(@_QMfooEv1) : !fir.ref + %1 = fircg.ext_declare %0 {uniq_name = "_QMfooEv1"} : (!fir.ref) -> !fir.ref loc(#loc1) + %4 = fir.address_of(@_QFtestExyz) : !fir.ref + %5 = fircg.ext_declare %4 {uniq_name = "_QFtestExyz"} : (!fir.ref) -> !fir.ref loc(#loc4) + return + } loc(#loc3) +} +#loc1 = loc("test.f90":2:14) +#loc2 = loc("test.f90":6:1) +#loc3 = loc("test.f90":10:1) +#loc4 = loc("test.f90":13:1) + +// CHECK: #[[MOD:.+]] = #llvm.di_module<{{.*}}name = "foo"{{.*}}> +// CHECK: #[[SP_REC:.+]] = #llvm.di_subprogram, isRecSelf = true{{.*}}> +// CHECK: #[[IMP_ENTITY:.+]] = #llvm.di_imported_entity +// CHECK: #[[SP:.+]] = #llvm.di_subprogram{{.*}}retainedNodes = #[[IMP_ENTITY]]> +// CHECK: #llvm.di_global_variable diff --git a/flang/test/Transforms/debug-line-table-inc-file.fir b/flang/test/Transforms/debug-line-table-inc-file.fir index 065039b59c5ae..216cd5e016f2f 100644 --- a/flang/test/Transforms/debug-line-table-inc-file.fir +++ b/flang/test/Transforms/debug-line-table-inc-file.fir @@ -31,7 +31,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} { // CHECK: #[[LOC_INC_FILE:.*]] = loc("{{.*}}inc.f90":1:1) // CHECK: #[[LOC_FILE:.*]] = loc("{{.*}}simple.f90":3:1) // CHECK: #[[DI_CU:.*]] = #llvm.di_compile_unit, sourceLanguage = DW_LANG_Fortran95, file = #[[DI_FILE]], producer = "{{.*}}flang{{.*}}", isOptimized = false, emissionKind = LineTablesOnly> -// CHECK: #[[DI_SP_INC:.*]] = #llvm.di_subprogram, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "sinc", linkageName = "_QPsinc", file = #[[DI_INC_FILE]], {{.*}}> -// CHECK: #[[DI_SP:.*]] = #llvm.di_subprogram, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "_QQmain", linkageName = "_QQmain", file = #[[DI_FILE]], {{.*}}> +// CHECK: #[[DI_SP_INC:.*]] = #llvm.di_subprogram<{{.*}}id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "sinc", linkageName = "_QPsinc", file = #[[DI_INC_FILE]], {{.*}}> +// CHECK: #[[DI_SP:.*]] = #llvm.di_subprogram<{{.*}}id = distinct[{{.*}}]<>, compileUnit = #[[DI_CU]], scope = #[[DI_FILE]], name = "_QQmain", linkageName = "_QQmain", file = #[[DI_FILE]], {{.*}}> // CHECK: #[[FUSED_LOC_INC_FILE]] = loc(fused<#[[DI_SP_INC]]>[#[[LOC_INC_FILE]]]) // CHECK: #[[FUSED_LOC_FILE]] = loc(fused<#[[DI_SP]]>[#[[LOC_FILE]]]) diff --git a/flang/test/Transforms/debug-local-global-storage-1.fir b/flang/test/Transforms/debug-local-global-storage-1.fir index d9d8083a14709..83a9055a6b8dc 100644 --- a/flang/test/Transforms/debug-local-global-storage-1.fir +++ b/flang/test/Transforms/debug-local-global-storage-1.fir @@ -45,7 +45,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : // CHECK-DAG: #[[CU:.*]] = #llvm.di_compile_unit<{{.*}}> // CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}scope = #[[CU]]{{.*}}name = "example"{{.*}}> // CHECK-DAG: #[[SP:.*]] = #llvm.di_subprogram<{{.*}}name = "_QQmain"{{.*}}> -// CHECK-DAG: #[[MOD_SP:.*]] = #llvm.di_subprogram<{{.*}}name = "mod_sub"{{.*}}> +// CHECK-DAG: #[[MOD_SP:.*]] = #llvm.di_subprogram<{{.*}}name = "mod_sub"{{.*}}retainedNodes = {{.*}}> // CHECK-DAG: #llvm.di_global_variable // CHECK-DAG: #llvm.di_global_variable // CHECK-DAG: #llvm.di_global_variable From 300161761df54f5f85630a8ad0e170d09d119ee3 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 11 Sep 2024 09:34:15 +0100 Subject: [PATCH 069/114] [AArch64] Add tests for scalar_to_vector(load) and extend load into zero tests. NFC --- .../test/CodeGen/AArch64/load-insert-undef.ll | 1098 +++++++++++++++++ llvm/test/CodeGen/AArch64/load-insert-zero.ll | 323 ++++- 2 files changed, 1420 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/load-insert-undef.ll diff --git a/llvm/test/CodeGen/AArch64/load-insert-undef.ll b/llvm/test/CodeGen/AArch64/load-insert-undef.ll new file mode 100644 index 0000000000000..1e776d1c06fcb --- /dev/null +++ b/llvm/test/CodeGen/AArch64/load-insert-undef.ll @@ -0,0 +1,1098 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+bf16,+sve | FileCheck %s + +define <8 x i8> @loadv8i8(ptr %p) { +; CHECK-LABEL: loadv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: ret + %l = load i8, ptr %p + %v = insertelement <8 x i8> poison, i8 %l, i32 0 + ret <8 x i8> %v +} + +define <16 x i8> @loadv16i8(ptr %p) { +; CHECK-LABEL: loadv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: ret + %l = load i8, ptr %p + %v = insertelement <16 x i8> poison, i8 %l, i32 0 + ret <16 x i8> %v +} + +define <4 x i16> @loadv4i16(ptr %p) { +; CHECK-LABEL: loadv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load i16, ptr %p + %v = insertelement <4 x i16> poison, i16 %l, i32 0 + ret <4 x i16> %v +} + +define <8 x i16> @loadv8i16(ptr %p) { +; CHECK-LABEL: loadv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load i16, ptr %p + %v = insertelement <8 x i16> poison, i16 %l, i32 0 + ret <8 x i16> %v +} + +define <2 x i32> @loadv2i32(ptr %p) { +; CHECK-LABEL: loadv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ret + %l = load i32, ptr %p + %v = insertelement <2 x i32> poison, i32 %l, i32 0 + ret <2 x i32> %v +} + +define <4 x i32> @loadv4i32(ptr %p) { +; CHECK-LABEL: loadv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ret + %l = load i32, ptr %p + %v = insertelement <4 x i32> poison, i32 %l, i32 0 + ret <4 x i32> %v +} + +define <2 x i64> @loadv2i64(ptr %p) { +; CHECK-LABEL: loadv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %l = load i64, ptr %p + %v = insertelement <2 x i64> poison, i64 %l, i32 0 + ret <2 x i64> %v +} + + +define <4 x half> @loadv4f16(ptr %p) { +; CHECK-LABEL: loadv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load half, ptr %p + %v = insertelement <4 x half> poison, half %l, i32 0 + ret <4 x half> %v +} + +define <8 x half> @loadv8f16(ptr %p) { +; CHECK-LABEL: loadv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load half, ptr %p + %v = insertelement <8 x half> poison, half %l, i32 0 + ret <8 x half> %v +} + +define <4 x bfloat> @loadv4bf16(ptr %p) { +; CHECK-LABEL: loadv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load bfloat, ptr %p + %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0 + ret <4 x bfloat> %v +} + +define <8 x bfloat> @loadv8bf16(ptr %p) { +; CHECK-LABEL: loadv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load bfloat, ptr %p + %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0 + ret <8 x bfloat> %v +} + +define <2 x float> @loadv2f32(ptr %p) { +; CHECK-LABEL: loadv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ret + %l = load float, ptr %p + %v = insertelement <2 x float> poison, float %l, i32 0 + ret <2 x float> %v +} + +define <4 x float> @loadv4f32(ptr %p) { +; CHECK-LABEL: loadv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ret + %l = load float, ptr %p + %v = insertelement <4 x float> poison, float %l, i32 0 + ret <4 x float> %v +} + +define <2 x double> @loadv2f64(ptr %p) { +; CHECK-LABEL: loadv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %l = load double, ptr %p + %v = insertelement <2 x double> poison, double %l, i32 0 + ret <2 x double> %v +} + + +; Unscaled + +define <8 x i8> @loadv8i8_offset(ptr %p) { +; CHECK-LABEL: loadv8i8_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i8, ptr %g + %v = insertelement <8 x i8> poison, i8 %l, i32 0 + ret <8 x i8> %v +} + +define <16 x i8> @loadv16i8_offset(ptr %p) { +; CHECK-LABEL: loadv16i8_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i8, ptr %g + %v = insertelement <16 x i8> poison, i8 %l, i32 0 + ret <16 x i8> %v +} + +define <4 x i16> @loadv4i16_offset(ptr %p) { +; CHECK-LABEL: loadv4i16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i16, ptr %g + %v = insertelement <4 x i16> poison, i16 %l, i32 0 + ret <4 x i16> %v +} + +define <8 x i16> @loadv8i16_offset(ptr %p) { +; CHECK-LABEL: loadv8i16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i16, ptr %g + %v = insertelement <8 x i16> poison, i16 %l, i32 0 + ret <8 x i16> %v +} + +define <2 x i32> @loadv2i32_offset(ptr %p) { +; CHECK-LABEL: loadv2i32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i32, ptr %g + %v = insertelement <2 x i32> poison, i32 %l, i32 0 + ret <2 x i32> %v +} + +define <4 x i32> @loadv4i32_offset(ptr %p) { +; CHECK-LABEL: loadv4i32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i32, ptr %g + %v = insertelement <4 x i32> poison, i32 %l, i32 0 + ret <4 x i32> %v +} + +define <2 x i64> @loadv2i64_offset(ptr %p) { +; CHECK-LABEL: loadv2i64_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur x8, [x0, #1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i64, ptr %g + %v = insertelement <2 x i64> poison, i64 %l, i32 0 + ret <2 x i64> %v +} + + +define <4 x half> @loadv4f16_offset(ptr %p) { +; CHECK-LABEL: loadv4f16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load half, ptr %g + %v = insertelement <4 x half> poison, half %l, i32 0 + ret <4 x half> %v +} + +define <8 x half> @loadv8f16_offset(ptr %p) { +; CHECK-LABEL: loadv8f16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load half, ptr %g + %v = insertelement <8 x half> poison, half %l, i32 0 + ret <8 x half> %v +} + +define <4 x bfloat> @loadv4bf16_offset(ptr %p) { +; CHECK-LABEL: loadv4bf16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load bfloat, ptr %g + %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0 + ret <4 x bfloat> %v +} + +define <8 x bfloat> @loadv8bf16_offset(ptr %p) { +; CHECK-LABEL: loadv8bf16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load bfloat, ptr %g + %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0 + ret <8 x bfloat> %v +} + +define <2 x float> @loadv2f32_offset(ptr %p) { +; CHECK-LABEL: loadv2f32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur s0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load float, ptr %g + %v = insertelement <2 x float> poison, float %l, i32 0 + ret <2 x float> %v +} + +define <4 x float> @loadv4f32_offset(ptr %p) { +; CHECK-LABEL: loadv4f32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur s0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load float, ptr %g + %v = insertelement <4 x float> poison, float %l, i32 0 + ret <4 x float> %v +} + +define <2 x double> @loadv2f64_offset(ptr %p) { +; CHECK-LABEL: loadv2f64_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load double, ptr %g + %v = insertelement <2 x double> poison, double %l, i32 0 + ret <2 x double> %v +} + + +define <8 x i8> @loadv8i8_noffset(ptr %p) { +; CHECK-LABEL: loadv8i8_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurb w8, [x0, #-1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load i8, ptr %g + %v = insertelement <8 x i8> poison, i8 %l, i32 0 + ret <8 x i8> %v +} + +define <16 x i8> @loadv16i8_noffset(ptr %p) { +; CHECK-LABEL: loadv16i8_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurb w8, [x0, #-1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load i8, ptr %g + %v = insertelement <16 x i8> poison, i8 %l, i32 0 + ret <16 x i8> %v +} + +define <4 x i16> @loadv4i16_noffset(ptr %p) { +; CHECK-LABEL: loadv4i16_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurh w8, [x0, #-1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load i16, ptr %g + %v = insertelement <4 x i16> poison, i16 %l, i32 0 + ret <4 x i16> %v +} + +define <8 x i16> @loadv8i16_noffset(ptr %p) { +; CHECK-LABEL: loadv8i16_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurh w8, [x0, #-1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load i16, ptr %g + %v = insertelement <8 x i16> poison, i16 %l, i32 0 + ret <8 x i16> %v +} + +define <2 x i32> @loadv2i32_noffset(ptr %p) { +; CHECK-LABEL: loadv2i32_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #-1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load i32, ptr %g + %v = insertelement <2 x i32> poison, i32 %l, i32 0 + ret <2 x i32> %v +} + +define <4 x i32> @loadv4i32_noffset(ptr %p) { +; CHECK-LABEL: loadv4i32_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #-1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load i32, ptr %g + %v = insertelement <4 x i32> poison, i32 %l, i32 0 + ret <4 x i32> %v +} + +define <2 x i64> @loadv2i64_noffset(ptr %p) { +; CHECK-LABEL: loadv2i64_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur x8, [x0, #-1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load i64, ptr %g + %v = insertelement <2 x i64> poison, i64 %l, i32 0 + ret <2 x i64> %v +} + + +define <4 x half> @loadv4f16_noffset(ptr %p) { +; CHECK-LABEL: loadv4f16_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #-1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load half, ptr %g + %v = insertelement <4 x half> poison, half %l, i32 0 + ret <4 x half> %v +} + +define <8 x half> @loadv8f16_noffset(ptr %p) { +; CHECK-LABEL: loadv8f16_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #-1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load half, ptr %g + %v = insertelement <8 x half> poison, half %l, i32 0 + ret <8 x half> %v +} + +define <4 x bfloat> @loadv4bf16_noffset(ptr %p) { +; CHECK-LABEL: loadv4bf16_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #-1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load bfloat, ptr %g + %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0 + ret <4 x bfloat> %v +} + +define <8 x bfloat> @loadv8bf16_noffset(ptr %p) { +; CHECK-LABEL: loadv8bf16_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #-1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load bfloat, ptr %g + %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0 + ret <8 x bfloat> %v +} + +define <2 x float> @loadv2f32_noffset(ptr %p) { +; CHECK-LABEL: loadv2f32_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur s0, [x0, #-1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load float, ptr %g + %v = insertelement <2 x float> poison, float %l, i32 0 + ret <2 x float> %v +} + +define <4 x float> @loadv4f32_noffset(ptr %p) { +; CHECK-LABEL: loadv4f32_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur s0, [x0, #-1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load float, ptr %g + %v = insertelement <4 x float> poison, float %l, i32 0 + ret <4 x float> %v +} + +define <2 x double> @loadv2f64_noffset(ptr %p) { +; CHECK-LABEL: loadv2f64_noffset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #-1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 -1 + %l = load double, ptr %g + %v = insertelement <2 x double> poison, double %l, i32 0 + ret <2 x double> %v +} + + +; ROW addressing modes + +define <8 x i8> @loadv8i8_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv8i8_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, w1, sxtw] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i32 %o + %l = load i8, ptr %g + %v = insertelement <8 x i8> poison, i8 %l, i32 0 + ret <8 x i8> %v +} + +define <16 x i8> @loadv16i8_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv16i8_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, w1, sxtw] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i32 %o + %l = load i8, ptr %g + %v = insertelement <16 x i8> poison, i8 %l, i32 0 + ret <16 x i8> %v +} + +define <4 x i16> @loadv4i16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4i16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i16, ptr %p, i32 %o + %l = load i16, ptr %g + %v = insertelement <4 x i16> poison, i16 %l, i32 0 + ret <4 x i16> %v +} + +define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv8i16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i16, ptr %p, i32 %o + %l = load i16, ptr %g + %v = insertelement <8 x i16> poison, i16 %l, i32 0 + ret <8 x i16> %v +} + +define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv2i32_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2] +; CHECK-NEXT: ret + %g = getelementptr inbounds i32, ptr %p, i32 %o + %l = load i32, ptr %g + %v = insertelement <2 x i32> poison, i32 %l, i32 0 + ret <2 x i32> %v +} + +define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4i32_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2] +; CHECK-NEXT: ret + %g = getelementptr inbounds i32, ptr %p, i32 %o + %l = load i32, ptr %g + %v = insertelement <4 x i32> poison, i32 %l, i32 0 + ret <4 x i32> %v +} + +define <2 x i64> @loadv2i64_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv2i64_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, w1, sxtw #3] +; CHECK-NEXT: ret + %g = getelementptr inbounds i64, ptr %p, i32 %o + %l = load i64, ptr %g + %v = insertelement <2 x i64> poison, i64 %l, i32 0 + ret <2 x i64> %v +} + +define <4 x half> @loadv4f16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4f16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds half, ptr %p, i32 %o + %l = load half, ptr %g + %v = insertelement <4 x half> poison, half %l, i32 0 + ret <4 x half> %v +} + +define <8 x half> @loadv8f16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv8f16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds half, ptr %p, i32 %o + %l = load half, ptr %g + %v = insertelement <8 x half> poison, half %l, i32 0 + ret <8 x half> %v +} + +define <4 x bfloat> @loadv4bf16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4bf16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds bfloat, ptr %p, i32 %o + %l = load bfloat, ptr %g + %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0 + ret <4 x bfloat> %v +} + +define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv8bf16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds bfloat, ptr %p, i32 %o + %l = load bfloat, ptr %g + %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0 + ret <8 x bfloat> %v +} + +define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv2f32_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2] +; CHECK-NEXT: ret + %g = getelementptr inbounds float, ptr %p, i32 %o + %l = load float, ptr %g + %v = insertelement <2 x float> poison, float %l, i32 0 + ret <2 x float> %v +} + +define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4f32_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2] +; CHECK-NEXT: ret + %g = getelementptr inbounds float, ptr %p, i32 %o + %l = load float, ptr %g + %v = insertelement <4 x float> poison, float %l, i32 0 + ret <4 x float> %v +} + +define <2 x double> @loadv2f64_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv2f64_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, w1, sxtw #3] +; CHECK-NEXT: ret + %g = getelementptr inbounds double, ptr %p, i32 %o + %l = load double, ptr %g + %v = insertelement <2 x double> poison, double %l, i32 0 + ret <2 x double> %v +} + +; roX + +define <8 x i8> @loadv8i8_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv8i8_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, x1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 %o + %l = load i8, ptr %g + %v = insertelement <8 x i8> poison, i8 %l, i32 0 + ret <8 x i8> %v +} + +define <16 x i8> @loadv16i8_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv16i8_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, x1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 %o + %l = load i8, ptr %g + %v = insertelement <16 x i8> poison, i8 %l, i32 0 + ret <16 x i8> %v +} + +define <4 x i16> @loadv4i16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4i16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i16, ptr %p, i64 %o + %l = load i16, ptr %g + %v = insertelement <4 x i16> poison, i16 %l, i32 0 + ret <4 x i16> %v +} + +define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv8i16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i16, ptr %p, i64 %o + %l = load i16, ptr %g + %v = insertelement <8 x i16> poison, i16 %l, i32 0 + ret <8 x i16> %v +} + +define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv2i32_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %g = getelementptr inbounds i32, ptr %p, i64 %o + %l = load i32, ptr %g + %v = insertelement <2 x i32> poison, i32 %l, i32 0 + ret <2 x i32> %v +} + +define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4i32_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %g = getelementptr inbounds i32, ptr %p, i64 %o + %l = load i32, ptr %g + %v = insertelement <4 x i32> poison, i32 %l, i32 0 + ret <4 x i32> %v +} + +define <2 x i64> @loadv2i64_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv2i64_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %g = getelementptr inbounds i64, ptr %p, i64 %o + %l = load i64, ptr %g + %v = insertelement <2 x i64> poison, i64 %l, i32 0 + ret <2 x i64> %v +} + +define <4 x half> @loadv4f16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4f16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds half, ptr %p, i64 %o + %l = load half, ptr %g + %v = insertelement <4 x half> poison, half %l, i32 0 + ret <4 x half> %v +} + +define <8 x half> @loadv8f16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv8f16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds half, ptr %p, i64 %o + %l = load half, ptr %g + %v = insertelement <8 x half> poison, half %l, i32 0 + ret <8 x half> %v +} + +define <4 x bfloat> @loadv4bf16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4bf16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds bfloat, ptr %p, i64 %o + %l = load bfloat, ptr %g + %v = insertelement <4 x bfloat> poison, bfloat %l, i32 0 + ret <4 x bfloat> %v +} + +define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv8bf16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds bfloat, ptr %p, i64 %o + %l = load bfloat, ptr %g + %v = insertelement <8 x bfloat> poison, bfloat %l, i32 0 + ret <8 x bfloat> %v +} + +define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv2f32_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %g = getelementptr inbounds float, ptr %p, i64 %o + %l = load float, ptr %g + %v = insertelement <2 x float> poison, float %l, i32 0 + ret <2 x float> %v +} + +define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4f32_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %g = getelementptr inbounds float, ptr %p, i64 %o + %l = load float, ptr %g + %v = insertelement <4 x float> poison, float %l, i32 0 + ret <4 x float> %v +} + +define <2 x double> @loadv2f64_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv2f64_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %g = getelementptr inbounds double, ptr %p, i64 %o + %l = load double, ptr %g + %v = insertelement <2 x double> poison, double %l, i32 0 + ret <2 x double> %v +} + + +; SVE + +define @loadnxv8i8(ptr %p) { +; CHECK-LABEL: loadnxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %l = load i8, ptr %p + %v = insertelement poison, i8 %l, i32 0 + ret %v +} + +define @loadnxv16i8(ptr %p) { +; CHECK-LABEL: loadnxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %l = load i8, ptr %p + %v = insertelement poison, i8 %l, i32 0 + ret %v +} + +define @loadnxv4i16(ptr %p) { +; CHECK-LABEL: loadnxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %l = load i16, ptr %p + %v = insertelement poison, i16 %l, i32 0 + ret %v +} + +define @loadnxv8i16(ptr %p) { +; CHECK-LABEL: loadnxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %l = load i16, ptr %p + %v = insertelement poison, i16 %l, i32 0 + ret %v +} + +define @loadnxv2i32(ptr %p) { +; CHECK-LABEL: loadnxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %l = load i32, ptr %p + %v = insertelement poison, i32 %l, i32 0 + ret %v +} + +define @loadnxv4i32(ptr %p) { +; CHECK-LABEL: loadnxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %l = load i32, ptr %p + %v = insertelement poison, i32 %l, i32 0 + ret %v +} + +define @loadnxv2i64(ptr %p) { +; CHECK-LABEL: loadnxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %l = load i64, ptr %p + %v = insertelement poison, i64 %l, i32 0 + ret %v +} + + +define @loadnxv4f16(ptr %p) { +; CHECK-LABEL: loadnxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load half, ptr %p + %v = insertelement poison, half %l, i32 0 + ret %v +} + +define @loadnxv8f16(ptr %p) { +; CHECK-LABEL: loadnxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load half, ptr %p + %v = insertelement poison, half %l, i32 0 + ret %v +} + +define @loadnxv4bf16(ptr %p) { +; CHECK-LABEL: loadnxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load bfloat, ptr %p + %v = insertelement poison, bfloat %l, i32 0 + ret %v +} + +define @loadnxv8bf16(ptr %p) { +; CHECK-LABEL: loadnxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: ret + %l = load bfloat, ptr %p + %v = insertelement poison, bfloat %l, i32 0 + ret %v +} + +define @loadnxv2f32(ptr %p) { +; CHECK-LABEL: loadnxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ret + %l = load float, ptr %p + %v = insertelement poison, float %l, i32 0 + ret %v +} + +define @loadnxv4f32(ptr %p) { +; CHECK-LABEL: loadnxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: ret + %l = load float, ptr %p + %v = insertelement poison, float %l, i32 0 + ret %v +} + +define @loadnxv2f64(ptr %p) { +; CHECK-LABEL: loadnxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: ret + %l = load double, ptr %p + %v = insertelement poison, double %l, i32 0 + ret %v +} + + +; Unscaled + +define @loadnxv8i8_offset(ptr %p) { +; CHECK-LABEL: loadnxv8i8_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i8, ptr %g + %v = insertelement poison, i8 %l, i32 0 + ret %v +} + +define @loadnxv16i8_offset(ptr %p) { +; CHECK-LABEL: loadnxv16i8_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i8, ptr %g + %v = insertelement poison, i8 %l, i32 0 + ret %v +} + +define @loadnxv4i16_offset(ptr %p) { +; CHECK-LABEL: loadnxv4i16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i16, ptr %g + %v = insertelement poison, i16 %l, i32 0 + ret %v +} + +define @loadnxv8i16_offset(ptr %p) { +; CHECK-LABEL: loadnxv8i16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i16, ptr %g + %v = insertelement poison, i16 %l, i32 0 + ret %v +} + +define @loadnxv2i32_offset(ptr %p) { +; CHECK-LABEL: loadnxv2i32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i32, ptr %g + %v = insertelement poison, i32 %l, i32 0 + ret %v +} + +define @loadnxv4i32_offset(ptr %p) { +; CHECK-LABEL: loadnxv4i32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i32, ptr %g + %v = insertelement poison, i32 %l, i32 0 + ret %v +} + +define @loadnxv2i64_offset(ptr %p) { +; CHECK-LABEL: loadnxv2i64_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur x8, [x0, #1] +; CHECK-NEXT: fmov d0, x8 +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load i64, ptr %g + %v = insertelement poison, i64 %l, i32 0 + ret %v +} + + +define @loadnxv4f16_offset(ptr %p) { +; CHECK-LABEL: loadnxv4f16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load half, ptr %g + %v = insertelement poison, half %l, i32 0 + ret %v +} + +define @loadnxv8f16_offset(ptr %p) { +; CHECK-LABEL: loadnxv8f16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load half, ptr %g + %v = insertelement poison, half %l, i32 0 + ret %v +} + +define @loadnxv4bf16_offset(ptr %p) { +; CHECK-LABEL: loadnxv4bf16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load bfloat, ptr %g + %v = insertelement poison, bfloat %l, i32 0 + ret %v +} + +define @loadnxv8bf16_offset(ptr %p) { +; CHECK-LABEL: loadnxv8bf16_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur h0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load bfloat, ptr %g + %v = insertelement poison, bfloat %l, i32 0 + ret %v +} + +define @loadnxv2f32_offset(ptr %p) { +; CHECK-LABEL: loadnxv2f32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur s0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load float, ptr %g + %v = insertelement poison, float %l, i32 0 + ret %v +} + +define @loadnxv4f32_offset(ptr %p) { +; CHECK-LABEL: loadnxv4f32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur s0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load float, ptr %g + %v = insertelement poison, float %l, i32 0 + ret %v +} + +define @loadnxv2f64_offset(ptr %p) { +; CHECK-LABEL: loadnxv2f64_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur d0, [x0, #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 1 + %l = load double, ptr %g + %v = insertelement poison, double %l, i32 0 + ret %v +} diff --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll index 23d545459295f..ccbd6f03fbcc3 100644 --- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll +++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll @@ -378,7 +378,6 @@ define <2 x i64> @loadv2i64_noffset(ptr %p) { ret <2 x i64> %v } - define <4 x half> @loadv4f16_noffset(ptr %p) { ; CHECK-LABEL: loadv4f16_noffset: ; CHECK: // %bb.0: @@ -457,6 +456,328 @@ define <2 x double> @loadv2f64_noffset(ptr %p) { } +; ROW addressing modes + +define <8 x i8> @loadv8i8_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv8i8_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, w1, sxtw] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i32 %o + %l = load i8, ptr %g + %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0 + ret <8 x i8> %v +} + +define <16 x i8> @loadv16i8_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv16i8_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, w1, sxtw] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i32 %o + %l = load i8, ptr %g + %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0 + ret <16 x i8> %v +} + +define <4 x i16> @loadv4i16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4i16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i16, ptr %p, i32 %o + %l = load i16, ptr %g + %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0 + ret <4 x i16> %v +} + +define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv8i16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i16, ptr %p, i32 %o + %l = load i16, ptr %g + %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0 + ret <8 x i16> %v +} + +define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv2i32_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, w1, sxtw #2 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret + %g = getelementptr inbounds i32, ptr %p, i32 %o + %l = load i32, ptr %g + %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0 + ret <2 x i32> %v +} + +define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4i32_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, w1, sxtw #2 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret + %g = getelementptr inbounds i32, ptr %p, i32 %o + %l = load i32, ptr %g + %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0 + ret <4 x i32> %v +} + +define <2 x i64> @loadv2i64_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv2i64_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, w1, sxtw #3] +; CHECK-NEXT: ret + %g = getelementptr inbounds i64, ptr %p, i32 %o + %l = load i64, ptr %g + %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0 + ret <2 x i64> %v +} + +define <4 x half> @loadv4f16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4f16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds half, ptr %p, i32 %o + %l = load half, ptr %g + %v = insertelement <4 x half> zeroinitializer, half %l, i32 0 + ret <4 x half> %v +} + +define <8 x half> @loadv8f16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv8f16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds half, ptr %p, i32 %o + %l = load half, ptr %g + %v = insertelement <8 x half> zeroinitializer, half %l, i32 0 + ret <8 x half> %v +} + +define <4 x bfloat> @loadv4bf16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4bf16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds bfloat, ptr %p, i32 %o + %l = load bfloat, ptr %g + %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0 + ret <4 x bfloat> %v +} + +define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv8bf16_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds bfloat, ptr %p, i32 %o + %l = load bfloat, ptr %g + %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0 + ret <8 x bfloat> %v +} + +define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv2f32_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, w1, sxtw #2 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret + %g = getelementptr inbounds float, ptr %p, i32 %o + %l = load float, ptr %g + %v = insertelement <2 x float> zeroinitializer, float %l, i32 0 + ret <2 x float> %v +} + +define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv4f32_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, w1, sxtw #2 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret + %g = getelementptr inbounds float, ptr %p, i32 %o + %l = load float, ptr %g + %v = insertelement <4 x float> zeroinitializer, float %l, i32 0 + ret <4 x float> %v +} + +define <2 x double> @loadv2f64_roW(ptr %p, i32 %o) { +; CHECK-LABEL: loadv2f64_roW: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, w1, sxtw #3] +; CHECK-NEXT: ret + %g = getelementptr inbounds double, ptr %p, i32 %o + %l = load double, ptr %g + %v = insertelement <2 x double> zeroinitializer, double %l, i32 0 + ret <2 x double> %v +} + + +; roX + +define <8 x i8> @loadv8i8_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv8i8_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, x1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 %o + %l = load i8, ptr %g + %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0 + ret <8 x i8> %v +} + +define <16 x i8> @loadv16i8_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv16i8_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, x1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i8, ptr %p, i64 %o + %l = load i8, ptr %g + %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0 + ret <16 x i8> %v +} + +define <4 x i16> @loadv4i16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4i16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i16, ptr %p, i64 %o + %l = load i16, ptr %g + %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0 + ret <4 x i16> %v +} + +define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv8i16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds i16, ptr %p, i64 %o + %l = load i16, ptr %g + %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0 + ret <8 x i16> %v +} + +define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv2i32_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1, lsl #2 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret + %g = getelementptr inbounds i32, ptr %p, i64 %o + %l = load i32, ptr %g + %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0 + ret <2 x i32> %v +} + +define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4i32_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1, lsl #2 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret + %g = getelementptr inbounds i32, ptr %p, i64 %o + %l = load i32, ptr %g + %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0 + ret <4 x i32> %v +} + +define <2 x i64> @loadv2i64_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv2i64_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %g = getelementptr inbounds i64, ptr %p, i64 %o + %l = load i64, ptr %g + %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0 + ret <2 x i64> %v +} + +define <4 x half> @loadv4f16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4f16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds half, ptr %p, i64 %o + %l = load half, ptr %g + %v = insertelement <4 x half> zeroinitializer, half %l, i32 0 + ret <4 x half> %v +} + +define <8 x half> @loadv8f16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv8f16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds half, ptr %p, i64 %o + %l = load half, ptr %g + %v = insertelement <8 x half> zeroinitializer, half %l, i32 0 + ret <8 x half> %v +} + +define <4 x bfloat> @loadv4bf16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4bf16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds bfloat, ptr %p, i64 %o + %l = load bfloat, ptr %g + %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0 + ret <4 x bfloat> %v +} + +define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv8bf16_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %g = getelementptr inbounds bfloat, ptr %p, i64 %o + %l = load bfloat, ptr %g + %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0 + ret <8 x bfloat> %v +} + +define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv2f32_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1, lsl #2 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret + %g = getelementptr inbounds float, ptr %p, i64 %o + %l = load float, ptr %g + %v = insertelement <2 x float> zeroinitializer, float %l, i32 0 + ret <2 x float> %v +} + +define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv4f32_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: add x8, x0, x1, lsl #2 +; CHECK-NEXT: ldr s0, [x8] +; CHECK-NEXT: ret + %g = getelementptr inbounds float, ptr %p, i64 %o + %l = load float, ptr %g + %v = insertelement <4 x float> zeroinitializer, float %l, i32 0 + ret <4 x float> %v +} + +define <2 x double> @loadv2f64_roX(ptr %p, i64 %o) { +; CHECK-LABEL: loadv2f64_roX: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr d0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %g = getelementptr inbounds double, ptr %p, i64 %o + %l = load double, ptr %g + %v = insertelement <2 x double> zeroinitializer, double %l, i32 0 + ret <2 x double> %v +} + + define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) { ; CHECK-LABEL: predictor_4x4_neon: ; CHECK: // %bb.0: From b4bb2f8aef01aeab8bf4fd164ed14a2c083d2858 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Wed, 11 Sep 2024 16:46:12 +0800 Subject: [PATCH 070/114] [LoopDeletion] Unblock loop deletion with `llvm.experimental.noalias.scope.decl` (#108144) Since `llvm.experimental.noalias.scope.decl` is marked as `memory(inaccessiblemem: readwrite)`, we cannot treat this annotation intrinsic as having no side effects. It will block loop deletion when this intrinsic exists inside a dead loop: https://github.com/llvm/llvm-project/blob/3dad29b677e427bf69c035605a16efd065576829/llvm/lib/Transforms/Scalar/LoopDeletion.cpp#L103-L110 This patch marks `llvm.experimental.noalias.scope.decl` as droppable to address the issue. Fixes https://github.com/llvm/llvm-project/issues/108052. --- llvm/lib/IR/User.cpp | 12 ++++++++- llvm/test/Transforms/LoopDeletion/noalias.ll | 28 ++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/LoopDeletion/noalias.ll diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp index 637af7aaa2453..00dd9c72c469c 100644 --- a/llvm/lib/IR/User.cpp +++ b/llvm/lib/IR/User.cpp @@ -113,7 +113,17 @@ MutableArrayRef User::getDescriptor() { } bool User::isDroppable() const { - return isa(this) || isa(this); + if (auto *II = dyn_cast(this)) { + switch (II->getIntrinsicID()) { + default: + return false; + case Intrinsic::assume: + case Intrinsic::pseudoprobe: + case Intrinsic::experimental_noalias_scope_decl: + return true; + } + } + return false; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/Transforms/LoopDeletion/noalias.ll b/llvm/test/Transforms/LoopDeletion/noalias.ll new file mode 100644 index 0000000000000..0f3b71df94270 --- /dev/null +++ b/llvm/test/Transforms/LoopDeletion/noalias.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=loop-deletion -S | FileCheck %s + +define void @pr108052(i64 %n) { +; CHECK-LABEL: define void @pr108052( +; CHECK-SAME: i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK: [[FOR_EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.exit: + ret void + +for.body: + %indvar = phi i64 [ 0, %entry ], [ %inc, %for.body ] + call void @llvm.experimental.noalias.scope.decl(metadata !0) + %inc = add nuw i64 %indvar, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.exit, label %for.body +} + +!0 = !{!1} +!1 = distinct !{!1, !2, !"x: %a"} +!2 = distinct !{!2, !"x"} From 596e7ccd30655faae9c4f35a1913dc23d08f3857 Mon Sep 17 00:00:00 2001 From: Alex Bradbury Date: Wed, 11 Sep 2024 10:04:56 +0100 Subject: [PATCH 071/114] [RISCV][doc] Add note to RISCVUsage about supported atomics ABIs (#103879) I've tried to avoid giving too much detailed explanation as the psABI docs are the better source for this. --- llvm/docs/RISCVUsage.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 8846b82fcaea5..a15af9adfa945 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -281,6 +281,13 @@ Supported ``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare`` These extensions are defined as part of the `RISC-V Profiles specification `__. They do not introduce any new features themselves, but instead describe existing hardware features. +Atomics ABIs +============ + +At the time of writing there are three atomics mappings (ABIs) `defined for RISC-V `__. As of LLVM 19, LLVM defaults to "A6S", which is compatible with both the original "A6" and the future "A7" ABI. See `the psABI atomics document `__ for more information on these mappings. + +Note that although the "A6S" mapping is used, the ELF attribute recording the mapping isn't currently emitted by default due to a bug causing a crash in older versions of binutils when processing files containing this attribute. + Experimental Extensions ======================= From a8f3d303122d049e65b699870615d464b77b489f Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Wed, 11 Sep 2024 17:08:44 +0800 Subject: [PATCH 072/114] [mlir] Add dependent TensorDialect to ConvertVectorToLLVM pass (#108045) This patch registers the tensor dialect as dependent of the ConvertVectorToLLVM. This which fixes a crash when `vector.transfer_write` is used with dynamic tensor type. The MaterializeTransferMask pattern would call `vector::createOrFoldDimOp` which creates a `tensor.dim` operation. Fixes #107805. --- .../VectorToLLVM/ConvertVectorToLLVMPass.cpp | 2 ++ mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp index 842d239cf6a51..4623b9667998c 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp @@ -19,6 +19,7 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h" #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h" #include "mlir/Dialect/X86Vector/Transforms.h" @@ -45,6 +46,7 @@ struct ConvertVectorToLLVMPass registry.insert(); registry.insert(); registry.insert(); + registry.insert(); if (armNeon) registry.insert(); if (armSVE) diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir index 7ac49c5f02347..bd14823cea50a 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir @@ -2521,6 +2521,16 @@ func.func @transfer_write_1d_scalable_mask(%arg0: memref<1x?xf32>, %vec: vector< // ----- +// CHECK-LABEL: func @transfer_write_tensor +// CHECK: vector.transfer_write +func.func @transfer_write_tensor(%arg0: vector<4xf32>,%arg1: tensor) -> tensor { + %c0 = arith.constant 0 : index + %0 = vector.transfer_write %arg0, %arg1[%c0] : vector<4xf32>, tensor + return %0 : tensor +} + +// ----- + func.func @genbool_0d_f() -> vector { %0 = vector.constant_mask [0] : vector return %0 : vector From a4b0153c4f5f0d6bcf42fb2cb97dbdfad9c59e2c Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Wed, 11 Sep 2024 17:10:58 +0800 Subject: [PATCH 073/114] [mlir][vector] Support for extracting 1-element vectors in VectorExtractOpConversion (#107549) This patch adds support for converting `vector.extract` that extract 1-element vectors into LLVM, fixing a crash in such cases. E.g., `vector.extract %1[0]: vector<1xf32> from vector<2xf32>`. Fix #61372. --- .../VectorToLLVM/ConvertVectorToLLVM.cpp | 5 +++- .../VectorToLLVM/vector-to-llvm.mlir | 24 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index 53e18a2e9d299..687061e9988f8 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -1104,7 +1104,10 @@ class VectorExtractOpConversion } // One-shot extraction of vector from array (only requires extractvalue). - if (isa(resultType)) { + // Except for extracting 1-element vectors. + if (isa(resultType) && + position.size() != + static_cast(extractOp.getSourceVectorType().getRank())) { if (extractOp.hasDynamicPosition()) return failure(); diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir index bd14823cea50a..2fe9ba8fead17 100644 --- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir +++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir @@ -1130,6 +1130,30 @@ func.func @extract_scalar_from_vec_1d_f32_scalable(%arg0: vector<[16]xf32>) -> f // ----- +func.func @extract_vec_1e_from_vec_1d_f32(%arg0: vector<16xf32>) -> vector<1xf32> { + %0 = vector.extract %arg0[15]: vector<1xf32> from vector<16xf32> + return %0 : vector<1xf32> +} +// CHECK-LABEL: @extract_vec_1e_from_vec_1d_f32( +// CHECK-SAME: %[[A:.*]]: vector<16xf32>) +// CHECK: %[[T0:.*]] = llvm.mlir.constant(15 : i64) : i64 +// CHECK: %[[T1:.*]] = llvm.extractelement %[[A]][%[[T0]] : i64] : vector<16xf32> +// CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : f32 to vector<1xf32> +// CHECK: return %[[T2]] : vector<1xf32> + +func.func @extract_vec_1e_from_vec_1d_f32_scalable(%arg0: vector<[16]xf32>) -> vector<1xf32> { + %0 = vector.extract %arg0[15]: vector<1xf32> from vector<[16]xf32> + return %0 : vector<1xf32> +} +// CHECK-LABEL: @extract_vec_1e_from_vec_1d_f32_scalable( +// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>) +// CHECK: %[[T0:.*]] = llvm.mlir.constant(15 : i64) : i64 +// CHECK: %[[T1:.*]] = llvm.extractelement %[[A]][%[[T0]] : i64] : vector<[16]xf32> +// CHECK: %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : f32 to vector<1xf32> +// CHECK: return %[[T2]] : vector<1xf32> + +// ----- + func.func @extract_scalar_from_vec_1d_index(%arg0: vector<16xindex>) -> index { %0 = vector.extract %arg0[15]: index from vector<16xindex> return %0 : index From f4dd1bc8fc625d3938f95b9d06aaaeebd2e90dca Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 11 Sep 2024 10:23:41 +0100 Subject: [PATCH 074/114] [AMDGPU] Fix leak and self-assignment in copy assignment operator (#107847) A static analyzer identified that this operator was unsafe in the case of self-assignment. In the placement new statement, StringValue's copy constructor was being implicitly called, which received a reference to "itself". In fact, it was being passed an old StringValue at the same address - one whose lifetime had already ended. The copy constructor was thus copying fields from a dead object. We need to be careful when switching active union members, and calling the destructor on the old StringValue will avoid memory leaks which I believe the old code exhibited. --- .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 7af5e7388f841..4cc60f5097899 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -100,19 +100,25 @@ struct SIArgument { SIArgument() : IsRegister(false), StackOffset(0) {} SIArgument(const SIArgument &Other) { IsRegister = Other.IsRegister; - if (IsRegister) { - ::new ((void *)std::addressof(RegisterName)) - StringValue(Other.RegisterName); - } else + if (IsRegister) + new (&RegisterName) StringValue(Other.RegisterName); + else StackOffset = Other.StackOffset; Mask = Other.Mask; } SIArgument &operator=(const SIArgument &Other) { + // Default-construct or destruct the old RegisterName in case of switching + // union members + if (IsRegister != Other.IsRegister) { + if (Other.IsRegister) + new (&RegisterName) StringValue(); + else + RegisterName.~StringValue(); + } IsRegister = Other.IsRegister; - if (IsRegister) { - ::new ((void *)std::addressof(RegisterName)) - StringValue(Other.RegisterName); - } else + if (IsRegister) + RegisterName = Other.RegisterName; + else StackOffset = Other.StackOffset; Mask = Other.Mask; return *this; From 2e4e918bf03868bb4cd0d0415766cfba8dc1d899 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Wed, 11 Sep 2024 11:27:30 +0200 Subject: [PATCH 075/114] [bazel] Port bc152fbf43157659f8b6817e8510e1fbe6e175b5 --- .../llvm-project-overlay/llvm/BUILD.bazel | 25 ++++++++++++++++--- .../llvm-project-overlay/llvm/driver.bzl | 1 + 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 4df7954ea3440..2af3fb40507ed 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -3690,22 +3690,41 @@ cc_binary( ], ) -cc_binary( - name = "llvm-debuginfod-find", +gentbl( + name = "DebugInfodFindOptsTableGen", + strip_include_prefix = "tools/llvm-debuginfod-find", + tbl_outs = [( + "-gen-opt-parser-defs", + "tools/llvm-debuginfod-find/Opts.inc", + )], + tblgen = ":llvm-tblgen", + td_file = "tools/llvm-debuginfod-find/Opts.td", + td_srcs = ["include/llvm/Option/OptParser.td"], +) + +cc_library( + name = "llvm-debuginfod-find-lib", srcs = glob([ "tools/llvm-debuginfod-find/*.cpp", ]), copts = llvm_copts, - stamp = 0, deps = [ ":BitReader", ":Core", + ":DebugInfodFindOptsTableGen", ":Debuginfod", + ":Option", ":Support", ":Symbolize", ], ) +llvm_driver_cc_binary( + name = "llvm-debuginfod-find", + stamp = 0, + deps = [":llvm-debuginfod-find-lib"], +) + cc_binary( name = "llvm-dis", srcs = glob([ diff --git a/utils/bazel/llvm-project-overlay/llvm/driver.bzl b/utils/bazel/llvm-project-overlay/llvm/driver.bzl index b3d3b2eed9f06..66e8af7db7d0e 100644 --- a/utils/bazel/llvm-project-overlay/llvm/driver.bzl +++ b/utils/bazel/llvm-project-overlay/llvm/driver.bzl @@ -16,6 +16,7 @@ _TOOLS = { "llvm-ar": "//llvm:llvm-ar-lib", "llvm-cgdata": "//llvm:llvm-cgdata-lib", "llvm-cxxfilt": "//llvm:llvm-cxxfilt-lib", + "llvm-debuginfod-find": "//llvm:llvm-debuginfod-find-lib", "llvm-dwp": "//llvm:llvm-dwp-lib", "llvm-gsymutil": "//llvm:llvm-gsymutil-lib", "llvm-ifs": "//llvm:llvm-ifs-lib", From c4a00be08aa10b5e51ee5db426d61ac87d9dc6fd Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Wed, 11 Sep 2024 11:36:59 +0200 Subject: [PATCH 076/114] [bazel] Add missing dependency for a8f3d303122d049e65b699870615d464b77b489f --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 5ee0ee5108276..b43bdb7b5f471 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -11780,6 +11780,7 @@ cc_library( ":MaskableOpInterface", ":MemRefDialect", ":Pass", + ":TensorDialect", ":ToLLVMIRTranslation", ":TransformUtils", ":VectorDialect", From 935b9f6274b39b35f6b391aaf4c87c0605421fb3 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 11 Sep 2024 10:39:34 +0100 Subject: [PATCH 077/114] [AMDGPU] Make use of multiclass inheritance. NFC. --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 16 ++++++---------- llvm/lib/Target/AMDGPU/BUFInstructions.td | 5 ++--- llvm/lib/Target/AMDGPU/VOP1Instructions.td | 5 ++--- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 5 ++--- 4 files changed, 12 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 2085113992ad1..e20c26eb83787 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1026,18 +1026,14 @@ defset list AMDGPUImageDimAtomicIntrinsics = { } } - multiclass AMDGPUImageDimAtomic { - defm "" - : AMDGPUImageDimAtomicX, "vdata">], rettype>; - } + multiclass AMDGPUImageDimAtomic : + AMDGPUImageDimAtomicX, "vdata">], rettype>; - multiclass AMDGPUImageDimFloatAtomic { - defm "" : AMDGPUImageDimAtomic; - } + multiclass AMDGPUImageDimFloatAtomic : + AMDGPUImageDimAtomic; - multiclass AMDGPUImageDimAnyAtomic { - defm "" : AMDGPUImageDimAtomic; - } + multiclass AMDGPUImageDimAnyAtomic : + AMDGPUImageDimAtomic; defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAnyAtomic<"ATOMIC_SWAP">; defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index c6668b24f4ef6..532ece8b16c5e 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1583,9 +1583,8 @@ multiclass BufferAtomicPat; } -multiclass BufferAtomicIntrPat { - defm : BufferAtomicPat; -} +multiclass BufferAtomicIntrPat : + BufferAtomicPat; multiclass BufferAtomicCmpSwapPat_Common { foreach RtnMode = ["ret", "noret"] in { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 03e4cb9fcf49b..8e5b61e8e492e 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -882,9 +882,8 @@ multiclass VOP1_Real_dpp8_with_name op, string opName, } } -multiclass VOP1_Realtriple_e64 op> { - defm NAME : VOP3_Realtriple; -} +multiclass VOP1_Realtriple_e64 op> : + VOP3_Realtriple; multiclass VOP1_Realtriple_e64_with_name op, string opName, string asmName> { diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index fccaa27f36138..afae7a886288c 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1513,9 +1513,8 @@ multiclass VOP2be_Real_dpp8 op, string opName, string asmNam } // We don't want to override separate decoderNamespaces within these -multiclass VOP2_Realtriple_e64 op> { - defm NAME : VOP3_Realtriple ; -} +multiclass VOP2_Realtriple_e64 op> : + VOP3_Realtriple; multiclass VOP2_Realtriple_e64_with_name op, string opName, string asmName> { From 704116373ae91a1b829dc3d3d269874fb27b579c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 11 Sep 2024 11:06:55 +0100 Subject: [PATCH 078/114] [AMDGPU] Regenerate buffer intrinsic tests with update_llc_test_checks. NFC. --- ...m.amdgcn.raw.ptr.buffer.load.format.d16.ll | 76 +++-- ...mdgcn.struct.ptr.buffer.load.format.d16.ll | 100 +++++-- .../llvm.amdgcn.struct.ptr.buffer.load.ll | 282 +++++++++++------- ...dgcn.struct.ptr.buffer.store.format.d16.ll | 152 ++++++---- ...m.amdgcn.struct.ptr.buffer.store.format.ll | 102 ++++--- .../llvm.amdgcn.struct.ptr.buffer.store.ll | 209 ++++++++----- 6 files changed, 611 insertions(+), 310 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll index c27118446cc2f..cafd903df2d56 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.format.d16.ll @@ -1,47 +1,79 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s -; GCN-LABEL: {{^}}buffer_load_format_d16_x: -; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) { +; UNPACKED-LABEL: buffer_load_format_d16_x: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: buffer_load_format_d16_x v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x20,0xe0,0x00,0x00,0x00,0x80] +; UNPACKED-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; UNPACKED-NEXT: ; return to shader part epilog +; +; PACKED-LABEL: buffer_load_format_d16_x: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: buffer_load_format_d16_x v0, off, s[0:3], 0 +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: ; return to shader part epilog main_body: %data = call half @llvm.amdgcn.raw.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) ret half %data } -; GCN-LABEL: {{^}}buffer_load_format_d16_xy: -; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 -; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] define amdgpu_ps half @buffer_load_format_d16_xy(ptr addrspace(8) inreg %rsrc) { +; UNPACKED-LABEL: buffer_load_format_d16_xy: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: buffer_load_format_d16_xy v[0:1], off, s[0:3], 0 ; encoding: [0x00,0x00,0x24,0xe0,0x00,0x00,0x00,0x80] +; UNPACKED-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; UNPACKED-NEXT: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e] +; UNPACKED-NEXT: ; return to shader part epilog +; +; PACKED-LABEL: buffer_load_format_d16_xy: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: buffer_load_format_d16_xy v0, off, s[0:3], 0 +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; PACKED-NEXT: ; return to shader part epilog main_body: %data = call <2 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %elt = extractelement <2 x half> %data, i32 1 ret half %elt } -; GCN-LABEL: {{^}}buffer_load_format_d16_xyz: -; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 define amdgpu_ps half @buffer_load_format_d16_xyz(ptr addrspace(8) inreg %rsrc) { +; UNPACKED-LABEL: buffer_load_format_d16_xyz: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: buffer_load_format_d16_xyz v[0:2], off, s[0:3], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x00,0x80] +; UNPACKED-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; UNPACKED-NEXT: v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e] +; UNPACKED-NEXT: ; return to shader part epilog +; +; PACKED-LABEL: buffer_load_format_d16_xyz: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: buffer_load_format_d16_xyz v[0:1], off, s[0:3], 0 +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_mov_b32_e32 v0, v1 +; PACKED-NEXT: ; return to shader part epilog main_body: %data = call <3 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %elt = extractelement <3 x half> %data, i32 2 ret half %elt } -; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: -; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 -; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] define amdgpu_ps half @buffer_load_format_d16_xyzw(ptr addrspace(8) inreg %rsrc) { +; UNPACKED-LABEL: buffer_load_format_d16_xyzw: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: buffer_load_format_d16_xyzw v[0:3], off, s[0:3], 0 ; encoding: [0x00,0x00,0x2c,0xe0,0x00,0x00,0x00,0x80] +; UNPACKED-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; UNPACKED-NEXT: v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e] +; UNPACKED-NEXT: ; return to shader part epilog +; +; PACKED-LABEL: buffer_load_format_d16_xyzw: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: buffer_load_format_d16_xyzw v[0:1], off, s[0:3], 0 +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; PACKED-NEXT: ; return to shader part epilog main_body: %data = call <4 x half> @llvm.amdgcn.raw.ptr.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %elt = extractelement <4 x half> %data, i32 3 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll index 3a396b54f89ab..39df6ec679e88 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.d16.ll @@ -1,57 +1,107 @@ -; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefixes=UNPACKED %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=PACKED %s -; GCN-LABEL: {{^}}buffer_load_format_d16_x: -; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_ps half @buffer_load_format_d16_x(ptr addrspace(8) inreg %rsrc) { +; UNPACKED-LABEL: buffer_load_format_d16_x: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; UNPACKED-NEXT: buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x20,0xe0,0x00,0x00,0x00,0x80] +; UNPACKED-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; UNPACKED-NEXT: ; return to shader part epilog +; +; PACKED-LABEL: buffer_load_format_d16_x: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: v_mov_b32_e32 v0, 0 +; PACKED-NEXT: buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: ; return to shader part epilog main_body: %data = call half @llvm.amdgcn.struct.ptr.buffer.load.format.f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret half %data } -; GCN-LABEL: {{^}}buffer_load_format_d16_xy: -; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]] define amdgpu_ps half @buffer_load_format_d16_xy(ptr addrspace(8) inreg %rsrc) { +; UNPACKED-LABEL: buffer_load_format_d16_xy: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; UNPACKED-NEXT: buffer_load_format_d16_xy v[0:1], v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x24,0xe0,0x00,0x00,0x00,0x80] +; UNPACKED-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; UNPACKED-NEXT: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e] +; UNPACKED-NEXT: ; return to shader part epilog +; +; PACKED-LABEL: buffer_load_format_d16_xy: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: v_mov_b32_e32 v0, 0 +; PACKED-NEXT: buffer_load_format_d16_xy v0, v0, s[0:3], 0 idxen +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; PACKED-NEXT: ; return to shader part epilog main_body: %data = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v2f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %elt = extractelement <2 x half> %data, i32 1 ret half %elt } -; GCN-LABEL: {{^}}buffer_load_format_d16_xyz: -; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] define amdgpu_ps half @buffer_load_format_d16_xyz(ptr addrspace(8) inreg %rsrc) { +; UNPACKED-LABEL: buffer_load_format_d16_xyz: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; UNPACKED-NEXT: buffer_load_format_d16_xyz v[0:2], v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x28,0xe0,0x00,0x00,0x00,0x80] +; UNPACKED-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; UNPACKED-NEXT: v_mov_b32_e32 v0, v2 ; encoding: [0x02,0x03,0x00,0x7e] +; UNPACKED-NEXT: ; return to shader part epilog +; +; PACKED-LABEL: buffer_load_format_d16_xyz: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: v_mov_b32_e32 v0, 0 +; PACKED-NEXT: buffer_load_format_d16_xyz v[0:1], v0, s[0:3], 0 idxen +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_mov_b32_e32 v0, v1 +; PACKED-NEXT: ; return to shader part epilog main_body: %data = call <3 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v3f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %elt = extractelement <3 x half> %data, i32 2 ret half %elt } -; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw: -; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]] - -; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen -; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]] define amdgpu_ps half @buffer_load_format_d16_xyzw(ptr addrspace(8) inreg %rsrc) { +; UNPACKED-LABEL: buffer_load_format_d16_xyzw: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; UNPACKED-NEXT: buffer_load_format_d16_xyzw v[0:3], v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x2c,0xe0,0x00,0x00,0x00,0x80] +; UNPACKED-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; UNPACKED-NEXT: v_mov_b32_e32 v0, v3 ; encoding: [0x03,0x03,0x00,0x7e] +; UNPACKED-NEXT: ; return to shader part epilog +; +; PACKED-LABEL: buffer_load_format_d16_xyzw: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: v_mov_b32_e32 v0, 0 +; PACKED-NEXT: buffer_load_format_d16_xyzw v[0:1], v0, s[0:3], 0 idxen +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; PACKED-NEXT: ; return to shader part epilog main_body: %data = call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %elt = extractelement <4 x half> %data, i32 3 ret half %elt } -; GCN-LABEL: {{^}}buffer_load_format_i16_x: -; GCN: buffer_load_format_d16_x v{{[0-9]+}}, {{v[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_ps half @buffer_load_format_i16_x(ptr addrspace(8) inreg %rsrc) { +; UNPACKED-LABEL: buffer_load_format_i16_x: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e] +; UNPACKED-NEXT: buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen ; encoding: [0x00,0x20,0x20,0xe0,0x00,0x00,0x00,0x80] +; UNPACKED-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf] +; UNPACKED-NEXT: ; return to shader part epilog +; +; PACKED-LABEL: buffer_load_format_i16_x: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: v_mov_b32_e32 v0, 0 +; PACKED-NEXT: buffer_load_format_d16_x v0, v0, s[0:3], 0 idxen +; PACKED-NEXT: s_waitcnt vmcnt(0) +; PACKED-NEXT: ; return to shader part epilog main_body: %data = call i16 @llvm.amdgcn.struct.ptr.buffer.load.format.i16(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %fdata = bitcast i16 %data to half diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll index 2f9e6b0a1cf52..55600cab8432b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.ll @@ -1,12 +1,16 @@ -;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,SI ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,VI -;CHECK-LABEL: {{^}}buffer_load: -;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: buffer_load_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: buffer_load_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { +; CHECK-LABEL: buffer_load: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v8, 0 +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v8, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dwordx4 v[4:7], v8, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_load_dwordx4 v[8:11], v8, s[0:3], 0 idxen slc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) %data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) @@ -17,106 +21,165 @@ main_body: ret {<4 x float>, <4 x float>, <4 x float>} %r2 } -;CHECK-LABEL: {{^}}buffer_load_immoffs: -;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:40 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { +; CHECK-LABEL: buffer_load_immoffs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen offset:40 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 40, i32 0, i32 0) ret <4 x float> %data } -;CHECK-LABEL: {{^}}buffer_load_immoffs_large: -;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc -;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], [[OFFSET]] idxen offset:4 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) { +; CHECK-LABEL: buffer_load_immoffs_large: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_movk_i32 s4, 0x1ffc +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], s4 idxen offset:4 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 4, i32 8188, i32 0) ret <4 x float> %data } -;CHECK-LABEL: {{^}}buffer_load_idx: -;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) { +; CHECK-LABEL: buffer_load_idx: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0) ret <4 x float> %data } -;CHECK-LABEL: {{^}}buffer_load_ofs: -;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { +; CHECK-LABEL: buffer_load_ofs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0) ret <4 x float> %data } -;CHECK-LABEL: {{^}}buffer_load_ofs_imm: -;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { +; CHECK-LABEL: buffer_load_ofs_imm: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %1, 60 %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0) ret <4 x float> %data } -;CHECK-LABEL: {{^}}buffer_load_both: -;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) { +; CHECK-LABEL: buffer_load_both: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0) ret <4 x float> %data } -;CHECK-LABEL: {{^}}buffer_load_both_reversed: -;CHECK: v_mov_b32_e32 v2, v0 -;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, i32, i32) { +; CHECK-LABEL: buffer_load_both_reversed: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v2, v0 +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0) ret <4 x float> %data } -;CHECK-LABEL: {{^}}buffer_load_x1: -;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps float @buffer_load_x1(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { +; CHECK-LABEL: buffer_load_x1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) ret float %data } -;CHECK-LABEL: {{^}}buffer_load_x2: -;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt define amdgpu_ps <2 x float> @buffer_load_x2(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { +; CHECK-LABEL: buffer_load_x2: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.v2f32(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) ret <2 x float> %data } -;CHECK-LABEL: {{^}}buffer_load_negative_offset: -;CHECK: v_add_{{[iu]}}32_e32 {{v[0-9]+}}, vcc, -16, v0 -;CHECK: buffer_load_dwordx4 v[0:3], {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen define amdgpu_ps <4 x float> @buffer_load_negative_offset(ptr addrspace(8) inreg, i32 %ofs) { +; SI-LABEL: buffer_load_negative_offset: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: v_add_i32_e32 v1, vcc, -16, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: buffer_load_negative_offset: +; VI: ; %bb.0: ; %main_body +; VI-NEXT: s_mov_b32 s4, 0 +; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: ; return to shader part epilog main_body: %ofs.1 = add i32 %ofs, -16 %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs.1, i32 0, i32 0) ret <4 x float> %data } -; SI won't merge ds memory operations, because of the signed offset bug, so -; we only have check lines for VI. -; CHECK-LABEL: buffer_load_mmo: -; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +; SI won't merge ds memory operations, because of the signed offset bug. define amdgpu_ps float @buffer_load_mmo(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %lds) { +; SI-LABEL: buffer_load_mmo: +; SI: ; %bb.0: ; %entry +; SI-NEXT: v_mov_b32_e32 v2, 0 +; SI-NEXT: buffer_load_dword v1, v2, s[0:3], 0 idxen +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_write_b32 v0, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 16, v0 +; SI-NEXT: ds_write_b32 v0, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: buffer_load_mmo: +; VI: ; %bb.0: ; %entry +; VI-NEXT: v_mov_b32_e32 v2, 0 +; VI-NEXT: buffer_load_dword v1, v2, s[0:3], 0 idxen +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: ; return to shader part epilog entry: store float 0.0, ptr addrspace(3) %lds %val = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) @@ -125,12 +188,15 @@ entry: ret float %val } -;CHECK-LABEL: {{^}}buffer_load_int: -;CHECK: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: buffer_load_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: buffer_load_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc -;CHECK: s_waitcnt define amdgpu_ps {<4 x float>, <2 x float>, float} @buffer_load_int(ptr addrspace(8) inreg) { +; CHECK-LABEL: buffer_load_int: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v6, 0 +; CHECK-NEXT: buffer_load_dwordx4 v[0:3], v6, s[0:3], 0 idxen +; CHECK-NEXT: buffer_load_dwordx2 v[4:5], v6, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_load_dword v6, v6, s[0:3], 0 idxen slc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog main_body: %data = call <4 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v4i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) %data_glc = call <2 x i32> @llvm.amdgcn.struct.ptr.buffer.load.v2i32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) @@ -144,13 +210,13 @@ main_body: ret {<4 x float>, <2 x float>, float} %r2 } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_ubyte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen -;CHECK: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -;CHECK-NEXT: ; return to shader part epilog define amdgpu_ps float @struct_ptr_buffer_load_ubyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { +; CHECK-LABEL: struct_ptr_buffer_load_ubyte: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 idxen offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; CHECK-NEXT: ; return to shader part epilog main_body: %tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = zext i8 %tmp to i32 @@ -158,13 +224,13 @@ main_body: ret float %val } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_ushort: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 -;CHECK-NEXT: ; return to shader part epilog define amdgpu_ps float @struct_ptr_buffer_load_ushort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { +; CHECK-LABEL: struct_ptr_buffer_load_ushort: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 idxen offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0 +; CHECK-NEXT: ; return to shader part epilog main_body: %tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = zext i16 %tmp to i32 @@ -172,13 +238,13 @@ main_body: ret float %val } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_sbyte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 -;CHECK-NEXT: ; return to shader part epilog define amdgpu_ps float @struct_ptr_buffer_load_sbyte(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { +; CHECK-LABEL: struct_ptr_buffer_load_sbyte: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_sbyte v0, v[0:1], s[0:3], 0 idxen offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +; CHECK-NEXT: ; return to shader part epilog main_body: %tmp = call i8 @llvm.amdgcn.struct.ptr.buffer.load.i8(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = sext i8 %tmp to i32 @@ -186,13 +252,13 @@ main_body: ret float %val } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_sshort: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v[0:1], s[0:3], 0 idxen offen -;CHECK-NEXT: s_waitcnt vmcnt(0) -;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 -;CHECK-NEXT: ; return to shader part epilog define amdgpu_ps float @struct_ptr_buffer_load_sshort(ptr addrspace(8) inreg %rsrc, i32 %idx, i32 %ofs) { +; CHECK-LABEL: struct_ptr_buffer_load_sshort: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_sshort v0, v[0:1], s[0:3], 0 idxen offen +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0 +; CHECK-NEXT: ; return to shader part epilog main_body: %tmp = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 %ofs, i32 0, i32 0) %tmp2 = sext i16 %tmp to i32 @@ -200,72 +266,84 @@ main_body: ret float %val } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_f16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: ds_write_b16 v0, [[VAL]] define amdgpu_ps void @struct_ptr_buffer_load_f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +; CHECK-LABEL: struct_ptr_buffer_load_f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: s_endpgm main_body: %val = call half @llvm.amdgcn.struct.ptr.buffer.load.f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store half %val, ptr addrspace(3) %ptr ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v2f16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: ds_write_b32 v0, [[VAL]] define amdgpu_ps void @struct_ptr_buffer_load_v2f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +; CHECK-LABEL: struct_ptr_buffer_load_v2f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b32 v0, v1 +; CHECK-NEXT: s_endpgm main_body: %val = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.load.v2f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <2 x half> %val, ptr addrspace(3) %ptr ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v4f16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: ds_write_b64 v0, [[VAL]] define amdgpu_ps void @struct_ptr_buffer_load_v4f16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +; CHECK-LABEL: struct_ptr_buffer_load_v4f16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[1:2] +; CHECK-NEXT: s_endpgm main_body: %val = call <4 x half> @llvm.amdgcn.struct.ptr.buffer.load.v4f16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <4 x half> %val, ptr addrspace(3) %ptr ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_i16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_ushort [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: ds_write_b16 v0, [[VAL]] define amdgpu_ps void @struct_ptr_buffer_load_i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +; CHECK-LABEL: struct_ptr_buffer_load_i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b16 v0, v1 +; CHECK-NEXT: s_endpgm main_body: %val = call i16 @llvm.amdgcn.struct.ptr.buffer.load.i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store i16 %val, ptr addrspace(3) %ptr ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v2i16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dword [[VAL:v[0-9]+]], v1, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: ds_write_b32 v0, [[VAL]] define amdgpu_ps void @struct_ptr_buffer_load_v2i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +; CHECK-LABEL: struct_ptr_buffer_load_v2i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b32 v0, v1 +; CHECK-NEXT: s_endpgm main_body: %val = call <2 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v2i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <2 x i16> %val, ptr addrspace(3) %ptr ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_load_v4i16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]], v1, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: ds_write_b64 v0, [[VAL]] define amdgpu_ps void @struct_ptr_buffer_load_v4i16(ptr addrspace(8) inreg %rsrc, ptr addrspace(3) %ptr, i32 %idx) { +; CHECK-LABEL: struct_ptr_buffer_load_v4i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_load_dwordx2 v[1:2], v1, s[0:3], 0 idxen +; CHECK-NEXT: s_mov_b32 m0, -1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[1:2] +; CHECK-NEXT: s_endpgm main_body: %val = call <4 x i16> @llvm.amdgcn.struct.ptr.buffer.load.v4i16(ptr addrspace(8) %rsrc, i32 %idx, i32 0, i32 0, i32 0) store <4 x i16> %val, ptr addrspace(3) %ptr diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll index 8109fca4a043a..58b422dd6a751 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.d16.ll @@ -1,85 +1,133 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s -; GCN-LABEL: {{^}}buffer_store_format_d16_x: -; GCN: s_load_dword s[[LO:[0-9]+]] -; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] -; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_x(ptr addrspace(8) %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) { +; GCN-LABEL: buffer_store_format_d16_x: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_load_dword s4, s[6:7], 0x30 +; GCN-NEXT: s_load_dword s5, s[6:7], 0x54 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_format_d16_x v0, v1, s[0:3], 0 idxen +; GCN-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.f16(half %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -; GCN-LABEL: {{^}}buffer_store_format_d16_xy: - -; UNPACKED: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}} -; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]] -; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]] -; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen - -; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data, i32 %index) { +; UNPACKED-LABEL: buffer_store_format_d16_xy: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; UNPACKED-NEXT: s_waitcnt lgkmcnt(0) +; UNPACKED-NEXT: s_lshr_b32 s6, s4, 16 +; UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff +; UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; UNPACKED-NEXT: v_mov_b32_e32 v1, s6 +; UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; UNPACKED-NEXT: buffer_store_format_d16_xy v[0:1], v2, s[0:3], 0 idxen +; UNPACKED-NEXT: s_endpgm +; +; PACKED-LABEL: buffer_store_format_d16_xy: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PACKED-NEXT: buffer_store_format_d16_xy v0, v1, s[0:3], 0 idxen +; PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -; GCN-LABEL: {{^}}buffer_store_format_d16_xyz: -; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 - -; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} - -; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] -; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] - -; UNPACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen - -; PACKED: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} -; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] -; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED0]] - -; PACKED: buffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) { +; UNPACKED-LABEL: buffer_store_format_d16_xyz: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; UNPACKED-NEXT: s_waitcnt lgkmcnt(0) +; UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff +; UNPACKED-NEXT: s_lshr_b32 s7, s4, 16 +; UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff +; UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; UNPACKED-NEXT: v_mov_b32_e32 v1, s7 +; UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; UNPACKED-NEXT: v_mov_b32_e32 v3, s6 +; UNPACKED-NEXT: buffer_store_format_d16_xyz v[0:2], v3, s[0:3], 0 idxen +; UNPACKED-NEXT: s_endpgm +; +; PACKED-LABEL: buffer_store_format_d16_xyz: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: s_and_b32 s5, s5, 0xffff +; PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PACKED-NEXT: v_mov_b32_e32 v2, s8 +; PACKED-NEXT: buffer_store_format_d16_xyz v[0:1], v2, s[0:3], 0 idxen +; PACKED-NEXT: s_endpgm main_body: %data_subvec = shufflevector <4 x half> %data, <4 x half> undef, <3 x i32> call void @llvm.amdgcn.struct.ptr.buffer.store.format.v3f16(<3 x half> %data_subvec, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: -; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 - -; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} -; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} - -; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] -; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] - -; UNPACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen - -; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]] -; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]] - -; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data, i32 %index) { +; UNPACKED-LABEL: buffer_store_format_d16_xyzw: +; UNPACKED: ; %bb.0: ; %main_body +; UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; UNPACKED-NEXT: s_load_dword s6, s[6:7], 0x18 +; UNPACKED-NEXT: s_waitcnt lgkmcnt(0) +; UNPACKED-NEXT: s_lshr_b32 s7, s5, 16 +; UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff +; UNPACKED-NEXT: s_lshr_b32 s8, s4, 16 +; UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff +; UNPACKED-NEXT: v_mov_b32_e32 v0, s4 +; UNPACKED-NEXT: v_mov_b32_e32 v1, s8 +; UNPACKED-NEXT: v_mov_b32_e32 v2, s5 +; UNPACKED-NEXT: v_mov_b32_e32 v3, s7 +; UNPACKED-NEXT: v_mov_b32_e32 v4, s6 +; UNPACKED-NEXT: buffer_store_format_d16_xyzw v[0:3], v4, s[0:3], 0 idxen +; UNPACKED-NEXT: s_endpgm +; +; PACKED-LABEL: buffer_store_format_d16_xyzw: +; PACKED: ; %bb.0: ; %main_body +; PACKED-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x10 +; PACKED-NEXT: s_load_dword s8, s[6:7], 0x18 +; PACKED-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; PACKED-NEXT: s_waitcnt lgkmcnt(0) +; PACKED-NEXT: v_mov_b32_e32 v0, s4 +; PACKED-NEXT: v_mov_b32_e32 v1, s5 +; PACKED-NEXT: v_mov_b32_e32 v2, s8 +; PACKED-NEXT: buffer_store_format_d16_xyzw v[0:1], v2, s[0:3], 0 idxen +; PACKED-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -; GCN-LABEL: {{^}}buffer_store_format_i16_x: -; GCN: s_load_dword s[[LO:[0-9]+]] -; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]] -; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen define amdgpu_kernel void @buffer_store_format_i16_x(ptr addrspace(8) %rsrc, [8 x i32], i16 %data, [8 x i32], i32 %index) { +; GCN-LABEL: buffer_store_format_i16_x: +; GCN: ; %bb.0: ; %main_body +; GCN-NEXT: s_load_dword s4, s[6:7], 0x30 +; GCN-NEXT: s_load_dword s5, s[6:7], 0x54 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_format_d16_x v0, v1, s[0:3], 0 idxen +; GCN-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.i16(i16 %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll index 13217b24dcd4b..61a08d96986b0 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.format.ll @@ -1,12 +1,15 @@ -;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s -;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=CHECK,SI %s +;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VI %s -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: buffer_store_format_xyzw v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: buffer_store_format_xyzw v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { +; CHECK-LABEL: buffer_store: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v12, 0 +; CHECK-NEXT: buffer_store_format_xyzw v[0:3], v12, s[0:3], 0 idxen +; CHECK-NEXT: buffer_store_format_xyzw v[4:7], v12, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_store_format_xyzw v[8:11], v12, s[0:3], 0 idxen slc +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) @@ -14,47 +17,56 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { +; CHECK-LABEL: buffer_store_immoffs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen offset:42 +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_idx: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) { +; CHECK-LABEL: buffer_store_idx: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { +; CHECK-LABEL: buffer_store_ofs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_mov_b32_e32 v5, v4 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 +; CHECK-NEXT: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_both: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) { +; CHECK-LABEL: buffer_store_both: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_both_reversed: -;CHECK: v_mov_b32_e32 v6, v4 -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) { +; CHECK-LABEL: buffer_store_both_reversed: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i32 0, i32 0) ret void @@ -62,14 +74,23 @@ main_body: ; Ideally, the register allocator would avoid the wait here ; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { +; SI-LABEL: buffer_store_wait: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen +; SI-NEXT: s_endpgm +; +; VI-LABEL: buffer_store_wait: +; VI: ; %bb.0: ; %main_body +; VI-NEXT: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen +; VI-NEXT: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen +; VI-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0, i32 0) @@ -77,28 +98,31 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) { +; CHECK-LABEL: buffer_store_x1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_format_x v0, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x1_i32: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x1_i32(ptr addrspace(8) inreg %rsrc, i32 %data, i32 %index) { +; CHECK-LABEL: buffer_store_x1_i32: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_format_x v0, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.i32(i32 %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) { +; CHECK-LABEL: buffer_store_x2: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.format.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll index e52af31360764..d08623f685e85 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.store.ll @@ -1,12 +1,15 @@ -;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s -;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s - -;CHECK-LABEL: {{^}}buffer_store: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=CHECK,SI %s +;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VI %s + define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) { +; CHECK-LABEL: buffer_store: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v12, 0 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v12, s[0:3], 0 idxen +; CHECK-NEXT: buffer_store_dwordx4 v[4:7], v12, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_store_dwordx4 v[8:11], v12, s[0:3], 0 idxen slc +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) @@ -14,62 +17,79 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_immoffs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) { +; CHECK-LABEL: buffer_store_immoffs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen offset:42 +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_idx: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_idx(ptr addrspace(8) inreg, <4 x float>, i32) { +; CHECK-LABEL: buffer_store_idx: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_ofs: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) { +; CHECK-LABEL: buffer_store_ofs: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_mov_b32_e32 v5, v4 +; CHECK-NEXT: v_mov_b32_e32 v4, s4 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 %2, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_both: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both(ptr addrspace(8) inreg, <4 x float>, i32, i32) { +; CHECK-LABEL: buffer_store_both: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 %3, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_both_reversed: -;CHECK: v_mov_b32_e32 v6, v4 -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen define amdgpu_ps void @buffer_store_both_reversed(ptr addrspace(8) inreg, <4 x float>, i32, i32) { +; CHECK-LABEL: buffer_store_both_reversed: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %3, i32 %2, i32 0, i32 0) ret void } ; Ideally, the register allocator would avoid the wait here -; -;CHECK-LABEL: {{^}}buffer_store_wait: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -;VERDE: s_waitcnt expcnt(0) -;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen -;CHECK: s_waitcnt vmcnt(0) -;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) { +; SI-LABEL: buffer_store_wait: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +; SI-NEXT: s_endpgm +; +; VI-LABEL: buffer_store_wait: +; VI: ; %bb.0: ; %main_body +; VI-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen +; VI-NEXT: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen +; VI-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0, i32 0) %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0, i32 0) @@ -77,30 +97,34 @@ main_body: ret void } -;CHECK-LABEL: {{^}}buffer_store_x1: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %index) { +; CHECK-LABEL: buffer_store_x1: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_x2: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %index) #0 { +; CHECK-LABEL: buffer_store_x2: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}buffer_store_int: -;CHECK-NOT: s_waitcnt -;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen -;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc -;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) { +; CHECK-LABEL: buffer_store_int: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_mov_b32_e32 v7, 0 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v7, s[0:3], 0 idxen +; CHECK-NEXT: buffer_store_dwordx2 v[4:5], v7, s[0:3], 0 idxen glc +; CHECK-NEXT: buffer_store_dword v6, v7, s[0:3], 0 idxen slc +; CHECK-NEXT: s_endpgm main_body: call void @llvm.amdgcn.struct.ptr.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) call void @llvm.amdgcn.struct.ptr.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) @@ -108,12 +132,12 @@ main_body: ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_store_byte: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen -;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_ptr_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) { +; CHECK-LABEL: struct_ptr_buffer_store_byte: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_byte v0, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i8 @@ -121,39 +145,63 @@ main_body: ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_store_f16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen -;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_ptr_buffer_store_f16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) { +; CHECK-LABEL: struct_ptr_buffer_store_f16: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cvt_f16_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_short v0, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm %v2 = fptrunc float %v1 to half call void @llvm.amdgcn.struct.ptr.buffer.store.f16(half %v2, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_store_v2f16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_ptr_buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %v1, i32 %index) { +; SI-LABEL: struct_ptr_buffer_store_v2f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen +; SI-NEXT: s_endpgm +; +; VI-LABEL: struct_ptr_buffer_store_v2f16: +; VI: ; %bb.0: +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; VI-NEXT: s_endpgm call void @llvm.amdgcn.struct.ptr.buffer.store.v2f16(<2 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_store_v4f16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_ptr_buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %v1, i32 %index) { +; SI-LABEL: struct_ptr_buffer_store_v4f16: +; SI: ; %bb.0: +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 idxen +; SI-NEXT: s_endpgm +; +; VI-LABEL: struct_ptr_buffer_store_v4f16: +; VI: ; %bb.0: +; VI-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; VI-NEXT: s_endpgm call void @llvm.amdgcn.struct.ptr.buffer.store.v4f16(<4 x half> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_store_i16: -;CHECK-NEXT: %bb. -;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}} -;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen -;CHECK-NEXT: s_endpgm define amdgpu_ps void @struct_ptr_buffer_store_i16(ptr addrspace(8) inreg %rsrc, float %v1, i32 %index) { +; CHECK-LABEL: struct_ptr_buffer_store_i16: +; CHECK: ; %bb.0: ; %main_body +; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0 +; CHECK-NEXT: buffer_store_short v0, v1, s[0:3], 0 idxen +; CHECK-NEXT: s_endpgm main_body: %v2 = fptoui float %v1 to i32 %v3 = trunc i32 %v2 to i16 @@ -161,18 +209,39 @@ main_body: ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_store_vif16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_ptr_buffer_store_vif16(ptr addrspace(8) inreg %rsrc, <2 x i16> %v1, i32 %index) { +; SI-LABEL: struct_ptr_buffer_store_vif16: +; SI: ; %bb.0: +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 idxen +; SI-NEXT: s_endpgm +; +; VI-LABEL: struct_ptr_buffer_store_vif16: +; VI: ; %bb.0: +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen +; VI-NEXT: s_endpgm call void @llvm.amdgcn.struct.ptr.buffer.store.v2i16(<2 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } -;CHECK-LABEL: {{^}}struct_ptr_buffer_store_v4i16: -;CHECK-NEXT: %bb. -;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen define amdgpu_ps void @struct_ptr_buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %v1, i32 %index) { +; SI-LABEL: struct_ptr_buffer_store_v4i16: +; SI: ; %bb.0: +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v1, v0, v1 +; SI-NEXT: buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 idxen +; SI-NEXT: s_endpgm +; +; VI-LABEL: struct_ptr_buffer_store_v4i16: +; VI: ; %bb.0: +; VI-NEXT: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen +; VI-NEXT: s_endpgm call void @llvm.amdgcn.struct.ptr.buffer.store.v4i16(<4 x i16> %v1, ptr addrspace(8) %rsrc, i32 %index, i32 0, i32 0, i32 0) ret void } From 7e0008d5ad5ea7df0b9586f07e5af4a7225dac96 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Wed, 11 Sep 2024 12:22:36 +0200 Subject: [PATCH 079/114] [LLD][COFF][NFC] Create import thunks in ImportFile::parse. (#107929) --- lld/COFF/InputFiles.cpp | 19 +++++++++++++++++-- lld/COFF/InputFiles.h | 2 ++ lld/COFF/SymbolTable.cpp | 4 ++-- lld/COFF/SymbolTable.h | 3 ++- lld/COFF/Symbols.cpp | 18 +++--------------- lld/COFF/Symbols.h | 2 +- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index fa2d230075d9d..c7956baf73cf4 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -1009,6 +1009,20 @@ MachineTypes ImportFile::getMachineType() const { return MachineTypes(machine); } +ImportThunkChunk *ImportFile::makeImportThunk() { + switch (hdr->Machine) { + case AMD64: + return make(ctx, impSym); + case I386: + return make(ctx, impSym); + case ARM64: + return make(ctx, impSym); + case ARMNT: + return make(ctx, impSym); + } + llvm_unreachable("unknown machine type"); +} + void ImportFile::parse() { const auto *hdr = reinterpret_cast(mb.getBufferStart()); @@ -1069,9 +1083,10 @@ void ImportFile::parse() { // DLL functions just like regular non-DLL functions.) if (hdr->getType() == llvm::COFF::IMPORT_CODE) { if (ctx.config.machine != ARM64EC) { - thunkSym = ctx.symtab.addImportThunk(name, impSym, hdr->Machine); + thunkSym = ctx.symtab.addImportThunk(name, impSym, makeImportThunk()); } else { - thunkSym = ctx.symtab.addImportThunk(name, impSym, AMD64); + thunkSym = ctx.symtab.addImportThunk( + name, impSym, make(ctx, impSym)); // FIXME: Add aux IAT symbols. } } diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h index 8b3303a8d87f4..1d55b4f34f754 100644 --- a/lld/COFF/InputFiles.h +++ b/lld/COFF/InputFiles.h @@ -55,6 +55,7 @@ class Defined; class DefinedImportData; class DefinedImportThunk; class DefinedRegular; +class ImportThunkChunk; class SectionChunk; class Symbol; class Undefined; @@ -352,6 +353,7 @@ class ImportFile : public InputFile { private: void parse() override; + ImportThunkChunk *makeImportThunk(); public: StringRef externalName; diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index bb7583bb9a7df..c9b3d78e3de17 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -784,11 +784,11 @@ DefinedImportData *SymbolTable::addImportData(StringRef n, ImportFile *f) { } Symbol *SymbolTable::addImportThunk(StringRef name, DefinedImportData *id, - uint16_t machine) { + ImportThunkChunk *chunk) { auto [s, wasInserted] = insert(name, nullptr); s->isUsedInRegularObj = true; if (wasInserted || isa(s) || s->isLazy()) { - replaceSymbol(s, ctx, name, id, machine); + replaceSymbol(s, ctx, name, id, chunk); return s; } diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index 51c6c79ec1446..3a277fc700e86 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -28,6 +28,7 @@ class COFFLinkerContext; class Defined; class DefinedAbsolute; class DefinedRegular; +class ImportThunkChunk; class LazyArchive; class SectionChunk; class Symbol; @@ -104,7 +105,7 @@ class SymbolTable { CommonChunk *c = nullptr); DefinedImportData *addImportData(StringRef n, ImportFile *f); Symbol *addImportThunk(StringRef name, DefinedImportData *s, - uint16_t machine); + ImportThunkChunk *chunk); void addLibcall(StringRef name); void addEntryThunk(Symbol *from, Symbol *to); void initializeEntryThunks(); diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp index b098abb80d6f1..5f4d797f74a2d 100644 --- a/lld/COFF/Symbols.cpp +++ b/lld/COFF/Symbols.cpp @@ -107,22 +107,10 @@ COFFSymbolRef DefinedCOFF::getCOFFSymbol() { uint64_t DefinedAbsolute::getRVA() { return va - ctx.config.imageBase; } -static Chunk *makeImportThunk(COFFLinkerContext &ctx, DefinedImportData *s, - uint16_t machine) { - if (machine == AMD64) - return make(ctx, s); - if (machine == I386) - return make(ctx, s); - if (machine == ARM64) - return make(ctx, s); - assert(machine == ARMNT); - return make(ctx, s); -} - DefinedImportThunk::DefinedImportThunk(COFFLinkerContext &ctx, StringRef name, - DefinedImportData *s, uint16_t machine) - : Defined(DefinedImportThunkKind, name), wrappedSym(s), - data(makeImportThunk(ctx, s, machine)) {} + DefinedImportData *s, + ImportThunkChunk *chunk) + : Defined(DefinedImportThunkKind, name), wrappedSym(s), data(chunk) {} Defined *Undefined::getWeakAlias() { // A weak alias may be a weak alias to another symbol, so check recursively. diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h index c427a062dc82b..724330e4bab95 100644 --- a/lld/COFF/Symbols.h +++ b/lld/COFF/Symbols.h @@ -388,7 +388,7 @@ class DefinedImportData : public Defined { class DefinedImportThunk : public Defined { public: DefinedImportThunk(COFFLinkerContext &ctx, StringRef name, - DefinedImportData *s, uint16_t machine); + DefinedImportData *s, ImportThunkChunk *chunk); static bool classof(const Symbol *s) { return s->kind() == DefinedImportThunkKind; From e1ee07d0ff7a37bf5f52d560a52925c0507471e1 Mon Sep 17 00:00:00 2001 From: Akshat Oke <76596238+Akshat-Oke@users.noreply.github.com> Date: Wed, 11 Sep 2024 16:00:16 +0530 Subject: [PATCH 080/114] [AMDGPU][NewPM] Port SIPeepholeSDWA pass to NPM (#107049) --- llvm/lib/Target/AMDGPU/AMDGPU.h | 6 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +- llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp | 56 +++++++++++++------ llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h | 24 ++++++++ llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir | 3 + llvm/test/CodeGen/AMDGPU/sdwa-ops.mir | 2 + llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 2 + 8 files changed, 77 insertions(+), 22 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 8d6e022e1e4d4..399aa9c633564 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -36,7 +36,7 @@ void initializeAMDGPURegBankSelectPass(PassRegistry &); FunctionPass *createGCNDPPCombinePass(); FunctionPass *createSIAnnotateControlFlowLegacyPass(); FunctionPass *createSIFoldOperandsLegacyPass(); -FunctionPass *createSIPeepholeSDWAPass(); +FunctionPass *createSIPeepholeSDWALegacyPass(); FunctionPass *createSILowerI1CopiesLegacyPass(); FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass(); FunctionPass *createSIShrinkInstructionsLegacyPass(); @@ -163,8 +163,8 @@ extern char &GCNDPPCombineLegacyID; void initializeSIFoldOperandsLegacyPass(PassRegistry &); extern char &SIFoldOperandsLegacyID; -void initializeSIPeepholeSDWAPass(PassRegistry &); -extern char &SIPeepholeSDWAID; +void initializeSIPeepholeSDWALegacyPass(PassRegistry &); +extern char &SIPeepholeSDWALegacyID; void initializeSIShrinkInstructionsLegacyPass(PassRegistry &); extern char &SIShrinkInstructionsLegacyID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 58481fe9df239..97661bf9837f9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -100,5 +100,6 @@ MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) +MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass()) MACHINE_FUNCTION_PASS("si-shrink-instructions", SIShrinkInstructionsPass()) #undef MACHINE_FUNCTION_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 9c9c505139373..55d0de59bc49a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -39,6 +39,7 @@ #include "SILoadStoreOptimizer.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" +#include "SIPeepholeSDWA.h" #include "SIShrinkInstructions.h" #include "TargetInfo/AMDGPUTargetInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -415,7 +416,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIFixSGPRCopiesLegacyPass(*PR); initializeSIFixVGPRCopiesPass(*PR); initializeSIFoldOperandsLegacyPass(*PR); - initializeSIPeepholeSDWAPass(*PR); + initializeSIPeepholeSDWALegacyPass(*PR); initializeSIShrinkInstructionsLegacyPass(*PR); initializeSIOptimizeExecMaskingPreRAPass(*PR); initializeSIOptimizeVGPRLiveRangePass(*PR); @@ -1275,7 +1276,7 @@ void GCNPassConfig::addMachineSSAOptimization() { addPass(&GCNDPPCombineLegacyID); addPass(&SILoadStoreOptimizerLegacyID); if (isPassEnabled(EnableSDWAPeephole)) { - addPass(&SIPeepholeSDWAID); + addPass(&SIPeepholeSDWALegacyID); addPass(&EarlyMachineLICMID); addPass(&MachineCSELegacyID); addPass(&SIFoldOperandsLegacyID); diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp index d80e1277b2a8a..86cb0e6944ed7 100644 --- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp @@ -19,6 +19,7 @@ /// //===----------------------------------------------------------------------===// +#include "SIPeepholeSDWA.h" #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" @@ -45,7 +46,7 @@ class SDWADstOperand; using SDWAOperandsVector = SmallVector; using SDWAOperandsMap = MapVector; -class SIPeepholeSDWA : public MachineFunctionPass { +class SIPeepholeSDWA { private: MachineRegisterInfo *MRI; const SIRegisterInfo *TRI; @@ -57,14 +58,6 @@ class SIPeepholeSDWA : public MachineFunctionPass { std::optional foldToImm(const MachineOperand &Op) const; -public: - static char ID; - - SIPeepholeSDWA() : MachineFunctionPass(ID) { - initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; void matchSDWAOperands(MachineBasicBlock &MBB); std::unique_ptr matchSDWAOperand(MachineInstr &MI); void pseudoOpConvertToVOP2(MachineInstr &MI, @@ -72,8 +65,20 @@ class SIPeepholeSDWA : public MachineFunctionPass { bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; +public: + bool run(MachineFunction &MF); +}; + +class SIPeepholeSDWALegacy : public MachineFunctionPass { +public: + static char ID; + + SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {} + StringRef getPassName() const override { return "SI Peephole SDWA"; } + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); @@ -192,17 +197,17 @@ class SDWADstPreserveOperand : public SDWADstOperand { } // end anonymous namespace -INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false) +INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false, + false) -char SIPeepholeSDWA::ID = 0; +char SIPeepholeSDWALegacy::ID = 0; -char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; +char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID; -FunctionPass *llvm::createSIPeepholeSDWAPass() { - return new SIPeepholeSDWA(); +FunctionPass *llvm::createSIPeepholeSDWALegacyPass() { + return new SIPeepholeSDWALegacy(); } - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { switch(Sel) { @@ -1235,10 +1240,17 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, } } -bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { +bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + return SIPeepholeSDWA().run(MF); +} + +bool SIPeepholeSDWA::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); - if (!ST.hasSDWA() || skipFunction(MF.getFunction())) + if (!ST.hasSDWA()) return false; MRI = &MF.getRegInfo(); @@ -1295,3 +1307,13 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { return Ret; } + +PreservedAnalyses SIPeepholeSDWAPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &) { + if (MF.getFunction().hasOptNone() || !SIPeepholeSDWA().run(MF)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet(); + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h new file mode 100644 index 0000000000000..217867220f7d8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.h @@ -0,0 +1,24 @@ +//===--------- SIPeepholeSDWA.h -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_SIPEEPHOLESDWA_H +#define LLVM_LIB_TARGET_AMDGPU_SIPEEPHOLESDWA_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { + +class SIPeepholeSDWAPass : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_SIPEEPHOLESDWA_H diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir index 4ff43024ae8cc..0c9f628dfbb2c 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-gfx9.mir @@ -1,6 +1,9 @@ # RUN: llc -mtriple=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=kaveri -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s # GCN-LABEL: {{^}}name: add_shr_i32 # GCN: [[SMOV:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 123 diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir index ef986f8c9d2a3..0ad1b5527c854 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-ops.mir @@ -1,5 +1,7 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 %s # test for 3 consecutive _sdwa's # GFX9-LABEL: name: test1_add_co_sdwa diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir index 4ca39ecc7a0ae..ffbd2d092b5d8 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir +++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDWA %s +# RUN: llc -mtriple=amdgcn -mcpu=fiji -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=SDWA %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes=si-peephole-sdwa -o - %s | FileCheck -check-prefix=SDWA %s --- name: add_f16_u32_preserve tracksRegLiveness: true From da6944912baffa430468078c38f65d55fb83dd43 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 11 Sep 2024 11:36:27 +0100 Subject: [PATCH 081/114] [lldb][test] Add test for no_unique_address when mixed with bitfields (#108155) This is the root-cause for the LLDB failures that started occurring after https://github.com/llvm/llvm-project/pull/105865. The DWARFASTParserClang has logic to try derive unnamed bitfields from DWARF offsets. In this case we treat `padding` as a 1-byte size field that would overlap with `flag`, and decide we need to introduce an unnamed bitfield into the AST, which is incorrect. --- .../no_unique_address-with-bitfields.cpp | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp new file mode 100644 index 0000000000000..1c9cc36a711b4 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp @@ -0,0 +1,28 @@ +// LLDB currently erroneously adds an unnamed bitfield +// into the AST when an overlapping no_unique_address +// field precedes a bitfield. + +// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s +// RUN: %lldb %t \ +// RUN: -o "target var global" \ +// RUN: -o "image dump ast" \ +// RUN: -o exit | FileCheck %s + +// CHECK: (lldb) image dump ast +// CHECK: CXXRecordDecl {{.*}} struct Foo definition +// CHECK: |-FieldDecl {{.*}} data 'char[5]' +// CHECK-NEXT: |-FieldDecl {{.*}} padding 'Empty' +// CHECK-NEXT: |-FieldDecl {{.*}} 'int' +// CHECK-NEXT: | `-IntegerLiteral {{.*}} 'int' 8 +// CHECK-NEXT: `-FieldDecl {{.*}} sloc> flag 'unsigned long' +// CHECK-NEXT: `-IntegerLiteral {{.*}} 'int' 1 + +struct Empty {}; + +struct Foo { + char data[5]; + [[no_unique_address]] Empty padding; + unsigned long flag : 1; +}; + +Foo global; From 2f3d061918ece414d6db544a34b2e44a9950bc23 Mon Sep 17 00:00:00 2001 From: Sergio Afonso Date: Wed, 11 Sep 2024 12:16:34 +0100 Subject: [PATCH 082/114] [MLIR][OpenMP] Automate operand structure definition (#99508) This patch adds the "gen-openmp-clause-ops" `mlir-tblgen` generator to produce the structure definitions previously in OpenMPClauseOperands.h automatically from the information contained in OpenMPOps.td and OpenMPClauses.td. The original header is maintained to enable the definition of similar structures that are not directly related to any single `OpenMP_Clause` or `OpenMP_Op` tablegen definition. --- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 6 +- flang/lib/Lower/OpenMP/ClauseProcessor.h | 2 +- .../mlir/Dialect/OpenMP/CMakeLists.txt | 1 + .../Dialect/OpenMP/OpenMPClauseOperands.h | 292 +----------------- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 34 +- mlir/test/mlir-tblgen/openmp-clause-ops.td | 86 ++++++ mlir/tools/mlir-tblgen/OmpOpGen.cpp | 215 ++++++++++++- 7 files changed, 319 insertions(+), 317 deletions(-) create mode 100644 mlir/test/mlir-tblgen/openmp-clause-ops.td diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 3f54234b176e3..f336d213cc862 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -181,7 +181,7 @@ static void addUseDeviceClause( static void convertLoopBounds(lower::AbstractConverter &converter, mlir::Location loc, - mlir::omp::LoopRelatedOps &result, + mlir::omp::LoopRelatedClauseOps &result, std::size_t loopVarTypeSize) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); // The types of lower bound, upper bound, and step are converted into the @@ -203,7 +203,7 @@ static void convertLoopBounds(lower::AbstractConverter &converter, bool ClauseProcessor::processCollapse( mlir::Location currentLocation, lower::pft::Evaluation &eval, - mlir::omp::LoopRelatedOps &result, + mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv) const { bool found = false; fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); @@ -855,7 +855,7 @@ bool ClauseProcessor::processIf( // Assume that, at most, a single 'if' clause will be applicable to the // given directive. if (operand) { - result.ifVar = operand; + result.ifExpr = operand; found = true; } }); diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index f6b319c726a2d..8d02d368f4ee0 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -55,7 +55,7 @@ class ClauseProcessor { // 'Unique' clauses: They can appear at most once in the clause list. bool processCollapse(mlir::Location currentLocation, lower::pft::Evaluation &eval, - mlir::omp::LoopRelatedOps &result, + mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv) const; bool processDevice(lower::StatementContext &stmtCtx, mlir::omp::DeviceClauseOps &result) const; diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt index dd349d1392e7b..a65c6b1d3c96b 100644 --- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt @@ -17,6 +17,7 @@ mlir_tablegen(OpenMPOpsDialect.h.inc -gen-dialect-decls -dialect=omp) mlir_tablegen(OpenMPOpsDialect.cpp.inc -gen-dialect-defs -dialect=omp) mlir_tablegen(OpenMPOps.h.inc -gen-op-decls) mlir_tablegen(OpenMPOps.cpp.inc -gen-op-defs) +mlir_tablegen(OpenMPClauseOps.h.inc -gen-openmp-clause-ops) mlir_tablegen(OpenMPOpsTypes.h.inc -gen-typedef-decls -typedefs-dialect=omp) mlir_tablegen(OpenMPOpsTypes.cpp.inc -gen-typedef-defs -typedefs-dialect=omp) mlir_tablegen(OpenMPOpsEnums.h.inc -gen-enum-decls) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h index 38e4d8f245e4f..1247a871f93c6 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h @@ -23,303 +23,31 @@ #define GET_ATTRDEF_CLASSES #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.h.inc" +#include "mlir/Dialect/OpenMP/OpenMPClauseOps.h.inc" + namespace mlir { namespace omp { //===----------------------------------------------------------------------===// -// Mixin structures defining MLIR operands associated with each OpenMP clause. +// Extra clause operand structures. //===----------------------------------------------------------------------===// -struct AlignedClauseOps { - llvm::SmallVector alignedVars; - llvm::SmallVector alignments; -}; - -struct AllocateClauseOps { - llvm::SmallVector allocateVars, allocatorVars; -}; - -struct CancelDirectiveNameClauseOps { - ClauseCancellationConstructTypeAttr cancelDirective; -}; - -struct CopyprivateClauseOps { - llvm::SmallVector copyprivateVars; - llvm::SmallVector copyprivateSyms; -}; - -struct CriticalNameClauseOps { - /// This field has a generic name because it's mirroring the `sym_name` - /// argument of the `OpenMP_CriticalNameClause` tablegen definition. That one - /// can't be renamed to anything more specific because the `sym_name` name is - /// a requirement of the `Symbol` MLIR trait associated with that clause. - StringAttr symName; -}; - -struct DependClauseOps { - llvm::SmallVector dependKinds; - llvm::SmallVector dependVars; -}; - -struct DeviceClauseOps { - Value device; -}; - struct DeviceTypeClauseOps { - // The default capture type. + /// The default capture type. DeclareTargetDeviceType deviceType = DeclareTargetDeviceType::any; }; -struct DistScheduleClauseOps { - UnitAttr distScheduleStatic; - Value distScheduleChunkSize; -}; - -struct DoacrossClauseOps { - ClauseDependAttr doacrossDependType; - IntegerAttr doacrossNumLoops; - llvm::SmallVector doacrossDependVars; -}; - -struct FilterClauseOps { - Value filteredThreadId; -}; - -struct FinalClauseOps { - Value final; -}; - -struct GrainsizeClauseOps { - Value grainsize; -}; - -struct HasDeviceAddrClauseOps { - llvm::SmallVector hasDeviceAddrVars; -}; - -struct HintClauseOps { - IntegerAttr hint; -}; - -struct IfClauseOps { - Value ifVar; -}; - -struct InReductionClauseOps { - llvm::SmallVector inReductionVars; - llvm::SmallVector inReductionByref; - llvm::SmallVector inReductionSyms; -}; - -struct IsDevicePtrClauseOps { - llvm::SmallVector isDevicePtrVars; -}; - -struct LinearClauseOps { - llvm::SmallVector linearVars, linearStepVars; -}; - -struct LoopRelatedOps { - llvm::SmallVector loopLowerBounds, loopUpperBounds, loopSteps; - UnitAttr loopInclusive; -}; - -struct MapClauseOps { - llvm::SmallVector mapVars; -}; - -struct MergeableClauseOps { - UnitAttr mergeable; -}; - -struct NogroupClauseOps { - UnitAttr nogroup; -}; - -struct NontemporalClauseOps { - llvm::SmallVector nontemporalVars; -}; - -struct NowaitClauseOps { - UnitAttr nowait; -}; - -struct NumTasksClauseOps { - Value numTasks; -}; - -struct NumTeamsClauseOps { - Value numTeamsLower, numTeamsUpper; -}; - -struct NumThreadsClauseOps { - Value numThreads; -}; - -struct OrderClauseOps { - ClauseOrderKindAttr order; - OrderModifierAttr orderMod; -}; - -struct OrderedClauseOps { - IntegerAttr ordered; -}; - -struct ParallelizationLevelClauseOps { - UnitAttr parLevelSimd; -}; - -struct PriorityClauseOps { - Value priority; -}; - -struct PrivateClauseOps { - // SSA values that correspond to "original" values being privatized. - // They refer to the SSA value outside the OpenMP region from which a clone is - // created inside the region. - llvm::SmallVector privateVars; - // The list of symbols referring to delayed privatizer ops (i.e. `omp.private` - // ops). - llvm::SmallVector privateSyms; -}; - -struct ProcBindClauseOps { - ClauseProcBindKindAttr procBindKind; -}; - -struct ReductionClauseOps { - llvm::SmallVector reductionVars; - llvm::SmallVector reductionByref; - llvm::SmallVector reductionSyms; -}; - -struct SafelenClauseOps { - IntegerAttr safelen; -}; - -struct ScheduleClauseOps { - ClauseScheduleKindAttr scheduleKind; - Value scheduleChunk; - ScheduleModifierAttr scheduleMod; - UnitAttr scheduleSimd; -}; - -struct SimdlenClauseOps { - IntegerAttr simdlen; -}; - -struct TaskReductionClauseOps { - llvm::SmallVector taskReductionVars; - llvm::SmallVector taskReductionByref; - llvm::SmallVector taskReductionSyms; -}; - -struct ThreadLimitClauseOps { - Value threadLimit; -}; - -struct UntiedClauseOps { - UnitAttr untied; -}; - -struct UseDeviceAddrClauseOps { - llvm::SmallVector useDeviceAddrVars; -}; - -struct UseDevicePtrClauseOps { - llvm::SmallVector useDevicePtrVars; -}; - //===----------------------------------------------------------------------===// -// Structures defining clause operands associated with each OpenMP leaf -// construct. -// -// These mirror the arguments expected by the corresponding OpenMP MLIR ops. +// Extra operation operand structures. //===----------------------------------------------------------------------===// -namespace detail { -template -struct Clauses : public Mixins... {}; -} // namespace detail - -using CancelOperands = - detail::Clauses; - -using CancellationPointOperands = detail::Clauses; - -using CriticalDeclareOperands = - detail::Clauses; - -// TODO `indirect` clause. +// TODO: Add `indirect` clause. using DeclareTargetOperands = detail::Clauses; -using DistributeOperands = - detail::Clauses; - -using LoopNestOperands = detail::Clauses; - -using MaskedOperands = detail::Clauses; - -using OrderedOperands = detail::Clauses; - -using OrderedRegionOperands = detail::Clauses; - -using ParallelOperands = - detail::Clauses; - -using SectionsOperands = detail::Clauses; - -using SimdOperands = - detail::Clauses; - -using SingleOperands = detail::Clauses; - -// TODO `defaultmap`, `uses_allocators` clauses. -using TargetOperands = - detail::Clauses; - -using TargetDataOperands = - detail::Clauses; - -using TargetEnterExitUpdateDataOperands = - detail::Clauses; - -// TODO `affinity`, `detach` clauses. -using TaskOperands = - detail::Clauses; - -using TaskgroupOperands = - detail::Clauses; - -using TaskloopOperands = - detail::Clauses; - -using TaskwaitOperands = detail::Clauses; - -using TeamsOperands = - detail::Clauses; - -using WsloopOperands = - detail::Clauses; +/// omp.target_enter_data, omp.target_exit_data and omp.target_update take the +/// same clauses, so we give the structure to be shared by all of them a +/// representative name. +using TargetEnterExitUpdateDataOperands = TargetEnterDataOperands; } // namespace omp } // namespace mlir diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 1a9b87f0d68c9..e4ed58f26016a 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -1370,7 +1370,7 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapVars) { void TargetDataOp::build(OpBuilder &builder, OperationState &state, const TargetDataOperands &clauses) { - TargetDataOp::build(builder, state, clauses.device, clauses.ifVar, + TargetDataOp::build(builder, state, clauses.device, clauses.ifExpr, clauses.mapVars, clauses.useDeviceAddrVars, clauses.useDevicePtrVars); } @@ -1395,7 +1395,7 @@ void TargetEnterDataOp::build( MLIRContext *ctx = builder.getContext(); TargetEnterDataOp::build(builder, state, makeArrayAttr(ctx, clauses.dependKinds), - clauses.dependVars, clauses.device, clauses.ifVar, + clauses.dependVars, clauses.device, clauses.ifExpr, clauses.mapVars, clauses.nowait); } @@ -1415,7 +1415,7 @@ void TargetExitDataOp::build(OpBuilder &builder, OperationState &state, MLIRContext *ctx = builder.getContext(); TargetExitDataOp::build(builder, state, makeArrayAttr(ctx, clauses.dependKinds), - clauses.dependVars, clauses.device, clauses.ifVar, + clauses.dependVars, clauses.device, clauses.ifExpr, clauses.mapVars, clauses.nowait); } @@ -1434,7 +1434,7 @@ void TargetUpdateOp::build(OpBuilder &builder, OperationState &state, const TargetEnterExitUpdateDataOperands &clauses) { MLIRContext *ctx = builder.getContext(); TargetUpdateOp::build(builder, state, makeArrayAttr(ctx, clauses.dependKinds), - clauses.dependVars, clauses.device, clauses.ifVar, + clauses.dependVars, clauses.device, clauses.ifExpr, clauses.mapVars, clauses.nowait); } @@ -1456,7 +1456,7 @@ void TargetOp::build(OpBuilder &builder, OperationState &state, // inReductionByref, inReductionSyms. TargetOp::build(builder, state, /*allocate_vars=*/{}, /*allocator_vars=*/{}, makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars, - clauses.device, clauses.hasDeviceAddrVars, clauses.ifVar, + clauses.device, clauses.hasDeviceAddrVars, clauses.ifExpr, /*in_reduction_vars=*/{}, /*in_reduction_byref=*/nullptr, /*in_reduction_syms=*/nullptr, clauses.isDevicePtrVars, clauses.mapVars, clauses.nowait, clauses.privateVars, @@ -1488,9 +1488,8 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state, void ParallelOp::build(OpBuilder &builder, OperationState &state, const ParallelOperands &clauses) { MLIRContext *ctx = builder.getContext(); - ParallelOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars, - clauses.ifVar, clauses.numThreads, clauses.privateVars, + clauses.ifExpr, clauses.numThreads, clauses.privateVars, makeArrayAttr(ctx, clauses.privateSyms), clauses.procBindKind, clauses.reductionVars, makeDenseBoolArrayAttr(ctx, clauses.reductionByref), @@ -1588,13 +1587,12 @@ void TeamsOp::build(OpBuilder &builder, OperationState &state, const TeamsOperands &clauses) { MLIRContext *ctx = builder.getContext(); // TODO Store clauses in op: privateVars, privateSyms. - TeamsOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars, - clauses.ifVar, clauses.numTeamsLower, clauses.numTeamsUpper, - /*private_vars=*/{}, - /*private_syms=*/nullptr, clauses.reductionVars, - makeDenseBoolArrayAttr(ctx, clauses.reductionByref), - makeArrayAttr(ctx, clauses.reductionSyms), - clauses.threadLimit); + TeamsOp::build( + builder, state, clauses.allocateVars, clauses.allocatorVars, + clauses.ifExpr, clauses.numTeamsLower, clauses.numTeamsUpper, + /*private_vars=*/{}, /*private_syms=*/nullptr, clauses.reductionVars, + makeDenseBoolArrayAttr(ctx, clauses.reductionByref), + makeArrayAttr(ctx, clauses.reductionSyms), clauses.threadLimit); } LogicalResult TeamsOp::verify() { @@ -1814,7 +1812,7 @@ void SimdOp::build(OpBuilder &builder, OperationState &state, // TODO Store clauses in op: linearVars, linearStepVars, privateVars, // privateSyms, reductionVars, reductionByref, reductionSyms. SimdOp::build(builder, state, clauses.alignedVars, - makeArrayAttr(ctx, clauses.alignments), clauses.ifVar, + makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr, /*linear_vars=*/{}, /*linear_step_vars=*/{}, clauses.nontemporalVars, clauses.order, clauses.orderMod, /*private_vars=*/{}, /*private_syms=*/nullptr, @@ -1996,7 +1994,7 @@ void TaskOp::build(OpBuilder &builder, OperationState &state, // TODO Store clauses in op: privateVars, privateSyms. TaskOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars, makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars, - clauses.final, clauses.ifVar, clauses.inReductionVars, + clauses.final, clauses.ifExpr, clauses.inReductionVars, makeDenseBoolArrayAttr(ctx, clauses.inReductionByref), makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable, clauses.priority, /*private_vars=*/{}, /*private_syms=*/nullptr, @@ -2042,7 +2040,7 @@ void TaskloopOp::build(OpBuilder &builder, OperationState &state, // TODO Store clauses in op: privateVars, privateSyms. TaskloopOp::build( builder, state, clauses.allocateVars, clauses.allocatorVars, - clauses.final, clauses.grainsize, clauses.ifVar, clauses.inReductionVars, + clauses.final, clauses.grainsize, clauses.ifExpr, clauses.inReductionVars, makeDenseBoolArrayAttr(ctx, clauses.inReductionByref), makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable, clauses.nogroup, clauses.numTasks, clauses.priority, /*private_vars=*/{}, @@ -2424,7 +2422,7 @@ LogicalResult AtomicCaptureOp::verifyRegions() { void CancelOp::build(OpBuilder &builder, OperationState &state, const CancelOperands &clauses) { - CancelOp::build(builder, state, clauses.cancelDirective, clauses.ifVar); + CancelOp::build(builder, state, clauses.cancelDirective, clauses.ifExpr); } LogicalResult CancelOp::verify() { diff --git a/mlir/test/mlir-tblgen/openmp-clause-ops.td b/mlir/test/mlir-tblgen/openmp-clause-ops.td new file mode 100644 index 0000000000000..cee3f2a693bf8 --- /dev/null +++ b/mlir/test/mlir-tblgen/openmp-clause-ops.td @@ -0,0 +1,86 @@ +// Tablegen tests for the automatic generation of OpenMP clause operand +// structure definitions. + +// Run tablegen to generate OmpCommon.td in temp directory first. +// RUN: mkdir -p %t/mlir/Dialect/OpenMP +// RUN: mlir-tblgen --gen-directive-decl --directives-dialect=OpenMP \ +// RUN: %S/../../../llvm/include/llvm/Frontend/OpenMP/OMP.td \ +// RUN: -I %S/../../../llvm/include > %t/mlir/Dialect/OpenMP/OmpCommon.td + +// RUN: mlir-tblgen -gen-openmp-clause-ops -I %S/../../include -I %t %s 2>&1 | FileCheck %s + +include "mlir/Dialect/OpenMP/OpenMPOpBase.td" + + +def OpenMP_MyFirstClause : OpenMP_Clause< + /*isRequired=*/false, /*skipTraits=*/false, /*skipArguments=*/false, + /*skipAssemblyFormat=*/false, /*skipDescription=*/false, + /*skipExtraClassDeclaration=*/false> { + let arguments = (ins + // Simple attributes + I32Attr:$int_attr, + TypeAttr:$type_attr, + DeclareTargetAttr:$omp_attr, + + // Array attributes + F32ArrayAttr:$float_array_attr, + StrArrayAttr:$str_array_attr, + AnyIntElementsAttr:$anyint_elems_attr, + RankedF32ElementsAttr<[3, 4, 5]>:$float_nd_elems_attr, + + // Optional attributes + OptionalAttr:$opt_bool_attr, + OptionalAttr:$opt_int_array_attr, + OptionalAttr:$opt_int_elems_attr, + + // Multi-level composition + ConfinedAttr, [IntMinValue<0>]>:$complex_opt_int_attr, + + // ElementsAttrBase-related edge cases. + // CHECK: warning: could not infer array-like attribute element type for argument 'elements_attr', will use bare `storageType` + ElementsAttr:$elements_attr, + // CHECK: warning: could not infer array-like attribute element type for argument 'string_elements_attr', will use bare `storageType` + StringElementsAttr:$string_elements_attr + ); +} +// CHECK: struct MyFirstClauseOps { +// CHECK-NEXT: ::mlir::IntegerAttr intAttr; +// CHECK-NEXT: ::mlir::TypeAttr typeAttr; +// CHECK-NEXT: ::mlir::omp::DeclareTargetAttr ompAttr; + +// CHECK-NEXT: ::llvm::SmallVector<::mlir::Attribute> floatArrayAttr; +// CHECK-NEXT: ::llvm::SmallVector<::mlir::Attribute> strArrayAttr; +// CHECK-NEXT: ::llvm::SmallVector<::llvm::APInt> anyintElemsAttr; +// CHECK-NEXT: ::llvm::SmallVector<::llvm::APFloat> floatNdElemsAttr; + +// CHECK-NEXT: ::mlir::BoolAttr optBoolAttr; +// CHECK-NEXT: ::llvm::SmallVector<::mlir::Attribute> optIntArrayAttr; +// CHECK-NEXT: ::llvm::SmallVector optIntElemsAttr; + +// CHECK-NEXT: ::mlir::IntegerAttr complexOptIntAttr; + +// CHECK-NEXT: ::mlir::ElementsAttr elementsAttr; +// CHECK-NEXT: ::mlir::DenseElementsAttr stringElementsAttr; +// CHECK-NEXT: } + +def OpenMP_MySecondClause : OpenMP_Clause< + /*isRequired=*/false, /*skipTraits=*/false, /*skipArguments=*/false, + /*skipAssemblyFormat=*/false, /*skipDescription=*/false, + /*skipExtraClassDeclaration=*/false> { + let arguments = (ins + I32:$int_val, + Optional:$opt_any_val, + Variadic:$variadic_index_val + ); +} +// CHECK: struct MySecondClauseOps { +// CHECK-NEXT: ::mlir::Value intVal; +// CHECK-NEXT: ::mlir::Value optAnyVal; +// CHECK-NEXT: ::llvm::SmallVector<::mlir::Value> variadicIndexVal; +// CHECK-NEXT: } + +def OpenMP_MyFirstOp : OpenMP_Op<"op", clauses=[OpenMP_MyFirstClause]>; +// CHECK: using MyFirstOperands = detail::Clauses; + +def OpenMP_MySecondOp : OpenMP_Op<"op", clauses=[OpenMP_MyFirstClause, OpenMP_MySecondClause]>; +// CHECK: using MySecondOperands = detail::Clauses; diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp index 1545821263788..f546e1b1b6691 100644 --- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp +++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp @@ -12,11 +12,54 @@ #include "mlir/TableGen/GenInfo.h" +#include "mlir/TableGen/CodeGenHelpers.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/FormatAdapters.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" using namespace llvm; +/// The code block defining the base mixin class for combining clause operand +/// structures. +static const char *const baseMixinClass = R"( +namespace detail { +template +struct Clauses : public Mixins... {}; +} // namespace detail +)"; + +/// The code block defining operation argument structures. +static const char *const operationArgStruct = R"( +using {0}Operands = detail::Clauses<{1}>; +)"; + +/// Remove multiple optional prefixes and suffixes from \c str. +/// +/// Prefixes and suffixes are attempted to be removed once in the order they +/// appear in the \c prefixes and \c suffixes arguments. All prefixes are +/// processed before suffixes are. This means it will behave as shown in the +/// following example: +/// - str: "PrePreNameSuf1Suf2" +/// - prefixes: ["Pre"] +/// - suffixes: ["Suf1", "Suf2"] +/// - return: "PreNameSuf1" +static StringRef stripPrefixAndSuffix(StringRef str, + llvm::ArrayRef prefixes, + llvm::ArrayRef suffixes) { + for (StringRef prefix : prefixes) + if (str.starts_with(prefix)) + str = str.drop_front(prefix.size()); + + for (StringRef suffix : suffixes) + if (str.ends_with(suffix)) + str = str.drop_back(suffix.size()); + + return str; +} + /// Obtain the name of the OpenMP clause a given record inheriting /// `OpenMP_Clause` refers to. /// @@ -53,19 +96,8 @@ static StringRef extractOmpClauseName(const Record *clause) { assert(!clauseClassName.empty() && "clause name must be found"); // Keep only the OpenMP clause name itself for reporting purposes. - StringRef prefix = "OpenMP_"; - StringRef suffixes[] = {"Skip", "Clause"}; - - if (clauseClassName.starts_with(prefix)) - clauseClassName = clauseClassName.substr(prefix.size()); - - for (StringRef suffix : suffixes) { - if (clauseClassName.ends_with(suffix)) - clauseClassName = - clauseClassName.substr(0, clauseClassName.size() - suffix.size()); - } - - return clauseClassName; + return stripPrefixAndSuffix(clauseClassName, /*prefixes=*/{"OpenMP_"}, + /*suffixes=*/{"Skip", "Clause"}); } /// Check that the given argument, identified by its name and initialization @@ -148,6 +180,139 @@ static void verifyClause(const Record *op, const Record *clause) { "or explicitly skipping this field."); } +/// Translate the type of an OpenMP clause's argument to its corresponding +/// representation for clause operand structures. +/// +/// All kinds of values are represented as `mlir::Value` fields, whereas +/// attributes are represented based on their `storageType`. +/// +/// \param[in] name The name of the argument. +/// \param[in] init The `DefInit` object representing the argument. +/// \param[out] nest Number of levels of array nesting associated with the +/// type. Must be initially set to 0. +/// \param[out] rank Rank (number of dimensions, if an array type) of the base +/// type. Must be initially set to 1. +/// +/// \return the name of the base type to represent elements of the argument +/// type. +static StringRef translateArgumentType(ArrayRef loc, StringInit *name, + Init *init, int &nest, int &rank) { + Record *def = cast(init)->getDef(); + + llvm::StringSet superClasses; + for (auto [sc, _] : def->getSuperClasses()) + superClasses.insert(sc->getNameInitAsString()); + + // Handle wrapper-style superclasses. + if (superClasses.contains("OptionalAttr")) + return translateArgumentType( + loc, name, def->getValue("baseAttr")->getValue(), nest, rank); + + if (superClasses.contains("TypedArrayAttrBase")) + return translateArgumentType( + loc, name, def->getValue("elementAttr")->getValue(), ++nest, rank); + + // Handle ElementsAttrBase superclasses. + if (superClasses.contains("ElementsAttrBase")) { + // TODO: Obtain the rank from ranked types. + ++nest; + + if (superClasses.contains("IntElementsAttrBase")) + return "::llvm::APInt"; + if (superClasses.contains("FloatElementsAttr") || + superClasses.contains("RankedFloatElementsAttr")) + return "::llvm::APFloat"; + if (superClasses.contains("DenseArrayAttrBase")) + return stripPrefixAndSuffix(def->getValueAsString("returnType"), + {"::llvm::ArrayRef<"}, {">"}); + + // Decrease the nesting depth in the case where the base type cannot be + // inferred, so that the bare storageType is used instead of a vector. + --nest; + PrintWarning( + loc, + "could not infer array-like attribute element type for argument '" + + name->getAsUnquotedString() + "', will use bare `storageType`"); + } + + // Handle simple attribute and value types. + bool isAttr = superClasses.contains("Attr"); + bool isValue = superClasses.contains("TypeConstraint"); + if (superClasses.contains("Variadic")) + ++nest; + + if (isValue) { + assert(!isAttr && + "argument can't be simultaneously a value and an attribute"); + return "::mlir::Value"; + } + + assert(isAttr && "argument must be an attribute if it's not a value"); + return nest > 0 ? "::mlir::Attribute" + : def->getValueAsString("storageType").trim(); +} + +/// Generate the structure that represents the arguments of the given \c clause +/// record of type \c OpenMP_Clause. +/// +/// It will contain a field for each argument, using the same name translated to +/// camel case and the corresponding base type as returned by +/// translateArgumentType() optionally wrapped in one or more llvm::SmallVector. +/// +/// An additional field containing a tuple of integers to hold the size of each +/// dimension will also be created for multi-rank types. This is not yet +/// supported. +static void genClauseOpsStruct(const Record *clause, raw_ostream &os) { + if (clause->isAnonymous()) + return; + + StringRef clauseName = extractOmpClauseName(clause); + os << "struct " << clauseName << "ClauseOps {\n"; + + DagInit *arguments = clause->getValueAsDag("arguments"); + for (auto [name, arg] : + zip_equal(arguments->getArgNames(), arguments->getArgs())) { + int nest = 0, rank = 1; + StringRef baseType = + translateArgumentType(clause->getLoc(), name, arg, nest, rank); + std::string fieldName = + convertToCamelFromSnakeCase(name->getAsUnquotedString(), + /*capitalizeFirst=*/false); + + os << formatv(" {0}{1}{2} {3};\n", + fmt_repeat("::llvm::SmallVector<", nest), baseType, + fmt_repeat(">", nest), fieldName); + + if (rank > 1) { + assert(nest >= 1 && "must be nested if it's a ranked type"); + os << formatv(" {0}::std::tuple<{1}int>{2} {3}Dims;\n", + fmt_repeat("::llvm::SmallVector<", nest - 1), + fmt_repeat("int, ", rank - 1), fmt_repeat(">", nest - 1), + fieldName); + } + } + + os << "};\n"; +} + +/// Generate the structure that represents the clause-related arguments of the +/// given \c op record of type \c OpenMP_Op. +/// +/// This structure will be defined in terms of the clause operand structures +/// associated to the clauses of the operation. +static void genOperandsDef(const Record *op, raw_ostream &os) { + if (op->isAnonymous()) + return; + + SmallVector clauseNames; + for (Record *clause : op->getValueAsListOfDefs("clauseList")) + clauseNames.push_back((extractOmpClauseName(clause) + "ClauseOps").str()); + + StringRef opName = stripPrefixAndSuffix( + op->getName(), /*prefixes=*/{"OpenMP_"}, /*suffixes=*/{"Op"}); + os << formatv(operationArgStruct, opName, join(clauseNames, ", ")); +} + /// Verify that all properties of `OpenMP_Clause`s of records deriving from /// `OpenMP_Op`s have been inherited by the latter. static bool verifyDecls(const RecordKeeper &recordKeeper, raw_ostream &) { @@ -159,8 +324,32 @@ static bool verifyDecls(const RecordKeeper &recordKeeper, raw_ostream &) { return false; } +/// Generate structures to represent clause-related operands, based on existing +/// `OpenMP_Clause` definitions and aggregate them into operation-specific +/// structures according to the `clauses` argument of each definition deriving +/// from `OpenMP_Op`. +static bool genClauseOps(const RecordKeeper &recordKeeper, raw_ostream &os) { + mlir::tblgen::NamespaceEmitter ns(os, "mlir::omp"); + for (const Record *clause : + recordKeeper.getAllDerivedDefinitions("OpenMP_Clause")) + genClauseOpsStruct(clause, os); + + // Produce base mixin class. + os << baseMixinClass; + + for (const Record *op : recordKeeper.getAllDerivedDefinitions("OpenMP_Op")) + genOperandsDef(op, os); + + return false; +} + // Registers the generator to mlir-tblgen. static mlir::GenRegistration verifyOpenmpOps("verify-openmp-ops", "Verify OpenMP operations (produce no output file)", verifyDecls); + +static mlir::GenRegistration + genOpenmpClauseOps("gen-openmp-clause-ops", + "Generate OpenMP clause operand structures", + genClauseOps); From e50131aa068f74daa70d4135c92020aadae3af33 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Wed, 11 Sep 2024 13:20:59 +0200 Subject: [PATCH 083/114] [clang] Diagnose dangling issues for the "Container" case. (#107213) This pull request enhances the GSL lifetime analysis to detect situations where a dangling `Container` object is constructed: ```cpp std::vector bad = {std::string()}; // dangling ``` The assignment case is not yet supported, but they will be addressed in a follow-up. Fixes #100526 (excluding the `push_back` case). --- clang/docs/ReleaseNotes.rst | 2 + clang/include/clang/Basic/AttrDocs.td | 14 ++++ clang/lib/Sema/CheckExprLifetime.cpp | 42 +++++++--- .../Sema/warn-lifetime-analysis-nocfg.cpp | 77 +++++++++++++++++++ 4 files changed, 126 insertions(+), 9 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 250821a9f9c45..59ccdf1e15cd8 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -298,6 +298,8 @@ Improvements to Clang's diagnostics - Clang now warns for u8 character literals used in C23 with ``-Wpre-c23-compat`` instead of ``-Wpre-c++17-compat``. +- Clang now diagnoses cases where a dangling ``GSLOwner`` object is constructed, e.g. ``std::vector v = {std::string()};`` (#GH100526). + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 546e5100b79dd..9f72456d2da67 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -6690,6 +6690,20 @@ When the Owner's lifetime ends, it will consider the Pointer to be dangling. P.getInt(); // P is dangling } +If a template class is annotated with ``[[gsl::Owner]]``, and the first +instantiated template argument is a pointer type (raw pointer, or ``[[gsl::Pointer]]``), +the analysis will consider the instantiated class as a container of the pointer. +When constructing such an object from a GSL owner object, the analysis will +assume that the container holds a pointer to the owner object. Consequently, +when the owner object is destroyed, the pointer will be considered dangling. + +.. code-block:: c++ + + int f() { + std::vector v = {std::string()}; // v holds a dangling pointer. + std::optional o = std::string(); // o holds a dangling pointer. + } + }]; } diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index f1507ebb9a506..c8e703036c132 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -267,6 +267,26 @@ static bool isInStlNamespace(const Decl *D) { return DC->isStdNamespace(); } +// Returns true if the given Record decl is a form of `GSLOwner` +// type, e.g. std::vector, std::optional. +static bool isContainerOfPointer(const RecordDecl *Container) { + if (const auto *CTSD = + dyn_cast_if_present(Container)) { + if (!CTSD->hasAttr()) // Container must be a GSL owner type. + return false; + const auto &TAs = CTSD->getTemplateArgs(); + return TAs.size() > 0 && TAs[0].getKind() == TemplateArgument::Type && + (isRecordWithAttr(TAs[0].getAsType()) || + TAs[0].getAsType()->isPointerType()); + } + return false; +} + +static bool isGSLOwner(QualType T) { + return isRecordWithAttr(T) && + !isContainerOfPointer(T->getAsRecordDecl()); +} + static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) { if (auto *Conv = dyn_cast_or_null(Callee)) if (isRecordWithAttr(Conv->getConversionType())) @@ -275,7 +295,7 @@ static bool shouldTrackImplicitObjectArg(const CXXMethodDecl *Callee) { return false; if (!isRecordWithAttr( Callee->getFunctionObjectParameterType()) && - !isRecordWithAttr(Callee->getFunctionObjectParameterType())) + !isGSLOwner(Callee->getFunctionObjectParameterType())) return false; if (Callee->getReturnType()->isPointerType() || isRecordWithAttr(Callee->getReturnType())) { @@ -413,7 +433,7 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, // Once we initialized a value with a non gsl-owner reference, it can no // longer dangle. if (ReturnType->isReferenceType() && - !isRecordWithAttr(ReturnType->getPointeeType())) { + !isGSLOwner(ReturnType->getPointeeType())) { for (const IndirectLocalPathEntry &PE : llvm::reverse(Path)) { if (PE.Kind == IndirectLocalPathEntry::GslReferenceInit || PE.Kind == IndirectLocalPathEntry::LifetimeBoundCall) @@ -468,12 +488,17 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr()) VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]); else if (EnableGSLAnalysis && I == 0) { + // Perform GSL analysis for the first argument if (shouldTrackFirstArgument(Callee)) { VisitGSLPointerArg(Callee, Args[0]); - } else if (auto *CCE = dyn_cast(Call); - CCE && - CCE->getConstructor()->getParent()->hasAttr()) { - VisitGSLPointerArg(CCE->getConstructor(), Args[0]); + } else if (auto *Ctor = dyn_cast(Call)) { + const auto *ClassD = Ctor->getConstructor()->getParent(); + // Two cases: + // a GSL pointer, e.g. std::string_view + // a container of GSL pointer, e.g. std::vector + if (ClassD->hasAttr() || + (isContainerOfPointer(ClassD) && Callee->getNumParams() == 1)) + VisitGSLPointerArg(Ctor->getConstructor(), Args[0]); } } } @@ -990,13 +1015,12 @@ static void checkExprLifetimeImpl(Sema &SemaRef, // int &p = *localUniquePtr; // someContainer.add(std::move(localUniquePtr)); // return p; - IsLocalGslOwner = isRecordWithAttr(L->getType()); + IsLocalGslOwner = isGSLOwner(L->getType()); if (pathContainsInit(Path) || !IsLocalGslOwner) return false; } else { IsGslPtrValueFromGslTempOwner = - MTE && !MTE->getExtendingDecl() && - isRecordWithAttr(MTE->getType()); + MTE && !MTE->getExtendingDecl() && isGSLOwner(MTE->getType()); // Skipping a chain of initializing gsl::Pointer annotated objects. // We are looking only for the final source to find out if it was // a local or temporary owner or the address of a local variable/param. diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp index 59357d0730a7d..234e06f069074 100644 --- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp +++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp @@ -158,17 +158,30 @@ auto begin(C &c) -> decltype(c.begin()); template T *begin(T (&array)[N]); +using size_t = decltype(sizeof(0)); + +template +struct initializer_list { + const T* ptr; size_t sz; +}; template struct vector { typedef __gnu_cxx::basic_iterator iterator; iterator begin(); iterator end(); const T *data() const; + vector(); + vector(initializer_list __l); + + template + vector(InputIterator first, InputIterator __last); + T &at(int n); }; template struct basic_string_view { + basic_string_view(); basic_string_view(const T *); const T *begin() const; }; @@ -203,11 +216,21 @@ template struct optional { optional(); optional(const T&); + + template + optional(U&& t); + + template + optional(optional&& __t); + T &operator*() &; T &&operator*() &&; T &value() &; T &&value() &&; }; +template +optional<__decay(T)> make_optional(T&&); + template struct stack { @@ -553,3 +576,57 @@ void test() { std::string_view svjkk1 = ReturnStringView(StrCat("bar", "x")); // expected-warning {{object backing the pointer will be destroyed at the end of the full-expression}} } } // namespace GH100549 + +namespace GH100526 { +void test() { + std::vector v1({std::string()}); // expected-warning {{object backing the pointer will be destroyed at the end}} + std::vector v2({ + std::string(), // expected-warning {{object backing the pointer will be destroyed at the end}} + std::string_view() + }); + std::vector v3({ + std::string_view(), + std::string() // expected-warning {{object backing the pointer will be destroyed at the end}} + }); + + std::optional o1 = std::string(); // expected-warning {{object backing the pointer}} + + std::string s; + // This is a tricky use-after-free case, what it does: + // 1. make_optional creates a temporary "optional"" object + // 2. the temporary object owns the underlying string which is copied from s. + // 3. the t3 object holds the view to the underlying string of the temporary object. + std::optional o2 = std::make_optional(s); // expected-warning {{object backing the pointer}} + std::optional o3 = std::optional(s); // expected-warning {{object backing the pointer}} + std::optional o4 = std::optional(s); + + // FIXME: should work for assignment cases + v1 = {std::string()}; + o1 = std::string(); + + // no warning on copying pointers. + std::vector n1 = {std::string_view()}; + std::optional n2 = {std::string_view()}; + std::optional n3 = std::string_view(); + std::optional n4 = std::make_optional(std::string_view()); + const char* b = ""; + std::optional n5 = std::make_optional(b); + std::optional n6 = std::make_optional("test"); +} + +std::vector test2(int i) { + std::vector t; + if (i) + return t; // this is fine, no dangling + return std::vector(t.begin(), t.end()); +} + +std::optional test3(int i) { + std::string s; + std::string_view sv; + if (i) + return s; // expected-warning {{address of stack memory associated}} + return sv; // fine +} + +} // namespace GH100526 From 334873fe2df27a4fa613e8744f29e502d3358397 Mon Sep 17 00:00:00 2001 From: Amy Wang Date: Wed, 11 Sep 2024 07:37:35 -0400 Subject: [PATCH 084/114] [MLIR][Python] Python binding support for IntegerSet attribute (#107640) Support IntegerSet attribute python binding. --- mlir/include/mlir-c/BuiltinAttributes.h | 9 +++++++++ mlir/lib/Bindings/Python/IRAttributes.cpp | 22 +++++++++++++++++++++- mlir/lib/CAPI/IR/BuiltinAttributes.cpp | 9 +++++++++ mlir/python/mlir/_mlir_libs/_mlir/ir.pyi | 16 ++++++++++++++++ mlir/python/mlir/ir.py | 5 +++++ mlir/test/python/ir/attributes.py | 18 ++++++++++++++++++ 6 files changed, 78 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir-c/BuiltinAttributes.h b/mlir/include/mlir-c/BuiltinAttributes.h index 231eb83b5e269..7c8c84e55b962 100644 --- a/mlir/include/mlir-c/BuiltinAttributes.h +++ b/mlir/include/mlir-c/BuiltinAttributes.h @@ -16,6 +16,7 @@ #include "mlir-c/AffineMap.h" #include "mlir-c/IR.h" +#include "mlir-c/IntegerSet.h" #include "mlir-c/Support.h" #ifdef __cplusplus @@ -177,6 +178,14 @@ MLIR_CAPI_EXPORTED bool mlirBoolAttrGetValue(MlirAttribute attr); /// Checks whether the given attribute is an integer set attribute. MLIR_CAPI_EXPORTED bool mlirAttributeIsAIntegerSet(MlirAttribute attr); +/// Creates an integer set attribute wrapping the given set. The attribute +/// belongs to the same context as the integer set. +MLIR_CAPI_EXPORTED MlirAttribute mlirIntegerSetAttrGet(MlirIntegerSet set); + +/// Returns the integer set wrapped in the given integer set attribute. +MLIR_CAPI_EXPORTED MlirIntegerSet +mlirIntegerSetAttrGetValue(MlirAttribute attr); + /// Returns the typeID of an IntegerSet attribute. MLIR_CAPI_EXPORTED MlirTypeID mlirIntegerSetAttrGetTypeID(void); diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp index b4049bd7972d4..bfdd4a520af27 100644 --- a/mlir/lib/Bindings/Python/IRAttributes.cpp +++ b/mlir/lib/Bindings/Python/IRAttributes.cpp @@ -147,6 +147,26 @@ class PyAffineMapAttribute : public PyConcreteAttribute { } }; +class PyIntegerSetAttribute + : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAIntegerSet; + static constexpr const char *pyClassName = "IntegerSetAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + static constexpr GetTypeIDFunctionTy getTypeIdFunction = + mlirIntegerSetAttrGetTypeID; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](PyIntegerSet &integerSet) { + MlirAttribute attr = mlirIntegerSetAttrGet(integerSet.get()); + return PyIntegerSetAttribute(integerSet.getContext(), attr); + }, + py::arg("integer_set"), "Gets an attribute wrapping an IntegerSet."); + } +}; + template static T pyTryCast(py::handle object) { try { @@ -1426,7 +1446,6 @@ py::object symbolRefOrFlatSymbolRefAttributeCaster(PyAttribute &pyAttribute) { void mlir::python::populateIRAttributes(py::module &m) { PyAffineMapAttribute::bind(m); - PyDenseBoolArrayAttribute::bind(m); PyDenseBoolArrayAttribute::PyDenseArrayIterator::bind(m); PyDenseI8ArrayAttribute::bind(m); @@ -1466,6 +1485,7 @@ void mlir::python::populateIRAttributes(py::module &m) { PyOpaqueAttribute::bind(m); PyFloatAttribute::bind(m); PyIntegerAttribute::bind(m); + PyIntegerSetAttribute::bind(m); PyStringAttribute::bind(m); PyTypeAttribute::bind(m); PyGlobals::get().registerTypeCaster( diff --git a/mlir/lib/CAPI/IR/BuiltinAttributes.cpp b/mlir/lib/CAPI/IR/BuiltinAttributes.cpp index 726af884668b2..11d1ade552f5a 100644 --- a/mlir/lib/CAPI/IR/BuiltinAttributes.cpp +++ b/mlir/lib/CAPI/IR/BuiltinAttributes.cpp @@ -10,6 +10,7 @@ #include "mlir-c/Support.h" #include "mlir/CAPI/AffineMap.h" #include "mlir/CAPI/IR.h" +#include "mlir/CAPI/IntegerSet.h" #include "mlir/CAPI/Support.h" #include "mlir/IR/AsmState.h" #include "mlir/IR/Attributes.h" @@ -192,6 +193,14 @@ MlirTypeID mlirIntegerSetAttrGetTypeID(void) { return wrap(IntegerSetAttr::getTypeID()); } +MlirAttribute mlirIntegerSetAttrGet(MlirIntegerSet set) { + return wrap(IntegerSetAttr::get(unwrap(set))); +} + +MlirIntegerSet mlirIntegerSetAttrGetValue(MlirAttribute attr) { + return wrap(llvm::cast(unwrap(attr)).getValue()); +} + //===----------------------------------------------------------------------===// // Opaque attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi index 7b4fac7275bfc..a3d3a92618696 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi @@ -138,6 +138,7 @@ __all__ = [ "InsertionPoint", "IntegerAttr", "IntegerSet", + "IntegerSetAttr", "IntegerSetConstraint", "IntegerSetConstraintList", "IntegerType", @@ -1905,6 +1906,21 @@ class IntegerSet: @property def n_symbols(self) -> int: ... +class IntegerSetAttr(Attribute): + static_typeid: ClassVar[TypeID] + @staticmethod + def get(integer_set) -> IntegerSetAttr: + """ + Gets an attribute wrapping an IntegerSet. + """ + @staticmethod + def isinstance(other: Attribute) -> bool: ... + def __init__(self, cast_from_attr: Attribute) -> None: ... + @property + def type(self) -> Type: ... + @property + def typeid(self) -> TypeID: ... + class IntegerSetConstraint: def __init__(self, *args, **kwargs) -> None: ... @property diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py index a9ac765fe1c17..9a6ce462047ad 100644 --- a/mlir/python/mlir/ir.py +++ b/mlir/python/mlir/ir.py @@ -22,6 +22,11 @@ def _affineMapAttr(x, context): return AffineMapAttr.get(x) +@register_attribute_builder("IntegerSetAttr") +def _integerSetAttr(x, context): + return IntegerSetAttr.get(x) + + @register_attribute_builder("BoolAttr") def _boolAttr(x, context): return BoolAttr.get(x, context=context) diff --git a/mlir/test/python/ir/attributes.py b/mlir/test/python/ir/attributes.py index 4b475db634645..00c3e1b4decdb 100644 --- a/mlir/test/python/ir/attributes.py +++ b/mlir/test/python/ir/attributes.py @@ -162,6 +162,24 @@ def testAffineMapAttr(): assert attr_built == attr_parsed +# CHECK-LABEL: TEST: testIntegerSetAttr +@run +def testIntegerSetAttr(): + with Context() as ctx: + d0 = AffineDimExpr.get(0) + d1 = AffineDimExpr.get(1) + s0 = AffineSymbolExpr.get(0) + c42 = AffineConstantExpr.get(42) + set0 = IntegerSet.get(2, 1, [d0 - d1, s0 - c42], [True, False]) + + # CHECK: affine_set<(d0, d1)[s0] : (d0 - d1 == 0, s0 - 42 >= 0)> + attr_built = IntegerSetAttr.get(set0) + print(str(attr_built)) + + attr_parsed = Attribute.parse(str(attr_built)) + assert attr_built == attr_parsed + + # CHECK-LABEL: TEST: testFloatAttr @run def testFloatAttr(): From 7c25ae87f7378f38aa49a92b9cf8092deb95a1f4 Mon Sep 17 00:00:00 2001 From: Frederik Carlier Date: Wed, 11 Sep 2024 13:38:00 +0200 Subject: [PATCH 085/114] Set dllimport on Objective C ivar offsets (#107604) Ensures that offsets for instance variables are marked with `dllimport` if the interface to which they belong has this attribute. --- clang/lib/CodeGen/CGObjCGNU.cpp | 11 +++++++++-- clang/test/CodeGenObjC/dllstorage.m | 4 ++-- clang/test/SemaObjC/ivar-access-tests.m | 10 ++++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp index adc7cdbfded88..6280e9465ecba 100644 --- a/clang/lib/CodeGen/CGObjCGNU.cpp +++ b/clang/lib/CodeGen/CGObjCGNU.cpp @@ -1699,11 +1699,18 @@ class CGObjCGNUstep2 : public CGObjCGNUstep { llvm::Value *EmitIvarOffset(CodeGenFunction &CGF, const ObjCInterfaceDecl *Interface, const ObjCIvarDecl *Ivar) override { - const std::string Name = GetIVarOffsetVariableName(Ivar->getContainingInterface(), Ivar); + const ObjCInterfaceDecl *ContainingInterface = + Ivar->getContainingInterface(); + const std::string Name = + GetIVarOffsetVariableName(ContainingInterface, Ivar); llvm::GlobalVariable *IvarOffsetPointer = TheModule.getNamedGlobal(Name); - if (!IvarOffsetPointer) + if (!IvarOffsetPointer) { IvarOffsetPointer = new llvm::GlobalVariable(TheModule, IntTy, false, llvm::GlobalValue::ExternalLinkage, nullptr, Name); + if (Ivar->getAccessControl() != ObjCIvarDecl::Private && + Ivar->getAccessControl() != ObjCIvarDecl::Package) + CGM.setGVProperties(IvarOffsetPointer, ContainingInterface); + } CharUnits Align = CGM.getIntAlign(); llvm::Value *Offset = CGF.Builder.CreateAlignedLoad(IntTy, IvarOffsetPointer, Align); diff --git a/clang/test/CodeGenObjC/dllstorage.m b/clang/test/CodeGenObjC/dllstorage.m index c94f4c9b5804d..a6c591b2d7930 100644 --- a/clang/test/CodeGenObjC/dllstorage.m +++ b/clang/test/CodeGenObjC/dllstorage.m @@ -112,7 +112,7 @@ @interface M : I { // CHECK-IR-DAG: @"OBJC_IVAR_$_M._ivar" = external dllimport global i32 // CHECK-NF-DAG: @"$_OBJC_REF_CLASS_M" = external dllimport global ptr -// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external global i32 +// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external dllimport global i32 __declspec(dllexport) __attribute__((__objc_exception__)) @@ -151,7 +151,7 @@ id f(Q *q) { // CHECK-IR-DAG: @"OBJC_IVAR_$_M._ivar" = external dllimport global i32 -// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external global i32 +// CHECK-NF-DAG: @"__objc_ivar_offset_M._ivar.@" = external dllimport global i32 int g(void) { @autoreleasepool { diff --git a/clang/test/SemaObjC/ivar-access-tests.m b/clang/test/SemaObjC/ivar-access-tests.m index cd7e09d406ada..6060dea5ab0f0 100644 --- a/clang/test/SemaObjC/ivar-access-tests.m +++ b/clang/test/SemaObjC/ivar-access-tests.m @@ -2,6 +2,8 @@ @interface MySuperClass { + int unmarked; + @private int private; @@ -17,6 +19,7 @@ @implementation MySuperClass - (void) test { int access; MySuperClass *s = 0; + access = s->unmarked; access = s->private; access = s->protected; } @@ -30,9 +33,11 @@ @implementation MyClass - (void) test { int access; MySuperClass *s = 0; + access = s->unmarked; access = s->private; // expected-error {{instance variable 'private' is private}} access = s->protected; MyClass *m=0; + access = m->unmarked; access = m->private; // expected-error {{instance variable 'private' is private}} access = m->protected; } @@ -46,9 +51,11 @@ @implementation Deeper - (void) test { int access; MySuperClass *s = 0; + access = s->unmarked; access = s->private; // expected-error {{instance variable 'private' is private}} access = s->protected; MyClass *m=0; + access = m->unmarked; access = m->private; // expected-error {{instance variable 'private' is private}} access = m->protected; } @@ -61,9 +68,11 @@ @implementation Unrelated - (void) test { int access; MySuperClass *s = 0; + access = s->unmarked; // expected-error {{instance variable 'unmarked' is protected}} access = s->private; // expected-error {{instance variable 'private' is private}} access = s->protected; // expected-error {{instance variable 'protected' is protected}} MyClass *m=0; + access = m->unmarked; // expected-error {{instance variable 'unmarked' is protected}} access = m->private; // expected-error {{instance variable 'private' is private}} access = m->protected; // expected-error {{instance variable 'protected' is protected}} } @@ -73,6 +82,7 @@ int main (void) { MySuperClass *s = 0; int access; + access = s->unmarked; // expected-error {{instance variable 'unmarked' is protected}} access = s->private; // expected-error {{instance variable 'private' is private}} access = s->protected; // expected-error {{instance variable 'protected' is protected}} return 0; From b35bb7b797e81e1d972c8e6d60e20e39c1917b99 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Wed, 11 Sep 2024 19:46:26 +0800 Subject: [PATCH 086/114] [mlir] Fix 'StringSet' may not intend to support class template argument deduction (NFC) /llvm-project/mlir/tools/mlir-tblgen/OmpOpGen.cpp:202:3: error: 'StringSet' may not intend to support class template argument deduction [-Werror,-Wctad-maybe-unsupported] llvm::StringSet superClasses; ^ /llvm-project/llvm/include/llvm/ADT/StringSet.h:23:7: note: add a deduction guide to suppress this warning class StringSet : public StringMap { ^ --- mlir/tools/mlir-tblgen/OmpOpGen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp index f546e1b1b6691..c9d25a5dee5cd 100644 --- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp +++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp @@ -199,7 +199,7 @@ static StringRef translateArgumentType(ArrayRef loc, StringInit *name, Init *init, int &nest, int &rank) { Record *def = cast(init)->getDef(); - llvm::StringSet superClasses; + llvm::StringSet<> superClasses; for (auto [sc, _] : def->getSuperClasses()) superClasses.insert(sc->getNameInitAsString()); From 0856f12bb0a9829a282bef7c26ad536ff3b1e0a5 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Wed, 11 Sep 2024 19:59:00 +0800 Subject: [PATCH 087/114] [mlir] Fix -Wunused-variable in OmpOpGen.cpp (NFC) /llvm-project/mlir/tools/mlir-tblgen/OmpOpGen.cpp:239:8: error: unused variable 'isAttr' [-Werror,-Wunused-variable] bool isAttr = superClasses.contains("Attr"); ^ --- mlir/tools/mlir-tblgen/OmpOpGen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp index c9d25a5dee5cd..23368c56bee8c 100644 --- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp +++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp @@ -236,7 +236,7 @@ static StringRef translateArgumentType(ArrayRef loc, StringInit *name, } // Handle simple attribute and value types. - bool isAttr = superClasses.contains("Attr"); + [[maybe_unused]] bool isAttr = superClasses.contains("Attr"); bool isValue = superClasses.contains("TypeConstraint"); if (superClasses.contains("Variadic")) ++nest; From ed22029eea12b37c2a58f2c6b8d67f12009102a0 Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Wed, 11 Sep 2024 14:18:14 +0200 Subject: [PATCH 088/114] [SPIR-V] Address the case when optimization uses GEP operator and GenCode creates G_PTR_ADD to convey the semantics (#107880) When running SPIR-V Backend with optimization levels higher than 0, we observe GEP Operator's as a new factor, massively used to convey the semantics of the original LLVM IR. Previously, an issue related to GEP Operator was mentioned and fixed on the consumer side of toolchains (see, for example, Khronos Trandslator Issue https://github.com/KhronosGroup/SPIRV-LLVM-Translator/issues/2486 and PR https://github.com/KhronosGroup/SPIRV-LLVM-Translator/pull/2487). However, there is a case when GenCode creates G_PTR_ADD to convey the original semantics under optimization levels higher than 0 where it's SPIR-V Backend that fails to translate source LLVM IR correctly. Consider the following reproducer: ``` %struct = type { i32, [257 x i8], [257 x i8], [129 x i8], i32, i64, i64, i64, i64, i64, i64 } @Mem = linkonce_odr dso_local addrspace(1) global %struct zeroinitializer, align 8 define weak dso_local spir_func void @__devicelib_assert_fail(ptr addrspace(4) noundef %expr, i32 noundef %line, i1 %fl) { entry: %cmp = icmp eq i32 %line, 0 br i1 %cmp, label %lbl, label %exit lbl: store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @Mem, i64 648), align 8 br i1 %fl, label %lbl, label %exit exit: ret void } ``` converted to the following machine instructions by SPIR-V Backend: ``` %4:type(s64) = OpTypeInt 32, 0 %22:type(s64) = OpTypePointer 5, %4:type(s64) %2:type(s64) = OpTypeInt 8, 0 %28:type(s64) = OpTypePointer 5, %2:type(s64) %10:pid(p1) = G_GLOBAL_VALUE @Mem %36:type(s64) = OpTypeStruct %4:type(s64), %32:type(s64), %32:type(s64), %34:type(s64), %4:type(s64), %35:type(s64), %35:type(s64), %35:type(s64), %35:type(s64), %35:type(s64), %35:type(s64) %37:iid(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.const.composite) %8:iid(s32) = ASSIGN_TYPE %37:iid(s32), %36:type(s64) G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.spv.init.global), %10:pid(p1), %8:iid(s32) %29:pid(p1) = nuw G_PTR_ADD %10:pid, %16:iid(s64) %15:pid(p1) = nuw ASSIGN_TYPE %29:pid(p1), %28:type(s64) %27:pid(p2) = G_BITCAST %15:pid(p1) %17:pid(p2) = ASSIGN_TYPE %27:pid(p2), %22:type(s64) G_STORE %1:iid(s32), %17:pid(p2) :: (store (s32) into %ir.3, align 8, addrspace 1) ``` On the next stage of instruction selection this `G_PTR_ADD`-related pattern would be interpreted as an initialization of a global variable and converted to an invalid constant GEP pattern that, in its turn, would fail to be verified by LLVM during back translation from SPIR-V to LLVM IR. This PR introduces a fix for the problem by adding one more case of `G_PTR_ADD` translation, when we use a non-const GEP to convey the meaning. The reproducer is attached as a new test case. --- .../Target/SPIRV/SPIRVDuplicatesTracker.cpp | 14 ++++ .../Target/SPIRV/SPIRVInstructionSelector.cpp | 67 +++++++++++++++++-- .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 1 + .../CodeGen/SPIRV/opt-gepoperator-of-gvar.ll | 64 ++++++++++++++++++ 4 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp index 7c32bb1968ef5..832ca0ba5a82d 100644 --- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp @@ -13,6 +13,8 @@ #include "SPIRVDuplicatesTracker.h" +#define DEBUG_TYPE "build-dep-graph" + using namespace llvm; template @@ -63,6 +65,18 @@ void SPIRVGeneralDuplicatesTracker::buildDepsGraph( if (MI->getOpcode() == SPIRV::OpConstantFunctionPointerINTEL && i == 2) continue; MachineOperand *RegOp = &VRegDef->getOperand(0); + LLVM_DEBUG({ + if (Reg2Entry.count(RegOp) == 0 && + (MI->getOpcode() != SPIRV::OpVariable || i != 3)) { + dbgs() << "Unexpected pattern while building a dependency " + "graph.\nInstruction: "; + MI->print(dbgs()); + dbgs() << "Operand: "; + Op.print(dbgs()); + dbgs() << "\nOperand definition: "; + VRegDef->print(dbgs()); + } + }); assert((MI->getOpcode() == SPIRV::OpVariable && i == 3) || Reg2Entry.count(RegOp)); if (Reg2Entry.count(RegOp)) diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 1e861da35aaac..831d7f76ac14c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -607,10 +607,7 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, case TargetOpcode::G_ADDRSPACE_CAST: return selectAddrSpaceCast(ResVReg, ResType, I); case TargetOpcode::G_PTR_ADD: { - // Currently, we get G_PTR_ADD only as a result of translating - // global variables, initialized with constant expressions like GV + Const - // (see test opencl/basic/progvar_prog_scope_init.ll). - // TODO: extend the handler once we have other cases. + // Currently, we get G_PTR_ADD only applied to global variables. assert(I.getOperand(1).isReg() && I.getOperand(2).isReg()); Register GV = I.getOperand(1).getReg(); MachineRegisterInfo::def_instr_iterator II = MRI->def_instr_begin(GV); @@ -619,8 +616,68 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, (*II).getOpcode() == TargetOpcode::COPY || (*II).getOpcode() == SPIRV::OpVariable) && isImm(I.getOperand(2), MRI)); - Register Idx = buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I); + // It may be the initialization of a global variable. + bool IsGVInit = false; + for (MachineRegisterInfo::use_instr_iterator + UseIt = MRI->use_instr_begin(I.getOperand(0).getReg()), + UseEnd = MRI->use_instr_end(); + UseIt != UseEnd; UseIt = std::next(UseIt)) { + if ((*UseIt).getOpcode() == TargetOpcode::G_GLOBAL_VALUE || + (*UseIt).getOpcode() == SPIRV::OpVariable) { + IsGVInit = true; + break; + } + } MachineBasicBlock &BB = *I.getParent(); + if (!IsGVInit) { + SPIRVType *GVType = GR.getSPIRVTypeForVReg(GV); + SPIRVType *GVPointeeType = GR.getPointeeType(GVType); + SPIRVType *ResPointeeType = GR.getPointeeType(ResType); + if (GVPointeeType && ResPointeeType && GVPointeeType != ResPointeeType) { + // Build a new virtual register that is associated with the required + // data type. + Register NewVReg = MRI->createGenericVirtualRegister(MRI->getType(GV)); + MRI->setRegClass(NewVReg, MRI->getRegClass(GV)); + // Having a correctly typed base we are ready to build the actually + // required GEP. It may not be a constant though, because all Operands + // of OpSpecConstantOp is to originate from other const instructions, + // and only the AccessChain named opcodes accept a global OpVariable + // instruction. We can't use an AccessChain opcode because of the type + // mismatch between result and base types. + if (!GR.isBitcastCompatible(ResType, GVType)) + report_fatal_error( + "incompatible result and operand types in a bitcast"); + Register ResTypeReg = GR.getSPIRVTypeID(ResType); + MachineInstrBuilder MIB = + BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpBitcast)) + .addDef(NewVReg) + .addUse(ResTypeReg) + .addUse(GV); + return MIB.constrainAllUses(TII, TRI, RBI) && + BuildMI(BB, I, I.getDebugLoc(), + TII.get(STI.isVulkanEnv() + ? SPIRV::OpInBoundsAccessChain + : SPIRV::OpInBoundsPtrAccessChain)) + .addDef(ResVReg) + .addUse(ResTypeReg) + .addUse(NewVReg) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); + } else { + return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addImm( + static_cast(SPIRV::Opcode::InBoundsPtrAccessChain)) + .addUse(GV) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); + } + } + // It's possible to translate G_PTR_ADD to OpSpecConstantOp: either to + // initialize a global variable with a constant expression (e.g., the test + // case opencl/basic/progvar_prog_scope_init.ll), or for another use case + Register Idx = buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I); auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp)) .addDef(ResVReg) .addUse(GR.getSPIRVTypeID(ResType)) diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 96601dd8796c6..23cd32eff45d5 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -1628,6 +1628,7 @@ multiclass OpcodeOperand value> { defm : SymbolicOperandWithRequirements; } // TODO: implement other mnemonics. +defm InBoundsAccessChain : OpcodeOperand<66>; defm InBoundsPtrAccessChain : OpcodeOperand<70>; defm PtrCastToGeneric : OpcodeOperand<121>; defm Bitcast : OpcodeOperand<124>; diff --git a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll new file mode 100644 index 0000000000000..5f9229f5a5bd6 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll @@ -0,0 +1,64 @@ +; RUN: llc -O2 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O2 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -O2 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O2 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#Char:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#PtrChar:]] = OpTypePointer CrossWorkgroup %[[#Char]] +; CHECK-DAG: %[[#Int:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#PtrInt:]] = OpTypePointer CrossWorkgroup %[[#Int]] +; CHECK-DAG: %[[#C648:]] = OpConstant %[[#]] 648 +; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] +; CHECK-DAG: %[[#VarInit:]] = OpConstantNull %[[#Struct]] +; CHECK-DAG: %[[#PtrStruct:]] = OpTypePointer CrossWorkgroup %[[#Struct]] +; CHECK-DAG: %[[#Var:]] = OpVariable %[[#PtrStruct]] CrossWorkgroup %[[#VarInit]] +; CHECK-DAG: %[[#Bytes:]] = OpVariable %[[#PtrChar]] CrossWorkgroup %[[#]] +; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] 70 %[[#Bytes]] %[[#C648]] + +; CHECK: OpFunction +; CHECK: %[[#]] = OpFunctionParameter %[[#]] +; CHECK: %[[#Line:]] = OpFunctionParameter %[[#Int]] +; CHECK: %[[#]] = OpFunctionParameter %[[#]] +; CHECK: %[[#Casted:]] = OpBitcast %[[#PtrChar]] %[[#Var]] +; CHECK: %[[#AddrChar:]] = OpInBoundsPtrAccessChain %[[#PtrChar]] %[[#Casted]] %[[#C648]] +; CHECK: %[[#AddrInt:]] = OpBitcast %[[#PtrInt]] %[[#AddrChar]] +; CHECK: OpStore %[[#AddrInt]] %[[#Line]] + +%struct = type { i32, [257 x i8], [257 x i8], [129 x i8], i32, i64, i64, i64, i64, i64, i64 } +@Mem = linkonce_odr dso_local addrspace(1) global %struct zeroinitializer, align 8 + +define weak dso_local spir_func void @foo(ptr addrspace(4) noundef %expr, i32 noundef %line, i1 %fl) { +entry: + %cmp = icmp eq i32 %line, 0 + br i1 %cmp, label %lbl, label %exit + +lbl: + store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @Mem, i64 648), align 8 + br i1 %fl, label %lbl, label %exit + +exit: + ret void +} + +; CHECK: OpFunction +; CHECK: %[[#]] = OpFunctionParameter %[[#]] +; CHECK: %[[#Line2:]] = OpFunctionParameter %[[#Int]] +; CHECK: %[[#]] = OpFunctionParameter %[[#]] +; CHECK: %[[#AddrInt2:]] = OpBitcast %[[#PtrInt]] %[[#BytesGEP]] +; CHECK: OpStore %[[#AddrInt2]] %[[#Line2]] + +@Bytes = linkonce_odr dso_local addrspace(1) global i8 zeroinitializer, align 8 + +define weak dso_local spir_func void @bar(ptr addrspace(4) noundef %expr, i32 noundef %line, i1 %fl) { +entry: + %cmp = icmp eq i32 %line, 0 + br i1 %cmp, label %lbl, label %exit + +lbl: + store i32 %line, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @Bytes, i64 648), align 8 + br i1 %fl, label %lbl, label %exit + +exit: + ret void +} From 1b0400eed8613108d9f293b9ddd3380e3241ac60 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 11 Sep 2024 13:07:47 +0100 Subject: [PATCH 089/114] [X86] combineSubABS - handle NEG(ABD()) expanded patterns combineSubABS already handles the "(sub Y, cmovns X, -X) -> (add Y, cmovns -X, X)" fold by flipping the cmov operands. We can do something similar for the negation of ABDS/U patterns which have been expanded to a CMOVL/CMOVB with a pair of commuted subtractions: "NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y))" --- llvm/lib/Target/X86/X86ISelLowering.cpp | 56 +++++++++++------- llvm/test/CodeGen/X86/abds-neg.ll | 75 +++++++++---------------- llvm/test/CodeGen/X86/abdu-neg.ll | 57 +++++++------------ 3 files changed, 80 insertions(+), 108 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a1d466eee691c..d0794cb9bfde3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56096,34 +56096,50 @@ static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) { if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse()) return SDValue(); - X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2); - if (CC != X86::COND_S && CC != X86::COND_NS) - return SDValue(); - - // Condition should come from a negate operation. SDValue Cond = N1.getOperand(3); - if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0))) + if (Cond.getOpcode() != X86ISD::SUB) return SDValue(); assert(Cond.getResNo() == 1 && "Unexpected result number"); - // Get the X and -X from the negate. - SDValue NegX = Cond.getValue(0); - SDValue X = Cond.getOperand(1); - SDValue FalseOp = N1.getOperand(0); SDValue TrueOp = N1.getOperand(1); + X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2); + MVT VT = N->getSimpleValueType(0); + SDLoc DL(N); - // Cmov operands should be X and NegX. Order doesn't matter. - if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X)) - return SDValue(); + // ABS condition should come from a negate operation. + if ((CC == X86::COND_S || CC == X86::COND_NS) && + isNullConstant(Cond.getOperand(0))) { + // Get the X and -X from the negate. + SDValue NegX = Cond.getValue(0); + SDValue X = Cond.getOperand(1); - // Build a new CMOV with the operands swapped. - SDLoc DL(N); - MVT VT = N->getSimpleValueType(0); - SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, - N1.getOperand(2), Cond); - // Convert sub to add. - return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov); + // Cmov operands should be X and NegX. Order doesn't matter. + if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X)) + return SDValue(); + + // Build a new CMOV with the operands swapped. + SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, + N1.getOperand(2), Cond); + // Convert sub to add. + return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov); + } + + // Handle ABD special case: + // NEG(ABD(X,Y)) -> NEG(CMOV(SUB(X,Y),SUB(Y,X))) -> CMOV(SUB(Y,X),SUB(X,Y)). + // ABD condition should come from a pair of matching subtracts. + if ((CC == X86::COND_L || CC == X86::COND_B) && isNullConstant(N0) && + (FalseOp == Cond.getValue(0) || TrueOp == Cond.getValue(0)) && + (TrueOp.getOpcode() == ISD::SUB || TrueOp.getOpcode() == X86ISD::SUB) && + (FalseOp.getOpcode() == ISD::SUB || FalseOp.getOpcode() == X86ISD::SUB) && + (TrueOp.getOperand(0) == FalseOp.getOperand(1)) && + (TrueOp.getOperand(1) == FalseOp.getOperand(0))) { + // Build a new CMOV with the operands swapped. + return DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, N1.getOperand(2), + Cond); + } + + return SDValue(); } static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) { diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll index b9b3436dd1ed9..6e22d855dc831 100644 --- a/llvm/test/CodeGen/X86/abds-neg.ll +++ b/llvm/test/CodeGen/X86/abds-neg.ll @@ -112,8 +112,7 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -144,8 +143,7 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -176,8 +174,7 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -208,8 +205,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32: @@ -217,8 +213,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovgel %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovll %esi, %eax ; X64-NEXT: retq %aext = sext i32 %a to i64 %bext = sext i32 %b to i64 @@ -237,8 +232,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_i16: @@ -247,8 +241,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind { ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: subl %eax, %ecx ; X64-NEXT: subl %edi, %eax -; X64-NEXT: cmovll %ecx, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovgel %ecx, %eax ; X64-NEXT: retq %aext = sext i32 %a to i64 %bext = sext i16 %b to i64 @@ -267,8 +260,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_undef: @@ -276,8 +268,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovgel %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovll %esi, %eax ; X64-NEXT: retq %aext = sext i32 %a to i64 %bext = sext i32 %b to i64 @@ -319,8 +310,7 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovgeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovlq %rsi, %rax ; X64-NEXT: retq %aext = sext i64 %a to i128 %bext = sext i64 %b to i128 @@ -362,8 +352,7 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovgeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovlq %rsi, %rax ; X64-NEXT: retq %aext = sext i64 %a to i128 %bext = sext i64 %b to i128 @@ -558,8 +547,7 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -587,8 +575,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_minmax_i32: @@ -596,8 +583,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovgel %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovll %esi, %eax ; X64-NEXT: retq %min = call i32 @llvm.smin.i32(i32 %a, i32 %b) %max = call i32 @llvm.smax.i32(i32 %a, i32 %b) @@ -641,8 +627,7 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovgeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovlq %rsi, %rax ; X64-NEXT: retq %min = call i64 @llvm.smin.i64(i64 %a, i64 %b) %max = call i64 @llvm.smax.i64(i64 %a, i64 %b) @@ -776,8 +761,7 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -806,8 +790,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_cmp_i32: @@ -815,8 +798,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovgel %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovll %esi, %eax ; X64-NEXT: retq %cmp = icmp sge i32 %a, %b %ab = sub i32 %a, %b @@ -853,8 +835,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovgeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovlq %rsi, %rax ; X64-NEXT: retq %cmp = icmp slt i64 %a, %b %ab = sub i64 %a, %b @@ -1031,8 +1012,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_subnsw_i32: @@ -1040,8 +1020,7 @@ define i32 @abd_subnsw_i32(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovgel %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovll %esi, %eax ; X64-NEXT: retq %sub = sub nsw i32 %a, %b %abs = call i32 @llvm.abs.i32(i32 %sub, i1 false) @@ -1057,8 +1036,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovll %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovgel %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_subnsw_i32_undef: @@ -1066,8 +1044,7 @@ define i32 @abd_subnsw_i32_undef(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovgel %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovll %esi, %eax ; X64-NEXT: retq %sub = sub nsw i32 %a, %b %abs = call i32 @llvm.abs.i32(i32 %sub, i1 true) @@ -1098,8 +1075,7 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovgeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovlq %rsi, %rax ; X64-NEXT: retq %sub = sub nsw i64 %a, %b %abs = call i64 @llvm.abs.i64(i64 %sub, i1 false) @@ -1130,8 +1106,7 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovgeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovlq %rsi, %rax ; X64-NEXT: retq %sub = sub nsw i64 %a, %b %abs = call i64 @llvm.abs.i64(i64 %sub, i1 true) diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll index 1ded7e79e2510..6bda99c89a37e 100644 --- a/llvm/test/CodeGen/X86/abdu-neg.ll +++ b/llvm/test/CodeGen/X86/abdu-neg.ll @@ -112,8 +112,7 @@ define i16 @abd_ext_i16(i16 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -144,8 +143,7 @@ define i16 @abd_ext_i16_i32(i16 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -176,8 +174,7 @@ define i16 @abd_ext_i16_undef(i16 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -208,8 +205,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32: @@ -217,8 +213,7 @@ define i32 @abd_ext_i32(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovael %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovbl %esi, %eax ; X64-NEXT: retq %aext = zext i32 %a to i64 %bext = zext i32 %b to i64 @@ -237,8 +232,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_i16: @@ -247,8 +241,7 @@ define i32 @abd_ext_i32_i16(i32 %a, i16 %b) nounwind { ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: subl %eax, %ecx ; X64-NEXT: subl %edi, %eax -; X64-NEXT: cmovbl %ecx, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovael %ecx, %eax ; X64-NEXT: retq %aext = zext i32 %a to i64 %bext = zext i16 %b to i64 @@ -267,8 +260,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_ext_i32_undef: @@ -276,8 +268,7 @@ define i32 @abd_ext_i32_undef(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovael %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovbl %esi, %eax ; X64-NEXT: retq %aext = zext i32 %a to i64 %bext = zext i32 %b to i64 @@ -313,8 +304,7 @@ define i64 @abd_ext_i64(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovaeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovbq %rsi, %rax ; X64-NEXT: retq %aext = zext i64 %a to i128 %bext = zext i64 %b to i128 @@ -350,8 +340,7 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovaeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovbq %rsi, %rax ; X64-NEXT: retq %aext = zext i64 %a to i128 %bext = zext i64 %b to i128 @@ -540,8 +529,7 @@ define i16 @abd_minmax_i16(i16 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -569,8 +557,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_minmax_i32: @@ -578,8 +565,7 @@ define i32 @abd_minmax_i32(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovael %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovbl %esi, %eax ; X64-NEXT: retq %min = call i32 @llvm.umin.i32(i32 %a, i32 %b) %max = call i32 @llvm.umax.i32(i32 %a, i32 %b) @@ -623,8 +609,7 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovaeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovbq %rsi, %rax ; X64-NEXT: retq %min = call i64 @llvm.umin.i64(i64 %a, i64 %b) %max = call i64 @llvm.umax.i64(i64 %a, i64 %b) @@ -758,8 +743,7 @@ define i16 @abd_cmp_i16(i16 %a, i16 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -788,8 +772,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: subl %eax, %edx ; X86-NEXT: subl %ecx, %eax -; X86-NEXT: cmovbl %edx, %eax -; X86-NEXT: negl %eax +; X86-NEXT: cmovael %edx, %eax ; X86-NEXT: retl ; ; X64-LABEL: abd_cmp_i32: @@ -797,8 +780,7 @@ define i32 @abd_cmp_i32(i32 %a, i32 %b) nounwind { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: subl %esi, %eax ; X64-NEXT: subl %edi, %esi -; X64-NEXT: cmovael %esi, %eax -; X64-NEXT: negl %eax +; X64-NEXT: cmovbl %esi, %eax ; X64-NEXT: retq %cmp = icmp uge i32 %a, %b %ab = sub i32 %a, %b @@ -832,8 +814,7 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: subq %rsi, %rax ; X64-NEXT: subq %rdi, %rsi -; X64-NEXT: cmovaeq %rsi, %rax -; X64-NEXT: negq %rax +; X64-NEXT: cmovbq %rsi, %rax ; X64-NEXT: retq %cmp = icmp ult i64 %a, %b %ab = sub i64 %a, %b From b9c2e2e3e910f8283f52c574fd8b6a7981d6cb0d Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Wed, 11 Sep 2024 14:18:54 +0200 Subject: [PATCH 090/114] [bazel] port 2f3d061918ece414d6db544a34b2e44a9950bc23 --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index b43bdb7b5f471..c931898ed98e3 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -10516,6 +10516,10 @@ gentbl_cc_library( ["-gen-op-doc"], "g3doc/Dialects/OpenMP/OpenMPOps.md", ), + ( + ["-gen-openmp-clause-ops"], + "include/mlir/Dialect/OpenMP/OpenMPClauseOps.h.inc", + ), ], tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/OpenMP/OpenMPOps.td", From 80fcab8c26129a98f01ce4f8d9cc90f3653bf693 Mon Sep 17 00:00:00 2001 From: MichelleCDjunaidi Date: Wed, 11 Sep 2024 22:32:01 +1000 Subject: [PATCH 091/114] [Docs][clang-query] disclose Windows linetab bug on clang-query tab auto-complete (#107956) As per https://github.com/llvm/llvm-project/pull/106672/#issuecomment-2325577815 and https://github.com/llvm/llvm-project/issues/107377, the documentation should be updated to note that the current bug on Windows involving ``LineEditor`` causing Tab key related features to not work. Fixes #107377 --- .../docs/clang-tidy/Contributing.rst | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst index d5303418b859b..ff8b05ff263c1 100644 --- a/clang-tools-extra/docs/clang-tidy/Contributing.rst +++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst @@ -344,18 +344,20 @@ matching expressions to simplify your matcher. clang-query> let c1 cxxRecordDecl() clang-query> match c1 -Alternatively, pressing the tab key after a previous matcher's open parentheses would also -show which matchers can be chained with the previous matcher, though some matchers that work -may not be listed. - -Just like breaking up a huge function into smaller chunks with intention-revealing names -can help you understand a complex algorithm, breaking up a matcher into smaller matchers -with intention-revealing names can help you understand a complicated matcher. - -Once you have a working clang-query matcher, the C++ API matchers will be the same or similar -to your interactively constructed matcher (there can be cases where they differ slightly). -You can use local variables to preserve your intention-revealing names that you applied -to nested matchers. +Alternatively, pressing the tab key after a previous matcher's open parentheses +would also show which matchers can be chained with the previous matcher, +though some matchers that work may not be listed. Note that tab completion +does not currently work on Windows. + +Just like breaking up a huge function into smaller chunks with +intention-revealing names can help you understand a complex algorithm, breaking +up a matcher into smaller matchers with intention-revealing names can help +you understand a complicated matcher. + +Once you have a working :program:`clang-query` matcher, the C++ API matchers +will be the same or similar to your interactively constructed matcher (there +can be cases where they differ slightly). You can use local variables to preserve +your intention-revealing names that you applied to nested matchers. Creating private matchers ^^^^^^^^^^^^^^^^^^^^^^^^^ From 49b57df16144450331b4f90332d6918168b2a306 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 11 Sep 2024 14:38:56 +0200 Subject: [PATCH 092/114] DXIL: Use correct type ID when writing ValueAsMetadata. (#94337) When emitting references to functions as part of `ValueAsMetadata`, we currently emit the incorrect (typed) pointer, resulting in crashes during deserialization. Avoid this by correctly mapping the type during serialization. --- .../Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp | 2 +- llvm/test/tools/dxil-dis/metadata.ll | 10 +++++++++- llvm/tools/dxil-dis/CMakeLists.txt | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp index cd0d6d34e9a67..45aadac861946 100644 --- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp +++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp @@ -1345,7 +1345,7 @@ void DXILBitcodeWriter::writeValueAsMetadata( Ty = TypedPointerType::get(F->getFunctionType(), F->getAddressSpace()); else if (GlobalVariable *GV = dyn_cast(V)) Ty = TypedPointerType::get(GV->getValueType(), GV->getAddressSpace()); - Record.push_back(getTypeID(Ty)); + Record.push_back(getTypeID(Ty, V)); Record.push_back(VE.getValueID(V)); Stream.EmitRecord(bitc::METADATA_VALUE, Record, 0); Record.clear(); diff --git a/llvm/test/tools/dxil-dis/metadata.ll b/llvm/test/tools/dxil-dis/metadata.ll index 758860a2deb8f..18f2530ab8fc2 100644 --- a/llvm/test/tools/dxil-dis/metadata.ll +++ b/llvm/test/tools/dxil-dis/metadata.ll @@ -1,13 +1,21 @@ -; RUN: llc --filetype=obj %s -o - | dxil-dis +; RUN: llc --filetype=obj %s -o - | dxil-dis target triple = "dxil-unknown-shadermodel6.7-library" +define void @kernel(ptr addrspace(1)) { + ret void +} + !llvm.foo = !{!0} !llvm.bar = !{!1} +!llvm.baz = !{!2} !0 = !{i32 42} !1 = !{!"Some MDString"} +!2 = !{ptr @kernel} ; CHECK: !llvm.foo = !{!0} ; CHECK: !llvm.bar = !{!1} +; CHECK: !llvm.baz = !{!2} ; CHECK: !0 = !{i32 42} ; CHECK: !1 = !{!"Some MDString"} +; CHECK: !2 = !{void (i8 addrspace(1)*)* @kernel} diff --git a/llvm/tools/dxil-dis/CMakeLists.txt b/llvm/tools/dxil-dis/CMakeLists.txt index 9addf108a8614..d0541fcf802e9 100644 --- a/llvm/tools/dxil-dis/CMakeLists.txt +++ b/llvm/tools/dxil-dis/CMakeLists.txt @@ -25,7 +25,9 @@ include(ExternalProject) set(SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/DXC-src) set(BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/DXC-bins) -set(GIT_SETTINGS GIT_REPOSITORY https://github.com/microsoft/DirectXShaderCompiler.git) +set(GIT_SETTINGS + GIT_REPOSITORY https://github.com/microsoft/DirectXShaderCompiler.git + GIT_TAG main) if (DXC_SOURCE_DIR) set(SOURCE_DIR ${DXC_SOURCE_DIR}) From 99a235499337aff27d087c31916d6785d2e9a263 Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Wed, 11 Sep 2024 14:46:40 +0200 Subject: [PATCH 093/114] [LLD][COFF] Add support for ARM64EC import call thunks. (#107931) These thunks can be accessed using `__impchk_*` symbols, though they are typically not called directly. Instead, they are used to populate the auxiliary IAT. When the imported function is x86_64 (or an ARM64EC function with a patched export thunk), the thunk is used to call it. Otherwise, the OS may replace the thunk at runtime with a direct pointer to the ARM64EC function to avoid the overhead. --- lld/COFF/Chunks.cpp | 22 ++++++ lld/COFF/Chunks.h | 24 +++++++ lld/COFF/Config.h | 1 + lld/COFF/Driver.cpp | 7 +- lld/COFF/Driver.h | 2 + lld/COFF/InputFiles.cpp | 7 ++ lld/COFF/InputFiles.h | 2 + lld/COFF/MarkLive.cpp | 22 ++++-- lld/COFF/SymbolTable.cpp | 16 ++++- lld/COFF/SymbolTable.h | 4 +- lld/COFF/Writer.cpp | 2 + lld/test/COFF/Inputs/loadconfig-arm64ec.s | 2 + lld/test/COFF/arm64ec-import.test | 84 ++++++++++++++++++++--- 13 files changed, 176 insertions(+), 19 deletions(-) diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp index 060eb6c32004d..0f33885f7df37 100644 --- a/lld/COFF/Chunks.cpp +++ b/lld/COFF/Chunks.cpp @@ -1093,4 +1093,26 @@ void CHPERedirectionChunk::writeTo(uint8_t *buf) const { } } +ImportThunkChunkARM64EC::ImportThunkChunkARM64EC(ImportFile *file) + : ImportThunkChunk(file->ctx, file->impSym), file(file) {} + +void ImportThunkChunkARM64EC::writeTo(uint8_t *buf) const { + memcpy(buf, importThunkARM64EC, sizeof(importThunkARM64EC)); + applyArm64Addr(buf, file->impSym->getRVA(), rva, 12); + applyArm64Ldr(buf + 4, file->impSym->getRVA() & 0xfff); + + // The exit thunk may be missing. This can happen if the application only + // references a function by its address (in which case the thunk is never + // actually used, but is still required to fill the auxiliary IAT), or in + // cases of hand-written assembly calling an imported ARM64EC function (where + // the exit thunk is ignored by __icall_helper_arm64ec). In such cases, MSVC + // link.exe uses 0 as the RVA. + uint32_t exitThunkRVA = exitThunk ? exitThunk->getRVA() : 0; + applyArm64Addr(buf + 8, exitThunkRVA, rva + 8, 12); + applyArm64Imm(buf + 12, exitThunkRVA & 0xfff, 0); + + Defined *helper = cast(file->ctx.config.arm64ECIcallHelper); + applyArm64Branch26(buf + 16, helper->getRVA() - rva - 16); +} + } // namespace lld::coff diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h index 30e5b538c352e..28e0fd68ac515 100644 --- a/lld/COFF/Chunks.h +++ b/lld/COFF/Chunks.h @@ -544,6 +544,14 @@ static const uint8_t importThunkARM64[] = { 0x00, 0x02, 0x1f, 0xd6, // br x16 }; +static const uint32_t importThunkARM64EC[] = { + 0x9000000b, // adrp x11, 0x0 + 0xf940016b, // ldr x11, [x11] + 0x9000000a, // adrp x10, 0x0 + 0x9100014a, // add x10, x10, #0x0 + 0x14000000 // b 0x0 +}; + // Windows-specific. // A chunk for DLL import jump table entry. In a final output, its // contents will be a JMP instruction to some __imp_ symbol. @@ -599,6 +607,22 @@ class ImportThunkChunkARM64 : public ImportThunkChunk { MachineTypes getMachine() const override { return ARM64; } }; +// ARM64EC __impchk_* thunk implementation. +// Performs an indirect call to an imported function pointer +// using the __icall_helper_arm64ec helper function. +class ImportThunkChunkARM64EC : public ImportThunkChunk { +public: + explicit ImportThunkChunkARM64EC(ImportFile *file); + size_t getSize() const override { return sizeof(importThunkARM64EC); }; + MachineTypes getMachine() const override { return ARM64EC; } + void writeTo(uint8_t *buf) const override; + + Defined *exitThunk; + +private: + ImportFile *file; +}; + class RangeExtensionThunkARM : public NonSectionCodeChunk { public: explicit RangeExtensionThunkARM(COFFLinkerContext &ctx, Defined *t) diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h index 947f3fead54e0..738776a971ea3 100644 --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -164,6 +164,7 @@ struct Configuration { std::set delayLoads; std::map dllOrder; Symbol *delayLoadHelper = nullptr; + Symbol *arm64ECIcallHelper = nullptr; bool saveTemps = false; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 3ef9fa3f65c6a..a1fe6444991a3 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -1383,6 +1383,11 @@ void LinkerDriver::createECExportThunks() { } } +void LinkerDriver::pullArm64ECIcallHelper() { + if (!ctx.config.arm64ECIcallHelper) + ctx.config.arm64ECIcallHelper = addUndefined("__icall_helper_arm64ec"); +} + // In MinGW, if no symbols are chosen to be exported, then all symbols are // automatically exported by default. This behavior can be forced by the // -export-all-symbols option, so that it happens even when exports are @@ -2685,7 +2690,7 @@ void LinkerDriver::linkerMain(ArrayRef argsArr) { if (auto *arg = args.getLastArg(OPT_print_symbol_order)) config->printSymbolOrder = arg->getValue(); - ctx.symtab.initializeEntryThunks(); + ctx.symtab.initializeECThunks(); // Identify unreferenced COMDAT sections. if (config->doGC) { diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h index b5cf8e2f18fd4..0c195a7cc3148 100644 --- a/lld/COFF/Driver.h +++ b/lld/COFF/Driver.h @@ -101,6 +101,8 @@ class LinkerDriver { std::unique_ptr tar; // for /linkrepro + void pullArm64ECIcallHelper(); + private: // Searches a file from search paths. std::optional findFileIfNew(StringRef filename); diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index c7956baf73cf4..3dbdf8fe3920d 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -190,6 +190,8 @@ void ObjFile::initializeECThunks() { ctx.symtab.addEntryThunk(getSymbol(entry->src), getSymbol(entry->dst)); break; case Arm64ECThunkType::Exit: + ctx.symtab.addExitThunk(getSymbol(entry->src), getSymbol(entry->dst)); + break; case Arm64ECThunkType::GuestExit: break; default: @@ -1088,6 +1090,11 @@ void ImportFile::parse() { thunkSym = ctx.symtab.addImportThunk( name, impSym, make(ctx, impSym)); // FIXME: Add aux IAT symbols. + + StringRef impChkName = saver().save("__impchk_" + name); + impchkThunk = make(this); + ctx.symtab.addImportThunk(impChkName, impSym, impchkThunk); + ctx.driver.pullArm64ECIcallHelper(); } } } diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h index 1d55b4f34f754..3b837017e1c21 100644 --- a/lld/COFF/InputFiles.h +++ b/lld/COFF/InputFiles.h @@ -56,6 +56,7 @@ class DefinedImportData; class DefinedImportThunk; class DefinedRegular; class ImportThunkChunk; +class ImportThunkChunkARM64EC; class SectionChunk; class Symbol; class Undefined; @@ -349,6 +350,7 @@ class ImportFile : public InputFile { DefinedImportData *impSym = nullptr; Symbol *thunkSym = nullptr; + ImportThunkChunkARM64EC *impchkThunk = nullptr; std::string dllName; private: diff --git a/lld/COFF/MarkLive.cpp b/lld/COFF/MarkLive.cpp index 06079a98f2d00..8af58780e1358 100644 --- a/lld/COFF/MarkLive.cpp +++ b/lld/COFF/MarkLive.cpp @@ -43,13 +43,23 @@ void markLive(COFFLinkerContext &ctx) { worklist.push_back(c); }; - auto addSym = [&](Symbol *b) { - if (auto *sym = dyn_cast(b)) + std::function addSym; + + auto addImportFile = [&](ImportFile *file) { + file->live = true; + if (file->impchkThunk && file->impchkThunk->exitThunk) + addSym(file->impchkThunk->exitThunk); + }; + + addSym = [&](Symbol *b) { + if (auto *sym = dyn_cast(b)) { enqueue(sym->getChunk()); - else if (auto *sym = dyn_cast(b)) - sym->file->live = true; - else if (auto *sym = dyn_cast(b)) - sym->wrappedSym->file->live = sym->wrappedSym->file->thunkLive = true; + } else if (auto *sym = dyn_cast(b)) { + addImportFile(sym->file); + } else if (auto *sym = dyn_cast(b)) { + addImportFile(sym->wrappedSym->file); + sym->wrappedSym->file->thunkLive = true; + } }; // Add GC root chunks. diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp index c9b3d78e3de17..a6575ecac3bb4 100644 --- a/lld/COFF/SymbolTable.cpp +++ b/lld/COFF/SymbolTable.cpp @@ -557,7 +557,11 @@ void SymbolTable::addEntryThunk(Symbol *from, Symbol *to) { entryThunks.push_back({from, to}); } -void SymbolTable::initializeEntryThunks() { +void SymbolTable::addExitThunk(Symbol *from, Symbol *to) { + exitThunks[from] = to; +} + +void SymbolTable::initializeECThunks() { for (auto it : entryThunks) { auto *to = dyn_cast(it.second); if (!to) @@ -573,6 +577,16 @@ void SymbolTable::initializeEntryThunks() { } from->getChunk()->setEntryThunk(to); } + + for (ImportFile *file : ctx.importFileInstances) { + if (!file->impchkThunk) + continue; + + Symbol *sym = exitThunks.lookup(file->thunkSym); + if (!sym) + sym = exitThunks.lookup(file->impSym); + file->impchkThunk->exitThunk = dyn_cast_or_null(sym); + } } Symbol *SymbolTable::addUndefined(StringRef name, InputFile *f, diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h index 3a277fc700e86..13e151e3a8c50 100644 --- a/lld/COFF/SymbolTable.h +++ b/lld/COFF/SymbolTable.h @@ -108,7 +108,8 @@ class SymbolTable { ImportThunkChunk *chunk); void addLibcall(StringRef name); void addEntryThunk(Symbol *from, Symbol *to); - void initializeEntryThunks(); + void addExitThunk(Symbol *from, Symbol *to); + void initializeECThunks(); void reportDuplicate(Symbol *existing, InputFile *newFile, SectionChunk *newSc = nullptr, @@ -141,6 +142,7 @@ class SymbolTable { std::unique_ptr lto; bool ltoCompilationDone = false; std::vector> entryThunks; + llvm::DenseMap exitThunks; COFFLinkerContext &ctx; }; diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 3cb9b3b512ead..b589a16bca32a 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -1248,6 +1248,8 @@ void Writer::appendImportThunks() { DefinedImportThunk *thunk = cast(file->thunkSym); if (file->thunkLive) textSec->addChunk(thunk->getChunk()); + if (file->impchkThunk) + textSec->addChunk(file->impchkThunk); } if (!delayIdata.empty()) { diff --git a/lld/test/COFF/Inputs/loadconfig-arm64ec.s b/lld/test/COFF/Inputs/loadconfig-arm64ec.s index 78e7fba43a0a4..75dc6105301d0 100644 --- a/lld/test/COFF/Inputs/loadconfig-arm64ec.s +++ b/lld/test/COFF/Inputs/loadconfig-arm64ec.s @@ -30,6 +30,8 @@ __os_arm64x_dispatch_ret: .xword 0 __os_arm64x_check_call: .xword 0 + .globl __os_arm64x_dispatch_icall +__os_arm64x_dispatch_icall: __os_arm64x_check_icall: .xword 0 __os_arm64x_get_x64_information: diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test index b1c47d785e445..44a84c09e11a3 100644 --- a/lld/test/COFF/arm64ec-import.test +++ b/lld/test/COFF/arm64ec-import.test @@ -2,17 +2,19 @@ REQUIRES: aarch64, x86 RUN: split-file %s %t.dir && cd %t.dir RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test.s -o test.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows icall.s -o icall.obj +RUN: llvm-mc -filetype=obj -triple=arm64ec-windows hybmp.s -o hybmp.obj RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj RUN: llvm-lib -machine:arm64ec -def:test.def -out:test-arm64ec.lib RUN: llvm-lib -machine:arm64ec -def:test2.def -out:test2-arm64ec.lib RUN: llvm-lib -machine:x64 -def:test.def -out:test-x86_64.lib Link using ARM64EC import library: -RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj \ +RUN: lld-link -machine:arm64ec -dll -noentry -out:out.dll loadconfig-arm64ec.obj icall.obj hybmp.obj \ RUN: test.obj test-arm64ec.lib test2-arm64ec.lib Link using x86_64 import library: -RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj \ +RUN: lld-link -machine:arm64ec -dll -noentry -out:out2.dll loadconfig-arm64ec.obj icall.obj hybmp.obj \ RUN: test.obj test-x86_64.lib test2-arm64ec.lib RUN: llvm-readobj --coff-imports out.dll | FileCheck --check-prefix=IMPORTS %s @@ -20,7 +22,7 @@ RUN: llvm-readobj --coff-imports out2.dll | FileCheck --check-prefix=IMPORTS %s IMPORTS: Import { IMPORTS-NEXT: Name: test.dll IMPORTS-NEXT: ImportLookupTableRVA: -IMPORTS-NEXT: ImportAddressTableRVA: 0x2000 +IMPORTS-NEXT: ImportAddressTableRVA: 0x3000 IMPORTS-NEXT: Symbol: data (0) IMPORTS-NEXT: Symbol: func (0) IMPORTS-NEXT: Symbol: func2 (0) @@ -28,24 +30,45 @@ IMPORTS-NEXT: } IMPORTS-NEXT: Import { IMPORTS-NEXT: Name: test2.dll IMPORTS-NEXT: ImportLookupTableRVA: -IMPORTS-NEXT: ImportAddressTableRVA: 0x2020 +IMPORTS-NEXT: ImportAddressTableRVA: 0x3020 IMPORTS-NEXT: Symbol: t2func (0) IMPORTS-NEXT: } RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s -DISASM: 0000000180001000 <.text>: -DISASM-NEXT: 180001000: ff 25 02 10 00 00 jmpq *0x1002(%rip) # 0x180002008 +DISASM: 180001000: 52800000 mov w0, #0x0 // =0 +DISASM-NEXT: 180001004: d65f03c0 ret +DISASM-NEXT: 180001008: d000000b adrp x11, 0x180003000 +DISASM-NEXT: 18000100c: f940056b ldr x11, [x11, #0x8] +DISASM-NEXT: 180001010: 9000000a adrp x10, 0x180001000 <.text> +DISASM-NEXT: 180001014: 9101114a add x10, x10, #0x44 +DISASM-NEXT: 180001018: 17fffffa b 0x180001000 <.text> +DISASM-NEXT: 18000101c: d000000b adrp x11, 0x180003000 +DISASM-NEXT: 180001020: f940096b ldr x11, [x11, #0x10] +DISASM-NEXT: 180001024: f0ffffea adrp x10, 0x180000000 +DISASM-NEXT: 180001028: 9100014a add x10, x10, #0x0 +DISASM-NEXT: 18000102c: 17fffff5 b 0x180001000 <.text> +DISASM-NEXT: 180001030: d000000b adrp x11, 0x180003000 +DISASM-NEXT: 180001034: f940116b ldr x11, [x11, #0x20] +DISASM-NEXT: 180001038: 9000000a adrp x10, 0x180001000 <.text> +DISASM-NEXT: 18000103c: 9101314a add x10, x10, #0x4c +DISASM-NEXT: 180001040: 17fffff0 b 0x180001000 <.text> +DISASM-NEXT: 180001044: 52800020 mov w0, #0x1 // =1 +DISASM-NEXT: 180001048: d65f03c0 ret +DISASM-NEXT: 18000104c: 52800040 mov w0, #0x2 // =2 +DISASM-NEXT: 180001050: d65f03c0 ret +DISASM-NEXT: ... +DISASM-NEXT: 180002000: ff 25 02 10 00 00 jmpq *0x1002(%rip) # 0x180003008 RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s RUN: llvm-readobj --hex-dump=.test out2.dll | FileCheck --check-prefix=TESTSEC %s -TESTSEC: 0x180005000 08200000 00200000 10200000 20200000 -TESTSEC-NEXT: 0x180005010 00100000 +TESTSEC: 0x180006000 08300000 00300000 10300000 20300000 +TESTSEC-NEXT: 0x180006010 08100000 1c100000 00200000 RUN: llvm-readobj --headers out.dll | FileCheck -check-prefix=HEADERS %s -HEADERS: LoadConfigTableRVA: 0x3008 -HEADERS: IATRVA: 0x2000 +HEADERS: LoadConfigTableRVA: 0x4010 +HEADERS: IATRVA: 0x3000 HEADERS: IATSize: 0x1000 #--- test.s @@ -57,8 +80,49 @@ arm64ec_data_sym: .rva __imp_data .rva __imp_func2 .rva __imp_t2func + .rva __impchk_func + .rva __impchk_func2 .rva func +#--- icall.s + .text + .globl __icall_helper_arm64ec + .p2align 2, 0x0 +__icall_helper_arm64ec: + mov w0, #0 + ret + +#--- hybmp.s + .section .hybmp$x, "yi" + // __imp_func exit thunk is ignored when func is defined as well + .symidx __imp_func + .symidx dead_exit_thunk + .word 4 + .symidx func + .symidx func_exit_thunk + .word 4 + .symidx __imp_t2func + .symidx t2func_exit_thunk + .word 4 + + .section .wowthk$aa,"xr",discard,func_exit_thunk + .globl func_exit_thunk +func_exit_thunk: + mov w0, #1 + ret + + .section .wowthk$aa,"xr",discard,t2func_exit_thunk + .globl t2func_exit_thunk +t2func_exit_thunk: + mov w0, #2 + ret + + .section .wowthk$aa,"xr",discard,dead_exit_thunk + .globl dead_exit_thunk +dead_exit_thunk: + mov w0, #0xdead + ret + #--- test.def NAME test.dll EXPORTS From 5904448ceb67d6a7bd752aa4b54d9acb64bcc533 Mon Sep 17 00:00:00 2001 From: Tulio Magno Quites Machado Filho Date: Wed, 11 Sep 2024 09:57:22 -0300 Subject: [PATCH 094/114] Avoid exposing password and token from git repositories (#105220) Try to detect if the git remote URL has a password or a Github token and return an error teaching the user how to avoid leaking their password or token. --- llvm/cmake/modules/VersionFromVCS.cmake | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/llvm/cmake/modules/VersionFromVCS.cmake b/llvm/cmake/modules/VersionFromVCS.cmake index 18edbeabe3e4b..da42781d2ae39 100644 --- a/llvm/cmake/modules/VersionFromVCS.cmake +++ b/llvm/cmake/modules/VersionFromVCS.cmake @@ -39,6 +39,30 @@ function(get_source_info path revision repository) OUTPUT_VARIABLE git_output ERROR_QUIET) if(git_result EQUAL 0) + # Passwords or tokens should not be stored in the remote URL at the + # risk of being leaked. In case we find one, error out and teach the + # user the best practices. + string(REGEX MATCH "https?://[^/]*:[^/]*@.*" + http_password "${git_output}") + if(http_password) + message(SEND_ERROR "The git remote repository URL has an embedded \ +password. Remove the password from the URL or use \ +`-DLLVM_FORCE_VC_REPOSITORY=` in order to avoid \ +leaking your password (see https://git-scm.com/docs/gitcredentials for \ +alternatives).") + endif() + # GitHub token formats are described at: + # https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/about-authentication-to-github#githubs-token-formats + string(REGEX MATCH + "https?://(gh[pousr]|github_pat)_[^/]+@github.com.*" + github_token "${git_output}") + if(github_token) + message(SEND_ERROR "The git remote repository URL has an embedded \ +GitHub Token. Remove the token from the URL or use \ +`-DLLVM_FORCE_VC_REPOSITORY=` in order to avoid leaking \ +your token (see https://git-scm.com/docs/gitcredentials for alternatives).") + endif() + string(STRIP "${git_output}" git_output) set(${repository} ${git_output} PARENT_SCOPE) else() From 5f25b89513954b00e045b9fdf1a16f3a34e04c52 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 05:57:50 -0700 Subject: [PATCH 095/114] [TableGen] Migrate Option Emitters to const RecordKeeper (#107696) Migrate Opt/OptRST Emitters to const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- llvm/utils/TableGen/Common/OptEmitter.cpp | 16 ++++++------ llvm/utils/TableGen/Common/OptEmitter.h | 4 +-- llvm/utils/TableGen/ExegesisEmitter.cpp | 17 +++++++------ llvm/utils/TableGen/OptParserEmitter.cpp | 8 +++--- llvm/utils/TableGen/OptRSTEmitter.cpp | 30 +++++++++-------------- 5 files changed, 34 insertions(+), 41 deletions(-) diff --git a/llvm/utils/TableGen/Common/OptEmitter.cpp b/llvm/utils/TableGen/Common/OptEmitter.cpp index 7fcf3074e0931..1c91ec5b3dbc4 100644 --- a/llvm/utils/TableGen/Common/OptEmitter.cpp +++ b/llvm/utils/TableGen/Common/OptEmitter.cpp @@ -1,4 +1,4 @@ -//===- OptEmitter.cpp - Helper for emitting options.----------- -----------===// +//===- OptEmitter.cpp - Helper for emitting options -------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -39,21 +39,19 @@ static int StrCmpOptionName(const char *A, const char *B) { return (a < b) ? -1 : 1; } -int CompareOptionRecords(Record *const *Av, Record *const *Bv) { - const Record *A = *Av; - const Record *B = *Bv; - +// Returns true if A is ordered before B. +bool CompareOptionRecords(const Record *A, const Record *B) { // Sentinel options precede all others and are only ordered by precedence. bool ASent = A->getValueAsDef("Kind")->getValueAsBit("Sentinel"); bool BSent = B->getValueAsDef("Kind")->getValueAsBit("Sentinel"); if (ASent != BSent) - return ASent ? -1 : 1; + return ASent; // Compare options by name, unless they are sentinels. if (!ASent) if (int Cmp = StrCmpOptionName(A->getValueAsString("Name").str().c_str(), B->getValueAsString("Name").str().c_str())) - return Cmp; + return Cmp < 0; if (!ASent) { std::vector APrefixes = A->getValueAsListOfStrings("Prefixes"); @@ -65,7 +63,7 @@ int CompareOptionRecords(Record *const *Av, Record *const *Bv) { BEPre = BPrefixes.end(); APre != AEPre && BPre != BEPre; ++APre, ++BPre) { if (int Cmp = StrCmpOptionName(APre->str().c_str(), BPre->str().c_str())) - return Cmp; + return Cmp < 0; } } @@ -78,7 +76,7 @@ int CompareOptionRecords(Record *const *Av, Record *const *Bv) { PrintError(B->getLoc(), Twine("Other defined here")); PrintFatalError("Equivalent Options found."); } - return APrec < BPrec ? -1 : 1; + return APrec < BPrec; } } // namespace llvm diff --git a/llvm/utils/TableGen/Common/OptEmitter.h b/llvm/utils/TableGen/Common/OptEmitter.h index eaef966bbac66..5eecd61987337 100644 --- a/llvm/utils/TableGen/Common/OptEmitter.h +++ b/llvm/utils/TableGen/Common/OptEmitter.h @@ -1,4 +1,4 @@ -//===- OptEmitter.h - Helper for emitting options. --------------*- C++ -*-===// +//===- OptEmitter.h - Helper for emitting options ---------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -11,7 +11,7 @@ namespace llvm { class Record; -int CompareOptionRecords(Record *const *Av, Record *const *Bv); +bool CompareOptionRecords(const Record *A, const Record *B); } // namespace llvm #endif // LLVM_UTILS_TABLEGEN_COMMON_OPTEMITTER_H diff --git a/llvm/utils/TableGen/ExegesisEmitter.cpp b/llvm/utils/TableGen/ExegesisEmitter.cpp index 0de7cb4233748..a5dd2994b3753 100644 --- a/llvm/utils/TableGen/ExegesisEmitter.cpp +++ b/llvm/utils/TableGen/ExegesisEmitter.cpp @@ -30,7 +30,7 @@ namespace { class ExegesisEmitter { public: - ExegesisEmitter(RecordKeeper &RK); + ExegesisEmitter(const RecordKeeper &RK); void run(raw_ostream &OS) const; @@ -51,7 +51,7 @@ class ExegesisEmitter { void emitPfmCountersLookupTable(raw_ostream &OS) const; - RecordKeeper &Records; + const RecordKeeper &Records; std::string Target; // Table of counter name -> counter index. @@ -59,7 +59,7 @@ class ExegesisEmitter { }; static std::map -collectPfmCounters(RecordKeeper &Records) { +collectPfmCounters(const RecordKeeper &Records) { std::map PfmCounterNameTable; const auto AddPfmCounterName = [&PfmCounterNameTable]( const Record *PfmCounterDef) { @@ -67,7 +67,8 @@ collectPfmCounters(RecordKeeper &Records) { if (!Counter.empty()) PfmCounterNameTable.emplace(Counter, 0); }; - for (Record *Def : Records.getAllDerivedDefinitions("ProcPfmCounters")) { + for (const Record *Def : + Records.getAllDerivedDefinitions("ProcPfmCounters")) { // Check that ResourceNames are unique. llvm::SmallSet Seen; for (const Record *IssueCounter : @@ -95,9 +96,9 @@ collectPfmCounters(RecordKeeper &Records) { return PfmCounterNameTable; } -ExegesisEmitter::ExegesisEmitter(RecordKeeper &RK) +ExegesisEmitter::ExegesisEmitter(const RecordKeeper &RK) : Records(RK), PfmCounterNameTable(collectPfmCounters(RK)) { - std::vector Targets = Records.getAllDerivedDefinitions("Target"); + ArrayRef Targets = Records.getAllDerivedDefinitions("Target"); if (Targets.size() == 0) PrintFatalError("No 'Target' subclasses defined!"); if (Targets.size() != 1) @@ -223,7 +224,7 @@ void ExegesisEmitter::emitPfmCounters(raw_ostream &OS) const { } // namespace void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const { - std::vector Bindings = + std::vector Bindings = Records.getAllDerivedDefinitions("PfmCountersBinding"); assert(!Bindings.empty() && "there must be at least one binding"); llvm::sort(Bindings, [](const Record *L, const Record *R) { @@ -232,7 +233,7 @@ void ExegesisEmitter::emitPfmCountersLookupTable(raw_ostream &OS) const { OS << "// Sorted (by CpuName) array of pfm counters.\n" << "static const CpuAndPfmCounters " << Target << "CpuPfmCounters[] = {\n"; - for (Record *Binding : Bindings) { + for (const Record *Binding : Bindings) { // Emit as { "cpu", procinit }, OS << " { \"" // << Binding->getValueAsString("CpuName") << "\"," // diff --git a/llvm/utils/TableGen/OptParserEmitter.cpp b/llvm/utils/TableGen/OptParserEmitter.cpp index 81195c8c106c2..a41c684f169e9 100644 --- a/llvm/utils/TableGen/OptParserEmitter.cpp +++ b/llvm/utils/TableGen/OptParserEmitter.cpp @@ -250,15 +250,15 @@ static void EmitHelpTextsForVariants( /// OptParserEmitter - This tablegen backend takes an input .td file /// describing a list of options and emits a data structure for parsing and /// working with those options when given an input command line. -static void EmitOptParser(RecordKeeper &Records, raw_ostream &OS) { +static void EmitOptParser(const RecordKeeper &Records, raw_ostream &OS) { // Get the option groups and options. - const std::vector &Groups = + ArrayRef Groups = Records.getAllDerivedDefinitions("OptionGroup"); - std::vector Opts = Records.getAllDerivedDefinitions("Option"); + std::vector Opts = Records.getAllDerivedDefinitions("Option"); emitSourceFileHeader("Option Parsing Definitions", OS); - array_pod_sort(Opts.begin(), Opts.end(), CompareOptionRecords); + llvm::sort(Opts, CompareOptionRecords); // Generate prefix groups. typedef SmallVector, 2> PrefixKeyT; typedef std::map PrefixesT; diff --git a/llvm/utils/TableGen/OptRSTEmitter.cpp b/llvm/utils/TableGen/OptRSTEmitter.cpp index 75b7cbdf29887..43b0f78c44d90 100644 --- a/llvm/utils/TableGen/OptRSTEmitter.cpp +++ b/llvm/utils/TableGen/OptRSTEmitter.cpp @@ -16,30 +16,24 @@ using namespace llvm; /// OptParserEmitter - This tablegen backend takes an input .td file /// describing a list of options and emits a RST man page. -static void EmitOptRST(RecordKeeper &Records, raw_ostream &OS) { - llvm::StringMap> OptionsByGroup; +static void EmitOptRST(const RecordKeeper &Records, raw_ostream &OS) { + llvm::StringMap> OptionsByGroup; std::vector OptionsWithoutGroup; // Get the options. - std::vector Opts = Records.getAllDerivedDefinitions("Option"); - array_pod_sort(Opts.begin(), Opts.end(), CompareOptionRecords); + std::vector Opts = Records.getAllDerivedDefinitions("Option"); + llvm::sort(Opts, CompareOptionRecords); // Get the option groups. - const std::vector &Groups = - Records.getAllDerivedDefinitions("OptionGroup"); - for (unsigned i = 0, e = Groups.size(); i != e; ++i) { - const Record &R = *Groups[i]; - OptionsByGroup.try_emplace(R.getValueAsString("Name")); - } + for (const Record *R : Records.getAllDerivedDefinitions("OptionGroup")) + OptionsByGroup.try_emplace(R->getValueAsString("Name")); // Map options to their group. - for (unsigned i = 0, e = Opts.size(); i != e; ++i) { - const Record &R = *Opts[i]; - if (const DefInit *DI = dyn_cast(R.getValueInit("Group"))) { - OptionsByGroup[DI->getDef()->getValueAsString("Name")].push_back(Opts[i]); - } else { - OptionsByGroup["options"].push_back(Opts[i]); - } + for (const Record *R : Opts) { + if (const DefInit *DI = dyn_cast(R->getValueInit("Group"))) + OptionsByGroup[DI->getDef()->getValueAsString("Name")].push_back(R); + else + OptionsByGroup["options"].push_back(R); } // Print options under their group. @@ -49,7 +43,7 @@ static void EmitOptRST(RecordKeeper &Records, raw_ostream &OS) { OS << std::string(GroupName.size(), '-') << '\n'; OS << '\n'; - for (Record *R : KV.getValue()) { + for (const Record *R : KV.getValue()) { OS << ".. option:: "; // Print the prefix. From 6043321127fa3c51481beaee683a34c2d2ca468d Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 11 Sep 2024 08:57:58 -0400 Subject: [PATCH 096/114] [gn] port bc152fbf4315 (llvm-debuginfod-find driver_exe) --- .../llvm/tools/llvm-debuginfod-find/BUILD.gn | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn index 16b2c53438314..6b926bc777dca 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-debuginfod-find/BUILD.gn @@ -1,6 +1,13 @@ import("//llvm/tools/binutils_symlinks.gni") +import("//llvm/utils/TableGen/tablegen.gni") +import("//llvm/utils/gn/build/driver_executable.gni") import("//llvm/utils/gn/build/symlink_or_copy.gni") +tablegen("Opts") { + visibility = [ ":llvm-debuginfod-find" ] + args = [ "-gen-opt-parser-defs" ] +} + if (llvm_install_binutils_symlinks) { symlink_or_copy("debuginfod-find") { deps = [ ":llvm-debuginfod-find" ] @@ -18,9 +25,11 @@ group("symlinks") { } } -executable("llvm-debuginfod-find") { +driver_executable("llvm-debuginfod-find") { deps = [ + ":Opts", "//llvm/lib/Debuginfod", + "//llvm/lib/Option", "//llvm/lib/Support", ] sources = [ "llvm-debuginfod-find.cpp" ] From 135bd31975192654629c9bd453533ba705af1dba Mon Sep 17 00:00:00 2001 From: Alex Rice Date: Wed, 11 Sep 2024 14:02:44 +0100 Subject: [PATCH 097/114] [mlir] [tblgen-to-irdl] Refactor tblgen-to-irdl script and support more types (#105505) Refactors the tblgen-to-irdl script slightly and adds support for - Various integer types - Various Float types - Confined types - Complex types (with fixed element type) Also doesn't add the operand and result ops if they are empty. I could potentially split this into smaller PRs if that'd be helpful (refactor + integer/float/complex, confined type, optional operand/result). @math-fehr --- mlir/include/mlir/IR/CommonTypeConstraints.td | 5 +- mlir/test/tblgen-to-irdl/CMathDialect.td | 1 - mlir/test/tblgen-to-irdl/TestDialect.td | 59 +++++- .../tools/tblgen-to-irdl/OpDefinitionsGen.cpp | 179 +++++++++++++++++- 4 files changed, 227 insertions(+), 17 deletions(-) diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td index 09eab50f53a54..0a1521f8ddfb8 100644 --- a/mlir/include/mlir/IR/CommonTypeConstraints.td +++ b/mlir/include/mlir/IR/CommonTypeConstraints.td @@ -198,7 +198,10 @@ class AllOfType allowedTypeList, string summary = "", class ConfinedType predicates, string summary = "", string cppType = type.cppType> : Type< And, - summary, cppType>; + summary, cppType> { + Type baseType = type; + list predicateList = predicates; +} // Integer types. diff --git a/mlir/test/tblgen-to-irdl/CMathDialect.td b/mlir/test/tblgen-to-irdl/CMathDialect.td index 5b9e756727cb3..454543e074c48 100644 --- a/mlir/test/tblgen-to-irdl/CMathDialect.td +++ b/mlir/test/tblgen-to-irdl/CMathDialect.td @@ -25,7 +25,6 @@ def CMath_ComplexType : CMath_Type<"ComplexType", "complex"> { // CHECK: irdl.operation @identity { // CHECK-NEXT: %0 = irdl.base "!cmath.complex" -// CHECK-NEXT: irdl.operands() // CHECK-NEXT: irdl.results(%0) // CHECK-NEXT: } def CMath_IdentityOp : CMath_Op<"identity"> { diff --git a/mlir/test/tblgen-to-irdl/TestDialect.td b/mlir/test/tblgen-to-irdl/TestDialect.td index fc40da527db00..2622c81776076 100644 --- a/mlir/test/tblgen-to-irdl/TestDialect.td +++ b/mlir/test/tblgen-to-irdl/TestDialect.td @@ -28,9 +28,8 @@ def Test_AndOp : Test_Op<"and"> { // CHECK-LABEL: irdl.operation @and { // CHECK-NEXT: %[[v0:[^ ]*]] = irdl.base "!test.singleton_a" // CHECK-NEXT: %[[v1:[^ ]*]] = irdl.any -// CHECK-NEXT: %[[v2:[^ ]*]] = irdl.all_of(%[[v0]], %[[v1]]) +// CHECK-NEXT: %[[v2:[^ ]*]] = irdl.all_of(%[[v0]], %[[v1]]) // CHECK-NEXT: irdl.operands(%[[v2]]) -// CHECK-NEXT: irdl.results() // CHECK-NEXT: } @@ -41,9 +40,39 @@ def Test_AnyOp : Test_Op<"any"> { // CHECK-LABEL: irdl.operation @any { // CHECK-NEXT: %[[v0:[^ ]*]] = irdl.any // CHECK-NEXT: irdl.operands(%[[v0]]) -// CHECK-NEXT: irdl.results() // CHECK-NEXT: } +// Check confined types are converted correctly. +def Test_ConfinedOp : Test_Op<"confined"> { + let arguments = (ins ConfinedType($_self)">]>:$tensor, + ConfinedType($_self)"> + , CPred<"::llvm::cast<::mlir::VectorType>($_self).getRank() > 0">]>]>:$vector); +} +// CHECK-LABEL: irdl.operation @confined { +// CHECK-NEXT: %[[v0:[^ ]*]] = irdl.any +// CHECK-NEXT: %[[v1:[^ ]*]] = irdl.c_pred "(::llvm::isa<::mlir::TensorType>($_self))" +// CHECK-NEXT: %[[v2:[^ ]*]] = irdl.all_of(%[[v0]], %[[v1]]) +// CHECK-NEXT: %[[v3:[^ ]*]] = irdl.any +// CHECK-NEXT: %[[v4:[^ ]*]] = irdl.c_pred "(::llvm::isa<::mlir::VectorType>($_self))" +// CHECK-NEXT: %[[v5:[^ ]*]] = irdl.c_pred "(::llvm::cast<::mlir::VectorType>($_self).getRank() > 0)" +// CHECK-NEXT: %[[v6:[^ ]*]] = irdl.all_of(%[[v4]], %[[v5]]) +// CHECK-NEXT: %[[v7:[^ ]*]] = irdl.all_of(%[[v3]], %[[v6]]) +// CHECK-NEXT: irdl.operands(%[[v2]], %[[v7]]) +// CHECK-NEXT: } + +// Check generic integer types are converted correctly. +def Test_Integers : Test_Op<"integers"> { + let arguments = (ins AnyI8:$any_int, + AnyInteger:$any_integer); +} +// CHECK-LABEL: irdl.operation @integers { +// CHECK-NEXT: %[[v0:[^ ]*]] = irdl.is i8 +// CHECK-NEXT: %[[v1:[^ ]*]] = irdl.is si8 +// CHECK-NEXT: %[[v2:[^ ]*]] = irdl.is ui8 +// CHECK-NEXT: %[[v3:[^ ]*]] = irdl.any_of(%[[v0]], %[[v1]], %[[v2]]) +// CHECK-NEXT: %[[v4:[^ ]*]] = irdl.base "!builtin.integer" +// CHECK-NEXT: irdl.operands(%[[v3]], %[[v4]]) +// CHECK-NEXT: } // Check that AnyTypeOf is converted correctly. def Test_OrOp : Test_Op<"or"> { @@ -53,11 +82,30 @@ def Test_OrOp : Test_Op<"or"> { // CHECK-NEXT: %[[v0:[^ ]*]] = irdl.base "!test.singleton_a" // CHECK-NEXT: %[[v1:[^ ]*]] = irdl.base "!test.singleton_b" // CHECK-NEXT: %[[v2:[^ ]*]] = irdl.base "!test.singleton_c" -// CHECK-NEXT: %[[v3:[^ ]*]] = irdl.any_of(%[[v0]], %[[v1]], %[[v2]]) +// CHECK-NEXT: %[[v3:[^ ]*]] = irdl.any_of(%[[v0]], %[[v1]], %[[v2]]) // CHECK-NEXT: irdl.operands(%[[v3]]) -// CHECK-NEXT: irdl.results() // CHECK-NEXT: } +// Check that various types are converted correctly. +def Test_TypesOp : Test_Op<"types"> { + let arguments = (ins I32:$a, + SI64:$b, + UI8:$c, + Index:$d, + F32:$e, + NoneType:$f, + Complex); +} +// CHECK-LABEL: irdl.operation @types { +// CHECK-NEXT: %{{.*}} = irdl.is i32 +// CHECK-NEXT: %{{.*}} = irdl.is si64 +// CHECK-NEXT: %{{.*}} = irdl.is ui8 +// CHECK-NEXT: %{{.*}} = irdl.is index +// CHECK-NEXT: %{{.*}} = irdl.is f32 +// CHECK-NEXT: %{{.*}} = irdl.is none +// CHECK-NEXT: %{{.*}} = irdl.is complex +// CHECK-NEXT: irdl.operands({{.*}}) +// CHECK-NEXT: } // Check that variadics and optionals are converted correctly. def Test_VariadicityOp : Test_Op<"variadicity"> { @@ -70,5 +118,4 @@ def Test_VariadicityOp : Test_Op<"variadicity"> { // CHECK-NEXT: %[[v1:[^ ]*]] = irdl.base "!test.singleton_b" // CHECK-NEXT: %[[v2:[^ ]*]] = irdl.base "!test.singleton_c" // CHECK-NEXT: irdl.operands(variadic %[[v0]], optional %[[v1]], %[[v2]]) -// CHECK-NEXT: irdl.results() // CHECK-NEXT: } diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp index 4a13a00335f65..dd0d98de496e8 100644 --- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp +++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp @@ -39,6 +39,131 @@ llvm::cl::opt selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"), llvm::cl::cat(dialectGenCat), llvm::cl::Required); +Value createPredicate(OpBuilder &builder, tblgen::Pred pred) { + MLIRContext *ctx = builder.getContext(); + + if (pred.isCombined()) { + auto combiner = pred.getDef().getValueAsDef("kind")->getName(); + if (combiner == "PredCombinerAnd" || combiner == "PredCombinerOr") { + std::vector constraints; + for (auto *child : pred.getDef().getValueAsListOfDefs("children")) { + constraints.push_back(createPredicate(builder, tblgen::Pred(child))); + } + if (combiner == "PredCombinerAnd") { + auto op = + builder.create(UnknownLoc::get(ctx), constraints); + return op.getOutput(); + } + auto op = + builder.create(UnknownLoc::get(ctx), constraints); + return op.getOutput(); + } + } + + std::string condition = pred.getCondition(); + // Build a CPredOp to match the C constraint built. + irdl::CPredOp op = builder.create( + UnknownLoc::get(ctx), StringAttr::get(ctx, condition)); + return op; +} + +Value typeToConstraint(OpBuilder &builder, Type type) { + MLIRContext *ctx = builder.getContext(); + auto op = + builder.create(UnknownLoc::get(ctx), TypeAttr::get(type)); + return op.getOutput(); +} + +std::optional recordToType(MLIRContext *ctx, const Record &predRec) { + + if (predRec.isSubClassOf("I")) { + auto width = predRec.getValueAsInt("bitwidth"); + return IntegerType::get(ctx, width, IntegerType::Signless); + } + + if (predRec.isSubClassOf("SI")) { + auto width = predRec.getValueAsInt("bitwidth"); + return IntegerType::get(ctx, width, IntegerType::Signed); + } + + if (predRec.isSubClassOf("UI")) { + auto width = predRec.getValueAsInt("bitwidth"); + return IntegerType::get(ctx, width, IntegerType::Unsigned); + } + + // Index type + if (predRec.getName() == "Index") { + return IndexType::get(ctx); + } + + // Float types + if (predRec.isSubClassOf("F")) { + auto width = predRec.getValueAsInt("bitwidth"); + switch (width) { + case 16: + return FloatType::getF16(ctx); + case 32: + return FloatType::getF32(ctx); + case 64: + return FloatType::getF64(ctx); + case 80: + return FloatType::getF80(ctx); + case 128: + return FloatType::getF128(ctx); + } + } + + if (predRec.getName() == "NoneType") { + return NoneType::get(ctx); + } + + if (predRec.getName() == "BF16") { + return FloatType::getBF16(ctx); + } + + if (predRec.getName() == "TF32") { + return FloatType::getTF32(ctx); + } + + if (predRec.getName() == "F8E4M3FN") { + return FloatType::getFloat8E4M3FN(ctx); + } + + if (predRec.getName() == "F8E5M2") { + return FloatType::getFloat8E5M2(ctx); + } + + if (predRec.getName() == "F8E4M3") { + return FloatType::getFloat8E4M3(ctx); + } + + if (predRec.getName() == "F8E4M3FNUZ") { + return FloatType::getFloat8E4M3FNUZ(ctx); + } + + if (predRec.getName() == "F8E4M3B11FNUZ") { + return FloatType::getFloat8E4M3B11FNUZ(ctx); + } + + if (predRec.getName() == "F8E5M2FNUZ") { + return FloatType::getFloat8E5M2FNUZ(ctx); + } + + if (predRec.getName() == "F8E3M4") { + return FloatType::getFloat8E3M4(ctx); + } + + if (predRec.isSubClassOf("Complex")) { + const Record *elementRec = predRec.getValueAsDef("elementType"); + auto elementType = recordToType(ctx, *elementRec); + if (elementType.has_value()) { + return ComplexType::get(elementType.value()); + } + } + + return std::nullopt; +} + Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) { MLIRContext *ctx = builder.getContext(); const Record &predRec = constraint.getDef(); @@ -78,11 +203,45 @@ Value createConstraint(OpBuilder &builder, tblgen::Constraint constraint) { return op.getOutput(); } - std::string condition = constraint.getPredicate().getCondition(); - // Build a CPredOp to match the C constraint built. - irdl::CPredOp op = builder.create( - UnknownLoc::get(ctx), StringAttr::get(ctx, condition)); - return op; + // Integer types + if (predRec.getName() == "AnyInteger") { + auto op = builder.create( + UnknownLoc::get(ctx), StringAttr::get(ctx, "!builtin.integer")); + return op.getOutput(); + } + + if (predRec.isSubClassOf("AnyI")) { + auto width = predRec.getValueAsInt("bitwidth"); + std::vector types = { + typeToConstraint(builder, + IntegerType::get(ctx, width, IntegerType::Signless)), + typeToConstraint(builder, + IntegerType::get(ctx, width, IntegerType::Signed)), + typeToConstraint(builder, + IntegerType::get(ctx, width, IntegerType::Unsigned))}; + auto op = builder.create(UnknownLoc::get(ctx), types); + return op.getOutput(); + } + + auto type = recordToType(ctx, predRec); + + if (type.has_value()) { + return typeToConstraint(builder, type.value()); + } + + // Confined type + if (predRec.isSubClassOf("ConfinedType")) { + std::vector constraints; + constraints.push_back(createConstraint( + builder, tblgen::Constraint(predRec.getValueAsDef("baseType")))); + for (Record *child : predRec.getValueAsListOfDefs("predicateList")) { + constraints.push_back(createPredicate(builder, tblgen::Pred(child))); + } + auto op = builder.create(UnknownLoc::get(ctx), constraints); + return op.getOutput(); + } + + return createPredicate(builder, constraint.getPredicate()); } /// Returns the name of the operation without the dialect prefix. @@ -131,10 +290,12 @@ irdl::OperationOp createIRDLOperation(OpBuilder &builder, auto [results, resultVariadicity] = getValues(tblgenOp.getResults()); // Create the operands and results operations. - consBuilder.create(UnknownLoc::get(ctx), operands, - operandVariadicity); - consBuilder.create(UnknownLoc::get(ctx), results, - resultVariadicity); + if (!operands.empty()) + consBuilder.create(UnknownLoc::get(ctx), operands, + operandVariadicity); + if (!results.empty()) + consBuilder.create(UnknownLoc::get(ctx), results, + resultVariadicity); return op; } From 2a130f1a140613445b8f387d3fa54328c1b94cde Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 11 Sep 2024 14:03:01 +0100 Subject: [PATCH 098/114] [NFC][Clang][SVE] Refactor AArch64SVEACLETypes.def to enabled more uses. (#107599) Some switch statements require all SVE builtin types to be manually specified. This patch refactors the SVE_*_TYPE macros so that such code can be generated during preprocessing. I've tried to establish a minimal interface that covers all types where no special information is required and then created a set of macros that are dedicated to specific datatypes (i.e. int, float). This patch is groundwork to simplify the changing of SVE tuple types to become struct based as well as work to support the FP8 ACLE. --- .../clang/Basic/AArch64SVEACLETypes.def | 178 +++++++++++------- clang/lib/AST/ASTContext.cpp | 162 +++++----------- clang/lib/AST/ItaniumMangle.cpp | 16 +- clang/lib/CodeGen/CodeGenTypes.cpp | 70 ++----- 4 files changed, 178 insertions(+), 248 deletions(-) diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def index fa9c1ac0491c4..56e6179a664e2 100644 --- a/clang/include/clang/Basic/AArch64SVEACLETypes.def +++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def @@ -8,28 +8,48 @@ // // This file defines various SVE builtin types. The macros are: // -// SVE_TYPE(Name, Id, SingletonId) - A builtin type that has not been -// covered by any other #define. Defining this macro covers all -// the builtins. +// SVE_TYPE: +// - (Name, MangledName, Id, SingletonId) +// A builtin type that has not been covered by any other #define. Defining +// this macro covers all the builtin types. // -// SVE_VECTOR_TYPE(Name, Id, SingletonId, ElKind, ElBits, IsSigned, IsFP) - -// An SVE scalable vector. +// SVE_VECTOR_TYPE, SVE_PREDICATE_TYPE, SVE_OPAQUE_TYPE: +// - (Name, MangledName, Id, SingletonId) +// A builtin type that has not been covered by any other #define. Defining +// this macro covers the named subset of builtin types. // -// SVE_PREDICATE_TYPE(Name, Id, SingletonId, ElKind) - An SVE scalable -// predicate. +// SVE_VECTOR_TYPE_INT +// - (Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned) +// Defining the macro covers the integer vector types. +// +// SVE_VECTOR_TYPE_FLOAT, SVE_VECTOR_TYPE_BFLOAT: +// - (Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) +// Defining the macro covers the floating point vector types. +// +// SVE_PREDICATE_TYPE_ALL: +// - (Name, MangledName, Id, SingletonId, NumEls, NF) +// Defining the macro covers the boolean vector types. // // where: // // - Name is the name of the builtin type. // +// - MangledName is the mangled name of the builtin type. +// // - BuiltinType::Id is the enumerator defining the type. // // - Context.SingletonId is the global singleton of this type. // // - ElKind enumerates the type of the elements. // +// - NumEls enumerates the number of the elements. +// // - ElBits is the size of one element in bits. // +// - NF enumerates the number of sub-vectors. +// TODO: Tuple types are represented as a concatination of "NumEls x ElBits" +// vectors. This will be changed to become a struct containing NF vectors. +// // - IsSigned is true for vectors of signed integer elements and // for vectors of floating-point elements. // @@ -39,102 +59,134 @@ //===----------------------------------------------------------------------===// #ifndef SVE_VECTOR_TYPE -#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits, \ - IsSigned, IsFP, IsBF) \ +#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ SVE_TYPE(Name, Id, SingletonId) #endif +#ifndef SVE_VECTOR_TYPE_DETAILS +#define SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned, IsFP, IsBF) \ + SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) +#endif + +#ifndef SVE_VECTOR_TYPE_BFLOAT +#define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \ + SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, true) +#endif + +#ifndef SVE_VECTOR_TYPE_FLOAT +#define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \ + SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, true, false) +#endif + +#ifndef SVE_VECTOR_TYPE_INT +#define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned) \ + SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, IsSigned, false, false) +#endif + #ifndef SVE_PREDICATE_TYPE -#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls) \ +#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) \ SVE_TYPE(Name, Id, SingletonId) #endif +#ifndef SVE_PREDICATE_TYPE_ALL +#define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \ + SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) +#endif + #ifndef SVE_OPAQUE_TYPE -#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) \ +#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) \ SVE_TYPE(Name, Id, SingletonId) #endif //===- Vector point types -----------------------------------------------===// +SVE_VECTOR_TYPE_INT("__SVInt8_t", "__SVInt8_t", SveInt8, SveInt8Ty, 16, 8, 1, true) +SVE_VECTOR_TYPE_INT("__SVInt16_t", "__SVInt16_t", SveInt16, SveInt16Ty, 8, 16, 1, true) +SVE_VECTOR_TYPE_INT("__SVInt32_t", "__SVInt32_t", SveInt32, SveInt32Ty, 4, 32, 1, true) +SVE_VECTOR_TYPE_INT("__SVInt64_t", "__SVInt64_t", SveInt64, SveInt64Ty, 2, 64, 1, true) -SVE_VECTOR_TYPE("__SVInt8_t", "__SVInt8_t", SveInt8, SveInt8Ty, 16, 8, true, false, false) -SVE_VECTOR_TYPE("__SVInt16_t", "__SVInt16_t", SveInt16, SveInt16Ty, 8, 16, true, false, false) -SVE_VECTOR_TYPE("__SVInt32_t", "__SVInt32_t", SveInt32, SveInt32Ty, 4, 32, true, false, false) -SVE_VECTOR_TYPE("__SVInt64_t", "__SVInt64_t", SveInt64, SveInt64Ty, 2, 64, true, false, false) - -SVE_VECTOR_TYPE("__SVUint8_t", "__SVUint8_t", SveUint8, SveUint8Ty, 16, 8, false, false, false) -SVE_VECTOR_TYPE("__SVUint16_t", "__SVUint16_t", SveUint16, SveUint16Ty, 8, 16, false, false, false) -SVE_VECTOR_TYPE("__SVUint32_t", "__SVUint32_t", SveUint32, SveUint32Ty, 4, 32, false, false, false) -SVE_VECTOR_TYPE("__SVUint64_t", "__SVUint64_t", SveUint64, SveUint64Ty, 2, 64, false, false, false) +SVE_VECTOR_TYPE_INT("__SVUint8_t", "__SVUint8_t", SveUint8, SveUint8Ty, 16, 8, 1, false) +SVE_VECTOR_TYPE_INT("__SVUint16_t", "__SVUint16_t", SveUint16, SveUint16Ty, 8, 16, 1, false) +SVE_VECTOR_TYPE_INT("__SVUint32_t", "__SVUint32_t", SveUint32, SveUint32Ty, 4, 32, 1, false) +SVE_VECTOR_TYPE_INT("__SVUint64_t", "__SVUint64_t", SveUint64, SveUint64Ty, 2, 64, 1, false) -SVE_VECTOR_TYPE("__SVFloat16_t", "__SVFloat16_t", SveFloat16, SveFloat16Ty, 8, 16, true, true, false) -SVE_VECTOR_TYPE("__SVFloat32_t", "__SVFloat32_t", SveFloat32, SveFloat32Ty, 4, 32, true, true, false) -SVE_VECTOR_TYPE("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty, 2, 64, true, true, false) +SVE_VECTOR_TYPE_FLOAT("__SVFloat16_t", "__SVFloat16_t", SveFloat16, SveFloat16Ty, 8, 16, 1) +SVE_VECTOR_TYPE_FLOAT("__SVFloat32_t", "__SVFloat32_t", SveFloat32, SveFloat32Ty, 4, 32, 1) +SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty, 2, 64, 1) -SVE_VECTOR_TYPE("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, true, false, true) +SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1) // // x2 // -SVE_VECTOR_TYPE("__clang_svint8x2_t", "svint8x2_t", SveInt8x2, SveInt8x2Ty, 32, 8, true, false, false) -SVE_VECTOR_TYPE("__clang_svint16x2_t", "svint16x2_t", SveInt16x2, SveInt16x2Ty, 16, 16, true, false, false) -SVE_VECTOR_TYPE("__clang_svint32x2_t", "svint32x2_t", SveInt32x2, SveInt32x2Ty, 8, 32, true, false, false) -SVE_VECTOR_TYPE("__clang_svint64x2_t", "svint64x2_t", SveInt64x2, SveInt64x2Ty, 4, 64, true, false, false) -SVE_VECTOR_TYPE("__clang_svuint8x2_t", "svuint8x2_t", SveUint8x2, SveUint8x2Ty, 32, 8, false, false, false) -SVE_VECTOR_TYPE("__clang_svuint16x2_t", "svuint16x2_t", SveUint16x2, SveUint16x2Ty, 16, 16, false, false, false) -SVE_VECTOR_TYPE("__clang_svuint32x2_t", "svuint32x2_t", SveUint32x2, SveUint32x2Ty, 8, 32, false, false, false) -SVE_VECTOR_TYPE("__clang_svuint64x2_t", "svuint64x2_t", SveUint64x2, SveUint64x2Ty, 4, 64, false, false, false) +SVE_VECTOR_TYPE_INT("__clang_svint8x2_t", "svint8x2_t", SveInt8x2, SveInt8x2Ty, 16, 8, 2, true) +SVE_VECTOR_TYPE_INT("__clang_svint16x2_t", "svint16x2_t", SveInt16x2, SveInt16x2Ty, 8, 16, 2, true) +SVE_VECTOR_TYPE_INT("__clang_svint32x2_t", "svint32x2_t", SveInt32x2, SveInt32x2Ty, 4, 32, 2, true) +SVE_VECTOR_TYPE_INT("__clang_svint64x2_t", "svint64x2_t", SveInt64x2, SveInt64x2Ty, 2, 64, 2, true) -SVE_VECTOR_TYPE("__clang_svfloat16x2_t", "svfloat16x2_t", SveFloat16x2, SveFloat16x2Ty, 16, 16, true, true, false) -SVE_VECTOR_TYPE("__clang_svfloat32x2_t", "svfloat32x2_t", SveFloat32x2, SveFloat32x2Ty, 8, 32, true, true, false) -SVE_VECTOR_TYPE("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, SveFloat64x2Ty, 4, 64, true, true, false) +SVE_VECTOR_TYPE_INT("__clang_svuint8x2_t", "svuint8x2_t", SveUint8x2, SveUint8x2Ty, 16 , 8, 2, false) +SVE_VECTOR_TYPE_INT("__clang_svuint16x2_t", "svuint16x2_t", SveUint16x2, SveUint16x2Ty, 8, 16, 2, false) +SVE_VECTOR_TYPE_INT("__clang_svuint32x2_t", "svuint32x2_t", SveUint32x2, SveUint32x2Ty, 4, 32, 2, false) +SVE_VECTOR_TYPE_INT("__clang_svuint64x2_t", "svuint64x2_t", SveUint64x2, SveUint64x2Ty, 2, 64, 2, false) + +SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x2_t", "svfloat16x2_t", SveFloat16x2, SveFloat16x2Ty, 8, 16, 2) +SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x2_t", "svfloat32x2_t", SveFloat32x2, SveFloat32x2Ty, 4, 32, 2) +SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, SveFloat64x2Ty, 2, 64, 2) + +SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2) -SVE_VECTOR_TYPE("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 16, 16, true, false, true) // // x3 // -SVE_VECTOR_TYPE("__clang_svint8x3_t", "svint8x3_t", SveInt8x3, SveInt8x3Ty, 48, 8, true, false, false) -SVE_VECTOR_TYPE("__clang_svint16x3_t", "svint16x3_t", SveInt16x3, SveInt16x3Ty, 24, 16, true, false, false) -SVE_VECTOR_TYPE("__clang_svint32x3_t", "svint32x3_t", SveInt32x3, SveInt32x3Ty, 12, 32, true, false, false) -SVE_VECTOR_TYPE("__clang_svint64x3_t", "svint64x3_t", SveInt64x3, SveInt64x3Ty, 6, 64, true, false, false) -SVE_VECTOR_TYPE("__clang_svuint8x3_t", "svuint8x3_t", SveUint8x3, SveUint8x3Ty, 48, 8, false, false, false) -SVE_VECTOR_TYPE("__clang_svuint16x3_t", "svuint16x3_t", SveUint16x3, SveUint16x3Ty, 24, 16, false, false, false) -SVE_VECTOR_TYPE("__clang_svuint32x3_t", "svuint32x3_t", SveUint32x3, SveUint32x3Ty, 12, 32, false, false, false) -SVE_VECTOR_TYPE("__clang_svuint64x3_t", "svuint64x3_t", SveUint64x3, SveUint64x3Ty, 6, 64, false, false, false) +SVE_VECTOR_TYPE_INT("__clang_svint8x3_t", "svint8x3_t", SveInt8x3, SveInt8x3Ty, 16, 8, 3, true) +SVE_VECTOR_TYPE_INT("__clang_svint16x3_t", "svint16x3_t", SveInt16x3, SveInt16x3Ty, 8, 16, 3, true) +SVE_VECTOR_TYPE_INT("__clang_svint32x3_t", "svint32x3_t", SveInt32x3, SveInt32x3Ty, 4, 32, 3, true) +SVE_VECTOR_TYPE_INT("__clang_svint64x3_t", "svint64x3_t", SveInt64x3, SveInt64x3Ty, 2, 64, 3, true) + +SVE_VECTOR_TYPE_INT("__clang_svuint8x3_t", "svuint8x3_t", SveUint8x3, SveUint8x3Ty, 16, 8, 3, false) +SVE_VECTOR_TYPE_INT("__clang_svuint16x3_t", "svuint16x3_t", SveUint16x3, SveUint16x3Ty, 8, 16, 3, false) +SVE_VECTOR_TYPE_INT("__clang_svuint32x3_t", "svuint32x3_t", SveUint32x3, SveUint32x3Ty, 4, 32, 3, false) +SVE_VECTOR_TYPE_INT("__clang_svuint64x3_t", "svuint64x3_t", SveUint64x3, SveUint64x3Ty, 2, 64, 3, false) -SVE_VECTOR_TYPE("__clang_svfloat16x3_t", "svfloat16x3_t", SveFloat16x3, SveFloat16x3Ty, 24, 16, true, true, false) -SVE_VECTOR_TYPE("__clang_svfloat32x3_t", "svfloat32x3_t", SveFloat32x3, SveFloat32x3Ty, 12, 32, true, true, false) -SVE_VECTOR_TYPE("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, SveFloat64x3Ty, 6, 64, true, true, false) +SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x3_t", "svfloat16x3_t", SveFloat16x3, SveFloat16x3Ty, 8, 16, 3) +SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x3_t", "svfloat32x3_t", SveFloat32x3, SveFloat32x3Ty, 4, 32, 3) +SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, SveFloat64x3Ty, 2, 64, 3) + +SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3) -SVE_VECTOR_TYPE("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 24, 16, true, false, true) // // x4 // -SVE_VECTOR_TYPE("__clang_svint8x4_t", "svint8x4_t", SveInt8x4, SveInt8x4Ty, 64, 8, true, false, false) -SVE_VECTOR_TYPE("__clang_svint16x4_t", "svint16x4_t", SveInt16x4, SveInt16x4Ty, 32, 16, true, false, false) -SVE_VECTOR_TYPE("__clang_svint32x4_t", "svint32x4_t", SveInt32x4, SveInt32x4Ty, 16, 32, true, false, false) -SVE_VECTOR_TYPE("__clang_svint64x4_t", "svint64x4_t", SveInt64x4, SveInt64x4Ty, 8, 64, true, false, false) -SVE_VECTOR_TYPE("__clang_svuint8x4_t", "svuint8x4_t", SveUint8x4, SveUint8x4Ty, 64, 8, false, false, false) -SVE_VECTOR_TYPE("__clang_svuint16x4_t", "svuint16x4_t", SveUint16x4, SveUint16x4Ty, 32, 16, false, false, false) -SVE_VECTOR_TYPE("__clang_svuint32x4_t", "svuint32x4_t", SveUint32x4, SveUint32x4Ty, 16, 32, false, false, false) -SVE_VECTOR_TYPE("__clang_svuint64x4_t", "svuint64x4_t", SveUint64x4, SveUint64x4Ty, 8, 64, false, false, false) +SVE_VECTOR_TYPE_INT("__clang_svint8x4_t", "svint8x4_t", SveInt8x4, SveInt8x4Ty, 16, 8, 4, true) +SVE_VECTOR_TYPE_INT("__clang_svint16x4_t", "svint16x4_t", SveInt16x4, SveInt16x4Ty, 8, 16, 4, true) +SVE_VECTOR_TYPE_INT("__clang_svint32x4_t", "svint32x4_t", SveInt32x4, SveInt32x4Ty, 4, 32, 4, true) +SVE_VECTOR_TYPE_INT("__clang_svint64x4_t", "svint64x4_t", SveInt64x4, SveInt64x4Ty, 2, 64, 4, true) + +SVE_VECTOR_TYPE_INT("__clang_svuint8x4_t", "svuint8x4_t", SveUint8x4, SveUint8x4Ty, 16, 8, 4, false) +SVE_VECTOR_TYPE_INT("__clang_svuint16x4_t", "svuint16x4_t", SveUint16x4, SveUint16x4Ty, 8, 16, 4, false) +SVE_VECTOR_TYPE_INT("__clang_svuint32x4_t", "svuint32x4_t", SveUint32x4, SveUint32x4Ty, 4, 32, 4, false) +SVE_VECTOR_TYPE_INT("__clang_svuint64x4_t", "svuint64x4_t", SveUint64x4, SveUint64x4Ty, 2, 64, 4, false) -SVE_VECTOR_TYPE("__clang_svfloat16x4_t", "svfloat16x4_t", SveFloat16x4, SveFloat16x4Ty, 32, 16, true, true, false) -SVE_VECTOR_TYPE("__clang_svfloat32x4_t", "svfloat32x4_t", SveFloat32x4, SveFloat32x4Ty, 16, 32, true, true, false) -SVE_VECTOR_TYPE("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, SveFloat64x4Ty, 8, 64, true, true, false) +SVE_VECTOR_TYPE_FLOAT("__clang_svfloat16x4_t", "svfloat16x4_t", SveFloat16x4, SveFloat16x4Ty, 8, 16, 4) +SVE_VECTOR_TYPE_FLOAT("__clang_svfloat32x4_t", "svfloat32x4_t", SveFloat32x4, SveFloat32x4Ty, 4, 32, 4) +SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, SveFloat64x4Ty, 2, 64, 4) -SVE_VECTOR_TYPE("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 32, 16, true, false, true) +SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4) -SVE_PREDICATE_TYPE("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16) -SVE_PREDICATE_TYPE("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 32) -SVE_PREDICATE_TYPE("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4Ty, 64) +SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1) +SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2) +SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4Ty, 16, 4) SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy) #undef SVE_VECTOR_TYPE +#undef SVE_VECTOR_TYPE_BFLOAT +#undef SVE_VECTOR_TYPE_FLOAT +#undef SVE_VECTOR_TYPE_INT #undef SVE_PREDICATE_TYPE +#undef SVE_PREDICATE_TYPE_ALL #undef SVE_OPAQUE_TYPE #undef SVE_TYPE diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index fa9cc38efc466..8ece39a383046 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2203,13 +2203,12 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { // Because the length is only known at runtime, we use a dummy value // of 0 for the static length. The alignment values are those defined // by the Procedure Call Standard for the Arm Architecture. -#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits, \ - IsSigned, IsFP, IsBF) \ +#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ case BuiltinType::Id: \ Width = 0; \ Align = 128; \ break; -#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls) \ +#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) \ case BuiltinType::Id: \ Width = 0; \ Align = 16; \ @@ -4284,108 +4283,27 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const { switch (Ty->getKind()) { default: llvm_unreachable("Unsupported builtin vector type"); - case BuiltinType::SveInt8: - return SVE_INT_ELTTY(8, 16, true, 1); - case BuiltinType::SveUint8: - return SVE_INT_ELTTY(8, 16, false, 1); - case BuiltinType::SveInt8x2: - return SVE_INT_ELTTY(8, 16, true, 2); - case BuiltinType::SveUint8x2: - return SVE_INT_ELTTY(8, 16, false, 2); - case BuiltinType::SveInt8x3: - return SVE_INT_ELTTY(8, 16, true, 3); - case BuiltinType::SveUint8x3: - return SVE_INT_ELTTY(8, 16, false, 3); - case BuiltinType::SveInt8x4: - return SVE_INT_ELTTY(8, 16, true, 4); - case BuiltinType::SveUint8x4: - return SVE_INT_ELTTY(8, 16, false, 4); - case BuiltinType::SveInt16: - return SVE_INT_ELTTY(16, 8, true, 1); - case BuiltinType::SveUint16: - return SVE_INT_ELTTY(16, 8, false, 1); - case BuiltinType::SveInt16x2: - return SVE_INT_ELTTY(16, 8, true, 2); - case BuiltinType::SveUint16x2: - return SVE_INT_ELTTY(16, 8, false, 2); - case BuiltinType::SveInt16x3: - return SVE_INT_ELTTY(16, 8, true, 3); - case BuiltinType::SveUint16x3: - return SVE_INT_ELTTY(16, 8, false, 3); - case BuiltinType::SveInt16x4: - return SVE_INT_ELTTY(16, 8, true, 4); - case BuiltinType::SveUint16x4: - return SVE_INT_ELTTY(16, 8, false, 4); - case BuiltinType::SveInt32: - return SVE_INT_ELTTY(32, 4, true, 1); - case BuiltinType::SveUint32: - return SVE_INT_ELTTY(32, 4, false, 1); - case BuiltinType::SveInt32x2: - return SVE_INT_ELTTY(32, 4, true, 2); - case BuiltinType::SveUint32x2: - return SVE_INT_ELTTY(32, 4, false, 2); - case BuiltinType::SveInt32x3: - return SVE_INT_ELTTY(32, 4, true, 3); - case BuiltinType::SveUint32x3: - return SVE_INT_ELTTY(32, 4, false, 3); - case BuiltinType::SveInt32x4: - return SVE_INT_ELTTY(32, 4, true, 4); - case BuiltinType::SveUint32x4: - return SVE_INT_ELTTY(32, 4, false, 4); - case BuiltinType::SveInt64: - return SVE_INT_ELTTY(64, 2, true, 1); - case BuiltinType::SveUint64: - return SVE_INT_ELTTY(64, 2, false, 1); - case BuiltinType::SveInt64x2: - return SVE_INT_ELTTY(64, 2, true, 2); - case BuiltinType::SveUint64x2: - return SVE_INT_ELTTY(64, 2, false, 2); - case BuiltinType::SveInt64x3: - return SVE_INT_ELTTY(64, 2, true, 3); - case BuiltinType::SveUint64x3: - return SVE_INT_ELTTY(64, 2, false, 3); - case BuiltinType::SveInt64x4: - return SVE_INT_ELTTY(64, 2, true, 4); - case BuiltinType::SveUint64x4: - return SVE_INT_ELTTY(64, 2, false, 4); - case BuiltinType::SveBool: - return SVE_ELTTY(BoolTy, 16, 1); - case BuiltinType::SveBoolx2: - return SVE_ELTTY(BoolTy, 16, 2); - case BuiltinType::SveBoolx4: - return SVE_ELTTY(BoolTy, 16, 4); - case BuiltinType::SveFloat16: - return SVE_ELTTY(HalfTy, 8, 1); - case BuiltinType::SveFloat16x2: - return SVE_ELTTY(HalfTy, 8, 2); - case BuiltinType::SveFloat16x3: - return SVE_ELTTY(HalfTy, 8, 3); - case BuiltinType::SveFloat16x4: - return SVE_ELTTY(HalfTy, 8, 4); - case BuiltinType::SveFloat32: - return SVE_ELTTY(FloatTy, 4, 1); - case BuiltinType::SveFloat32x2: - return SVE_ELTTY(FloatTy, 4, 2); - case BuiltinType::SveFloat32x3: - return SVE_ELTTY(FloatTy, 4, 3); - case BuiltinType::SveFloat32x4: - return SVE_ELTTY(FloatTy, 4, 4); - case BuiltinType::SveFloat64: - return SVE_ELTTY(DoubleTy, 2, 1); - case BuiltinType::SveFloat64x2: - return SVE_ELTTY(DoubleTy, 2, 2); - case BuiltinType::SveFloat64x3: - return SVE_ELTTY(DoubleTy, 2, 3); - case BuiltinType::SveFloat64x4: - return SVE_ELTTY(DoubleTy, 2, 4); - case BuiltinType::SveBFloat16: - return SVE_ELTTY(BFloat16Ty, 8, 1); - case BuiltinType::SveBFloat16x2: - return SVE_ELTTY(BFloat16Ty, 8, 2); - case BuiltinType::SveBFloat16x3: - return SVE_ELTTY(BFloat16Ty, 8, 3); - case BuiltinType::SveBFloat16x4: - return SVE_ELTTY(BFloat16Ty, 8, 4); + +#define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF, IsSigned) \ + case BuiltinType::Id: \ + return {getIntTypeForBitwidth(ElBits, IsSigned), \ + llvm::ElementCount::getScalable(NumEls), NF}; +#define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF) \ + case BuiltinType::Id: \ + return {ElBits == 16 ? HalfTy : (ElBits == 32 ? FloatTy : DoubleTy), \ + llvm::ElementCount::getScalable(NumEls), NF}; +#define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF) \ + case BuiltinType::Id: \ + return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF}; +#define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \ + case BuiltinType::Id: \ + return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF}; +#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) +#include "clang/Basic/AArch64SVEACLETypes.def" + #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF, \ IsSigned) \ case BuiltinType::Id: \ @@ -4425,22 +4343,30 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts, unsigned NumFields) const { if (Target->hasAArch64SVETypes()) { uint64_t EltTySize = getTypeSize(EltTy); -#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId, NumEls, ElBits, \ - IsSigned, IsFP, IsBF) \ - if (!EltTy->isBooleanType() && \ - ((EltTy->hasIntegerRepresentation() && \ - EltTy->hasSignedIntegerRepresentation() == IsSigned) || \ - (EltTy->hasFloatingRepresentation() && !EltTy->isBFloat16Type() && \ - IsFP && !IsBF) || \ - (EltTy->hasFloatingRepresentation() && EltTy->isBFloat16Type() && \ - IsBF && !IsFP)) && \ - EltTySize == ElBits && NumElts == NumEls) { \ + +#define SVE_VECTOR_TYPE_INT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF, IsSigned) \ + if (EltTy->hasIntegerRepresentation() && !EltTy->isBooleanType() && \ + EltTy->hasSignedIntegerRepresentation() == IsSigned && \ + EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) { \ return SingletonId; \ } -#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId, NumEls) \ - if (EltTy->isBooleanType() && NumElts == NumEls) \ +#define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF) \ + if (EltTy->hasFloatingRepresentation() && !EltTy->isBFloat16Type() && \ + EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) { \ + return SingletonId; \ + } +#define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF) \ + if (EltTy->hasFloatingRepresentation() && EltTy->isBFloat16Type() && \ + EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) { \ + return SingletonId; \ + } +#define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \ + if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1) \ return SingletonId; -#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingleTonId) +#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) #include "clang/Basic/AArch64SVEACLETypes.def" } else if (Target->hasRISCVVTypes()) { uint64_t EltTySize = getTypeSize(EltTy); diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 7d638befcbd3f..b6e1da0c3192d 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -3384,8 +3384,7 @@ void CXXNameMangler::mangleType(const BuiltinType *T) { // The SVE types are effectively target-specific. The mangling scheme // is defined in the appendices to the Procedure Call Standard for the // Arm Architecture. -#define SVE_VECTOR_TYPE(InternalName, MangledName, Id, SingletonId, NumEls, \ - ElBits, IsSigned, IsFP, IsBF) \ +#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ case BuiltinType::Id: \ if (T->getKind() == BuiltinType::SveBFloat16 && \ isCompatibleWith(LangOptions::ClangABI::Ver17)) { \ @@ -3394,21 +3393,18 @@ void CXXNameMangler::mangleType(const BuiltinType *T) { Out << "u" << type_name.size() << type_name; \ } else { \ type_name = MangledName; \ - Out << (type_name == InternalName ? "u" : "") << type_name.size() \ - << type_name; \ + Out << (type_name == Name ? "u" : "") << type_name.size() << type_name; \ } \ break; -#define SVE_PREDICATE_TYPE(InternalName, MangledName, Id, SingletonId, NumEls) \ +#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) \ case BuiltinType::Id: \ type_name = MangledName; \ - Out << (type_name == InternalName ? "u" : "") << type_name.size() \ - << type_name; \ + Out << (type_name == Name ? "u" : "") << type_name.size() << type_name; \ break; -#define SVE_OPAQUE_TYPE(InternalName, MangledName, Id, SingletonId) \ +#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) \ case BuiltinType::Id: \ type_name = MangledName; \ - Out << (type_name == InternalName ? "u" : "") << type_name.size() \ - << type_name; \ + Out << (type_name == Name ? "u" : "") << type_name.size() << type_name; \ break; #include "clang/Basic/AArch64SVEACLETypes.def" #define PPC_VECTOR_TYPE(Name, Id, Size) \ diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 11a577bbdd078..5eebd8ad2a065 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -500,63 +500,19 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { case BuiltinType::OCLReserveID: ResultType = CGM.getOpenCLRuntime().convertOpenCLSpecificType(Ty); break; - case BuiltinType::SveInt8: - case BuiltinType::SveUint8: - case BuiltinType::SveInt8x2: - case BuiltinType::SveUint8x2: - case BuiltinType::SveInt8x3: - case BuiltinType::SveUint8x3: - case BuiltinType::SveInt8x4: - case BuiltinType::SveUint8x4: - case BuiltinType::SveInt16: - case BuiltinType::SveUint16: - case BuiltinType::SveInt16x2: - case BuiltinType::SveUint16x2: - case BuiltinType::SveInt16x3: - case BuiltinType::SveUint16x3: - case BuiltinType::SveInt16x4: - case BuiltinType::SveUint16x4: - case BuiltinType::SveInt32: - case BuiltinType::SveUint32: - case BuiltinType::SveInt32x2: - case BuiltinType::SveUint32x2: - case BuiltinType::SveInt32x3: - case BuiltinType::SveUint32x3: - case BuiltinType::SveInt32x4: - case BuiltinType::SveUint32x4: - case BuiltinType::SveInt64: - case BuiltinType::SveUint64: - case BuiltinType::SveInt64x2: - case BuiltinType::SveUint64x2: - case BuiltinType::SveInt64x3: - case BuiltinType::SveUint64x3: - case BuiltinType::SveInt64x4: - case BuiltinType::SveUint64x4: - case BuiltinType::SveBool: - case BuiltinType::SveBoolx2: - case BuiltinType::SveBoolx4: - case BuiltinType::SveFloat16: - case BuiltinType::SveFloat16x2: - case BuiltinType::SveFloat16x3: - case BuiltinType::SveFloat16x4: - case BuiltinType::SveFloat32: - case BuiltinType::SveFloat32x2: - case BuiltinType::SveFloat32x3: - case BuiltinType::SveFloat32x4: - case BuiltinType::SveFloat64: - case BuiltinType::SveFloat64x2: - case BuiltinType::SveFloat64x3: - case BuiltinType::SveFloat64x4: - case BuiltinType::SveBFloat16: - case BuiltinType::SveBFloat16x2: - case BuiltinType::SveBFloat16x3: - case BuiltinType::SveBFloat16x4: { - ASTContext::BuiltinVectorTypeInfo Info = - Context.getBuiltinVectorTypeInfo(cast(Ty)); - return llvm::ScalableVectorType::get(ConvertType(Info.ElementType), - Info.EC.getKnownMinValue() * - Info.NumVectors); - } +#define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ + case BuiltinType::Id: +#define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) \ + case BuiltinType::Id: +#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) +#include "clang/Basic/AArch64SVEACLETypes.def" + { + ASTContext::BuiltinVectorTypeInfo Info = + Context.getBuiltinVectorTypeInfo(cast(Ty)); + return llvm::ScalableVectorType::get(ConvertType(Info.ElementType), + Info.EC.getKnownMinValue() * + Info.NumVectors); + } case BuiltinType::SveCount: return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); #define PPC_VECTOR_TYPE(Name, Id, Size) \ From 44fc987ed174e32544a577387ab0df6886495e82 Mon Sep 17 00:00:00 2001 From: Vladislav Dzhidzhoev Date: Wed, 11 Sep 2024 16:04:01 +0300 Subject: [PATCH 099/114] [lldb][test] Toolchain detection rewrite in Python (#102185) This fix is based on a problem with cxx_compiler and cxx_linker macros on Windows. There was an issue with compiler detection in paths containing "icc". In such case, Makefile.rules thought it was provided with icc compiler. To solve that, utilities detection has been rewritten in Python. The last element of compiler's path is separated, taking into account the platform path delimiter, and compiler type is extracted, with regard of possible cross-toolchain prefix. --------- Co-authored-by: Pavel Labath --- .../Python/lldbsuite/test/builders/builder.py | 98 +++++++++++++++++-- .../Python/lldbsuite/test/make/Makefile.rules | 92 ++++------------- .../breakpoint/breakpoint_ids/Makefile | 2 +- .../breakpoint/breakpoint_locations/Makefile | 2 +- .../consecutive_breakpoints/Makefile | 2 +- .../functionalities/breakpoint/cpp/Makefile | 2 +- .../dummy_target_breakpoints/Makefile | 2 +- .../require_hw_breakpoints/Makefile | 2 +- .../breakpoint/step_over_breakpoint/Makefile | 2 +- .../thread_plan_user_breakpoint/Makefile | 2 +- .../ObjCDataFormatterTestCase.py | 4 +- .../TestNSDictionarySynthetic.py | 4 +- .../nssetsynth/TestNSSetSynthetic.py | 4 +- .../poarray/TestPrintObjectArray.py | 4 +- .../functionalities/inline-stepping/Makefile | 2 +- .../postmortem/minidump-new/makefile.txt | 1 + .../lang/objc/orderedset/TestOrderedSet.py | 4 +- .../TestObjCSingleEntryDictionary.py | 4 +- lldb/test/API/macosx/macCatalyst/Makefile | 1 + .../macCatalystAppMacOSFramework/Makefile | 1 + .../macosx/simulator/TestSimulatorPlatform.py | 4 +- .../API/python_api/frame/inlines/Makefile | 2 +- .../lldb-server/TestAppleSimulatorOSType.py | 4 +- 23 files changed, 139 insertions(+), 106 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py index 4ea9a86c1d5fc..564918c58b6dd 100644 --- a/lldb/packages/Python/lldbsuite/test/builders/builder.py +++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py @@ -1,10 +1,12 @@ import os +import pathlib import platform import subprocess import sys import itertools import lldbsuite.test.lldbtest as lldbtest +import lldbsuite.test.lldbplatformutil as lldbplatformutil import lldbsuite.test.lldbutil as lldbutil from lldbsuite.test import configuration from lldbsuite.test_event import build_exception @@ -96,17 +98,101 @@ def getArchSpec(self, architecture): """ return ["ARCH=" + architecture] if architecture else [] - def getCCSpec(self, compiler): + def getToolchainSpec(self, compiler): """ - Helper function to return the key-value string to specify the compiler + Helper function to return the key-value strings to specify the toolchain used for the make system. """ cc = compiler if compiler else None if not cc and configuration.compiler: cc = configuration.compiler - if cc: - return ['CC="%s"' % cc] - return [] + + if not cc: + return [] + + cc = cc.strip() + cc_path = pathlib.Path(cc) + + # We can get CC compiler string in the following formats: + # [] - such as 'xrun clang', 'xrun /usr/bin/clang' & etc + # + # Where could contain the following parts: + # [.] - sucn as 'clang', 'clang.exe' ('clang-cl.exe'?) + # -[.] - such as 'armv7-linux-gnueabi-gcc' + # /[.] - such as '/usr/bin/clang', 'c:\path\to\compiler\clang,exe' + # /-[.] - such as '/usr/bin/clang', 'c:\path\to\compiler\clang,exe' + + cc_ext = cc_path.suffix + # Compiler name without extension + cc_name = cc_path.stem.split(" ")[-1] + + # A kind of compiler (canonical name): clang, gcc, cc & etc. + cc_type = cc_name + # A triple prefix of compiler name: gcc + cc_prefix = "" + if not "clang-cl" in cc_name and not "llvm-gcc" in cc_name: + cc_name_parts = cc_name.split("-") + cc_type = cc_name_parts[-1] + if len(cc_name_parts) > 1: + cc_prefix = "-".join(cc_name_parts[:-1]) + "-" + + # A kind of C++ compiler. + cxx_types = { + "icc": "icpc", + "llvm-gcc": "llvm-g++", + "gcc": "g++", + "cc": "c++", + "clang": "clang++", + } + cxx_type = cxx_types.get(cc_type, cc_type) + + cc_dir = cc_path.parent + + def getToolchainUtil(util_name): + return cc_dir / (cc_prefix + util_name + cc_ext) + + cxx = getToolchainUtil(cxx_type) + + util_names = { + "OBJCOPY": "objcopy", + "STRIP": "strip", + "ARCHIVER": "ar", + "DWP": "dwp", + } + utils = [] + + if not lldbplatformutil.platformIsDarwin(): + if cc_type in ["clang", "cc", "gcc"]: + util_paths = {} + # Assembly a toolchain side tool cmd based on passed CC. + for var, name in util_names.items(): + # Do not override explicity specified tool from the cmd line. + if not os.getenv(var): + util_paths[var] = getToolchainUtil(name) + else: + util_paths[var] = os.getenv(var) + utils.extend(["AR=%s" % util_paths["ARCHIVER"]]) + + # Look for llvm-dwp or gnu dwp + if not lldbutil.which(util_paths["DWP"]): + util_paths["DWP"] = getToolchainUtil("llvm-dwp") + if not lldbutil.which(util_paths["DWP"]): + util_paths["DWP"] = lldbutil.which("llvm-dwp") + if not util_paths["DWP"]: + util_paths["DWP"] = lldbutil.which("dwp") + if not util_paths["DWP"]: + del util_paths["DWP"] + + for var, path in util_paths.items(): + utils.append("%s=%s" % (var, path)) + else: + utils.extend(["AR=%slibtool" % os.getenv("CROSS_COMPILE", "")]) + + return [ + "CC=%s" % cc, + "CC_TYPE=%s" % cc_type, + "CXX=%s" % cxx, + ] + utils def getSDKRootSpec(self): """ @@ -178,7 +264,7 @@ def getBuildCommand( make_targets, self.getArchCFlags(architecture), self.getArchSpec(architecture), - self.getCCSpec(compiler), + self.getToolchainSpec(compiler), self.getExtraMakeArgs(), self.getSDKRootSpec(), self.getModuleCacheSpec(), diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules index 1ba3f843e87cf..f81db9bc06d8a 100644 --- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules +++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules @@ -102,15 +102,22 @@ endif # If you change the defaults of CC, be sure to also change it in the file # test/builders/builder_base.py, which provides a Python way to return the # value of the make variable CC -- getCompiler(). -# -# See also these functions: -# o cxx_compiler -# o cxx_linker #---------------------------------------------------------------------- ifeq "$(CC)" "" $(error "C compiler is not specified. Please run tests through lldb-dotest or lit") endif +# Always override the linker. Assign already normalized CC. +override LD := $(CC) +# A kind of linker. It always gets retrieved from CC. +override LDC := $(CC_TYPE) + +ifeq "$(HOST_OS)" "Windows_NT" + # This function enframes the full path with the platform specific quotes. This is necessary to run the c++ executable + # properly under 'sh' on Windows host (prevent the path breakage because of Windows style path separators). + override CXX := $(QUOTE)$(CXX)$(QUOTE) +endif + #---------------------------------------------------------------------- # Handle SDKROOT for the cross platform builds. #---------------------------------------------------------------------- @@ -147,10 +154,8 @@ ifeq "$(OS)" "Darwin" DS := $(DSYMUTIL) DSFLAGS := $(DSFLAGS_EXTRAS) DSYM = $(EXE).dSYM - AR := $(CROSS_COMPILE)libtool ARFLAGS := -static -o else - AR := $(CROSS_COMPILE)ar # On non-Apple platforms, -arch becomes -m ARCHFLAG := -m @@ -213,7 +218,7 @@ endif LIMIT_DEBUG_INFO_FLAGS = NO_LIMIT_DEBUG_INFO_FLAGS = MODULE_DEBUG_INFO_FLAGS = -ifneq (,$(findstring clang,$(CC))) +ifeq ($(CC_TYPE), clang) LIMIT_DEBUG_INFO_FLAGS += -flimit-debug-info NO_LIMIT_DEBUG_INFO_FLAGS += -fno-limit-debug-info MODULE_DEBUG_INFO_FLAGS += -gmodules @@ -279,7 +284,6 @@ endif CFLAGS += $(CFLAGS_EXTRAS) CXXFLAGS += -std=c++11 $(CFLAGS) $(ARCH_CXXFLAGS) -LD = $(CC) # Copy common options to the linker flags (dwarf, arch. & etc). # Note: we get some 'garbage' options for linker here (such as -I, --isystem & etc). LDFLAGS += $(CFLAGS) @@ -312,61 +316,6 @@ ifneq "$(DYLIB_NAME)" "" endif endif -# Function that returns the counterpart C++ compiler, given $(CC) as arg. -cxx_compiler_notdir = $(if $(findstring icc,$(1)), \ - $(subst icc,icpc,$(1)), \ - $(if $(findstring llvm-gcc,$(1)), \ - $(subst llvm-gcc,llvm-g++,$(1)), \ - $(if $(findstring gcc,$(1)), \ - $(subst gcc,g++,$(1)), \ - $(subst cc,c++,$(1))))) -cxx_compiler = $(if $(findstring /,$(1)),$(join $(dir $(1)), $(call cxx_compiler_notdir,$(notdir $(1)))),$(call cxx_compiler_notdir,$(1))) - -# Function that returns the C++ linker, given $(CC) as arg. -cxx_linker_notdir = $(if $(findstring icc,$(1)), \ - $(subst icc,icpc,$(1)), \ - $(if $(findstring llvm-gcc,$(1)), \ - $(subst llvm-gcc,llvm-g++,$(1)), \ - $(if $(findstring gcc,$(1)), \ - $(subst gcc,g++,$(1)), \ - $(subst cc,c++,$(1))))) -cxx_linker = $(if $(findstring /,$(1)),$(join $(dir $(1)), $(call cxx_linker_notdir,$(notdir $(1)))),$(call cxx_linker_notdir,$(1))) - -ifneq "$(OS)" "Darwin" - CLANG_OR_GCC := $(strip $(if $(findstring clang,$(CC)), \ - $(findstring clang,$(CC)), \ - $(if $(findstring gcc,$(CC)), \ - $(findstring gcc,$(CC)), \ - cc))) - - CC_LASTWORD := $(strip $(lastword $(subst -, ,$(CC)))) - - replace_with = $(strip $(if $(findstring $(3),$(CC_LASTWORD)), \ - $(subst $(3),$(1),$(2)), \ - $(subst $(3),$(1),$(subst -$(CC_LASTWORD),,$(2))))) - - ifeq "$(notdir $(CC))" "$(CC)" - replace_cc_with = $(call replace_with,$(1),$(CC),$(CLANG_OR_GCC)) - else - replace_cc_with = $(join $(dir $(CC)),$(call replace_with,$(1),$(notdir $(CC)),$(CLANG_OR_GCC))) - endif - - OBJCOPY ?= $(call replace_cc_with,objcopy) - ARCHIVER ?= $(call replace_cc_with,ar) - # Look for llvm-dwp or gnu dwp - DWP ?= $(call replace_cc_with,llvm-dwp) - ifeq ($(wildcard $(DWP)),) - DWP = $(call replace_cc_with,dwp) - ifeq ($(wildcard $(DWP)),) - DWP = $(shell command -v llvm-dwp 2> /dev/null) - ifeq ($(wildcard $(DWP)),) - DWP = $(shell command -v dwp 2> /dev/null) - endif - endif - endif - override AR = $(ARCHIVER) -endif - ifdef PIE LDFLAGS += -pie endif @@ -375,7 +324,7 @@ endif # Windows specific options #---------------------------------------------------------------------- ifeq "$(OS)" "Windows_NT" - ifneq (,$(findstring clang,$(CC))) + ifeq ($(CC_TYPE), clang) # Clang for Windows doesn't support C++ Exceptions CXXFLAGS += -fno-exceptions CXXFLAGS += -D_HAS_EXCEPTIONS=0 @@ -420,7 +369,7 @@ endif ifeq (1,$(USE_LIBSTDCPP)) # Clang requires an extra flag: -stdlib=libstdc++ - ifneq (,$(findstring clang,$(CC))) + ifeq ($(CC_TYPE), clang) # Force clang looking for the gcc's headers at specific rootfs folder. CXXFLAGS += -stdlib=libstdc++ $(GCC_TOOLCHAIN_FLAGS) LDFLAGS += -stdlib=libstdc++ $(GCC_TOOLCHAIN_FLAGS) @@ -458,7 +407,7 @@ ifeq (1, $(USE_SYSTEM_STDLIB)) CXXFLAGS += -nostdlib++ -nostdinc++ -cxx-isystem $(SDKROOT)/usr/include/c++/v1 LDFLAGS += -L$(SDKROOT)/usr/lib -Wl,-rpath,$(SDKROOT)/usr/lib -lc++ else - ifneq (,$(findstring clang,$(CC))) + ifeq ($(CC_TYPE),clang) # Force clang looking for the gcc's headers at specific rootfs folder. CXXFLAGS += $(GCC_TOOLCHAIN_FLAGS) LDFLAGS += $(GCC_TOOLCHAIN_FLAGS) @@ -485,8 +434,6 @@ DYLIB_OBJECTS +=$(strip $(DYLIB_C_SOURCES:.c=.o)) DYLIB_OBJECTS +=$(strip $(DYLIB_OBJC_SOURCES:.m=.o)) ifneq "$(strip $(DYLIB_CXX_SOURCES))" "" DYLIB_OBJECTS +=$(strip $(patsubst %.mm, %.o, $(DYLIB_CXX_SOURCES:.cpp=.o))) - CXX = $(call cxx_compiler,$(CC)) - LD = $(call cxx_linker,$(CC)) endif #---------------------------------------------------------------------- @@ -509,8 +456,6 @@ endif #---------------------------------------------------------------------- ifneq "$(strip $(CXX_SOURCES))" "" OBJECTS +=$(strip $(CXX_SOURCES:.cpp=.o)) - CXX = $(call cxx_compiler,$(CC)) - LD = $(call cxx_linker,$(CC)) endif #---------------------------------------------------------------------- @@ -526,19 +471,18 @@ endif #---------------------------------------------------------------------- ifneq "$(strip $(OBJCXX_SOURCES))" "" OBJECTS +=$(strip $(OBJCXX_SOURCES:.mm=.o)) - CXX = $(call cxx_compiler,$(CC)) - LD = $(call cxx_linker,$(CC)) ifeq "$(findstring lobjc,$(LDFLAGS))" "" LDFLAGS +=-lobjc endif endif -ifeq ($(findstring clang, $(CXX)), clang) +ifeq ($(CC_TYPE), clang) CXXFLAGS += --driver-mode=g++ endif ifneq "$(CXX)" "" - ifeq ($(findstring clang, $(LD)), clang) + # Specify the driver mode parameter if we use clang as the linker. + ifeq ($(LDC), clang) LDFLAGS += --driver-mode=g++ endif endif diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile b/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile index 2c00681fa2280..778d3e58ab56f 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_ids/Makefile @@ -1,6 +1,6 @@ CXX_SOURCES := main.cpp -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CXXFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile index 9645fd9cc8dfb..304633c2dca1f 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/Makefile @@ -1,6 +1,6 @@ C_SOURCES := main.c -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile index 2c00681fa2280..778d3e58ab56f 100644 --- a/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile +++ b/lldb/test/API/functionalities/breakpoint/consecutive_breakpoints/Makefile @@ -1,6 +1,6 @@ CXX_SOURCES := main.cpp -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CXXFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/functionalities/breakpoint/cpp/Makefile b/lldb/test/API/functionalities/breakpoint/cpp/Makefile index 66108b79e7fe0..3b4be01d551f4 100644 --- a/lldb/test/API/functionalities/breakpoint/cpp/Makefile +++ b/lldb/test/API/functionalities/breakpoint/cpp/Makefile @@ -1,7 +1,7 @@ CXX_SOURCES := main.cpp CXXFLAGS_EXTRAS := -std=c++14 -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CXXFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile index 9645fd9cc8dfb..304633c2dca1f 100644 --- a/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile +++ b/lldb/test/API/functionalities/breakpoint/dummy_target_breakpoints/Makefile @@ -1,6 +1,6 @@ C_SOURCES := main.c -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile index 9645fd9cc8dfb..304633c2dca1f 100644 --- a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile +++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/require_hw_breakpoints/Makefile @@ -1,6 +1,6 @@ C_SOURCES := main.c -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile index 2c00681fa2280..778d3e58ab56f 100644 --- a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile +++ b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/Makefile @@ -1,6 +1,6 @@ CXX_SOURCES := main.cpp -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CXXFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile b/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile index 2c00681fa2280..778d3e58ab56f 100644 --- a/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile +++ b/lldb/test/API/functionalities/breakpoint/thread_plan_user_breakpoint/Makefile @@ -1,6 +1,6 @@ CXX_SOURCES := main.cpp -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CXXFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py index a0d6802b3a506..c1cd9556c5ef3 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/ObjCDataFormatterTestCase.py @@ -16,12 +16,12 @@ def appkit_tester_impl(self, commands, use_constant_classes): self.build() else: disable_constant_classes = { - "CC": "xcrun clang", # FIXME: Remove when flags are available upstream. "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals " + "-fno-constant-nsarray-literals " + "-fno-constant-nsdictionary-literals", } - self.build(dictionary=disable_constant_classes) + # FIXME: Remove compiler when flags are available upstream. + self.build(dictionary=disable_constant_classes, compiler="xcrun clang") self.appkit_common_data_formatters_command() commands() diff --git a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py index 9ac41d67eb9ab..e1d7e42bdd1a9 100644 --- a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py +++ b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py @@ -26,12 +26,12 @@ def test_rdar11988289_with_run_command(self): def test_rdar11988289_with_run_command_no_const(self): """Test that NSDictionary reports its synthetic children properly.""" disable_constant_classes = { - "CC": "xcrun clang", # FIXME: Remove when flags are available upstream. "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals " + "-fno-constant-nsarray-literals " + "-fno-constant-nsdictionary-literals", } - self.build(dictionary=disable_constant_classes) + # FIXME: Remove when flags are available upstream. + self.build(dictionary=disable_constant_classes, compiler="xcrun clang") self.run_tests() def run_tests(self): diff --git a/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py b/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py index 053ec0ee9757e..1037e75c17eb3 100644 --- a/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py +++ b/lldb/test/API/functionalities/data-formatter/nssetsynth/TestNSSetSynthetic.py @@ -26,12 +26,12 @@ def test_rdar12529957_with_run_command(self): def test_rdar12529957_with_run_command_no_const(self): """Test that NSSet reports its synthetic children properly.""" disable_constant_classes = { - "CC": "xcrun clang", # FIXME: Remove when flags are available upstream. "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals " + "-fno-constant-nsarray-literals " + "-fno-constant-nsdictionary-literals", } - self.build(dictionary=disable_constant_classes) + # FIXME: Remove compiler when flags are available upstream. + self.build(dictionary=disable_constant_classes, compiler="xcrun clang") self.run_tests() def run_tests(self): diff --git a/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py b/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py index fff37829cd20d..db86f48f8ec1f 100644 --- a/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py +++ b/lldb/test/API/functionalities/data-formatter/poarray/TestPrintObjectArray.py @@ -20,13 +20,13 @@ def test_print_array(self): def test_print_array_no_const(self): """Test that expr -O -Z works""" disable_constant_classes = { - "CC": "xcrun clang", # FIXME: Remove when flags are available upstream. "USE_SYSTEM_STDLIB": "1", # See above. "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals " + "-fno-constant-nsarray-literals " + "-fno-constant-nsdictionary-literals", } - self.build(dictionary=disable_constant_classes) + # FIXME: Remove compiler when flags are available upstream. + self.build(dictionary=disable_constant_classes, compiler="xcrun clang") self.printarray_data_formatter_commands() def setUp(self): diff --git a/lldb/test/API/functionalities/inline-stepping/Makefile b/lldb/test/API/functionalities/inline-stepping/Makefile index 362b89d7f995b..bf646c7b7db33 100644 --- a/lldb/test/API/functionalities/inline-stepping/Makefile +++ b/lldb/test/API/functionalities/inline-stepping/Makefile @@ -1,6 +1,6 @@ CXX_SOURCES := calling.cpp -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CXXFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt b/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt index 7096efabdcfe1..d594b585b2d5f 100644 --- a/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt +++ b/lldb/test/API/functionalities/postmortem/minidump-new/makefile.txt @@ -19,6 +19,7 @@ # to generate a Minidump when the binary crashes/requests such. # CC=g++ +CC_TYPE=gcc FLAGS=-g --std=c++11 INCLUDE=-I$HOME/breakpad/src/src/ LINK=-L. -lbreakpad -lpthread -nostdlib -lc -lstdc++ -lgcc_s -fno-exceptions diff --git a/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py b/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py index 14bfc322979b3..a7d6d9d155efc 100644 --- a/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py +++ b/lldb/test/API/lang/objc/orderedset/TestOrderedSet.py @@ -12,12 +12,12 @@ def test_ordered_set(self): @skipUnlessDarwin def test_ordered_set_no_const(self): disable_constant_classes = { - "CC": "xcrun clang", # FIXME: Remove when flags are available upstream. "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals " + "-fno-constant-nsarray-literals " + "-fno-constant-nsdictionary-literals", } - self.build(dictionary=disable_constant_classes) + # FIXME: Remove when flags are available upstream. + self.build(dictionary=disable_constant_classes, compiler="xcrun clang") self.run_tests() def run_tests(self): diff --git a/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py b/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py index 68c0af76b8e3b..8debe731dfe1a 100644 --- a/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py +++ b/lldb/test/API/lang/objc/single-entry-dictionary/TestObjCSingleEntryDictionary.py @@ -28,12 +28,12 @@ def test_single_entry_dict(self): ) # bug in NSDictionary formatting on watchos def test_single_entry_dict_no_const(self): disable_constant_classes = { - "CC": "xcrun clang", # FIXME: Remove when flags are available upstream. "CFLAGS_EXTRAS": "-fno-constant-nsnumber-literals " + "-fno-constant-nsarray-literals " + "-fno-constant-nsdictionary-literals", } - self.build(dictionary=disable_constant_classes) + # FIXME: Remove compiler when flags are available upstream. + self.build(dictionary=disable_constant_classes, compiler="xcrun clang") self.run_tests() def run_tests(self): diff --git a/lldb/test/API/macosx/macCatalyst/Makefile b/lldb/test/API/macosx/macCatalyst/Makefile index 3f084968a2d57..ef17d89d2372c 100644 --- a/lldb/test/API/macosx/macCatalyst/Makefile +++ b/lldb/test/API/macosx/macCatalyst/Makefile @@ -7,6 +7,7 @@ USE_SYSTEM_STDLIB := 1 # FIXME: rdar://problem/54986190 # There is a Clang driver change missing on llvm.org. +override CC_TYPE=clang override CC=xcrun clang include Makefile.rules diff --git a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile index b24fe3f574ccf..c77a186724fda 100644 --- a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile +++ b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile @@ -5,6 +5,7 @@ override TRIPLE := $(ARCH)-apple-ios13.0-macabi CFLAGS_EXTRAS := -target $(TRIPLE) # FIXME: rdar://problem/54986190 +override CC_TYPE=clang override CC=xcrun clang all: libfoo.dylib a.out diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py index b712afdd7560a..3f5645a486bcb 100644 --- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py +++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py @@ -59,10 +59,10 @@ def run_with(self, arch, os, vers, env, expected_load_command): self.build( dictionary={ "ARCH": arch, - "CC": clang, "ARCH_CFLAGS": "-target {} {}".format(triple, version_min), "SDKROOT": sdk_root, - } + }, + compiler=clang, ) self.check_load_commands(expected_load_command) diff --git a/lldb/test/API/python_api/frame/inlines/Makefile b/lldb/test/API/python_api/frame/inlines/Makefile index e6d9d8310a0fa..cf17569a5e351 100644 --- a/lldb/test/API/python_api/frame/inlines/Makefile +++ b/lldb/test/API/python_api/frame/inlines/Makefile @@ -1,6 +1,6 @@ C_SOURCES := inlines.c -ifneq (,$(findstring icc,$(CC))) +ifeq ($(CC_TYPE), icc) CFLAGS_EXTRAS := -debug inline-debug-info endif diff --git a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py index 16297efe14372..ed47f94e9492b 100644 --- a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py +++ b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py @@ -71,12 +71,12 @@ def check_simulator_ostype(self, sdk, platform_name, arch=platform.machine()): self.build( dictionary={ "EXE": exe_name, - "CC": clang, "SDKROOT": sdkroot.strip(), "ARCH": arch, "ARCH_CFLAGS": "-target {} {}".format(triple, version_min), "USE_SYSTEM_STDLIB": 1, - } + }, + compiler=clang, ) exe_path = os.path.realpath(self.getBuildArtifact(exe_name)) cmd = [ From ba4bcce5f5ffa9e7d4af72c20fe4f1baf97075fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thorsten=20Sch=C3=BCtt?= Date: Wed, 11 Sep 2024 15:04:55 +0200 Subject: [PATCH 100/114] [GlobalIsel] Combine trunc of binop (#107721) trunc (binop X, C) --> binop (trunc X, trunc C) --> binop (trunc X, C`) Try to narrow the width of math or bitwise logic instructions by pulling a truncate ahead of binary operators. Vx and Nx cores consider 32-bit and 64-bit basic arithmetic equal in costs. --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 6 + .../include/llvm/Target/GlobalISel/Combine.td | 36 +- .../GlobalISel/CombinerHelperCasts.cpp | 46 + .../GlobalISel/combine-narrow-binop.mir | 136 + .../AArch64/GlobalISel/inline-memset.mir | 8 +- ...izer-combiner-divrem-insertpt-conflict.mir | 9 +- .../AMDGPU/GlobalISel/combine-itofp.mir | 16 +- .../AMDGPU/GlobalISel/combine-zext-trunc.mir | 26 +- ...-divergent-i1-phis-no-lane-mask-merging.ll | 7 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 3021 +++++++++-------- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 2940 ++++++++-------- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 73 +- .../AMDGPU/GlobalISel/shl-ext-reduce.ll | 3 + .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 64 +- .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 18 +- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 14 +- llvm/test/CodeGen/AMDGPU/constrained-shift.ll | 2 - llvm/test/CodeGen/AMDGPU/ctlz.ll | 4 +- 18 files changed, 3396 insertions(+), 3033 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 9b62d6067be39..828532dcffb7d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -831,6 +831,12 @@ class CombinerHelper { /// Combine ors. bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo); + /// trunc (binop X, C) --> binop (trunc X, trunc C). + bool matchNarrowBinop(const MachineInstr &TruncMI, + const MachineInstr &BinopMI, BuildFnTy &MatchInfo); + + bool matchCastOfInteger(const MachineInstr &CastMI, APInt &MatchInfo); + /// Combine addos. bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 525cc815e73ce..a595a51d7b01f 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1867,6 +1867,33 @@ class buildvector_of_opcode : GICombineRule < def buildvector_of_truncate : buildvector_of_opcode; +// narrow binop. +// trunc (binop X, C) --> binop (trunc X, trunc C) +class narrow_binop_opcode : GICombineRule < + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_CONSTANT $const, $imm), + (binopOpcode $binop, $x, $const):$Binop, + (G_TRUNC $root, $binop):$Trunc, + [{ return Helper.matchNarrowBinop(*${Trunc}, *${Binop}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${Trunc}, ${matchinfo}); }])>; + +def narrow_binop_add : narrow_binop_opcode; +def narrow_binop_sub : narrow_binop_opcode; +def narrow_binop_mul : narrow_binop_opcode; +def narrow_binop_and : narrow_binop_opcode; +def narrow_binop_or : narrow_binop_opcode; +def narrow_binop_xor : narrow_binop_opcode; + +// Cast of integer. +class integer_of_opcode : GICombineRule < + (defs root:$root, apint_matchinfo:$matchinfo), + (match (G_CONSTANT $int, $imm), + (castOpcode $root, $int):$Cast, + [{ return Helper.matchCastOfInteger(*${Cast}, ${matchinfo}); }]), + (apply [{ Helper.replaceInstWithConstant(*${Cast}, ${matchinfo}); }])>; + +def integer_of_truncate : integer_of_opcode; + def cast_combines: GICombineGroup<[ truncate_of_zext, truncate_of_sext, @@ -1881,7 +1908,14 @@ def cast_combines: GICombineGroup<[ anyext_of_anyext, anyext_of_zext, anyext_of_sext, - buildvector_of_truncate + buildvector_of_truncate, + narrow_binop_add, + narrow_binop_sub, + narrow_binop_mul, + narrow_binop_and, + narrow_binop_or, + narrow_binop_xor, + integer_of_truncate ]>; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp index 8714fdabf6549..30557e6a2304e 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp @@ -313,3 +313,49 @@ bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI, return true; } + +bool CombinerHelper::matchNarrowBinop(const MachineInstr &TruncMI, + const MachineInstr &BinopMI, + BuildFnTy &MatchInfo) { + const GTrunc *Trunc = cast(&TruncMI); + const GBinOp *BinOp = cast(&BinopMI); + + if (!MRI.hasOneNonDBGUse(BinOp->getReg(0))) + return false; + + Register Dst = Trunc->getReg(0); + LLT DstTy = MRI.getType(Dst); + + // Is narrow binop legal? + if (!isLegalOrBeforeLegalizer({BinOp->getOpcode(), {DstTy}})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + auto LHS = B.buildTrunc(DstTy, BinOp->getLHSReg()); + auto RHS = B.buildTrunc(DstTy, BinOp->getRHSReg()); + B.buildInstr(BinOp->getOpcode(), {Dst}, {LHS, RHS}); + }; + + return true; +} + +bool CombinerHelper::matchCastOfInteger(const MachineInstr &CastMI, + APInt &MatchInfo) { + const GExtOrTruncOp *Cast = cast(&CastMI); + + APInt Input = getIConstantFromReg(Cast->getSrcReg(), MRI); + + LLT DstTy = MRI.getType(Cast->getReg(0)); + + if (!isConstantLegalOrBeforeLegalizer(DstTy)) + return false; + + switch (Cast->getOpcode()) { + case TargetOpcode::G_TRUNC: { + MatchInfo = Input.trunc(DstTy.getScalarSizeInBits()); + return true; + } + default: + return false; + } +} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir new file mode 100644 index 0000000000000..f207e9c149a47 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir @@ -0,0 +1,136 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s --check-prefixes=CHECK + +--- +name: test_combine_trunc_xor_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_xor_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_XOR [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_XOR %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_add_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_add_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_ADD [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_ADD %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_mul_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_mul_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_MUL [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_MUL %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_and_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_and_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_AND [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_AND %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_or_i64 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_or_i64 + ; CHECK: %lhs:_(s64) = COPY $x0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_OR [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s64) = COPY $x0 + %rhs:_(s64) = G_CONSTANT i64 5 + %res:_(s64) = G_OR %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s64) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_sub_i128 +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_sub_i128 + ; CHECK: %lhs:_(s128) = COPY $q0 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s128) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK-NEXT: %small:_(s32) = G_SUB [[TRUNC]], [[C]] + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s128) = COPY $q0 + %rhs:_(s128) = G_CONSTANT i128 5 + %res:_(s128) = G_SUB %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s128) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_sub_i128_multi_use +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_sub_i128_multi_use + ; CHECK: %lhs:_(s128) = COPY $q0 + ; CHECK-NEXT: %rhs:_(s128) = G_CONSTANT i128 5 + ; CHECK-NEXT: %res:_(s128) = G_SUB %lhs, %rhs + ; CHECK-NEXT: %small:_(s32) = G_TRUNC %res(s128) + ; CHECK-NEXT: $q0 = COPY %res(s128) + ; CHECK-NEXT: $w0 = COPY %small(s32) + %lhs:_(s128) = COPY $q0 + %rhs:_(s128) = G_CONSTANT i128 5 + %res:_(s128) = G_SUB %lhs, %rhs + %small:_(s32) = G_TRUNC %res(s128) + $q0 = COPY %res(s128) + $w0 = COPY %small(s32) +... +--- +name: test_combine_trunc_xor_vector_pattern_did_not_match +body: | + bb.1: + ; CHECK-LABEL: name: test_combine_trunc_xor_vector_pattern_did_not_match + ; CHECK: %arg1:_(s64) = COPY $x0 + ; CHECK-NEXT: %arg2:_(s64) = COPY $x0 + ; CHECK-NEXT: %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + ; CHECK-NEXT: %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + ; CHECK-NEXT: %res:_(<2 x s64>) = G_XOR %lhs, %rhs + ; CHECK-NEXT: %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>) + ; CHECK-NEXT: $w0 = COPY %small(<2 x s16>) + %arg1:_(s64) = COPY $x0 + %arg2:_(s64) = COPY $x0 + %lhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + %rhs:_(<2 x s64>) = G_BUILD_VECTOR %arg1(s64), %arg2(s64) + %res:_(<2 x s64>) = G_XOR %lhs, %rhs + %small:_(<2 x s16>) = G_TRUNC %res(<2 x s64>) + $w0 = COPY %small(<2 x s16>) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir index fee5afd3ddbb2..9ed1e2d9eee3b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir @@ -224,10 +224,10 @@ body: | ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) ; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1) - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) - ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; CHECK-NEXT: G_STORE [[TRUNC]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 16448 + ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) + ; CHECK-NEXT: G_STORE [[C2]](s16), [[PTR_ADD1]](p0) :: (store (s16) into %ir.dst + 16, align 1) ; CHECK-NEXT: RET_ReallyLR %0:_(p0) = COPY $x0 %1:_(s8) = G_CONSTANT i8 64 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir index e51d9bd13163b..a87ff305d1535 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-divrem-insertpt-conflict.mir @@ -8,9 +8,8 @@ tracksRegLiveness: true body: | bb.1: ; CHECK-LABEL: name: test - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[C]](s64) - ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: $w0 = COPY [[C]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(s16) = G_CONSTANT i16 0 %2:_(s1) = G_CONSTANT i1 true @@ -41,9 +40,7 @@ body: | bb.1: ; CHECK-LABEL: name: test_inverted_div_rem ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32) - ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s8) - ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32) + ; CHECK-NEXT: $w0 = COPY [[C]](s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %0:_(s16) = G_CONSTANT i16 0 %2:_(s1) = G_CONSTANT i1 true diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir index e4f11dfa9e027..d6135d86022be 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir @@ -193,10 +193,10 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) - ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]] + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] + ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CONSTANT i64 255 @@ -216,10 +216,10 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 - ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64) - ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[TRUNC]] + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] + ; CHECK-NEXT: [[AMDGPU_CVT_F32_UBYTE0_:%[0-9]+]]:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 [[AND]] ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CVT_F32_UBYTE0_]](s32) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s64) = G_CONSTANT i64 255 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir index 3b914df7f8f8a..3423af64162e5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-zext-trunc.mir @@ -12,9 +12,11 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(s32) = COPY $vgpr0 - ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383 - ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF - ; GCN-NEXT: $vgpr0 = COPY %low_bits(s32) + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383 + ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]] + ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16) + ; GCN-NEXT: $vgpr0 = COPY %zext(s32) %var:_(s32) = COPY $vgpr0 %c3FFF:_(s32) = G_CONSTANT i32 16383 %low_bits:_(s32) = G_AND %var, %c3FFF @@ -34,10 +36,8 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(s32) = COPY $vgpr0 - ; GCN-NEXT: %cFFFFF:_(s32) = G_CONSTANT i32 1048575 - ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %cFFFFF - ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32) - ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16) + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32) + ; GCN-NEXT: %zext:_(s32) = G_ZEXT [[TRUNC]](s16) ; GCN-NEXT: $vgpr0 = COPY %zext(s32) %var:_(s32) = COPY $vgpr0 %cFFFFF:_(s32) = G_CONSTANT i32 1048575 @@ -58,9 +58,9 @@ body: | ; GCN: liveins: $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(s64) = COPY $vgpr0_vgpr1 - ; GCN-NEXT: %c3FFF:_(s64) = G_CONSTANT i64 16383 - ; GCN-NEXT: %low_bits:_(s64) = G_AND %var, %c3FFF - ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s64) + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s64) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383 + ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]] ; GCN-NEXT: %zext:_(s32) = G_ZEXT %trunc(s16) ; GCN-NEXT: $vgpr0 = COPY %zext(s32) %var:_(s64) = COPY $vgpr0_vgpr1 @@ -82,9 +82,9 @@ body: | ; GCN: liveins: $vgpr0 ; GCN-NEXT: {{ $}} ; GCN-NEXT: %var:_(s32) = COPY $vgpr0 - ; GCN-NEXT: %c3FFF:_(s32) = G_CONSTANT i32 16383 - ; GCN-NEXT: %low_bits:_(s32) = G_AND %var, %c3FFF - ; GCN-NEXT: %trunc:_(s16) = G_TRUNC %low_bits(s32) + ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC %var(s32) + ; GCN-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 16383 + ; GCN-NEXT: %trunc:_(s16) = G_AND [[TRUNC]], [[C]] ; GCN-NEXT: %zext:_(s64) = G_ZEXT %trunc(s16) ; GCN-NEXT: $vgpr0_vgpr1 = COPY %zext(s64) %var:_(s32) = COPY $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll index 966a481b6594d..bb7bc0447aea0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll @@ -238,13 +238,12 @@ define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 ; GFX10-NEXT: s_load_dwordx8 s[4:11], s[12:13], 0x0 ; GFX10-NEXT: v_mbcnt_hi_u32_b32 v1, -1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v1 -; GFX10-NEXT: v_and_b32_e32 v3, 1, v1 -; GFX10-NEXT: v_xor_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_xor_b32_e32 v3, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 ; GFX10-NEXT: ; implicit-def: $vgpr3 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v2 ; GFX10-NEXT: s_cbranch_vccnz .LBB4_4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index afffebea451a0..3bd3486ec261d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -350,10 +350,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX8-LABEL: s_fshl_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s3, s2, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, s3 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 @@ -362,10 +364,12 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX9-LABEL: s_fshl_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_and_b32 s3, s2, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 @@ -377,7 +381,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX10-NEXT: s_and_b32 s3, s2, 7 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -389,7 +395,9 @@ define amdgpu_ps i8 @s_fshl_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; GFX11-NEXT: s_and_b32 s3, s2, 7 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_lshl_b32 s0, s0, s3 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -416,11 +424,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX8-NEXT: v_mov_b32_e32 v3, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -429,11 +437,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 1 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -441,11 +449,11 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX10-LABEL: v_fshl_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v3, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -454,12 +462,12 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX11-LABEL: v_fshl_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v3, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX11-NEXT: v_lshrrev_b16 v1, v3, v1 @@ -692,22 +700,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_and_b32 s6, s2, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s4, 0xff +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s3, 7, s5 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_andn2_b32 s2, 7, s5 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_lshr_b32 s2, s2, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff @@ -719,22 +731,26 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_and_b32 s6, s2, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshl_b32 s0, s0, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s4, 0xff +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s3, 7, s5 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_andn2_b32 s2, 7, s5 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s0, s0, 0xff @@ -745,21 +761,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX10-LABEL: s_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_lshr_b32 s5, s2, 8 +; GFX10-NEXT: s_and_b32 s5, s2, 7 +; GFX10-NEXT: s_lshr_b32 s6, s2, 8 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff -; GFX10-NEXT: s_and_b32 s6, s2, 7 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_and_b32 s5, s6, 7 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_andn2_b32 s6, 7, s6 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, s5, 7 -; GFX10-NEXT: s_andn2_b32 s5, 7, s5 -; GFX10-NEXT: s_lshr_b32 s4, s4, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshr_b32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s4, s4, s5 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshl_b32 s3, s3, s5 +; GFX10-NEXT: s_lshr_b32 s4, s4, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -772,21 +792,25 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX11-LABEL: s_fshl_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s4, s1, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 8 +; GFX11-NEXT: s_and_b32 s5, s2, 7 +; GFX11-NEXT: s_lshr_b32 s6, s2, 8 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff -; GFX11-NEXT: s_and_b32 s6, s2, 7 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff +; GFX11-NEXT: s_lshl_b32 s0, s0, s5 +; GFX11-NEXT: s_and_b32 s5, s6, 7 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_and_not1_b32 s6, 7, s6 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s6, s5, 7 -; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 -; GFX11-NEXT: s_lshr_b32 s4, s4, 1 ; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_lshr_b32 s4, s4, 1 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 -; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s4, s4, s5 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_lshl_b32 s3, s3, s5 +; GFX11-NEXT: s_lshr_b32 s4, s4, s6 ; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_or_b32 s2, s3, s4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 @@ -837,20 +861,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX8-NEXT: v_mov_b32_e32 v6, 1 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX8-NEXT: v_not_b32_e32 v2, v5 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, v1, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshrrev_b16_e32 v2, v2, v3 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX8-NEXT: v_lshrrev_b16_e32 v2, v3, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -863,20 +887,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 1 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 -; GFX9-NEXT: v_not_b32_e32 v2, v5 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, v1, v3 -; GFX9-NEXT: v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshrrev_b16_e32 v2, v2, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v5 +; GFX9-NEXT: v_lshrrev_b16_sdwa v2, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX9-NEXT: v_lshrrev_b16_e32 v2, v3, v2 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 @@ -886,24 +910,24 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-LABEL: v_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX10-NEXT: v_not_b32_e32 v7, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX10-NEXT: v_not_b32_e32 v6, v3 -; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX10-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5 +; GFX10-NEXT: v_lshrrev_b16 v3, 1, v3 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX10-NEXT: v_lshlrev_b16 v4, v4, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 -; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 +; GFX10-NEXT: v_lshrrev_b16 v3, v6, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -913,26 +937,26 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX11-LABEL: v_fshl_v2i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX11-NEXT: v_not_b32_e32 v7, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4 -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v4 +; GFX11-NEXT: v_and_b32_e32 v4, 7, v4 ; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX11-NEXT: v_lshrrev_b16 v4, 1, v4 ; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX11-NEXT: v_lshlrev_b16 v3, v3, v5 +; GFX11-NEXT: v_lshrrev_b16 v3, 1, v3 +; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX11-NEXT: v_lshlrev_b16 v4, v4, v5 ; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b16 v4, v6, v4 +; GFX11-NEXT: v_lshrrev_b16 v3, v6, v3 ; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_or_b32_e32 v2, v3, v4 +; GFX11-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v2 @@ -1002,13 +1026,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 ; GFX8-NEXT: s_and_b32 s12, s2, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 +; GFX8-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 @@ -1016,29 +1042,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 +; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, s6, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s6, 0xff +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_andn2_b32 s3, 7, s9 +; GFX8-NEXT: s_lshr_b32 s2, s2, 1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_andn2_b32 s2, 7, s9 -; GFX8-NEXT: s_lshr_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_lshr_b32 s2, s2, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 7 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_and_b32 s3, s7, 0xff ; GFX8-NEXT: s_lshl_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s7, 0xff +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_andn2_b32 s4, 7, s10 +; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_andn2_b32 s3, 7, s10 -; GFX8-NEXT: s_lshr_b32 s4, s4, 1 -; GFX8-NEXT: s_lshr_b32 s3, s4, s3 +; GFX8-NEXT: s_lshr_b32 s3, s3, s4 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_andn2_b32 s4, 7, s11 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s3, s5, s3 -; GFX8-NEXT: s_lshr_b32 s5, s8, 1 +; GFX8-NEXT: s_andn2_b32 s5, 7, s11 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_lshr_b32 s4, s8, 1 +; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshr_b32 s4, s5, s4 +; GFX8-NEXT: s_lshr_b32 s4, s4, s5 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_or_b32 s3, s3, s4 @@ -1055,13 +1087,15 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 ; GFX9-NEXT: s_and_b32 s12, s2, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 +; GFX9-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 @@ -1069,29 +1103,35 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 +; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, s6, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s6, 0xff +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s3, 7, s9 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_andn2_b32 s2, 7, s9 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_lshr_b32 s2, s2, s3 ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s10, 7 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_and_b32 s3, s7, 0xff ; GFX9-NEXT: s_lshl_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s4, s7, 0xff +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_andn2_b32 s4, 7, s10 +; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX9-NEXT: s_andn2_b32 s3, 7, s10 -; GFX9-NEXT: s_lshr_b32 s4, s4, 1 -; GFX9-NEXT: s_lshr_b32 s3, s4, s3 +; GFX9-NEXT: s_lshr_b32 s3, s3, s4 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_andn2_b32 s4, 7, s11 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_lshl_b32 s3, s5, s3 -; GFX9-NEXT: s_lshr_b32 s5, s8, 1 +; GFX9-NEXT: s_andn2_b32 s5, 7, s11 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_lshr_b32 s4, s8, 1 +; GFX9-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshr_b32 s4, s5, s4 +; GFX9-NEXT: s_lshr_b32 s4, s4, s5 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s2, 0xff ; GFX9-NEXT: s_or_b32 s3, s3, s4 @@ -1108,48 +1148,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_and_b32 s11, s2, 7 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_andn2_b32 s12, 7, s2 +; GFX10-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 +; GFX10-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s6, 0xff -; GFX10-NEXT: s_and_b32 s6, s9, 7 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_andn2_b32 s9, 7, s9 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s12 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 -; GFX10-NEXT: s_lshr_b32 s2, s2, s9 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s11 +; GFX10-NEXT: s_lshr_b32 s1, s1, s12 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s7, 0xff -; GFX10-NEXT: s_and_b32 s3, s10, 7 -; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX10-NEXT: s_andn2_b32 s6, 7, s10 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_and_b32 s1, s9, 7 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_andn2_b32 s9, 7, s9 +; GFX10-NEXT: s_lshr_b32 s10, s2, 16 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_lshr_b32 s6, s6, 1 +; GFX10-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX10-NEXT: s_lshl_b32 s1, s3, s1 +; GFX10-NEXT: s_lshr_b32 s3, s6, s9 +; GFX10-NEXT: s_and_b32 s6, s10, 7 +; GFX10-NEXT: s_or_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s6 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff +; GFX10-NEXT: s_lshr_b32 s2, s2, 24 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s4, s11, 7 -; GFX10-NEXT: s_andn2_b32 s6, 7, s11 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX10-NEXT: s_andn2_b32 s6, 7, s10 +; GFX10-NEXT: s_lshr_b32 s4, s4, 1 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_and_b32 s7, s2, 7 +; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshr_b32 s4, s4, s6 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s7 ; GFX10-NEXT: s_lshr_b32 s7, s8, 1 -; GFX10-NEXT: s_lshl_b32 s4, s5, s4 -; GFX10-NEXT: s_lshr_b32 s5, s7, s6 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshl_b32 s5, s5, s6 +; GFX10-NEXT: s_lshr_b32 s2, s7, s2 +; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_or_b32 s3, s4, s5 +; GFX10-NEXT: s_or_b32 s2, s5, s2 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_and_b32 s3, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s3, 16 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1161,48 +1209,56 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX11-NEXT: s_lshr_b32 s7, s1, 16 ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_and_b32 s11, s2, 7 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshr_b32 s10, s2, 16 -; GFX11-NEXT: s_lshr_b32 s11, s2, 24 -; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_not1_b32 s12, 7, s2 +; GFX11-NEXT: s_and_b32 s11, 0xffff, s11 ; GFX11-NEXT: s_lshr_b32 s1, s1, 1 +; GFX11-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s1, s1, s2 -; GFX11-NEXT: s_and_b32 s2, s6, 0xff -; GFX11-NEXT: s_and_b32 s6, s9, 7 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 ; GFX11-NEXT: s_lshr_b32 s4, s0, 16 ; GFX11-NEXT: s_lshr_b32 s5, s0, 24 -; GFX11-NEXT: s_lshl_b32 s0, s0, s12 -; GFX11-NEXT: s_lshl_b32 s3, s3, s6 -; GFX11-NEXT: s_lshr_b32 s2, s2, s9 +; GFX11-NEXT: s_lshr_b32 s9, s2, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, s11 +; GFX11-NEXT: s_lshr_b32 s1, s1, s12 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s3, s2 -; GFX11-NEXT: s_and_b32 s2, s7, 0xff -; GFX11-NEXT: s_and_b32 s3, s10, 7 -; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 -; GFX11-NEXT: s_lshr_b32 s2, s2, 1 +; GFX11-NEXT: s_and_b32 s1, s9, 7 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 +; GFX11-NEXT: s_lshr_b32 s10, s2, 16 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_lshr_b32 s6, s6, 1 +; GFX11-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX11-NEXT: s_lshl_b32 s1, s3, s1 +; GFX11-NEXT: s_lshr_b32 s3, s6, s9 +; GFX11-NEXT: s_and_b32 s6, s10, 7 +; GFX11-NEXT: s_or_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s6 +; GFX11-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-NEXT: s_lshr_b32 s2, s2, 24 ; GFX11-NEXT: s_lshl_b32 s3, s4, s3 -; GFX11-NEXT: s_lshr_b32 s2, s2, s6 -; GFX11-NEXT: s_and_b32 s4, s11, 7 -; GFX11-NEXT: s_and_not1_b32 s6, 7, s11 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s6 +; GFX11-NEXT: s_and_not1_b32 s6, 7, s10 +; GFX11-NEXT: s_lshr_b32 s4, s4, 1 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_and_b32 s7, s2, 7 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshr_b32 s4, s4, s6 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s7 ; GFX11-NEXT: s_lshr_b32 s7, s8, 1 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 -; GFX11-NEXT: s_lshr_b32 s5, s7, s6 -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_lshl_b32 s5, s5, s6 +; GFX11-NEXT: s_lshr_b32 s2, s7, s2 +; GFX11-NEXT: s_or_b32 s3, s3, s4 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_or_b32 s3, s4, s5 +; GFX11-NEXT: s_or_b32 s2, s5, s2 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: s_lshl_b32 s1, s1, 8 -; GFX11-NEXT: s_and_b32 s2, s2, 0xff +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_lshl_b32 s1, s2, 16 -; GFX11-NEXT: s_and_b32 s2, s3, 0xff +; GFX11-NEXT: s_lshl_b32 s1, s3, 16 +; GFX11-NEXT: s_and_b32 s2, s2, 0xff ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_lshl_b32 s1, s2, 24 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -1271,37 +1327,38 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-LABEL: v_fshl_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_not_b32_e32 v7, v2 -; GFX8-NEXT: v_mov_b32_e32 v9, 1 +; GFX8-NEXT: v_mov_b32_e32 v8, 1 +; GFX8-NEXT: v_xor_b32_e32 v10, -1, v2 ; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v6, v6, v0 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v10 +; GFX8-NEXT: v_lshrrev_b16_e32 v9, v10, v9 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v5 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX8-NEXT: v_and_b32_e32 v9, 7, v5 +; GFX8-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v9, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v4 -; GFX8-NEXT: v_mov_b32_e32 v8, 0xff +; GFX8-NEXT: v_mov_b32_e32 v7, 0xff ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v4, 7 +; GFX8-NEXT: v_mov_b32_e32 v9, -1 ; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v7, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_xor_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshrrev_b16_e32 v8, 1, v8 +; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v7, 1, v7 +; GFX8-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_lshrrev_b16_e32 v7, v7, v8 +; GFX8-NEXT: v_lshrrev_b16_e32 v7, v10, v7 ; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX8-NEXT: v_lshrrev_b16_e32 v1, v2, v1 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 @@ -1320,46 +1377,47 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-LABEL: v_fshl_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_not_b32_e32 v7, v2 -; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_mov_b32_e32 v8, 1 +; GFX9-NEXT: v_xor_b32_e32 v10, -1, v2 ; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshrrev_b16_sdwa v10, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v6, v6, v0 -; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10 +; GFX9-NEXT: v_lshrrev_b16_e32 v9, v10, v9 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 -; GFX9-NEXT: v_not_b32_e32 v5, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v9 +; GFX9-NEXT: v_and_b32_e32 v9, 7, v5 +; GFX9-NEXT: v_xor_b32_e32 v5, -1, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, v9, v3 ; GFX9-NEXT: v_lshrrev_b16_e32 v4, v5, v4 -; GFX9-NEXT: v_mov_b32_e32 v8, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 0xff ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 7 +; GFX9-NEXT: v_mov_b32_e32 v10, -1 ; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_xor_b32_sdwa v11, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshrrev_b16_e32 v10, 1, v10 +; GFX9-NEXT: v_xor_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v9, 1, v9 +; GFX9-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshrrev_b16_e32 v7, v7, v10 +; GFX9-NEXT: v_lshrrev_b16_e32 v9, v11, v9 ; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX9-NEXT: v_lshrrev_b16_e32 v1, v2, v1 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v9 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v5 ; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_and_or_b32 v1, v6, v8, v1 +; GFX9-NEXT: v_and_or_b32 v1, v6, v7, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 @@ -1368,41 +1426,42 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-LABEL: v_fshl_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 0xff, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v1 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_not_b32_e32 v12, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v9, v0 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_lshrrev_b16 v9, 1, v11 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v12 -; GFX10-NEXT: v_mov_b32_e32 v12, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX10-NEXT: v_lshlrev_b16 v3, v7, v3 -; GFX10-NEXT: v_mov_b32_e32 v7, 7 -; GFX10-NEXT: v_not_b32_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_not_b32_sdwa v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_not_b32_e32 v8, v2 -; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_sdwa v14, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX10-NEXT: v_lshrrev_b16 v8, 1, v9 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v10 +; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 +; GFX10-NEXT: v_mov_b32_e32 v10, 0xff +; GFX10-NEXT: v_mov_b32_e32 v11, -1 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6 +; GFX10-NEXT: v_mov_b32_e32 v13, 7 +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_sdwa v10, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_xor_b32_sdwa v11, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_sdwa v14, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v7, 7, v12 -; GFX10-NEXT: v_lshrrev_b16 v10, 1, v10 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 -; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 ; GFX10-NEXT: v_lshlrev_b16 v4, v14, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v13, v1 +; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 ; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 -; GFX10-NEXT: v_lshrrev_b16 v5, v7, v10 -; GFX10-NEXT: v_lshrrev_b16 v7, v8, v9 +; GFX10-NEXT: v_lshrrev_b16 v5, v11, v12 +; GFX10-NEXT: v_lshrrev_b16 v7, v9, v8 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 @@ -1426,7 +1485,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_not_b32_e32 v13, v9 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 24, v1 @@ -1434,22 +1493,22 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX11-NEXT: v_lshlrev_b16 v3, v9, v3 -; GFX11-NEXT: v_not_b32_e32 v9, v10 +; GFX11-NEXT: v_xor_b32_e32 v9, -1, v10 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_lshrrev_b16 v6, v13, v6 -; GFX11-NEXT: v_not_b32_e32 v13, v11 +; GFX11-NEXT: v_xor_b32_e32 v13, -1, v11 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX11-NEXT: v_and_b32_e32 v12, 7, v2 -; GFX11-NEXT: v_not_b32_e32 v2, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX11-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX11-NEXT: v_lshrrev_b16 v7, 1, v7 +; GFX11-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX11-NEXT: v_lshrrev_b16 v8, 1, v8 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX11-NEXT: v_lshrrev_b16 v1, 1, v1 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX11-NEXT: v_lshlrev_b16 v4, v10, v4 ; GFX11-NEXT: v_lshrrev_b16 v6, v9, v7 @@ -5087,23 +5146,48 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) } define amdgpu_ps i64 @s_fshl_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) { -; GCN-LABEL: s_fshl_i64: -; GCN: ; %bb.0: -; GCN-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GCN-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 -; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: ; return to shader part epilog +; GFX6-LABEL: s_fshl_i64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX6-NEXT: s_not_b32 s4, s4 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_fshl_i64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX8-NEXT: s_not_b32 s4, s4 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: s_fshl_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX9-NEXT: s_not_b32 s4, s4 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: s_fshl_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_not_b32 s5, s4 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[4:5] ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_not_b32 s5, s4 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: ; return to shader part epilog @@ -5181,8 +5265,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 +; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 @@ -5194,8 +5278,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] @@ -5207,8 +5291,8 @@ define i64 @v_fshl_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] @@ -5362,36 +5446,36 @@ define amdgpu_ps <2 x float> @v_fshl_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v1, 63, v0 ; GFX6-NEXT: v_not_b32_e32 v0, v0 -; GFX6-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v1 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX6-NEXT: v_lshr_b64 v[2:3], s[0:1], v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX6-NEXT: v_lshr_b64 v[3:4], s[0:1], v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i64_ssv: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_and_b32_e32 v1, 63, v0 ; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1] +; GFX8-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i64_ssv: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_and_b32_e32 v1, 63, v0 ; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v1, s[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1 -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[0:1] +; GFX9-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i64_ssv: @@ -5429,10 +5513,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX6-LABEL: v_fshl_i64_svs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s2 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX6-NEXT: s_andn2_b32 s3, 63, s2 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s3 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -5440,10 +5523,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX8-LABEL: v_fshl_i64_svs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX8-NEXT: s_andn2_b32 s3, 63, s2 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -5451,10 +5533,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX9-LABEL: v_fshl_i64_svs: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_andn2_b32 s3, 63, s2 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX9-NEXT: ; return to shader part epilog @@ -5462,10 +5543,9 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX10-LABEL: v_fshl_i64_svs: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_andn2_b64 s[4:5], 63, s[2:3] -; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], 63 +; GFX10-NEXT: s_andn2_b32 s3, 63, s2 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -5473,13 +5553,12 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; GFX11-LABEL: v_fshl_i64_svs: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[2:3] -; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], 63 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_and_not1_b32 s3, 63, s2 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5490,10 +5569,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg %amt) { ; GFX6-LABEL: v_fshl_i64_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s4 +; GFX6-NEXT: s_and_b32 s3, s2, 63 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s3 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX6-NEXT: s_not_b32 s2, s2 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5501,10 +5580,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; ; GFX8-LABEL: v_fshl_i64_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX8-NEXT: s_and_b32 s3, s2, 63 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX8-NEXT: s_not_b32 s2, s2 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5512,10 +5591,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; ; GFX9-LABEL: v_fshl_i64_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX9-NEXT: s_and_b32 s3, s2, 63 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX9-NEXT: s_not_b32 s2, s2 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5523,10 +5602,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; ; GFX10-LABEL: v_fshl_i64_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX10-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: s_and_b32 s3, s2, 63 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] +; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5534,10 +5613,10 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; ; GFX11-LABEL: v_fshl_i64_vss: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX11-NEXT: s_and_not1_b64 s[2:3], 63, s[2:3] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX11-NEXT: s_and_b32 s3, s2, 63 ; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] +; GFX11-NEXT: s_not_b32 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -5553,80 +5632,70 @@ define amdgpu_ps <2 x float> @v_fshl_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg define amdgpu_ps <2 x i64> @s_fshl_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) { ; GFX6-LABEL: s_fshl_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX6-NEXT: s_not_b32 s8, s8 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX6-NEXT: s_not_b32 s6, s10 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX8-NEXT: s_not_b32 s8, s8 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX8-NEXT: s_not_b32 s6, s10 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 +; GFX9-NEXT: s_not_b32 s8, s8 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX9-NEXT: s_not_b32 s6, s10 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s6 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX10-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX10-NEXT: s_not_b32 s9, s8 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX10-NEXT: s_and_b64 s[8:9], s[10:11], 63 -; GFX10-NEXT: s_andn2_b64 s[10:11], 63, s[10:11] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX10-NEXT: s_not_b32 s8, s10 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s9 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s8 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_v2i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX11-NEXT: s_and_not1_b64 s[8:9], 63, s[8:9] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 +; GFX11-NEXT: s_not_b32 s9, s8 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX11-NEXT: s_and_b64 s[8:9], s[10:11], 63 -; GFX11-NEXT: s_and_not1_b64 s[10:11], 63, s[10:11] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 +; GFX11-NEXT: s_not_b32 s8, s10 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s9 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s10 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s8 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] ; GFX11-NEXT: ; return to shader part epilog @@ -5639,18 +5708,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], 1 +; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 -; GFX6-NEXT: v_not_b32_e32 v8, v10 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v8 +; GFX6-NEXT: v_not_b32_e32 v4, v10 +; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX6-NEXT: v_or_b32_e32 v3, v3, v7 @@ -5660,18 +5729,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] -; GFX8-NEXT: v_not_b32_e32 v8, v10 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v8, v[6:7] +; GFX8-NEXT: v_not_b32_e32 v4, v10 +; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v7 @@ -5681,18 +5750,18 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] +; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] -; GFX9-NEXT: v_not_b32_e32 v8, v10 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v8, v[6:7] +; GFX9-NEXT: v_not_b32_e32 v4, v10 +; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v7 @@ -5750,231 +5819,236 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: s_fshl_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX6-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX6-NEXT: s_sub_i32 s9, s10, 64 -; GFX6-NEXT: s_sub_i32 s11, 64, s10 -; GFX6-NEXT: s_cmp_lt_u32 s10, 64 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_and_b32 s9, s8, 0x7f +; GFX6-NEXT: s_sub_i32 s11, s9, 64 +; GFX6-NEXT: s_sub_i32 s14, 64, s9 +; GFX6-NEXT: s_cmp_lt_u32 s9, 64 ; GFX6-NEXT: s_cselect_b32 s18, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s10 -; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s11 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 -; GFX6-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX6-NEXT: s_cmp_eq_u32 s9, 0 +; GFX6-NEXT: s_cselect_b32 s9, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[14:15], s[0:1], s14 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[2:3], s8 +; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 +; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: s_mov_b32 s12, 0 +; GFX6-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s9, 0 +; GFX6-NEXT: s_mov_b32 s10, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX6-NEXT: s_lshl_b32 s13, s6, 31 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX6-NEXT: s_lshl_b32 s11, s6, 31 ; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX6-NEXT: s_sub_i32 s12, s8, 64 -; GFX6-NEXT: s_sub_i32 s10, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_andn2_b32 s6, 0x7f, s8 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX6-NEXT: s_not_b32 s9, s8 +; GFX6-NEXT: s_sub_i32 s14, s6, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s6 +; GFX6-NEXT: s_cmp_lt_u32 s6, 64 +; GFX6-NEXT: s_cselect_b32 s15, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], s9 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s9 ; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_cmp_lg_u32 s13, 0 +; GFX6-NEXT: s_cmp_lg_u32 s15, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 -; GFX6-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] +; GFX6-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX8-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX8-NEXT: s_sub_i32 s9, s10, 64 -; GFX8-NEXT: s_sub_i32 s11, 64, s10 -; GFX8-NEXT: s_cmp_lt_u32 s10, 64 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_and_b32 s9, s8, 0x7f +; GFX8-NEXT: s_sub_i32 s11, s9, 64 +; GFX8-NEXT: s_sub_i32 s14, 64, s9 +; GFX8-NEXT: s_cmp_lt_u32 s9, 64 ; GFX8-NEXT: s_cselect_b32 s18, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s10 -; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s11 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 -; GFX8-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s9, 0 +; GFX8-NEXT: s_cselect_b32 s9, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[0:1], s14 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[2:3], s8 +; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 +; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX8-NEXT: s_cmp_lg_u32 s18, 0 -; GFX8-NEXT: s_mov_b32 s12, 0 +; GFX8-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s9, 0 +; GFX8-NEXT: s_mov_b32 s10, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX8-NEXT: s_lshl_b32 s13, s6, 31 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX8-NEXT: s_lshl_b32 s11, s6, 31 ; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX8-NEXT: s_sub_i32 s12, s8, 64 -; GFX8-NEXT: s_sub_i32 s10, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_andn2_b32 s6, 0x7f, s8 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX8-NEXT: s_not_b32 s9, s8 +; GFX8-NEXT: s_sub_i32 s14, s6, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s6 +; GFX8-NEXT: s_cmp_lt_u32 s6, 64 +; GFX8-NEXT: s_cselect_b32 s15, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[4:5], s9 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s9 ; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_cmp_lg_u32 s13, 0 +; GFX8-NEXT: s_cmp_lg_u32 s15, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] +; GFX8-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX9-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX9-NEXT: s_sub_i32 s9, s10, 64 -; GFX9-NEXT: s_sub_i32 s11, 64, s10 -; GFX9-NEXT: s_cmp_lt_u32 s10, 64 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_and_b32 s9, s8, 0x7f +; GFX9-NEXT: s_sub_i32 s11, s9, 64 +; GFX9-NEXT: s_sub_i32 s14, 64, s9 +; GFX9-NEXT: s_cmp_lt_u32 s9, 64 ; GFX9-NEXT: s_cselect_b32 s18, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s10 -; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s11 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 -; GFX9-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b64 s[14:15], s[14:15], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s9, 0 +; GFX9-NEXT: s_cselect_b32 s9, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[0:1], s14 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[2:3], s8 +; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], s8 +; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX9-NEXT: s_cmp_lg_u32 s18, 0 -; GFX9-NEXT: s_mov_b32 s12, 0 +; GFX9-NEXT: s_cselect_b64 s[12:13], s[12:13], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: s_mov_b32 s10, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX9-NEXT: s_lshl_b32 s13, s6, 31 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX9-NEXT: s_lshl_b32 s11, s6, 31 ; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX9-NEXT: s_sub_i32 s12, s8, 64 -; GFX9-NEXT: s_sub_i32 s10, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_andn2_b32 s6, 0x7f, s8 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX9-NEXT: s_not_b32 s9, s8 +; GFX9-NEXT: s_sub_i32 s14, s6, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s6 +; GFX9-NEXT: s_cmp_lt_u32 s6, 64 +; GFX9-NEXT: s_cselect_b32 s15, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s8 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[4:5], s9 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s9 ; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 ; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cmp_lg_u32 s15, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[6:7], 0 -; GFX9-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] +; GFX9-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX10-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX10-NEXT: s_sub_i32 s9, s10, 64 -; GFX10-NEXT: s_sub_i32 s11, 64, s10 -; GFX10-NEXT: s_cmp_lt_u32 s10, 64 -; GFX10-NEXT: s_mov_b32 s12, 0 -; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_and_b32 s9, s8, 0x7f +; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_sub_i32 s11, s9, 64 +; GFX10-NEXT: s_sub_i32 s12, 64, s9 +; GFX10-NEXT: s_cmp_lt_u32 s9, 64 ; GFX10-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[14:15], s[0:1], s11 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[2:3], s10 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 -; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s9, 0 +; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[0:1], s12 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 +; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX10-NEXT: s_cmp_lg_u32 s18, 0 +; GFX10-NEXT: s_cselect_b64 s[14:15], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX10-NEXT: s_lshl_b32 s13, s6, 31 +; GFX10-NEXT: s_lshl_b32 s11, s6, 31 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] -; GFX10-NEXT: s_sub_i32 s14, s8, 64 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_andn2_b32 s6, 0x7f, s8 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX10-NEXT: s_not_b32 s10, s8 +; GFX10-NEXT: s_sub_i32 s12, s6, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s6 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: s_cselect_b32 s13, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX10-NEXT: s_lshl_b64 s[12:13], s[4:5], s9 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s10 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s15, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 -; GFX10-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[8:9], 0x7f, s[8:9] -; GFX11-NEXT: s_sub_i32 s9, s10, 64 -; GFX11-NEXT: s_sub_i32 s11, 64, s10 -; GFX11-NEXT: s_cmp_lt_u32 s10, 64 -; GFX11-NEXT: s_mov_b32 s12, 0 -; GFX11-NEXT: s_cselect_b32 s13, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_and_b32 s9, s8, 0x7f +; GFX11-NEXT: s_mov_b32 s10, 0 +; GFX11-NEXT: s_sub_i32 s11, s9, 64 +; GFX11-NEXT: s_sub_i32 s12, 64, s9 +; GFX11-NEXT: s_cmp_lt_u32 s9, 64 ; GFX11-NEXT: s_cselect_b32 s18, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[0:1], s11 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[2:3], s10 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 -; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 -; GFX11-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[14:15], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s9, 0 +; GFX11-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], s12 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s8 +; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 ; GFX11-NEXT: s_cmp_lg_u32 s18, 0 +; GFX11-NEXT: s_cselect_b64 s[14:15], s[16:17], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 -; GFX11-NEXT: s_lshl_b32 s13, s6, 31 +; GFX11-NEXT: s_lshl_b32 s11, s6, 31 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[6:7], 1 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] -; GFX11-NEXT: s_sub_i32 s14, s8, 64 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s15, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_and_not1_b32 s6, 0x7f, s8 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] +; GFX11-NEXT: s_not_b32 s10, s8 +; GFX11-NEXT: s_sub_i32 s12, s6, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s6 +; GFX11-NEXT: s_cmp_lt_u32 s6, 64 +; GFX11-NEXT: s_cselect_b32 s13, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: s_cselect_b32 s16, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s8 -; GFX11-NEXT: s_lshl_b64 s[12:13], s[4:5], s9 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s14 -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s10 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s15, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 -; GFX11-NEXT: s_or_b64 s[0:1], s[10:11], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 +; GFX11-NEXT: s_or_b64 s[0:1], s[14:15], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) @@ -5985,143 +6059,143 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v14 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v14 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v8 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v14 -; GFX6-NEXT: v_lshl_b64 v[12:13], v[0:1], v14 +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v15 +; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15 +; GFX6-NEXT: v_lshr_b64 v[9:10], v[0:1], v9 +; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 +; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v15 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v16 -; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX6-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX6-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GFX6-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc ; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v6 +; GFX6-NEXT: v_not_b32_e32 v4, v8 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v15 -; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v15 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v15 +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v14 +; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v14 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v15 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v14 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v14 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v15 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v11, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v12, v2 +; GFX6-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v14 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v14 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v14, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v15 +; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15 +; GFX8-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] -; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX8-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX8-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v13, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v6 +; GFX8-NEXT: v_not_b32_e32 v4, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v15 -; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v15 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v14 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v15, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v11, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v12, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX9-NEXT: v_sub_u32_e32 v8, 64, v14 -; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v14 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v14, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[12:13], v14, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v9, 64, v15 +; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15 +; GFX9-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc +; GFX9-NEXT: v_or_b32_e32 v10, v10, v12 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v1, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v10, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v8, v3, vcc +; GFX9-NEXT: v_not_b32_e32 v4, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v9, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1 -; GFX9-NEXT: v_sub_u32_e32 v6, 64, v15 -; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v15 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v15, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v6, 64, v14 +; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v15, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v14, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v15, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v10, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v11, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v12, v2 +; GFX9-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v12, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v10, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v13, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -6129,15 +6203,15 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX10-NEXT: v_not_b32_e32 v8, v8 +; GFX10-NEXT: v_not_b32_e32 v10, v8 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7] -; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v18 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v10 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5 ; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] @@ -6175,43 +6249,43 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX11-LABEL: v_fshl_i128: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v8 -; GFX11-NEXT: v_not_b32_e32 v8, v8 +; GFX11-NEXT: v_not_b32_e32 v10, v8 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_lshrrev_b64 v[12:13], 1, v[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 -; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v18 +; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v10 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] +; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 ; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1] ; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] ; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v8 ; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 ; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[12:13] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[12:13] -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo ; GFX11-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 -; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[12:13] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 +; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v6 :: v_dual_cndmask_b32 v7, 0, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v15, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v5, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s0 @@ -6229,173 +6303,173 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshl_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v6 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0 -; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v6 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v6 -; GFX6-NEXT: v_lshl_b64 v[4:5], s[0:1], v6 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 +; GFX6-NEXT: v_lshr_b64 v[1:2], s[0:1], v1 +; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7 +; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 +; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v7 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v8 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v8 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s2 -; GFX6-NEXT: v_mov_b32_e32 v3, s3 -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_mov_b32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v4, s3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX6-NEXT: s_lshl_b32 s9, s6, 31 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v7 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v7 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v10 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2 -; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v7 +; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v7 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v6 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v6 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v6, s[0:1] -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 +; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] +; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_mov_b32 s8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX8-NEXT: s_lshl_b32 s9, s6, 31 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v7 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v7 +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v6 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v6, s[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v6 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v6, s[0:1] -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 +; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] +; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[0:1] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX9-NEXT: v_not_b32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX9-NEXT: s_lshl_b32 s9, s6, 31 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v7, s[0:1] +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v7 +; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v7, s[2:3] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX9-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX9-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i128_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX10-NEXT: v_not_b32_e32 v0, v0 +; GFX10-NEXT: v_not_b32_e32 v2, v0 ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX10-NEXT: s_lshl_b32 s9, s6, 31 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v12 +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] ; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] @@ -6434,58 +6508,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX11-LABEL: v_fshl_i128_ssv: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_and_b32_e32 v12, 0x7f, v0 -; GFX11-NEXT: v_not_b32_e32 v0, v0 +; GFX11-NEXT: v_not_b32_e32 v2, v0 ; GFX11-NEXT: s_mov_b32 s8, 0 ; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX11-NEXT: s_lshl_b32 s9, s6, 31 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 +; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v2 ; GFX11-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] -; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v4, 0, v4 :: v_dual_cndmask_b32 v5, 0, v5 +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v12 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] -; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v13 +; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] +; GFX11-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1] +; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] +; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v1 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v7, v7, v9 -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s9, s1 -; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_or_b32_e32 v2, v6, v2 ; GFX11-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX11-NEXT: v_or_b32_e32 v3, v7, v3 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -6495,43 +6563,43 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshl_i128_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX6-NEXT: s_sub_i32 s5, s6, 64 -; GFX6-NEXT: s_sub_i32 s7, 64, s6 -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 +; GFX6-NEXT: s_and_b32 s5, s4, 0x7f +; GFX6-NEXT: s_sub_i32 s12, s5, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s5 +; GFX6-NEXT: s_cmp_lt_u32 s5, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s6 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[0:1], s7 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX6-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX6-NEXT: s_cmp_eq_u32 s5, 0 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: s_andn2_b32 s0, 0x7f, s4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 31, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: s_sub_i32 s0, s4, 64 -; GFX6-NEXT: s_sub_i32 s1, 64, s4 +; GFX6-NEXT: s_sub_i32 s1, s0, 64 +; GFX6-NEXT: s_sub_i32 s4, 64, s0 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: s_cselect_b32 s5, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s4 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1 -; GFX6-NEXT: s_cselect_b32 s6, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s4 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s0 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4 +; GFX6-NEXT: s_cselect_b32 s8, 1, 0 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s0 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s1 ; GFX6-NEXT: s_and_b32 s0, 1, s5 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX6-NEXT: s_and_b32 s0, 1, s6 +; GFX6-NEXT: s_and_b32 s0, 1, s8 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6539,51 +6607,51 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX6-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX6-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX6-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX6-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i128_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX8-NEXT: s_sub_i32 s5, s6, 64 -; GFX8-NEXT: s_sub_i32 s7, 64, s6 -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: s_and_b32 s5, s4, 0x7f +; GFX8-NEXT: s_sub_i32 s12, s5, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s5 +; GFX8-NEXT: s_cmp_lt_u32 s5, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s6 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[0:1], s7 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX8-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s5, 0 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: s_andn2_b32 s0, 0x7f, s4 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 31, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: s_sub_i32 s0, s4, 64 -; GFX8-NEXT: s_sub_i32 s1, 64, s4 +; GFX8-NEXT: s_sub_i32 s1, s0, 64 +; GFX8-NEXT: s_sub_i32 s4, 64, s0 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: s_cselect_b32 s5, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] -; GFX8-NEXT: s_cselect_b32 s6, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] +; GFX8-NEXT: s_cselect_b32 s8, 1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] ; GFX8-NEXT: s_and_b32 s0, 1, s5 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_and_b32 s0, 1, s6 +; GFX8-NEXT: s_and_b32 s0, 1, s8 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6591,50 +6659,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX8-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX8-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX8-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX8-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i128_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX9-NEXT: s_sub_i32 s5, s6, 64 -; GFX9-NEXT: s_sub_i32 s7, 64, s6 -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: s_and_b32 s5, s4, 0x7f +; GFX9-NEXT: s_sub_i32 s12, s5, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s5 +; GFX9-NEXT: s_cmp_lt_u32 s5, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s6 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[0:1], s7 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 -; GFX9-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[0:1], s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s4 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] +; GFX9-NEXT: s_andn2_b32 s0, 0x7f, s4 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 31, v1 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: s_sub_i32 s0, s4, 64 -; GFX9-NEXT: s_sub_i32 s1, 64, s4 -; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: s_sub_i32 s1, s0, 64 +; GFX9-NEXT: s_sub_i32 s4, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: s_cselect_b32 s5, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] -; GFX9-NEXT: s_cselect_b32 s6, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], s4, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] +; GFX9-NEXT: s_cselect_b32 s8, 1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] ; GFX9-NEXT: s_and_b32 s0, 1, s5 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX9-NEXT: s_and_b32 s0, 1, s6 +; GFX9-NEXT: s_and_b32 s0, 1, s8 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 @@ -6642,50 +6710,50 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, s8, v0 -; GFX9-NEXT: v_or_b32_e32 v1, s9, v1 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_or_b32_e32 v1, s7, v1 ; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX9-NEXT: v_or_b32_e32 v3, s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshl_i128_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX10-NEXT: s_sub_i32 s5, s6, 64 -; GFX10-NEXT: s_sub_i32 s7, 64, s6 -; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: s_and_b32 s5, s4, 0x7f ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_sub_i32 s12, s5, 64 +; GFX10-NEXT: s_sub_i32 s6, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s7 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 -; GFX10-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 -; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 31, v1 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s4 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s0, 64, s4 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX10-NEXT: s_sub_i32 s0, s4, 64 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_andn2_b32 s0, 0x7f, s4 +; GFX10-NEXT: s_sub_i32 s1, 64, s0 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX10-NEXT: s_sub_i32 s1, s0, 64 +; GFX10-NEXT: s_cmp_lt_u32 s0, 64 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3] +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10-NEXT: s_and_b32 s0, 1, s1 +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] ; GFX10-NEXT: s_and_b32 s0, 1, s5 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo @@ -6695,62 +6763,62 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX10-NEXT: v_or_b32_e32 v0, s6, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshl_i128_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] -; GFX11-NEXT: s_sub_i32 s5, s6, 64 -; GFX11-NEXT: s_sub_i32 s7, 64, s6 -; GFX11-NEXT: s_cmp_lt_u32 s6, 64 +; GFX11-NEXT: s_and_b32 s5, s4, 0x7f ; GFX11-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11-NEXT: s_sub_i32 s12, s5, 64 +; GFX11-NEXT: s_sub_i32 s6, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: s_cselect_b32 s13, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[0:1], s7 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 -; GFX11-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 -; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 ; GFX11-NEXT: v_lshl_or_b32 v1, v2, 31, v1 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s4 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s4 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX11-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s4, v[0:1] +; GFX11-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3] +; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s5, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s0, 64, s4 +; GFX11-NEXT: s_and_not1_b32 s0, 0x7f, s4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX11-NEXT: s_sub_i32 s0, s4, 64 -; GFX11-NEXT: s_cmp_lt_u32 s4, 64 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_sub_i32 s1, 64, s0 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX11-NEXT: s_sub_i32 s1, s0, 64 +; GFX11-NEXT: s_cmp_lt_u32 s0, 64 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3] +; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX11-NEXT: s_cselect_b32 s5, 1, 0 -; GFX11-NEXT: s_and_b32 s0, 1, s1 +; GFX11-NEXT: s_and_b32 s1, 1, s4 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] ; GFX11-NEXT: s_and_b32 s0, 1, s5 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], s4, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX11-NEXT: v_or_b32_e32 v0, s6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, s7, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -6760,25 +6828,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshl_i128_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX6-NEXT: s_sub_i32 s5, s6, 64 -; GFX6-NEXT: s_sub_i32 s7, 64, s6 -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 +; GFX6-NEXT: s_and_b32 s5, s4, 0x7f +; GFX6-NEXT: s_sub_i32 s7, s5, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s5 +; GFX6-NEXT: s_cmp_lt_u32 s5, 64 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 -; GFX6-NEXT: s_mov_b32 s8, 0 +; GFX6-NEXT: s_cmp_eq_u32 s5, 0 +; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_cselect_b32 s10, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s7 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s6 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s5 -; GFX6-NEXT: s_and_b32 s5, 1, s9 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s8 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], s5 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s7 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: s_lshl_b32 s9, s2, 31 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s6 +; GFX6-NEXT: s_lshl_b32 s7, s2, 31 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s5 +; GFX6-NEXT: s_and_b32 s5, 1, s9 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX6-NEXT: s_not_b32 s6, s4 +; GFX6-NEXT: s_andn2_b32 s4, 0x7f, s4 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX6-NEXT: s_and_b32 s5, 1, s10 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_sub_i32 s10, s4, 64 ; GFX6-NEXT: s_sub_i32 s8, 64, s4 @@ -6793,19 +6862,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6814,25 +6883,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX8-LABEL: v_fshl_i128_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX8-NEXT: s_sub_i32 s5, s6, 64 -; GFX8-NEXT: s_sub_i32 s7, 64, s6 -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 +; GFX8-NEXT: s_and_b32 s5, s4, 0x7f +; GFX8-NEXT: s_sub_i32 s7, s5, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s5 +; GFX8-NEXT: s_cmp_lt_u32 s5, 64 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 -; GFX8-NEXT: s_mov_b32 s8, 0 +; GFX8-NEXT: s_cmp_eq_u32 s5, 0 +; GFX8-NEXT: s_mov_b32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s10, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX8-NEXT: s_and_b32 s5, 1, s9 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s7, v[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: s_lshl_b32 s9, s2, 31 -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] +; GFX8-NEXT: s_lshl_b32 s7, s2, 31 +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] +; GFX8-NEXT: s_and_b32 s5, 1, s9 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX8-NEXT: s_not_b32 s6, s4 +; GFX8-NEXT: s_andn2_b32 s4, 0x7f, s4 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX8-NEXT: s_and_b32 s5, 1, s10 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_sub_i32 s10, s4, 64 ; GFX8-NEXT: s_sub_i32 s8, 64, s4 @@ -6847,19 +6917,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6868,25 +6938,26 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX9-LABEL: v_fshl_i128_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX9-NEXT: s_sub_i32 s5, s6, 64 -; GFX9-NEXT: s_sub_i32 s7, 64, s6 -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 +; GFX9-NEXT: s_and_b32 s5, s4, 0x7f +; GFX9-NEXT: s_sub_i32 s7, s5, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s5 +; GFX9-NEXT: s_cmp_lt_u32 s5, 64 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 -; GFX9-NEXT: s_mov_b32 s8, 0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 +; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_cselect_b32 s10, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX9-NEXT: s_and_b32 s5, 1, s9 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s8, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s7, v[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: s_lshl_b32 s9, s2, 31 -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] +; GFX9-NEXT: s_lshl_b32 s7, s2, 31 +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] +; GFX9-NEXT: s_and_b32 s5, 1, s9 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GFX9-NEXT: s_not_b32 s6, s4 +; GFX9-NEXT: s_andn2_b32 s4, 0x7f, s4 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX9-NEXT: s_and_b32 s5, 1, s10 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_sub_i32 s10, s4, 64 ; GFX9-NEXT: s_sub_i32 s8, 64, s4 @@ -6901,19 +6972,19 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v6 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6922,53 +6993,54 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX10-LABEL: v_fshl_i128_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX10-NEXT: s_sub_i32 s5, s6, 64 -; GFX10-NEXT: s_sub_i32 s7, 64, s6 -; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: s_and_b32 s5, s4, 0x7f +; GFX10-NEXT: s_sub_i32 s6, s5, 64 +; GFX10-NEXT: s_sub_i32 s7, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] -; GFX10-NEXT: s_and_b32 s6, 1, s8 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] ; GFX10-NEXT: s_mov_b32 s6, 0 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_lshl_b32 s7, s2, 31 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX10-NEXT: s_and_b32 s5, 1, s9 +; GFX10-NEXT: s_and_b32 s5, 1, s8 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_sub_i32 s10, s4, 64 -; GFX10-NEXT: s_sub_i32 s8, 64, s4 +; GFX10-NEXT: s_andn2_b32 s6, 0x7f, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: s_and_b32 s5, 1, s9 +; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX10-NEXT: s_not_b32 s8, s4 +; GFX10-NEXT: s_sub_i32 s10, s6, 64 +; GFX10-NEXT: s_sub_i32 s7, 64, s6 +; GFX10-NEXT: s_cmp_lt_u32 s6, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 +; GFX10-NEXT: s_lshl_b64 s[6:7], s[2:3], s7 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo -; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 @@ -6976,50 +7048,52 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX11-LABEL: v_fshl_i128_vss: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] -; GFX11-NEXT: s_sub_i32 s5, s6, 64 -; GFX11-NEXT: s_sub_i32 s7, 64, s6 -; GFX11-NEXT: s_cmp_lt_u32 s6, 64 +; GFX11-NEXT: s_and_b32 s5, s4, 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s6, s5, 64 +; GFX11-NEXT: s_sub_i32 s7, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] ; GFX11-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 0 -; GFX11-NEXT: v_lshlrev_b64 v[6:7], s6, v[2:3] +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX11-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], s6, v[0:1] -; GFX11-NEXT: s_and_b32 s6, 1, s8 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] ; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX11-NEXT: s_lshl_b32 s7, s2, 31 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX11-NEXT: s_and_b32 s5, 1, s9 +; GFX11-NEXT: s_and_b32 s5, 1, s8 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_sub_i32 s10, s4, 64 -; GFX11-NEXT: s_sub_i32 s8, 64, s4 +; GFX11-NEXT: s_and_not1_b32 s6, 0x7f, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: s_and_b32 s5, 1, s9 +; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1 +; GFX11-NEXT: s_not_b32 s8, s4 +; GFX11-NEXT: s_sub_i32 s10, s6, 64 +; GFX11-NEXT: s_sub_i32 s7, 64, s6 +; GFX11-NEXT: s_cmp_lt_u32 s6, 64 ; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 ; GFX11-NEXT: s_cselect_b32 s11, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s8 +; GFX11-NEXT: s_lshl_b64 s[6:7], s[2:3], s7 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[8:9], 0 ; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7152,40 +7226,41 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) { define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { ; GFX6-LABEL: s_fshl_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX6-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] -; GFX6-NEXT: s_sub_i32 s17, s18, 64 -; GFX6-NEXT: s_sub_i32 s19, 64, s18 -; GFX6-NEXT: s_cmp_lt_u32 s18, 64 -; GFX6-NEXT: s_cselect_b32 s23, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, 0 +; GFX6-NEXT: s_and_b32 s17, s16, 0x7f +; GFX6-NEXT: s_sub_i32 s19, s17, 64 +; GFX6-NEXT: s_sub_i32 s21, 64, s17 +; GFX6-NEXT: s_cmp_lt_u32 s17, 64 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[24:25], s[0:1], s18 -; GFX6-NEXT: s_lshr_b64 s[26:27], s[0:1], s19 -; GFX6-NEXT: s_lshl_b64 s[18:19], s[2:3], s18 -; GFX6-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19] -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX6-NEXT: s_cmp_lg_u32 s23, 0 -; GFX6-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX6-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] +; GFX6-NEXT: s_cmp_eq_u32 s17, 0 +; GFX6-NEXT: s_cselect_b32 s17, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[24:25], s[0:1], s21 +; GFX6-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 +; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], s16 +; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX6-NEXT: s_cmp_lg_u32 s28, 0 -; GFX6-NEXT: s_mov_b32 s22, 0 +; GFX6-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX6-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_mov_b32 s18, 0 ; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX6-NEXT: s_lshl_b32 s23, s10, 31 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX6-NEXT: s_lshl_b32 s19, s10, 31 ; GFX6-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX6-NEXT: s_sub_i32 s23, s16, 64 -; GFX6-NEXT: s_sub_i32 s18, 64, s16 -; GFX6-NEXT: s_cmp_lt_u32 s16, 64 +; GFX6-NEXT: s_andn2_b32 s10, 0x7f, s16 +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX6-NEXT: s_not_b32 s17, s16 +; GFX6-NEXT: s_sub_i32 s19, s10, 64 +; GFX6-NEXT: s_sub_i32 s21, 64, s10 +; GFX6-NEXT: s_cmp_lt_u32 s10, 64 ; GFX6-NEXT: s_cselect_b32 s26, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, 0 +; GFX6-NEXT: s_cmp_eq_u32 s10, 0 ; GFX6-NEXT: s_cselect_b32 s27, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX6-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 -; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s17 +; GFX6-NEXT: s_lshr_b64 s[16:17], s[0:1], s17 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[8:9], s21 +; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s19 ; GFX6-NEXT: s_cmp_lg_u32 s26, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 @@ -7193,86 +7268,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX6-NEXT: s_cmp_lg_u32 s26, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX6-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] -; GFX6-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] -; GFX6-NEXT: s_sub_i32 s11, s8, 64 -; GFX6-NEXT: s_sub_i32 s9, 64, s8 +; GFX6-NEXT: s_and_b32 s8, s20, 0x7f +; GFX6-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX6-NEXT: s_sub_i32 s19, s8, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s8 ; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 ; GFX6-NEXT: s_cselect_b32 s21, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX6-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] -; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s22, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[6:7], s20 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], s20 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX6-NEXT: s_cmp_lg_u32 s21, 0 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX6-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX6-NEXT: s_cmp_lg_u32 s22, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX6-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX6-NEXT: s_lshl_b32 s23, s14, 31 -; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 -; GFX6-NEXT: s_sub_i32 s18, s10, 64 -; GFX6-NEXT: s_sub_i32 s14, 64, s10 -; GFX6-NEXT: s_cmp_lt_u32 s10, 64 +; GFX6-NEXT: s_lshl_b32 s19, s14, 31 +; GFX6-NEXT: s_andn2_b32 s12, 0x7f, s20 +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; GFX6-NEXT: s_not_b32 s14, s20 +; GFX6-NEXT: s_sub_i32 s18, s12, 64 +; GFX6-NEXT: s_sub_i32 s16, 64, s12 +; GFX6-NEXT: s_cmp_lt_u32 s12, 64 ; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, 0 ; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 -; GFX6-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 -; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX6-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 +; GFX6-NEXT: s_lshr_b64 s[12:13], s[10:11], s14 +; GFX6-NEXT: s_lshr_b64 s[14:15], s[4:5], s14 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 +; GFX6-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX6-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 -; GFX6-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] -; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 +; GFX6-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX8-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] -; GFX8-NEXT: s_sub_i32 s17, s18, 64 -; GFX8-NEXT: s_sub_i32 s19, 64, s18 -; GFX8-NEXT: s_cmp_lt_u32 s18, 64 -; GFX8-NEXT: s_cselect_b32 s23, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s18, 0 +; GFX8-NEXT: s_and_b32 s17, s16, 0x7f +; GFX8-NEXT: s_sub_i32 s19, s17, 64 +; GFX8-NEXT: s_sub_i32 s21, 64, s17 +; GFX8-NEXT: s_cmp_lt_u32 s17, 64 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[24:25], s[0:1], s18 -; GFX8-NEXT: s_lshr_b64 s[26:27], s[0:1], s19 -; GFX8-NEXT: s_lshl_b64 s[18:19], s[2:3], s18 -; GFX8-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX8-NEXT: s_cmp_lg_u32 s23, 0 -; GFX8-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX8-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] +; GFX8-NEXT: s_cmp_eq_u32 s17, 0 +; GFX8-NEXT: s_cselect_b32 s17, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[24:25], s[0:1], s21 +; GFX8-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 +; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], s16 +; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX8-NEXT: s_cmp_lg_u32 s28, 0 -; GFX8-NEXT: s_mov_b32 s22, 0 +; GFX8-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX8-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: s_mov_b32 s18, 0 ; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX8-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX8-NEXT: s_lshl_b32 s23, s10, 31 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX8-NEXT: s_lshl_b32 s19, s10, 31 ; GFX8-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX8-NEXT: s_sub_i32 s23, s16, 64 -; GFX8-NEXT: s_sub_i32 s18, 64, s16 -; GFX8-NEXT: s_cmp_lt_u32 s16, 64 +; GFX8-NEXT: s_andn2_b32 s10, 0x7f, s16 +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX8-NEXT: s_not_b32 s17, s16 +; GFX8-NEXT: s_sub_i32 s19, s10, 64 +; GFX8-NEXT: s_sub_i32 s21, 64, s10 +; GFX8-NEXT: s_cmp_lt_u32 s10, 64 ; GFX8-NEXT: s_cselect_b32 s26, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s16, 0 +; GFX8-NEXT: s_cmp_eq_u32 s10, 0 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX8-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 -; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s17 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[0:1], s17 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[8:9], s21 +; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s19 ; GFX8-NEXT: s_cmp_lg_u32 s26, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 @@ -7280,86 +7357,88 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX8-NEXT: s_cmp_lg_u32 s26, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX8-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] -; GFX8-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] -; GFX8-NEXT: s_sub_i32 s11, s8, 64 -; GFX8-NEXT: s_sub_i32 s9, 64, s8 +; GFX8-NEXT: s_and_b32 s8, s20, 0x7f +; GFX8-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX8-NEXT: s_sub_i32 s19, s8, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s8 ; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX8-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] -; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX8-NEXT: s_cmp_lg_u32 s20, 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_cselect_b32 s22, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[6:7], s20 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[4:5], s20 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX8-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX8-NEXT: s_cmp_lg_u32 s22, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX8-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX8-NEXT: s_lshl_b32 s23, s14, 31 -; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 -; GFX8-NEXT: s_sub_i32 s18, s10, 64 -; GFX8-NEXT: s_sub_i32 s14, 64, s10 -; GFX8-NEXT: s_cmp_lt_u32 s10, 64 +; GFX8-NEXT: s_lshl_b32 s19, s14, 31 +; GFX8-NEXT: s_andn2_b32 s12, 0x7f, s20 +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; GFX8-NEXT: s_not_b32 s14, s20 +; GFX8-NEXT: s_sub_i32 s18, s12, 64 +; GFX8-NEXT: s_sub_i32 s16, 64, s12 +; GFX8-NEXT: s_cmp_lt_u32 s12, 64 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s12, 0 ; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 -; GFX8-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 -; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[10:11], s14 +; GFX8-NEXT: s_lshr_b64 s[14:15], s[4:5], s14 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 +; GFX8-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX8-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s20, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 -; GFX8-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] -; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 +; GFX8-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX9-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] -; GFX9-NEXT: s_sub_i32 s17, s18, 64 -; GFX9-NEXT: s_sub_i32 s19, 64, s18 -; GFX9-NEXT: s_cmp_lt_u32 s18, 64 -; GFX9-NEXT: s_cselect_b32 s23, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s18, 0 +; GFX9-NEXT: s_and_b32 s17, s16, 0x7f +; GFX9-NEXT: s_sub_i32 s19, s17, 64 +; GFX9-NEXT: s_sub_i32 s21, 64, s17 +; GFX9-NEXT: s_cmp_lt_u32 s17, 64 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[24:25], s[0:1], s18 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[0:1], s19 -; GFX9-NEXT: s_lshl_b64 s[18:19], s[2:3], s18 -; GFX9-NEXT: s_or_b64 s[18:19], s[26:27], s[18:19] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX9-NEXT: s_cmp_lg_u32 s23, 0 -; GFX9-NEXT: s_cselect_b64 s[24:25], s[24:25], 0 -; GFX9-NEXT: s_cselect_b64 s[0:1], s[18:19], s[0:1] +; GFX9-NEXT: s_cmp_eq_u32 s17, 0 +; GFX9-NEXT: s_cselect_b32 s17, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[0:1], s21 +; GFX9-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 +; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], s16 +; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 -; GFX9-NEXT: s_mov_b32 s22, 0 +; GFX9-NEXT: s_cselect_b64 s[22:23], s[22:23], 0 +; GFX9-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: s_mov_b32 s18, 0 ; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX9-NEXT: s_lshl_b32 s23, s10, 31 -; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX9-NEXT: s_lshl_b32 s19, s10, 31 ; GFX9-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX9-NEXT: s_sub_i32 s23, s16, 64 -; GFX9-NEXT: s_sub_i32 s18, 64, s16 -; GFX9-NEXT: s_cmp_lt_u32 s16, 64 +; GFX9-NEXT: s_andn2_b32 s10, 0x7f, s16 +; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX9-NEXT: s_not_b32 s17, s16 +; GFX9-NEXT: s_sub_i32 s19, s10, 64 +; GFX9-NEXT: s_sub_i32 s21, 64, s10 +; GFX9-NEXT: s_cmp_lt_u32 s10, 64 ; GFX9-NEXT: s_cselect_b32 s26, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, 0 +; GFX9-NEXT: s_cmp_eq_u32 s10, 0 ; GFX9-NEXT: s_cselect_b32 s27, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s16 -; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 -; GFX9-NEXT: s_lshl_b64 s[18:19], s[8:9], s18 -; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s17 +; GFX9-NEXT: s_lshr_b64 s[16:17], s[0:1], s17 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[8:9], s21 +; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s19 ; GFX9-NEXT: s_cmp_lg_u32 s26, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[16:17], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 @@ -7367,222 +7446,227 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr ; GFX9-NEXT: s_cmp_lg_u32 s26, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX9-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] -; GFX9-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] -; GFX9-NEXT: s_sub_i32 s11, s8, 64 -; GFX9-NEXT: s_sub_i32 s9, 64, s8 +; GFX9-NEXT: s_and_b32 s8, s20, 0x7f +; GFX9-NEXT: s_or_b64 s[0:1], s[22:23], s[0:1] +; GFX9-NEXT: s_sub_i32 s19, s8, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s8 ; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s8 -; GFX9-NEXT: s_lshr_b64 s[18:19], s[4:5], s9 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[6:7], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[18:19], s[8:9] -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s22, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[6:7], s20 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[4:5], s20 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX9-NEXT: s_cselect_b64 s[4:5], s[10:11], s[4:5] +; GFX9-NEXT: s_cmp_lg_u32 s22, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX9-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX9-NEXT: s_lshl_b32 s23, s14, 31 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 -; GFX9-NEXT: s_sub_i32 s18, s10, 64 -; GFX9-NEXT: s_sub_i32 s14, 64, s10 -; GFX9-NEXT: s_cmp_lt_u32 s10, 64 +; GFX9-NEXT: s_lshl_b32 s19, s14, 31 +; GFX9-NEXT: s_andn2_b32 s12, 0x7f, s20 +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], 1 +; GFX9-NEXT: s_not_b32 s14, s20 +; GFX9-NEXT: s_sub_i32 s18, s12, 64 +; GFX9-NEXT: s_sub_i32 s16, 64, s12 +; GFX9-NEXT: s_cmp_lt_u32 s12, 64 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, 0 ; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[12:13], s[8:9], s10 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 -; GFX9-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 -; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GFX9-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[10:11], s14 +; GFX9-NEXT: s_lshr_b64 s[14:15], s[4:5], s14 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[10:11], s16 +; GFX9-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] +; GFX9-NEXT: s_cselect_b64 s[10:11], s[14:15], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], 0 -; GFX9-NEXT: s_or_b64 s[4:5], s[16:17], s[4:5] -; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], 0 +; GFX9-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX10-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] -; GFX10-NEXT: s_sub_i32 s17, s18, 64 -; GFX10-NEXT: s_sub_i32 s19, 64, s18 -; GFX10-NEXT: s_cmp_lt_u32 s18, 64 -; GFX10-NEXT: s_mov_b32 s22, 0 -; GFX10-NEXT: s_cselect_b32 s23, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s18, 0 +; GFX10-NEXT: s_and_b32 s17, s16, 0x7f +; GFX10-NEXT: s_mov_b32 s18, 0 +; GFX10-NEXT: s_sub_i32 s19, s17, 64 +; GFX10-NEXT: s_sub_i32 s21, 64, s17 +; GFX10-NEXT: s_cmp_lt_u32 s17, 64 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s19 -; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s18 -; GFX10-NEXT: s_lshl_b64 s[18:19], s[0:1], s18 -; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX10-NEXT: s_cmp_lg_u32 s23, 0 -; GFX10-NEXT: s_cselect_b64 s[18:19], s[18:19], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s17, 0 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s21 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s16 +; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s16 +; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX10-NEXT: s_lshl_b32 s23, s10, 31 +; GFX10-NEXT: s_lshl_b32 s19, s10, 31 ; GFX10-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] -; GFX10-NEXT: s_sub_i32 s23, s16, 64 -; GFX10-NEXT: s_sub_i32 s17, 64, s16 -; GFX10-NEXT: s_cmp_lt_u32 s16, 64 +; GFX10-NEXT: s_andn2_b32 s10, 0x7f, s16 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX10-NEXT: s_not_b32 s19, s16 +; GFX10-NEXT: s_sub_i32 s21, s10, 64 +; GFX10-NEXT: s_sub_i32 s16, 64, s10 +; GFX10-NEXT: s_cmp_lt_u32 s10, 64 ; GFX10-NEXT: s_cselect_b32 s26, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s16, 0 +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 ; GFX10-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 -; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s19 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[8:9], s16 +; GFX10-NEXT: s_lshr_b64 s[22:23], s[8:9], s19 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s21 ; GFX10-NEXT: s_cmp_lg_u32 s26, 0 ; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s27, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s26, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 -; GFX10-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] +; GFX10-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 +; GFX10-NEXT: s_and_b32 s10, s20, 0x7f +; GFX10-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX10-NEXT: s_or_b64 s[0:1], s[18:19], s[0:1] -; GFX10-NEXT: s_sub_i32 s11, s8, 64 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_sub_i32 s19, s10, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s10 +; GFX10-NEXT: s_cmp_lt_u32 s10, 64 ; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 -; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], s8 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] +; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cselect_b32 s22, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s20 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[4:5], s20 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s22, 0 ; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX10-NEXT: s_lshl_b32 s23, s14, 31 -; GFX10-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX10-NEXT: s_sub_i32 s18, s10, 64 -; GFX10-NEXT: s_sub_i32 s11, 64, s10 -; GFX10-NEXT: s_cmp_lt_u32 s10, 64 +; GFX10-NEXT: s_lshl_b32 s19, s14, 31 +; GFX10-NEXT: s_andn2_b32 s12, 0x7f, s20 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX10-NEXT: s_not_b32 s16, s20 +; GFX10-NEXT: s_sub_i32 s18, s12, 64 +; GFX10-NEXT: s_sub_i32 s14, 64, s12 +; GFX10-NEXT: s_cmp_lt_u32 s12, 64 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cmp_eq_u32 s12, 0 ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[12:13], s11 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], s10 -; GFX10-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s18 +; GFX10-NEXT: s_lshr_b64 s[12:13], s[4:5], s16 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13] +; GFX10-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13] +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX10-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshl_v2i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[16:17], 0x7f, s[16:17] -; GFX11-NEXT: s_sub_i32 s17, s18, 64 -; GFX11-NEXT: s_sub_i32 s19, 64, s18 -; GFX11-NEXT: s_cmp_lt_u32 s18, 64 -; GFX11-NEXT: s_mov_b32 s22, 0 -; GFX11-NEXT: s_cselect_b32 s23, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s18, 0 +; GFX11-NEXT: s_and_b32 s17, s16, 0x7f +; GFX11-NEXT: s_mov_b32 s18, 0 +; GFX11-NEXT: s_sub_i32 s19, s17, 64 +; GFX11-NEXT: s_sub_i32 s21, 64, s17 +; GFX11-NEXT: s_cmp_lt_u32 s17, 64 ; GFX11-NEXT: s_cselect_b32 s28, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s19 -; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s18 -; GFX11-NEXT: s_lshl_b64 s[18:19], s[0:1], s18 -; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s17 -; GFX11-NEXT: s_cmp_lg_u32 s23, 0 -; GFX11-NEXT: s_cselect_b64 s[18:19], s[18:19], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s17, 0 +; GFX11-NEXT: s_cselect_b32 s17, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s21 +; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s16 +; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s16 +; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 ; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] ; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1 -; GFX11-NEXT: s_lshl_b32 s23, s10, 31 +; GFX11-NEXT: s_lshl_b32 s19, s10, 31 ; GFX11-NEXT: s_lshr_b64 s[8:9], s[10:11], 1 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] -; GFX11-NEXT: s_sub_i32 s23, s16, 64 -; GFX11-NEXT: s_sub_i32 s17, 64, s16 -; GFX11-NEXT: s_cmp_lt_u32 s16, 64 +; GFX11-NEXT: s_and_not1_b32 s10, 0x7f, s16 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[18:19] +; GFX11-NEXT: s_not_b32 s19, s16 +; GFX11-NEXT: s_sub_i32 s21, s10, 64 +; GFX11-NEXT: s_sub_i32 s16, 64, s10 +; GFX11-NEXT: s_cmp_lt_u32 s10, 64 ; GFX11-NEXT: s_cselect_b32 s26, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s16, 0 +; GFX11-NEXT: s_cmp_eq_u32 s10, 0 ; GFX11-NEXT: s_cselect_b32 s27, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s16 -; GFX11-NEXT: s_lshl_b64 s[24:25], s[8:9], s17 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 -; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[24:25] -; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s23 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s19 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[8:9], s16 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[8:9], s19 +; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s21 ; GFX11-NEXT: s_cmp_lg_u32 s26, 0 ; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s27, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s26, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 -; GFX11-NEXT: s_and_not1_b64 s[10:11], 0x7f, s[20:21] +; GFX11-NEXT: s_cselect_b64 s[8:9], s[22:23], 0 +; GFX11-NEXT: s_and_b32 s10, s20, 0x7f +; GFX11-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX11-NEXT: s_or_b64 s[0:1], s[18:19], s[0:1] -; GFX11-NEXT: s_sub_i32 s11, s8, 64 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_sub_i32 s19, s10, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s10 +; GFX11-NEXT: s_cmp_lt_u32 s10, 64 ; GFX11-NEXT: s_cselect_b32 s21, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s9 -; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], s8 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 -; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s11 -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] +; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_cselect_b32 s22, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s20 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[4:5], s20 +; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s19 ; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_cselect_b64 s[10:11], s[16:17], 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s22, 0 ; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] ; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], 1 -; GFX11-NEXT: s_lshl_b32 s23, s14, 31 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[14:15], 1 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[22:23] -; GFX11-NEXT: s_sub_i32 s18, s10, 64 -; GFX11-NEXT: s_sub_i32 s11, 64, s10 -; GFX11-NEXT: s_cmp_lt_u32 s10, 64 +; GFX11-NEXT: s_lshl_b32 s19, s14, 31 +; GFX11-NEXT: s_and_not1_b32 s12, 0x7f, s20 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[18:19] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], 1 +; GFX11-NEXT: s_not_b32 s16, s20 +; GFX11-NEXT: s_sub_i32 s18, s12, 64 +; GFX11-NEXT: s_sub_i32 s14, 64, s12 +; GFX11-NEXT: s_cmp_lt_u32 s12, 64 ; GFX11-NEXT: s_cselect_b32 s19, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_cmp_eq_u32 s12, 0 ; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[14:15], s[4:5], s10 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[12:13], s11 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[12:13], s10 -; GFX11-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s18 +; GFX11-NEXT: s_lshr_b64 s[12:13], s[4:5], s16 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[8:9], s14 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s18 ; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[12:13], s[14:15], s[12:13] +; GFX11-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[12:13] +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] ; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX11-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] +; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 +; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %amt) ret <2 x i128> %result @@ -7592,56 +7676,54 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-LABEL: v_fshl_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v16 -; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, 64, v23 -; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v23 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 -; GFX6-NEXT: v_lshl_b64 v[18:19], v[2:3], v23 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[0:1], v23 +; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v19 +; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19 +; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 +; GFX6-NEXT: v_lshl_b64 v[23:24], v[0:1], v19 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v25 -; GFX6-NEXT: v_or_b32_e32 v16, v16, v18 -; GFX6-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; GFX6-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc +; GFX6-NEXT: v_or_b32_e32 v17, v17, v21 +; GFX6-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc ; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX6-NEXT: v_not_b32_e32 v8, v16 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v24 -; GFX6-NEXT: v_subrev_i32_e32 v23, vcc, 64, v24 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v24 +; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v8 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v23 +; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v23 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v23 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v10 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v24 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v23 +; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v23 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v24 ; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX6-NEXT: v_not_b32_e32 v8, v20 ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX6-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX6-NEXT: v_or_b32_e32 v1, v19, v1 -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v18 -; GFX6-NEXT: v_subrev_i32_e32 v20, vcc, 64, v18 +; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 ; GFX6-NEXT: v_lshl_b64 v[16:17], v[4:5], v18 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v20 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[4:5], v19 ; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 @@ -7651,88 +7733,88 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; GFX6-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc ; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], 1 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX6-NEXT: v_not_b32_e32 v8, v20 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v19 -; GFX6-NEXT: v_subrev_i32_e32 v14, vcc, 64, v19 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v19 +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v14 +; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v14 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 -; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v19 -; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v14 +; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v14 +; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v15 ; GFX6-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX6-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX6-NEXT: v_or_b32_e32 v1, v22, v1 ; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX6-NEXT: v_or_b32_e32 v6, v18, v6 -; GFX6-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX6-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshl_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_not_b32_e32 v16, v16 -; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, 64, v23 -; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v23 -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v16, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[18:19], v23, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v23, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v19 +; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19 +; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] -; GFX8-NEXT: v_or_b32_e32 v16, v16, v18 -; GFX8-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v22, v1, v3, vcc +; GFX8-NEXT: v_or_b32_e32 v17, v17, v21 +; GFX8-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v18, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc ; GFX8-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 31, v10 +; GFX8-NEXT: v_not_b32_e32 v8, v16 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v24 -; GFX8-NEXT: v_subrev_u32_e32 v23, vcc, 64, v24 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v8 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v23 +; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v23 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v24, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v23, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v0, v18, v0 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX8-NEXT: v_not_b32_e32 v8, v20 ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX8-NEXT: v_or_b32_e32 v2, v18, v2 +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX8-NEXT: v_or_b32_e32 v1, v19, v1 -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v18 -; GFX8-NEXT: v_subrev_u32_e32 v20, vcc, 64, v18 +; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v20, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5] ; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 @@ -7742,87 +7824,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; GFX8-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v20, v5, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v19, v5, v7, vcc ; GFX8-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 31, v14 +; GFX8-NEXT: v_not_b32_e32 v8, v20 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v19 -; GFX8-NEXT: v_subrev_u32_e32 v14, vcc, 64, v19 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], v19, v[4:5] +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v14 +; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[12:13], v19, v[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX8-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v22, v1 ; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX8-NEXT: v_or_b32_e32 v6, v18, v6 -; GFX8-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX8-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshl_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX9-NEXT: v_not_b32_e32 v16, v16 -; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX9-NEXT: v_sub_u32_e32 v16, 64, v23 -; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v23 -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v16, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[18:19], v23, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v23, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19 +; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] -; GFX9-NEXT: v_or_b32_e32 v16, v16, v18 -; GFX9-NEXT: v_or_b32_e32 v17, v17, v19 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v19, 0, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v1, v17, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v0, v2, vcc +; GFX9-NEXT: v_or_b32_e32 v17, v17, v21 +; GFX9-NEXT: v_or_b32_e32 v18, v18, v22 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v17, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v1, v18, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 1, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v22, v16, v3, vcc +; GFX9-NEXT: v_not_b32_e32 v8, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v17, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] +; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v8 ; GFX9-NEXT: v_lshl_or_b32 v1, v10, 31, v1 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v24 -; GFX9-NEXT: v_subrev_u32_e32 v23, 64, v24 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v24, v[0:1] +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v23 +; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v23 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v24, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v23, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v24, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 +; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_or_b32_e32 v2, v18, v2 ; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX9-NEXT: v_not_b32_e32 v8, v20 -; GFX9-NEXT: v_or_b32_e32 v1, v19, v1 -; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v8 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 -; GFX9-NEXT: v_subrev_u32_e32 v20, 64, v18 +; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v20, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[4:5] ; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 @@ -7833,89 +7915,91 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v18, v4, v6, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], 1, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v20, v8, v7, vcc -; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e32 v19, v8, v7, vcc +; GFX9-NEXT: v_not_b32_e32 v8, v20 ; GFX9-NEXT: v_lshl_or_b32 v5, v14, 31, v5 -; GFX9-NEXT: v_sub_u32_e32 v10, 64, v19 -; GFX9-NEXT: v_subrev_u32_e32 v14, 64, v19 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], v19, v[4:5] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX9-NEXT: v_sub_u32_e32 v10, 64, v14 +; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[12:13], v19, v[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[6:7], v14, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] +; GFX9-NEXT: v_lshrrev_b64 v[6:7], v15, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v8, v8, v10 ; GFX9-NEXT: v_or_b32_e32 v9, v9, v11 -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v13, vcc -; GFX9-NEXT: v_or_b32_e32 v2, v21, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v22, v3 +; GFX9-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX9-NEXT: v_or_b32_e32 v1, v22, v1 ; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v18, v6 -; GFX9-NEXT: v_or_b32_e32 v7, v20, v7 +; GFX9-NEXT: v_or_b32_e32 v7, v19, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshl_v2i128: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX10-NEXT: v_not_b32_e32 v16, v16 +; GFX10-NEXT: v_not_b32_e32 v21, v16 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v27 -; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v16 +; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v21 ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] ; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 -; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] +; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 ; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 ; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 -; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v28 -; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX10-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v23, v23, v25 ; GFX10-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo -; GFX10-NEXT: v_or_b32_e32 v0, v24, v26 -; GFX10-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v23, s4 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX10-NEXT: v_or_b32_e32 v24, v24, v26 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v28 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v27 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v0, s4 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] +; GFX10-NEXT: v_cndmask_b32_e32 v16, v16, v23, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, v17, v24, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v23, v19, v3, s4 ; GFX10-NEXT: v_and_b32_e32 v24, 0x7f, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v23, v19, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v25, 0, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v16, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v17, v9, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v0, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, v1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 +; GFX10-NEXT: v_not_b32_e32 v16, v20 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX10-NEXT: v_not_b32_e32 v3, v20 ; GFX10-NEXT: v_or_b32_e32 v1, v22, v8 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] -; GFX10-NEXT: v_sub_nc_u32_e32 v11, 64, v24 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 64, v24 +; GFX10-NEXT: v_and_b32_e32 v22, 0x7f, v16 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX10-NEXT: v_and_b32_e32 v22, 0x7f, v3 ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v24, v[6:7] -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v11, v[4:5] +; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] ; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v22 -; GFX10-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 ; GFX10-NEXT: v_or_b32_e32 v12, v10, v12 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 @@ -7953,88 +8037,87 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v27, 0x7f, v16 -; GFX11-NEXT: v_not_b32_e32 v16, v16 +; GFX11-NEXT: v_not_b32_e32 v21, v16 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 -; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_and_b32_e32 v28, 0x7f, v21 +; GFX11-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v21, 0, v21 :: v_dual_cndmask_b32 v22, 0, v22 +; GFX11-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v27 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] -; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX11-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 -; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] +; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v28 -; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] -; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v28 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 ; GFX11-NEXT: v_or_b32_e32 v18, v16, v18 +; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v19, v1, v19 :: v_dual_cndmask_b32 v18, v0, v18 +; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 -; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 -; GFX11-NEXT: v_or_b32_e32 v23, v23, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v18, v0, v18, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v28 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] +; GFX11-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v16, v[10:11] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v19, v1, v19, vcc_lo -; GFX11-NEXT: v_or_b32_e32 v0, v24, v26 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v28 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v23, v23, v25 +; GFX11-NEXT: v_or_b32_e32 v24, v24, v26 +; GFX11-NEXT: v_dual_cndmask_b32 v25, 0, v1 :: v_dual_cndmask_b32 v16, v16, v23 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v10, v17, v24, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v23, v19, v3, s0 ; GFX11-NEXT: v_and_b32_e32 v24, 0x7f, v20 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v23, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v0, s0 -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v23, v19, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v3, v16, v8, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v17, v9, s1 -; GFX11-NEXT: v_sub_nc_u32_e32 v11, 64, v24 -; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v25, 0, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v9, s1 +; GFX11-NEXT: v_not_b32_e32 v16, v20 +; GFX11-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc_lo +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 ; GFX11-NEXT: v_or_b32_e32 v0, v21, v3 -; GFX11-NEXT: v_not_b32_e32 v3, v20 ; GFX11-NEXT: v_or_b32_e32 v1, v22, v8 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v24 +; GFX11-NEXT: v_and_b32_e32 v22, 0x7f, v16 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v11, v[4:5] ; GFX11-NEXT: v_lshlrev_b64 v[12:13], v24, v[6:7] ; GFX11-NEXT: v_lshlrev_b64 v[16:17], v24, v[4:5] -; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 -; GFX11-NEXT: v_and_b32_e32 v22, 0x7f, v3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] ; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] -; GFX11-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] -; GFX11-NEXT: v_or_b32_e32 v5, v11, v13 -; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v22 +; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v22 +; GFX11-NEXT: v_or_b32_e32 v12, v10, v12 ; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 ; GFX11-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9] -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v3, v12, vcc_lo ; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] +; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] +; GFX11-NEXT: v_or_b32_e32 v5, v11, v13 ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15] +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v22 -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v24 ; GFX11-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX11-NEXT: v_or_b32_e32 v18, v19, v21 +; GFX11-NEXT: v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5 +; GFX11-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v16, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v24 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v18, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v10, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v8, v11, v9, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v3, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v4, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 8538dcabca924..58304d2072d7f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -347,49 +347,57 @@ define amdgpu_ps i8 @s_fshr_i8(i8 inreg %lhs, i8 inreg %rhs, i8 inreg %amt) { ; ; GFX8-LABEL: s_fshr_i8: ; GFX8: ; %bb.0: +; GFX8-NEXT: s_andn2_b32 s3, 7, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_and_b32 s3, s2, 7 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_lshr_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i8: ; GFX9: ; %bb.0: +; GFX9-NEXT: s_andn2_b32 s3, 7, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_and_b32 s3, s2, 7 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i8: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_andn2_b32 s3, 7, s2 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_and_b32 s3, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_and_b32 s2, s2, 7 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i8: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_and_not1_b32 s3, 7, s2 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_and_b32 s3, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_b32 s2, s2, 7 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s3 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_lshl_b32 s0, s0, s3 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: ; return to shader part epilog @@ -414,33 +422,33 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX8-LABEL: v_fshr_i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_not_b32_e32 v3, v2 -; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 @@ -451,9 +459,9 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) { ; GFX11-LABEL: v_fshr_i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_not_b32_e32 v3, v2 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 @@ -687,25 +695,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX8-LABEL: s_fshr_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 -; GFX8-NEXT: s_and_b32 s6, s2, 7 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_andn2_b32 s6, 7, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s1, s1, s6 -; GFX8-NEXT: s_lshl_b32 s2, s3, s2 -; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s5, 7 +; GFX8-NEXT: s_lshl_b32 s1, s3, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s5, 7 +; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshr_b32 s1, s3, s1 -; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 +; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 @@ -714,25 +726,29 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX9-LABEL: s_fshr_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 -; GFX9-NEXT: s_and_b32 s6, s2, 7 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_andn2_b32 s6, 7, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_lshr_b32 s1, s1, s6 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s5, 7 +; GFX9-NEXT: s_lshl_b32 s1, s3, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s5, 7 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_lshr_b32 s1, s3, s1 -; GFX9-NEXT: s_or_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, s2 +; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 @@ -741,24 +757,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX10-LABEL: s_fshr_v2i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s4, s1, 8 +; GFX10-NEXT: s_andn2_b32 s5, 7, s2 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s5, s2, 8 -; GFX10-NEXT: s_and_b32 s6, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_lshr_b32 s4, s1, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_lshr_b32 s6, s2, 8 +; GFX10-NEXT: s_lshl_b32 s0, s0, s5 +; GFX10-NEXT: s_andn2_b32 s5, 7, s6 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_and_b32 s6, s6, 7 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s5, 7 -; GFX10-NEXT: s_andn2_b32 s5, 7, s5 +; GFX10-NEXT: s_and_b32 s2, s2, 7 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_lshl_b32 s3, s3, s5 -; GFX10-NEXT: s_lshr_b32 s2, s4, s2 -; GFX10-NEXT: s_lshr_b32 s1, s1, s6 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_lshr_b32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_and_b32 s0, s0, 0xff @@ -768,24 +788,28 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; ; GFX11-LABEL: s_fshr_v2i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_lshr_b32 s4, s1, 8 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s2 ; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s5, s2, 8 -; GFX11-NEXT: s_and_b32 s6, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_lshr_b32 s4, s1, 8 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_lshr_b32 s6, s2, 8 +; GFX11-NEXT: s_lshl_b32 s0, s0, s5 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s6 ; GFX11-NEXT: s_and_b32 s4, s4, 0xff +; GFX11-NEXT: s_and_b32 s6, s6, 7 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_and_b32 s2, s5, 7 -; GFX11-NEXT: s_and_not1_b32 s5, 7, s5 +; GFX11-NEXT: s_and_b32 s2, s2, 7 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_lshl_b32 s3, s3, s5 -; GFX11-NEXT: s_lshr_b32 s2, s4, s2 -; GFX11-NEXT: s_lshr_b32 s1, s1, s6 -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_lshr_b32 s4, s4, s6 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_or_b32 s2, s3, s4 ; GFX11-NEXT: s_or_b32 s0, s0, s1 ; GFX11-NEXT: s_and_b32 s1, s2, 0xff ; GFX11-NEXT: s_and_b32 s0, s0, 0xff @@ -832,23 +856,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX8-LABEL: v_fshr_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX8-NEXT: v_not_b32_e32 v2, v2 +; GFX8-NEXT: v_xor_b32_e32 v6, -1, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX8-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_not_b32_e32 v2, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v6, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 7, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 1, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v2, v2, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, v2, v1 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v5 +; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -857,23 +881,23 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX9-LABEL: v_fshr_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX9-NEXT: v_not_b32_e32 v2, v2 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v0, 1, v0 +; GFX9-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_not_b32_e32 v2, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v6, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v1, 7, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, v2, v3 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v2, 7, v5 +; GFX9-NEXT: v_lshrrev_b16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -885,20 +909,20 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v2 -; GFX10-NEXT: v_not_b32_e32 v2, v2 -; GFX10-NEXT: v_not_b32_e32 v7, v3 -; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 -; GFX10-NEXT: v_lshlrev_b16 v4, v7, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v6, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0 ; GFX10-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xff ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -912,22 +936,22 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX11-NEXT: v_not_b32_e32 v2, v2 -; GFX11-NEXT: v_not_b32_e32 v6, v3 -; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX11-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5 -; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_lshrrev_b16 v3, v3, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b16 v4, v6, v4 -; GFX11-NEXT: v_lshrrev_b16 v1, v7, v1 +; GFX11-NEXT: v_lshrrev_b16 v1, v2, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX11-NEXT: v_lshlrev_b16 v0, v7, v0 ; GFX11-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 @@ -997,50 +1021,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX8-LABEL: s_fshr_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_lshr_b32 s5, s0, 24 ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 ; GFX8-NEXT: s_lshr_b32 s11, s2, 24 -; GFX8-NEXT: s_and_b32 s12, s2, 7 -; GFX8-NEXT: s_andn2_b32 s2, 7, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_andn2_b32 s12, 7, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 7 ; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshr_b32 s4, s0, 16 +; GFX8-NEXT: s_lshr_b32 s5, s0, 24 +; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s0, s0, s12 +; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_lshr_b32 s1, s1, s12 -; GFX8-NEXT: s_lshl_b32 s2, s3, s2 -; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s9, 7 +; GFX8-NEXT: s_lshl_b32 s1, s3, 1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s2, s9, 7 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX8-NEXT: s_lshr_b32 s1, s3, s1 +; GFX8-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 -; GFX8-NEXT: s_lshl_b32 s4, s4, 1 -; GFX8-NEXT: s_lshl_b32 s3, s4, s3 +; GFX8-NEXT: s_or_b32 s1, s1, s2 +; GFX8-NEXT: s_lshl_b32 s2, s4, 1 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s3, s10, 7 ; GFX8-NEXT: s_and_b32 s4, s7, 0xff -; GFX8-NEXT: s_or_b32 s1, s2, s1 -; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX8-NEXT: s_lshr_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s1, s1, 0xff -; GFX8-NEXT: s_or_b32 s2, s3, s2 -; GFX8-NEXT: s_and_b32 s3, s11, 7 +; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX8-NEXT: s_lshr_b32 s3, s4, s3 ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 -; GFX8-NEXT: s_lshl_b32 s5, s5, 1 +; GFX8-NEXT: s_or_b32 s2, s2, s3 +; GFX8-NEXT: s_lshl_b32 s3, s5, 1 +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX8-NEXT: s_lshl_b32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s4, s11, 7 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_lshl_b32 s4, s5, s4 -; GFX8-NEXT: s_lshr_b32 s3, s8, s3 +; GFX8-NEXT: s_lshr_b32 s4, s8, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s2, 0xff -; GFX8-NEXT: s_or_b32 s3, s4, s3 +; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s3, 0xff @@ -1050,50 +1082,58 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX9-LABEL: s_fshr_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_lshr_b32 s5, s0, 24 ; GFX9-NEXT: s_lshr_b32 s6, s1, 8 ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 ; GFX9-NEXT: s_lshr_b32 s11, s2, 24 -; GFX9-NEXT: s_and_b32 s12, s2, 7 -; GFX9-NEXT: s_andn2_b32 s2, 7, s2 -; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_andn2_b32 s12, 7, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 7 ; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s0, 24 +; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s0, s0, s12 +; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_lshr_b32 s1, s1, s12 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s9, 7 +; GFX9-NEXT: s_lshl_b32 s1, s3, 1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshl_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s2, s9, 7 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_lshr_b32 s1, s3, s1 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 -; GFX9-NEXT: s_lshl_b32 s4, s4, 1 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_or_b32 s1, s1, s2 +; GFX9-NEXT: s_lshl_b32 s2, s4, 1 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_lshl_b32 s2, s2, s3 +; GFX9-NEXT: s_and_b32 s3, s10, 7 ; GFX9-NEXT: s_and_b32 s4, s7, 0xff -; GFX9-NEXT: s_or_b32 s1, s2, s1 -; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX9-NEXT: s_lshr_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s1, s1, 0xff -; GFX9-NEXT: s_or_b32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s3, s11, 7 +; GFX9-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_lshr_b32 s3, s4, s3 ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 -; GFX9-NEXT: s_lshl_b32 s5, s5, 1 +; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s3, s5, 1 +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_and_b32 s4, s11, 7 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff +; GFX9-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 -; GFX9-NEXT: s_lshl_b32 s4, s5, s4 -; GFX9-NEXT: s_lshr_b32 s3, s8, s3 +; GFX9-NEXT: s_lshr_b32 s4, s8, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s2, 0xff -; GFX9-NEXT: s_or_b32 s3, s4, s3 +; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s3, 0xff @@ -1104,43 +1144,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-LABEL: s_fshr_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_lshr_b32 s3, s0, 8 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s12, s2, 7 -; GFX10-NEXT: s_andn2_b32 s2, 7, s2 +; GFX10-NEXT: s_andn2_b32 s12, 7, s2 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff -; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_and_b32 s2, s2, 7 ; GFX10-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s9, 7 -; GFX10-NEXT: s_andn2_b32 s9, 7, s9 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_lshr_b32 s3, s0, 8 +; GFX10-NEXT: s_lshr_b32 s1, s1, s2 +; GFX10-NEXT: s_andn2_b32 s2, 7, s9 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff +; GFX10-NEXT: s_and_b32 s9, s9, 7 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 1 +; GFX10-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX10-NEXT: s_lshr_b32 s1, s1, s12 -; GFX10-NEXT: s_lshl_b32 s3, s3, s9 -; GFX10-NEXT: s_lshr_b32 s2, s6, s2 -; GFX10-NEXT: s_and_b32 s6, s7, 0xff +; GFX10-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX10-NEXT: s_lshl_b32 s0, s0, s12 +; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_lshr_b32 s3, s6, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s10, 7 -; GFX10-NEXT: s_andn2_b32 s3, 7, s10 -; GFX10-NEXT: s_lshl_b32 s4, s4, 1 +; GFX10-NEXT: s_or_b32 s1, s2, s3 +; GFX10-NEXT: s_andn2_b32 s2, 7, s10 +; GFX10-NEXT: s_lshl_b32 s3, s4, 1 +; GFX10-NEXT: s_and_b32 s4, s7, 0xff +; GFX10-NEXT: s_and_b32 s6, s10, 7 +; GFX10-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s2, s6, s2 -; GFX10-NEXT: s_andn2_b32 s4, 7, s11 -; GFX10-NEXT: s_lshl_b32 s5, s5, 1 +; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_lshr_b32 s3, s4, s6 +; GFX10-NEXT: s_lshl_b32 s4, s5, 1 +; GFX10-NEXT: s_andn2_b32 s5, 7, s11 ; GFX10-NEXT: s_and_b32 s6, s11, 7 -; GFX10-NEXT: s_lshl_b32 s4, s5, s4 +; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX10-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s5, s8, s6 -; GFX10-NEXT: s_or_b32 s2, s3, s2 +; GFX10-NEXT: s_or_b32 s2, s2, s3 ; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: s_and_b32 s0, s0, 0xff @@ -1157,43 +1205,51 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX11-LABEL: s_fshr_v4i8: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_lshr_b32 s6, s1, 8 -; GFX11-NEXT: s_lshr_b32 s3, s0, 8 -; GFX11-NEXT: s_lshr_b32 s4, s0, 16 -; GFX11-NEXT: s_lshr_b32 s5, s0, 24 ; GFX11-NEXT: s_lshr_b32 s7, s1, 16 ; GFX11-NEXT: s_lshr_b32 s8, s1, 24 ; GFX11-NEXT: s_lshr_b32 s9, s2, 8 ; GFX11-NEXT: s_lshr_b32 s10, s2, 16 ; GFX11-NEXT: s_lshr_b32 s11, s2, 24 -; GFX11-NEXT: s_and_b32 s12, s2, 7 -; GFX11-NEXT: s_and_not1_b32 s2, 7, s2 +; GFX11-NEXT: s_and_not1_b32 s12, 7, s2 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff -; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s2, s2, 7 ; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, s2 -; GFX11-NEXT: s_and_b32 s2, s9, 7 -; GFX11-NEXT: s_and_not1_b32 s9, 7, s9 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_lshr_b32 s3, s0, 8 +; GFX11-NEXT: s_lshr_b32 s1, s1, s2 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s9 +; GFX11-NEXT: s_and_b32 s6, s6, 0xff +; GFX11-NEXT: s_and_b32 s9, s9, 7 +; GFX11-NEXT: s_lshr_b32 s4, s0, 16 +; GFX11-NEXT: s_lshr_b32 s5, s0, 24 +; GFX11-NEXT: s_lshl_b32 s0, s0, 1 +; GFX11-NEXT: s_and_b32 s12, 0xffff, s12 ; GFX11-NEXT: s_lshl_b32 s3, s3, 1 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_lshr_b32 s1, s1, s12 -; GFX11-NEXT: s_lshl_b32 s3, s3, s9 -; GFX11-NEXT: s_lshr_b32 s2, s6, s2 -; GFX11-NEXT: s_and_b32 s6, s7, 0xff +; GFX11-NEXT: s_and_b32 s9, 0xffff, s9 +; GFX11-NEXT: s_lshl_b32 s0, s0, s12 +; GFX11-NEXT: s_lshl_b32 s2, s3, s2 +; GFX11-NEXT: s_lshr_b32 s3, s6, s9 ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s3, s2 -; GFX11-NEXT: s_and_b32 s2, s10, 7 -; GFX11-NEXT: s_and_not1_b32 s3, 7, s10 -; GFX11-NEXT: s_lshl_b32 s4, s4, 1 +; GFX11-NEXT: s_or_b32 s1, s2, s3 +; GFX11-NEXT: s_and_not1_b32 s2, 7, s10 +; GFX11-NEXT: s_lshl_b32 s3, s4, 1 +; GFX11-NEXT: s_and_b32 s4, s7, 0xff +; GFX11-NEXT: s_and_b32 s6, s10, 7 +; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 +; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX11-NEXT: s_lshl_b32 s3, s4, s3 -; GFX11-NEXT: s_lshr_b32 s2, s6, s2 -; GFX11-NEXT: s_and_not1_b32 s4, 7, s11 -; GFX11-NEXT: s_lshl_b32 s5, s5, 1 +; GFX11-NEXT: s_lshl_b32 s2, s3, s2 +; GFX11-NEXT: s_lshr_b32 s3, s4, s6 +; GFX11-NEXT: s_lshl_b32 s4, s5, 1 +; GFX11-NEXT: s_and_not1_b32 s5, 7, s11 ; GFX11-NEXT: s_and_b32 s6, s11, 7 -; GFX11-NEXT: s_lshl_b32 s4, s5, s4 +; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 +; GFX11-NEXT: s_and_b32 s6, 0xffff, s6 +; GFX11-NEXT: s_lshl_b32 s4, s4, s5 ; GFX11-NEXT: s_lshr_b32 s5, s8, s6 -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_or_b32 s2, s2, s3 ; GFX11-NEXT: s_and_b32 s1, s1, 0xff ; GFX11-NEXT: s_or_b32 s3, s4, s5 ; GFX11-NEXT: s_and_b32 s0, s0, 0xff @@ -1272,40 +1328,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX8-LABEL: v_fshr_v4i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_not_b32_e32 v7, v2 -; GFX8-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshlrev_b16_e32 v8, 1, v0 +; GFX8-NEXT: v_lshlrev_b16_e32 v6, v7, v6 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v2 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v8 -; GFX8-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX8-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v5 -; GFX8-NEXT: v_not_b32_e32 v5, v5 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX8-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX8-NEXT: v_lshlrev_b16_e32 v3, v5, v3 -; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX8-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX8-NEXT: v_lshlrev_b16_e32 v3, v7, v3 +; GFX8-NEXT: v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX8-NEXT: v_mov_b32_e32 v7, -1 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 7 +; GFX8-NEXT: v_mov_b32_e32 v4, 1 +; GFX8-NEXT: v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX8-NEXT: v_mov_b32_e32 v8, 0xff -; GFX8-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX8-NEXT: v_mov_b32_e32 v9, 1 -; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX8-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX8-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX8-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-NEXT: v_lshlrev_b16_e32 v5, v9, v5 +; GFX8-NEXT: v_mov_b32_e32 v9, 7 +; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX8-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_lshlrev_b16_e32 v7, v7, v10 -; GFX8-NEXT: v_lshrrev_b16_e32 v5, v5, v8 -; GFX8-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX8-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX8-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX8-NEXT: v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b16_e32 v8, v10, v8 +; GFX8-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX8-NEXT: v_or_b32_e32 v5, v5, v8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1321,40 +1378,41 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX9-LABEL: v_fshr_v4i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_not_b32_e32 v7, v2 -; GFX9-NEXT: v_and_b32_e32 v6, 7, v2 +; GFX9-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshlrev_b16_e32 v8, 1, v0 +; GFX9-NEXT: v_lshlrev_b16_e32 v6, v7, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v8 -; GFX9-NEXT: v_lshrrev_b16_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX9-NEXT: v_and_b32_e32 v7, 7, v5 -; GFX9-NEXT: v_not_b32_e32 v5, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX9-NEXT: v_xor_b32_e32 v7, -1, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 ; GFX9-NEXT: v_lshlrev_b16_e32 v3, 1, v3 -; GFX9-NEXT: v_lshlrev_b16_e32 v3, v5, v3 -; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 +; GFX9-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v3, v7, v3 +; GFX9-NEXT: v_lshrrev_b16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v7, -1 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 7 -; GFX9-NEXT: v_not_b32_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-NEXT: v_mov_b32_e32 v9, 1 +; GFX9-NEXT: v_mov_b32_e32 v4, 1 +; GFX9-NEXT: v_xor_b32_sdwa v9, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_sdwa v5, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX9-NEXT: v_mov_b32_e32 v8, 0xff -; GFX9-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v7, 7, v7 -; GFX9-NEXT: v_lshlrev_b16_sdwa v10, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_not_b32_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_lshlrev_b16_e32 v7, v7, v10 -; GFX9-NEXT: v_and_b32_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_lshrrev_b16_e32 v5, v5, v10 -; GFX9-NEXT: v_lshlrev_b16_e32 v0, v2, v0 -; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 -; GFX9-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX9-NEXT: v_lshlrev_b16_e32 v5, v9, v5 +; GFX9-NEXT: v_mov_b32_e32 v9, 7 +; GFX9-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_xor_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v10, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v11, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v4, 7, v4 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b16_e32 v10, v10, v11 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX9-NEXT: v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_or_b32_e32 v5, v5, v10 ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 @@ -1371,45 +1429,46 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 -; GFX10-NEXT: v_not_b32_e32 v8, v2 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 +; GFX10-NEXT: v_mov_b32_e32 v3, -1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX10-NEXT: v_not_b32_e32 v10, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_mov_b32_e32 v3, 7 ; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 8, v1 +; GFX10-NEXT: v_mov_b32_e32 v14, 0xff +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_not_b32_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_not_b32_sdwa v14, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v9 ; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 -; GFX10-NEXT: v_mov_b32_e32 v10, 0xff -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX10-NEXT: v_xor_b32_sdwa v9, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_mov_b32_e32 v10, 7 +; GFX10-NEXT: v_xor_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v12, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v13, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 -; GFX10-NEXT: v_and_b32_e32 v9, 0xff, v9 -; GFX10-NEXT: v_and_b32_sdwa v15, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_and_b32_e32 v10, 7, v14 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_and_b32_sdwa v15, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-NEXT: v_and_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_lshlrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b16 v3, v5, v9 -; GFX10-NEXT: v_lshlrev_b16 v5, v8, v6 +; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 +; GFX10-NEXT: v_and_b32_sdwa v2, v2, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b16 v5, v5, v8 +; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 ; GFX10-NEXT: v_lshrrev_b16 v1, v15, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v10, v7 +; GFX10-NEXT: v_lshlrev_b16 v3, v3, v7 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v11 ; GFX10-NEXT: v_lshrrev_b16 v7, v12, v13 -; GFX10-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, 8 -; GFX10-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX10-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v3 @@ -1427,29 +1486,29 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v13, 24, v2 ; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX11-NEXT: v_not_b32_e32 v12, v7 +; GFX11-NEXT: v_xor_b32_e32 v12, -1, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX11-NEXT: v_lshlrev_b16 v3, 1, v3 -; GFX11-NEXT: v_not_b32_e32 v14, v11 +; GFX11-NEXT: v_and_b32_e32 v12, 7, v12 +; GFX11-NEXT: v_xor_b32_e32 v14, -1, v11 ; GFX11-NEXT: v_lshrrev_b16 v6, v7, v6 -; GFX11-NEXT: v_not_b32_e32 v7, v13 +; GFX11-NEXT: v_xor_b32_e32 v7, -1, v13 ; GFX11-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX11-NEXT: v_not_b32_e32 v10, v2 +; GFX11-NEXT: v_xor_b32_e32 v10, -1, v2 ; GFX11-NEXT: v_lshlrev_b16 v3, v12, v3 -; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX11-NEXT: v_and_b32_e32 v12, 7, v14 ; GFX11-NEXT: v_lshlrev_b16 v4, 1, v4 +; GFX11-NEXT: v_and_b32_e32 v12, 7, v14 +; GFX11-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8 -; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX11-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX11-NEXT: v_and_b32_e32 v13, 7, v13 -; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX11-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX11-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX11-NEXT: v_lshlrev_b16 v4, v12, v4 @@ -5112,51 +5171,46 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) define amdgpu_ps i64 @s_fshr_i64(i64 inreg %lhs, i64 inreg %rhs, i64 inreg %amt) { ; GFX6-LABEL: s_fshr_i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GFX6-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 -; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX6-NEXT: s_not_b32 s5, s4 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GFX8-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 -; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX8-NEXT: s_not_b32 s5, s4 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 63 -; GFX9-NEXT: s_andn2_b64 s[4:5], 63, s[4:5] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 -; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 +; GFX9-NEXT: s_not_b32 s5, s4 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 +; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_andn2_b64 s[6:7], 63, s[4:5] ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_and_b64 s[4:5], s[4:5], 63 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GFX10-NEXT: s_not_b32 s5, s4 ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_not1_b64 s[6:7], 63, s[4:5] ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_and_b64 s[4:5], s[4:5], 63 -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s6 +; GFX11-NEXT: s_not_b32 s5, s4 ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: ; return to shader part epilog @@ -5233,12 +5287,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX6-LABEL: v_fshr_i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX6-NEXT: v_not_b32_e32 v4, v4 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: v_not_b32_e32 v5, v4 +; GFX6-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v4 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v5 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v5 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v4 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] @@ -5246,12 +5300,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX8-LABEL: v_fshr_i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX8-NEXT: v_not_b32_e32 v4, v4 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: v_not_b32_e32 v5, v4 +; GFX8-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -5259,12 +5313,12 @@ define i64 @v_fshr_i64(i64 %lhs, i64 %rhs, i64 %amt) { ; GFX9-LABEL: v_fshr_i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v5, 63, v4 -; GFX9-NEXT: v_not_b32_e32 v4, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_not_b32_e32 v5, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 63, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v5, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v5, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], v4, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5410,38 +5464,38 @@ define i64 @v_fshr_i64_48(i64 %lhs, i64 %rhs) { define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 %amt) { ; GFX6-LABEL: v_fshr_i64_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v0 -; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX6-NEXT: v_not_b32_e32 v1, v0 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[0:1], v0 -; GFX6-NEXT: v_lshr_b64 v[2:3], s[2:3], v2 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v1 +; GFX6-NEXT: v_lshr_b64 v[3:4], s[2:3], v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i64_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX8-NEXT: v_not_b32_e32 v1, v0 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX8-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3] +; GFX8-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i64_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v2, 63, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: v_not_b32_e32 v1, v0 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v0, s[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX9-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-NEXT: v_and_b32_e32 v1, 63, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 63, v0 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v1, s[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[3:4], v0, s[2:3] +; GFX9-NEXT: v_or_b32_e32 v0, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v1, v2, v4 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i64_ssv: @@ -5478,43 +5532,43 @@ define amdgpu_ps <2 x float> @v_fshr_i64_ssv(i64 inreg %lhs, i64 inreg %rhs, i64 define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg %amt) { ; GFX6-LABEL: v_fshr_i64_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX6-NEXT: s_not_b32 s3, s2 +; GFX6-NEXT: s_and_b32 s2, s2, 63 ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s4 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s2 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i64_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX8-NEXT: s_not_b32 s3, s2 +; GFX8-NEXT: s_and_b32 s2, s2, 63 ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i64_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] +; GFX9-NEXT: s_not_b32 s3, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 63 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s2, v[0:1] +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i64_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX10-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX10-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: s_and_b32 s3, s2, 63 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] +; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 @@ -5522,10 +5576,10 @@ define amdgpu_ps <2 x float> @v_fshr_i64_svs(i64 inreg %lhs, i64 %rhs, i64 inreg ; ; GFX11-LABEL: v_fshr_i64_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX11-NEXT: s_and_not1_b64 s[2:3], 63, s[2:3] -; GFX11-NEXT: v_lshrrev_b64 v[0:1], s4, v[0:1] +; GFX11-NEXT: s_and_b32 s3, s2, 63 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX11-NEXT: v_lshrrev_b64 v[0:1], s3, v[0:1] +; GFX11-NEXT: s_not_b32 s2, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) @@ -5542,10 +5596,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX6-LABEL: v_fshr_i64_vss: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 -; GFX6-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX6-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s2 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX6-NEXT: s_andn2_b32 s3, 63, s2 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], s3 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: ; return to shader part epilog @@ -5553,10 +5606,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX8-LABEL: v_fshr_i64_vss: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX8-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX8-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], s2, v[0:1] -; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX8-NEXT: s_andn2_b32 s3, 63, s2 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] +; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX8-NEXT: ; return to shader part epilog @@ -5564,10 +5616,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX9-LABEL: v_fshr_i64_vss: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX9-NEXT: s_and_b64 s[4:5], s[2:3], 63 -; GFX9-NEXT: s_andn2_b64 s[2:3], 63, s[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[0:1], s2, v[0:1] -; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 +; GFX9-NEXT: s_andn2_b32 s3, 63, s2 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] +; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX9-NEXT: ; return to shader part epilog @@ -5575,10 +5626,9 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX10-LABEL: v_fshr_i64_vss: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_andn2_b64 s[4:5], 63, s[2:3] -; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], 63 +; GFX10-NEXT: s_andn2_b32 s3, 63, s2 ; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -5586,13 +5636,12 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg ; GFX11-LABEL: v_fshr_i64_vss: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_and_not1_b64 s[4:5], 63, s[2:3] -; GFX11-NEXT: s_and_b64 s[2:3], s[2:3], 63 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_and_not1_b32 s3, 63, s2 ; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], s4, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s3, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 %amt) @@ -5603,63 +5652,55 @@ define amdgpu_ps <2 x float> @v_fshr_i64_vss(i64 %lhs, i64 inreg %rhs, i64 inreg define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs, <2 x i64> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i64: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX6-NEXT: s_not_b32 s9, s8 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX6-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX6-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX6-NEXT: s_not_b32 s4, s10 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v2i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX8-NEXT: s_not_b32 s9, s8 +; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX8-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX8-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX8-NEXT: s_not_b32 s4, s10 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v2i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[12:13], s[8:9], 63 -; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[8:9] ; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s12 +; GFX9-NEXT: s_not_b32 s9, s8 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; GFX9-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], 63 -; GFX9-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s4 +; GFX9-NEXT: s_not_b32 s4, s10 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_andn2_b64 s[12:13], 63, s[8:9] -; GFX10-NEXT: s_and_b64 s[8:9], s[8:9], 63 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX10-NEXT: s_andn2_b64 s[8:9], 63, s[10:11] +; GFX10-NEXT: s_not_b32 s9, s8 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_and_b64 s[10:11], s[10:11], 63 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX10-NEXT: s_not_b32 s9, s10 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], s9 ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 ; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] @@ -5667,15 +5708,13 @@ define amdgpu_ps <2 x i64> @s_fshr_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg % ; ; GFX11-LABEL: s_fshr_v2i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_not1_b64 s[12:13], 63, s[8:9] -; GFX11-NEXT: s_and_b64 s[8:9], s[8:9], 63 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 -; GFX11-NEXT: s_and_not1_b64 s[8:9], 63, s[10:11] +; GFX11-NEXT: s_not_b32 s9, s8 ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_and_b64 s[10:11], s[10:11], 63 -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 -; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s9 +; GFX11-NEXT: s_not_b32 s9, s10 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s9 ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s10 ; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] @@ -5688,18 +5727,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX6-LABEL: v_fshr_v2i64: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 1 +; GFX6-NEXT: v_not_b32_e32 v9, v8 +; GFX6-NEXT: v_and_b32_e32 v9, 63, v9 ; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v9 -; GFX6-NEXT: v_not_b32_e32 v8, v10 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX6-NEXT: v_not_b32_e32 v4, v10 +; GFX6-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 ; GFX6-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX6-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], v8 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[6:7], v4 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5709,18 +5748,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX8-LABEL: v_fshr_v2i64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX8-NEXT: v_not_b32_e32 v9, v8 +; GFX8-NEXT: v_and_b32_e32 v9, 63, v9 ; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] -; GFX8-NEXT: v_not_b32_e32 v8, v10 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_not_b32_e32 v4, v10 +; GFX8-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX8-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] ; GFX8-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX8-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5730,18 +5769,18 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { ; GFX9-LABEL: v_fshr_v2i64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v9, 63, v8 -; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX9-NEXT: v_not_b32_e32 v9, v8 +; GFX9-NEXT: v_and_b32_e32 v9, 63, v9 ; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, v[0:1] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v9, v[4:5] -; GFX9-NEXT: v_not_b32_e32 v8, v10 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX9-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX9-NEXT: v_not_b32_e32 v4, v10 +; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] ; GFX9-NEXT: v_and_b32_e32 v4, 63, v10 -; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v8, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v4, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v6 @@ -5800,231 +5839,237 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) { define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: s_fshr_i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX6-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX6-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX6-NEXT: s_sub_i32 s11, s8, 64 -; GFX6-NEXT: s_sub_i32 s9, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 -; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 +; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s8 +; GFX6-NEXT: s_not_b32 s9, s8 +; GFX6-NEXT: s_sub_i32 s16, s2, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s2 +; GFX6-NEXT: s_cmp_lt_u32 s2, 64 ; GFX6-NEXT: s_cselect_b32 s17, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s8 -; GFX6-NEXT: s_lshr_b64 s[14:15], s[12:13], s9 -; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX6-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GFX6-NEXT: s_lshl_b64 s[12:13], s[12:13], s11 -; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13] +; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_cselect_b32 s18, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[12:13], s[10:11], s12 +; GFX6-NEXT: s_lshl_b64 s[14:15], s[0:1], s9 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 +; GFX6-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX6-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 ; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX6-NEXT: s_sub_i32 s14, s10, 64 -; GFX6-NEXT: s_sub_i32 s12, 64, s10 -; GFX6-NEXT: s_cmp_lt_u32 s10, 64 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11] +; GFX6-NEXT: s_and_b32 s0, s8, 0x7f +; GFX6-NEXT: s_sub_i32 s14, s0, 64 +; GFX6-NEXT: s_sub_i32 s12, 64, s0 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: s_cselect_b32 s15, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX6-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 -; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX6-NEXT: s_cmp_lg_u32 s15, 0 -; GFX6-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: s_cmp_lg_u32 s15, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX6-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] +; GFX6-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX8-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX8-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_sub_i32 s11, s8, 64 -; GFX8-NEXT: s_sub_i32 s9, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 -; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 +; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s8 +; GFX8-NEXT: s_not_b32 s9, s8 +; GFX8-NEXT: s_sub_i32 s16, s2, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s2 +; GFX8-NEXT: s_cmp_lt_u32 s2, 64 ; GFX8-NEXT: s_cselect_b32 s17, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[12:13], s8 -; GFX8-NEXT: s_lshr_b64 s[14:15], s[12:13], s9 -; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX8-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GFX8-NEXT: s_lshl_b64 s[12:13], s[12:13], s11 -; GFX8-NEXT: s_cmp_lg_u32 s16, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13] +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_cselect_b32 s18, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[12:13], s[10:11], s12 +; GFX8-NEXT: s_lshl_b64 s[14:15], s[0:1], s9 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 +; GFX8-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX8-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 ; GFX8-NEXT: s_cmp_lg_u32 s17, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX8-NEXT: s_sub_i32 s14, s10, 64 -; GFX8-NEXT: s_sub_i32 s12, 64, s10 -; GFX8-NEXT: s_cmp_lt_u32 s10, 64 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] +; GFX8-NEXT: s_cmp_lg_u32 s18, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11] +; GFX8-NEXT: s_and_b32 s0, s8, 0x7f +; GFX8-NEXT: s_sub_i32 s14, s0, 64 +; GFX8-NEXT: s_sub_i32 s12, 64, s0 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: s_cselect_b32 s15, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX8-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 -; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX8-NEXT: s_cmp_lg_u32 s15, 0 -; GFX8-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 ; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX8-NEXT: s_cmp_lg_u32 s15, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX8-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] +; GFX8-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX9-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] -; GFX9-NEXT: s_lshl_b64 s[12:13], s[0:1], 1 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_sub_i32 s11, s8, 64 -; GFX9-NEXT: s_sub_i32 s9, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 -; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 +; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s8 +; GFX9-NEXT: s_not_b32 s9, s8 +; GFX9-NEXT: s_sub_i32 s16, s2, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s2 +; GFX9-NEXT: s_cmp_lt_u32 s2, 64 ; GFX9-NEXT: s_cselect_b32 s17, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[12:13], s8 -; GFX9-NEXT: s_lshr_b64 s[14:15], s[12:13], s9 -; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX9-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GFX9-NEXT: s_lshl_b64 s[12:13], s[12:13], s11 -; GFX9-NEXT: s_cmp_lg_u32 s16, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[12:13] +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s18, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[12:13], s[10:11], s12 +; GFX9-NEXT: s_lshl_b64 s[14:15], s[0:1], s9 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], s9 +; GFX9-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] +; GFX9-NEXT: s_lshl_b64 s[10:11], s[10:11], s16 ; GFX9-NEXT: s_cmp_lg_u32 s17, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[0:1], s[8:9] -; GFX9-NEXT: s_sub_i32 s14, s10, 64 -; GFX9-NEXT: s_sub_i32 s12, 64, s10 -; GFX9-NEXT: s_cmp_lt_u32 s10, 64 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], s[10:11] +; GFX9-NEXT: s_and_b32 s0, s8, 0x7f +; GFX9-NEXT: s_sub_i32 s14, s0, 64 +; GFX9-NEXT: s_sub_i32 s12, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: s_cselect_b32 s15, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s10 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[4:5], s10 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s8 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[4:5], s8 ; GFX9-NEXT: s_lshl_b64 s[12:13], s[6:7], s12 -; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX9-NEXT: s_cmp_lg_u32 s15, 0 -; GFX9-NEXT: s_cselect_b64 s[6:7], s[10:11], s[6:7] +; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7] ; GFX9-NEXT: s_cmp_lg_u32 s15, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[4:5] -; GFX9-NEXT: s_or_b64 s[2:3], s[8:9], s[6:7] +; GFX9-NEXT: s_or_b64 s[2:3], s[10:11], s[6:7] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX10-NEXT: s_andn2_b64 s[8:9], 0x7f, s[8:9] ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s12, s1, 31 -; GFX10-NEXT: s_mov_b32 s13, 0 +; GFX10-NEXT: s_lshr_b32 s10, s1, 31 +; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_andn2_b32 s9, 0x7f, s8 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] -; GFX10-NEXT: s_sub_i32 s11, s8, 64 -; GFX10-NEXT: s_sub_i32 s9, 64, s8 -; GFX10-NEXT: s_cmp_lt_u32 s8, 64 -; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s8, 0 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; GFX10-NEXT: s_not_b32 s14, s8 +; GFX10-NEXT: s_sub_i32 s16, s9, 64 +; GFX10-NEXT: s_sub_i32 s10, 64, s9 +; GFX10-NEXT: s_cmp_lt_u32 s9, 64 ; GFX10-NEXT: s_cselect_b32 s17, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[12:13], s[0:1], s9 -; GFX10-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX10-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 -; GFX10-NEXT: s_cmp_lg_u32 s16, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s9, 0 +; GFX10-NEXT: s_cselect_b32 s9, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX10-NEXT: s_lshl_b64 s[12:13], s[2:3], s14 +; GFX10-NEXT: s_lshl_b64 s[14:15], s[0:1], s14 +; GFX10-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s16 ; GFX10-NEXT: s_cmp_lg_u32 s17, 0 +; GFX10-NEXT: s_cselect_b64 s[12:13], s[14:15], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s9, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s14, s10, 64 -; GFX10-NEXT: s_sub_i32 s11, 64, s10 -; GFX10-NEXT: s_cmp_lt_u32 s10, 64 +; GFX10-NEXT: s_and_b32 s0, s8, 0x7f +; GFX10-NEXT: s_sub_i32 s14, s0, 64 +; GFX10-NEXT: s_sub_i32 s9, 64, s0 +; GFX10-NEXT: s_cmp_lt_u32 s0, 64 ; GFX10-NEXT: s_cselect_b32 s15, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s10 -; GFX10-NEXT: s_lshl_b64 s[12:13], s[6:7], s11 -; GFX10-NEXT: s_lshr_b64 s[10:11], s[6:7], s10 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], s8 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX10-NEXT: s_cmp_lg_u32 s16, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s15, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 -; GFX10-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 +; GFX10-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[10:11], s[8:9], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[8:9], 0x7f, s[8:9] ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s12, s1, 31 -; GFX11-NEXT: s_mov_b32 s13, 0 +; GFX11-NEXT: s_lshr_b32 s10, s1, 31 +; GFX11-NEXT: s_mov_b32 s11, 0 +; GFX11-NEXT: s_and_not1_b32 s9, 0x7f, s8 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[12:13] -; GFX11-NEXT: s_sub_i32 s11, s8, 64 -; GFX11-NEXT: s_sub_i32 s9, 64, s8 -; GFX11-NEXT: s_cmp_lt_u32 s8, 64 -; GFX11-NEXT: s_cselect_b32 s16, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s8, 0 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[10:11] +; GFX11-NEXT: s_not_b32 s14, s8 +; GFX11-NEXT: s_sub_i32 s16, s9, 64 +; GFX11-NEXT: s_sub_i32 s10, 64, s9 +; GFX11-NEXT: s_cmp_lt_u32 s9, 64 ; GFX11-NEXT: s_cselect_b32 s17, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[12:13], s[0:1], s9 -; GFX11-NEXT: s_lshl_b64 s[14:15], s[2:3], s8 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[0:1], s8 -; GFX11-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s11 -; GFX11-NEXT: s_cmp_lg_u32 s16, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[12:13], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s9, 0 +; GFX11-NEXT: s_cselect_b32 s9, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[10:11], s[0:1], s10 +; GFX11-NEXT: s_lshl_b64 s[12:13], s[2:3], s14 +; GFX11-NEXT: s_lshl_b64 s[14:15], s[0:1], s14 +; GFX11-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s16 ; GFX11-NEXT: s_cmp_lg_u32 s17, 0 +; GFX11-NEXT: s_cselect_b64 s[12:13], s[14:15], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[10:11], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s14, s10, 64 -; GFX11-NEXT: s_sub_i32 s11, 64, s10 -; GFX11-NEXT: s_cmp_lt_u32 s10, 64 +; GFX11-NEXT: s_and_b32 s0, s8, 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s14, s0, 64 +; GFX11-NEXT: s_sub_i32 s9, 64, s0 +; GFX11-NEXT: s_cmp_lt_u32 s0, 64 ; GFX11-NEXT: s_cselect_b32 s15, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s16, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s10 -; GFX11-NEXT: s_lshl_b64 s[12:13], s[6:7], s11 -; GFX11-NEXT: s_lshr_b64 s[10:11], s[6:7], s10 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[12:13] +; GFX11-NEXT: s_lshr_b64 s[0:1], s[4:5], s8 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s9 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[10:11] ; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s14 ; GFX11-NEXT: s_cmp_lg_u32 s15, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7] ; GFX11-NEXT: s_cmp_lg_u32 s16, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[4:5], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s15, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[10:11], 0 -; GFX11-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], 0 +; GFX11-NEXT: s_or_b64 s[0:1], s[12:13], s[0:1] ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) @@ -6035,29 +6080,29 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-LABEL: v_fshr_i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX6-NEXT: v_not_b32_e32 v8, v8 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[0:1], 1 +; GFX6-NEXT: v_lshl_b64 v[9:10], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX6-NEXT: v_not_b32_e32 v0, v8 +; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v0 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v15 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[9:10], v0 +; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 ; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15 -; GFX6-NEXT: v_lshl_b64 v[12:13], v[8:9], v15 -; GFX6-NEXT: v_or_b32_e32 v10, v0, v10 -; GFX6-NEXT: v_or_b32_e32 v11, v1, v11 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[8:9], v16 +; GFX6-NEXT: v_lshl_b64 v[13:14], v[9:10], v15 +; GFX6-NEXT: v_or_b32_e32 v11, v0, v11 +; GFX6-NEXT: v_or_b32_e32 v12, v1, v12 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[9:10], v16 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 -; GFX6-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; GFX6-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v11, v1, v3, vcc +; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX6-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v14 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], v14 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 @@ -6074,38 +6119,38 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v13, v1 -; GFX6-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX6-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fshr_i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX8-NEXT: v_not_b32_e32 v8, v8 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX8-NEXT: v_not_b32_e32 v0, v8 +; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] +; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] ; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15 -; GFX8-NEXT: v_lshlrev_b64 v[12:13], v15, v[8:9] -; GFX8-NEXT: v_or_b32_e32 v10, v0, v10 -; GFX8-NEXT: v_or_b32_e32 v11, v1, v11 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10] +; GFX8-NEXT: v_or_b32_e32 v11, v0, v11 +; GFX8-NEXT: v_or_b32_e32 v12, v1, v12 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[9:10] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v1, v3, vcc +; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v14 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] @@ -6122,39 +6167,39 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v13, v1 -; GFX8-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX8-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 -; GFX9-NEXT: v_not_b32_e32 v8, v8 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 -; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_not_b32_e32 v0, v8 +; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v15, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] +; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] ; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15 -; GFX9-NEXT: v_lshlrev_b64 v[12:13], v15, v[8:9] -; GFX9-NEXT: v_or_b32_e32 v10, v0, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v1, v11 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10] +; GFX9-NEXT: v_or_b32_e32 v11, v0, v11 +; GFX9-NEXT: v_or_b32_e32 v12, v1, v12 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v16, v[9:10] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, 0, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v2, vcc +; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v0, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 64, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] ; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 @@ -6170,10 +6215,10 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX9-NEXT: v_or_b32_e32 v1, v13, v1 -; GFX9-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v11, v3 +; GFX9-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX9-NEXT: v_or_b32_e32 v3, v12, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_fshr_i128: @@ -6282,158 +6327,158 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, i128 %amt) { ; GFX6-LABEL: v_fshr_i128_ssv: ; GFX6: ; %bb.0: -; GFX6-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX6-NEXT: v_not_b32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s9, 0 -; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v0 +; GFX6-NEXT: v_not_b32_e32 v1, v0 +; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshr_b32 s8, s1, 31 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v7 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[10:11], v0 -; GFX6-NEXT: v_lshl_b64 v[2:3], s[0:1], v7 +; GFX6-NEXT: s_lshr_b32 s0, s1, 31 +; GFX6-NEXT: s_mov_b32 s1, 0 +; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 +; GFX6-NEXT: v_lshr_b64 v[1:2], s[8:9], v1 +; GFX6-NEXT: v_lshl_b64 v[3:4], s[0:1], v7 ; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 -; GFX6-NEXT: v_lshl_b64 v[4:5], s[10:11], v7 -; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX6-NEXT: v_lshl_b64 v[5:6], s[8:9], v7 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshl_b64 v[0:1], s[10:11], v8 +; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[8:9], v8 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_mov_b32_e32 v3, s0 +; GFX6-NEXT: v_mov_b32_e32 v4, s1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v6 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v6 +; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v10 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[6:7], v2 -; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v6 +; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], s[6:7], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v6 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v10 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i128_ssv: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX8-NEXT: v_not_b32_e32 v0, v0 -; GFX8-NEXT: s_mov_b32 s9, 0 -; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v0 +; GFX8-NEXT: v_not_b32_e32 v1, v0 +; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshr_b32 s8, s1, 31 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v7 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11] -; GFX8-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] +; GFX8-NEXT: s_lshr_b32 s0, s1, 31 +; GFX8-NEXT: s_mov_b32 s1, 0 +; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 +; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v7, s[10:11] -; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v8, s[10:11] +; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[8:9] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v6 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v6, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v6 +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i128_ssv: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_and_b32_e32 v6, 0x7f, v0 -; GFX9-NEXT: v_not_b32_e32 v0, v0 -; GFX9-NEXT: s_mov_b32 s9, 0 -; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v0 +; GFX9-NEXT: v_not_b32_e32 v1, v0 +; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshr_b32 s8, s1, 31 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v7 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[10:11] -; GFX9-NEXT: v_lshlrev_b64 v[2:3], v7, s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s1, 31 +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: v_and_b32_e32 v7, 0x7f, v1 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 +; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] ; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v7, s[10:11] -; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v8, s[10:11] +; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 +; GFX9-NEXT: v_lshlrev_b64 v[1:2], v8, s[8:9] ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v6, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v6 +; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] -; GFX9-NEXT: v_lshrrev_b64 v[4:5], v6, s[6:7] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], v10, s[6:7] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc ; GFX9-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX9-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i128_ssv: @@ -6543,40 +6588,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshr_i128_svs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX6-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX6-NEXT: s_lshr_b32 s0, s1, 31 ; GFX6-NEXT: s_mov_b32 s1, 0 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX6-NEXT: s_sub_i32 s7, s4, 64 -; GFX6-NEXT: s_sub_i32 s5, 64, s4 -; GFX6-NEXT: s_cmp_lt_u32 s4, 64 -; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s4 +; GFX6-NEXT: s_not_b32 s5, s4 +; GFX6-NEXT: s_sub_i32 s12, s2, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s2 +; GFX6-NEXT: s_cmp_lt_u32 s2, 64 ; GFX6-NEXT: s_cselect_b32 s13, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 -; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s5 -; GFX6-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX6-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s7 -; GFX6-NEXT: s_cmp_lg_u32 s12, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_cselect_b32 s14, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX6-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 +; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 ; GFX6-NEXT: s_cmp_lg_u32 s13, 0 -; GFX6-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5] -; GFX6-NEXT: s_sub_i32 s0, s6, 64 -; GFX6-NEXT: s_sub_i32 s1, 64, s6 -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 -; GFX6-NEXT: s_cselect_b32 s7, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 -; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s6 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s1 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] +; GFX6-NEXT: s_and_b32 s0, s4, 0x7f +; GFX6-NEXT: s_sub_i32 s1, s0, 64 +; GFX6-NEXT: s_sub_i32 s4, 64, s0 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 +; GFX6-NEXT: s_cselect_b32 s5, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 +; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], s0 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4 ; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s6 -; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s0 -; GFX6-NEXT: s_and_b32 s0, 1, s7 +; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], s0 +; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], s1 +; GFX6-NEXT: s_and_b32 s0, 1, s5 ; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6590,46 +6636,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX6-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX6-NEXT: v_or_b32_e32 v2, s4, v2 -; GFX6-NEXT: v_or_b32_e32 v3, s5, v3 +; GFX6-NEXT: v_or_b32_e32 v2, s6, v2 +; GFX6-NEXT: v_or_b32_e32 v3, s7, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i128_svs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX8-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX8-NEXT: s_lshr_b32 s0, s1, 31 ; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_sub_i32 s7, s4, 64 -; GFX8-NEXT: s_sub_i32 s5, 64, s4 -; GFX8-NEXT: s_cmp_lt_u32 s4, 64 -; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s4 +; GFX8-NEXT: s_not_b32 s5, s4 +; GFX8-NEXT: s_sub_i32 s12, s2, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s2 +; GFX8-NEXT: s_cmp_lt_u32 s2, 64 ; GFX8-NEXT: s_cselect_b32 s13, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 -; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s5 -; GFX8-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX8-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s7 -; GFX8-NEXT: s_cmp_lg_u32 s12, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_cselect_b32 s14, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX8-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 +; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 ; GFX8-NEXT: s_cmp_lg_u32 s13, 0 -; GFX8-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5] -; GFX8-NEXT: s_sub_i32 s0, s6, 64 -; GFX8-NEXT: s_sub_i32 s1, 64, s6 -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 -; GFX8-NEXT: s_cselect_b32 s7, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 -; GFX8-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX8-NEXT: s_cmp_lg_u32 s14, 0 +; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] +; GFX8-NEXT: s_and_b32 s0, s4, 0x7f +; GFX8-NEXT: s_sub_i32 s1, s0, 64 +; GFX8-NEXT: s_sub_i32 s4, 64, s0 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 +; GFX8-NEXT: s_cselect_b32 s5, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 +; GFX8-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[8:9], s6, v[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] -; GFX8-NEXT: s_and_b32 s0, 1, s7 +; GFX8-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] +; GFX8-NEXT: s_and_b32 s0, 1, s5 ; GFX8-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6643,46 +6690,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX8-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX8-NEXT: v_or_b32_e32 v2, s4, v2 -; GFX8-NEXT: v_or_b32_e32 v3, s5, v3 +; GFX8-NEXT: v_or_b32_e32 v2, s6, v2 +; GFX8-NEXT: v_or_b32_e32 v3, s7, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i128_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX9-NEXT: s_lshl_b64 s[8:9], s[0:1], 1 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], 1 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 ; GFX9-NEXT: s_lshr_b32 s0, s1, 31 ; GFX9-NEXT: s_mov_b32 s1, 0 ; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX9-NEXT: s_sub_i32 s7, s4, 64 -; GFX9-NEXT: s_sub_i32 s5, 64, s4 -; GFX9-NEXT: s_cmp_lt_u32 s4, 64 -; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s4 +; GFX9-NEXT: s_not_b32 s5, s4 +; GFX9-NEXT: s_sub_i32 s12, s2, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s2 +; GFX9-NEXT: s_cmp_lt_u32 s2, 64 ; GFX9-NEXT: s_cselect_b32 s13, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[8:9], s4 -; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s5 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX9-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s7 -; GFX9-NEXT: s_cmp_lg_u32 s12, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s14, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[6:7], s8 +; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s5 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[6:7], s5 +; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], s12 ; GFX9-NEXT: s_cmp_lg_u32 s13, 0 -; GFX9-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5] -; GFX9-NEXT: s_sub_i32 s0, s6, 64 -; GFX9-NEXT: s_sub_i32 s1, 64, s6 -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 -; GFX9-NEXT: s_cselect_b32 s7, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 -; GFX9-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[8:9], s[6:7] +; GFX9-NEXT: s_cmp_lg_u32 s14, 0 +; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7] +; GFX9-NEXT: s_and_b32 s0, s4, 0x7f +; GFX9-NEXT: s_sub_i32 s1, s0, 64 +; GFX9-NEXT: s_sub_i32 s4, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 +; GFX9-NEXT: s_cselect_b32 s5, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[8:9], s6, v[2:3] -; GFX9-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] -; GFX9-NEXT: s_and_b32 s0, 1, s7 +; GFX9-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] +; GFX9-NEXT: v_lshrrev_b64 v[2:3], s1, v[2:3] +; GFX9-NEXT: s_and_b32 s0, 1, s5 ; GFX9-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v5, v5, v7 ; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 @@ -6696,50 +6744,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX9-NEXT: v_or_b32_e32 v1, s3, v1 -; GFX9-NEXT: v_or_b32_e32 v2, s4, v2 -; GFX9-NEXT: v_or_b32_e32 v3, s5, v3 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v2 +; GFX9-NEXT: v_or_b32_e32 v3, s7, v3 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_i128_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: s_mov_b32 s9, 0 +; GFX10-NEXT: s_lshr_b32 s6, s1, 31 +; GFX10-NEXT: s_mov_b32 s7, 0 +; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_sub_i32 s7, s4, 64 -; GFX10-NEXT: s_sub_i32 s5, 64, s4 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 -; GFX10-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX10-NEXT: s_not_b32 s10, s4 +; GFX10-NEXT: s_sub_i32 s12, s5, 64 +; GFX10-NEXT: s_sub_i32 s6, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: s_cselect_b32 s13, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 -; GFX10-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 -; GFX10-NEXT: s_cmp_lg_u32 s12, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s10 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX10-NEXT: s_cmp_lg_u32 s13, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s5, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s0, 64, s6 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX10-NEXT: s_sub_i32 s0, s6, 64 -; GFX10-NEXT: s_cmp_lt_u32 s6, 64 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX10-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_and_b32 s0, s4, 0x7f +; GFX10-NEXT: s_sub_i32 s1, 64, s0 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX10-NEXT: s_sub_i32 s1, s0, 64 +; GFX10-NEXT: s_cmp_lt_u32 s0, 64 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3] +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX10-NEXT: s_cselect_b32 s7, 1, 0 -; GFX10-NEXT: s_and_b32 s0, 1, s1 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: s_and_b32 s1, 1, s4 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10-NEXT: s_and_b32 s0, 1, s7 -; GFX10-NEXT: v_lshrrev_b64 v[2:3], s6, v[2:3] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX10-NEXT: s_and_b32 s0, 1, s5 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc_lo @@ -6749,64 +6798,65 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i ; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX10-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX10-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX10-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: v_fshr_i128_svs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s8, s1, 31 -; GFX11-NEXT: s_mov_b32 s9, 0 +; GFX11-NEXT: s_lshr_b32 s6, s1, 31 +; GFX11-NEXT: s_mov_b32 s7, 0 +; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX11-NEXT: s_sub_i32 s7, s4, 64 -; GFX11-NEXT: s_sub_i32 s5, 64, s4 -; GFX11-NEXT: s_cmp_lt_u32 s4, 64 -; GFX11-NEXT: v_lshrrev_b64 v[4:5], s6, v[0:1] -; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; GFX11-NEXT: s_not_b32 s10, s4 +; GFX11-NEXT: s_sub_i32 s12, s5, 64 +; GFX11-NEXT: s_sub_i32 s6, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: s_cselect_b32 s13, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[0:1], s5 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[2:3], s4 -; GFX11-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 -; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s7 -; GFX11-NEXT: s_cmp_lg_u32 s12, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s10 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s12 ; GFX11-NEXT: s_cmp_lg_u32 s13, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[10:11], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[6:7], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s5, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s0, 64, s6 +; GFX11-NEXT: s_and_b32 s0, s4, 0x7f ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_lshlrev_b64 v[6:7], s0, v[2:3] -; GFX11-NEXT: s_sub_i32 s0, s6, 64 -; GFX11-NEXT: s_cmp_lt_u32 s6, 64 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], s0, v[2:3] -; GFX11-NEXT: s_cselect_b32 s1, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11-NEXT: s_sub_i32 s1, 64, s0 +; GFX11-NEXT: v_lshrrev_b64 v[4:5], s0, v[0:1] +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s1, v[2:3] +; GFX11-NEXT: s_sub_i32 s1, s0, 64 +; GFX11-NEXT: s_cmp_lt_u32 s0, 64 +; GFX11-NEXT: v_lshrrev_b64 v[8:9], s1, v[2:3] +; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 -; GFX11-NEXT: s_cselect_b32 s7, 1, 0 -; GFX11-NEXT: s_and_b32 s0, 1, s1 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: s_and_b32 s1, 1, s4 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX11-NEXT: s_and_b32 s0, 1, s7 -; GFX11-NEXT: v_lshrrev_b64 v[2:3], s6, v[2:3] +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX11-NEXT: v_lshrrev_b64 v[2:3], s0, v[2:3] +; GFX11-NEXT: s_and_b32 s0, 1, s5 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 ; GFX11-NEXT: v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v2, 0, v2 :: v_dual_cndmask_b32 v3, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 -; GFX11-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_or_b32_e32 v1, s5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_or_b32_e32 v0, s8, v0 +; GFX11-NEXT: v_or_b32_e32 v1, s9, v1 ; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.fshr.i128(i128 %lhs, i128 %rhs, i128 %amt) %cast.result = bitcast i128 %result to <4 x float> @@ -6816,51 +6866,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) { ; GFX6-LABEL: v_fshr_i128_vss: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX6-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: s_sub_i32 s5, s4, 64 -; GFX6-NEXT: s_sub_i32 s7, 64, s4 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX6-NEXT: s_cmp_lt_u32 s4, 64 +; GFX6-NEXT: s_andn2_b32 s5, 0x7f, s4 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX6-NEXT: s_sub_i32 s6, s5, 64 +; GFX6-NEXT: s_sub_i32 s7, 64, s5 +; GFX6-NEXT: s_cmp_lt_u32 s5, 64 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s7 +; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s5 ; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cmp_eq_u32 s5, 0 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], s7 -; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], s4 -; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], s4 -; GFX6-NEXT: s_and_b32 s4, 1, s8 -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX6-NEXT: s_and_b32 s4, 1, s9 -; GFX6-NEXT: s_sub_i32 s10, s6, 64 -; GFX6-NEXT: s_sub_i32 s8, 64, s6 +; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], s5 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v1, v7 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[4:5], s5 -; GFX6-NEXT: s_cmp_lt_u32 s6, 64 -; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s6, 0 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[4:5], s6 +; GFX6-NEXT: s_and_b32 s5, 1, s8 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_and_b32 s5, 1, s9 ; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX6-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX6-NEXT: s_and_b32 s5, s4, 0x7f +; GFX6-NEXT: s_sub_i32 s10, s5, 64 +; GFX6-NEXT: s_sub_i32 s8, 64, s5 +; GFX6-NEXT: s_cmp_lt_u32 s5, 64 +; GFX6-NEXT: s_cselect_b32 s11, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s5, 0 ; GFX6-NEXT: s_cselect_b32 s12, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 -; GFX6-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX6-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 ; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX6-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX6-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s12, 0 ; GFX6-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX6-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v4 ; GFX6-NEXT: v_or_b32_e32 v1, s1, v5 ; GFX6-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6869,51 +6919,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX8-LABEL: v_fshr_i128_vss: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX8-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: s_sub_i32 s5, s4, 64 -; GFX8-NEXT: s_sub_i32 s7, 64, s4 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX8-NEXT: s_cmp_lt_u32 s4, 64 +; GFX8-NEXT: s_andn2_b32 s5, 0x7f, s4 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX8-NEXT: s_sub_i32 s6, s5, 64 +; GFX8-NEXT: s_sub_i32 s7, 64, s5 +; GFX8-NEXT: s_cmp_lt_u32 s5, 64 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] +; GFX8-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cmp_eq_u32 s5, 0 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] -; GFX8-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX8-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5] -; GFX8-NEXT: s_and_b32 s4, 1, s8 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX8-NEXT: s_and_b32 s4, 1, s9 -; GFX8-NEXT: s_sub_i32 s10, s6, 64 -; GFX8-NEXT: s_sub_i32 s8, 64, s6 +; GFX8-NEXT: v_lshlrev_b64 v[8:9], s5, v[4:5] ; GFX8-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v1, v7 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] -; GFX8-NEXT: s_cmp_lt_u32 s6, 64 -; GFX8-NEXT: s_cselect_b32 s11, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s6, 0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], s6, v[4:5] +; GFX8-NEXT: s_and_b32 s5, 1, s8 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_and_b32 s5, 1, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX8-NEXT: s_and_b32 s5, s4, 0x7f +; GFX8-NEXT: s_sub_i32 s10, s5, 64 +; GFX8-NEXT: s_sub_i32 s8, 64, s5 +; GFX8-NEXT: s_cmp_lt_u32 s5, 64 +; GFX8-NEXT: s_cselect_b32 s11, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s5, 0 ; GFX8-NEXT: s_cselect_b32 s12, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 -; GFX8-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX8-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX8-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 ; GFX8-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX8-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s12, 0 ; GFX8-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX8-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v4 ; GFX8-NEXT: v_or_b32_e32 v1, s1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6922,51 +6972,51 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; ; GFX9-LABEL: v_fshr_i128_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX9-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: s_sub_i32 s5, s4, 64 -; GFX9-NEXT: s_sub_i32 s7, 64, s4 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 -; GFX9-NEXT: s_cmp_lt_u32 s4, 64 +; GFX9-NEXT: s_andn2_b32 s5, 0x7f, s4 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX9-NEXT: s_sub_i32 s6, s5, 64 +; GFX9-NEXT: s_sub_i32 s7, 64, s5 +; GFX9-NEXT: s_cmp_lt_u32 s5, 64 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] +; GFX9-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], s7, v[4:5] -; GFX9-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX9-NEXT: v_lshlrev_b64 v[8:9], s4, v[4:5] -; GFX9-NEXT: s_and_b32 s4, 1, s8 -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 -; GFX9-NEXT: s_and_b32 s4, 1, s9 -; GFX9-NEXT: s_sub_i32 s10, s6, 64 -; GFX9-NEXT: s_sub_i32 s8, 64, s6 +; GFX9-NEXT: v_lshlrev_b64 v[8:9], s5, v[4:5] ; GFX9-NEXT: v_or_b32_e32 v6, v0, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v1, v7 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], s5, v[4:5] -; GFX9-NEXT: s_cmp_lt_u32 s6, 64 -; GFX9-NEXT: s_cselect_b32 s11, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s6, 0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], s6, v[4:5] +; GFX9-NEXT: s_and_b32 s5, 1, s8 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_and_b32 s5, 1, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4 +; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s5 +; GFX9-NEXT: s_and_b32 s5, s4, 0x7f +; GFX9-NEXT: s_sub_i32 s10, s5, 64 +; GFX9-NEXT: s_sub_i32 s8, 64, s5 +; GFX9-NEXT: s_cmp_lt_u32 s5, 64 +; GFX9-NEXT: s_cselect_b32 s11, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s5, 0 ; GFX9-NEXT: s_cselect_b32 s12, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], s6 -; GFX9-NEXT: s_lshr_b64 s[6:7], s[0:1], s6 +; GFX9-NEXT: s_lshr_b64 s[6:7], s[2:3], s4 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[0:1], s4 ; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 -; GFX9-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] +; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] +; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s12, 0 ; GFX9-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX9-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v4 ; GFX9-NEXT: v_or_b32_e32 v1, s1, v5 ; GFX9-NEXT: v_or_b32_e32 v2, s2, v2 @@ -6978,49 +7028,49 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX10-NEXT: s_andn2_b64 s[4:5], 0x7f, s[4:5] -; GFX10-NEXT: s_sub_i32 s7, 64, s4 +; GFX10-NEXT: s_andn2_b32 s5, 0x7f, s4 +; GFX10-NEXT: s_sub_i32 s6, s5, 64 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX10-NEXT: s_sub_i32 s5, s4, 64 -; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: s_sub_i32 s7, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] ; GFX10-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 +; GFX10-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX10-NEXT: s_cselect_b32 s9, 1, 0 -; GFX10-NEXT: s_and_b32 s4, 1, s8 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX10-NEXT: s_and_b32 s5, 1, s8 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX10-NEXT: s_and_b32 s5, s4, 0x7f ; GFX10-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX10-NEXT: s_and_b32 s4, 1, s9 -; GFX10-NEXT: s_sub_i32 s10, s6, 64 -; GFX10-NEXT: s_sub_i32 s7, 64, s6 -; GFX10-NEXT: s_cmp_lt_u32 s6, 64 +; GFX10-NEXT: s_and_b32 s6, 1, s9 +; GFX10-NEXT: s_sub_i32 s10, s5, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s5 +; GFX10-NEXT: s_cmp_lt_u32 s5, 64 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc_lo ; GFX10-NEXT: s_cselect_b32 s11, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_cmp_eq_u32 s5, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 ; GFX10-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 -; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s7 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc_lo -; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s12, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX10-NEXT: s_cmp_lg_u32 s11, 0 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX10-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX10-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX10-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX10-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7031,47 +7081,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i ; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: s_and_b64 s[6:7], s[4:5], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[4:5], 0x7f, s[4:5] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s7, 64, s4 +; GFX11-NEXT: s_and_not1_b32 s5, 0x7f, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_sub_i32 s6, s5, 64 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX11-NEXT: s_sub_i32 s5, s4, 64 -; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: s_sub_i32 s7, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], s7, v[0:1] ; GFX11-NEXT: s_cselect_b32 s8, 1, 0 -; GFX11-NEXT: v_lshlrev_b64 v[6:7], s4, v[2:3] -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: v_lshlrev_b64 v[8:9], s4, v[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], s5, v[2:3] ; GFX11-NEXT: s_cselect_b32 s9, 1, 0 -; GFX11-NEXT: s_and_b32 s4, 1, s8 -; GFX11-NEXT: v_lshlrev_b64 v[0:1], s5, v[0:1] -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: v_lshlrev_b64 v[8:9], s5, v[0:1] +; GFX11-NEXT: s_and_b32 s5, 1, s8 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], s6, v[0:1] +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX11-NEXT: s_and_b32 s5, s4, 0x7f ; GFX11-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v7 -; GFX11-NEXT: s_and_b32 s4, 1, s9 -; GFX11-NEXT: s_sub_i32 s10, s6, 64 -; GFX11-NEXT: s_sub_i32 s7, 64, s6 -; GFX11-NEXT: s_cmp_lt_u32 s6, 64 +; GFX11-NEXT: s_and_b32 s6, 1, s9 +; GFX11-NEXT: s_sub_i32 s10, s5, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s5 +; GFX11-NEXT: s_cmp_lt_u32 s5, 64 ; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9 ; GFX11-NEXT: s_cselect_b32 s11, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11-NEXT: s_cmp_eq_u32 s5, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 -; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 ; GFX11-NEXT: s_cselect_b32 s12, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[0:1], s6 -; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s7 -; GFX11-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[6:7], s[0:1], s4 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[2:3], s4 +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s10 ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3] +; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; GFX11-NEXT: s_cmp_lg_u32 s11, 0 ; GFX11-NEXT: v_or_b32_e32 v0, s0, v6 -; GFX11-NEXT: s_cselect_b64 s[2:3], s[6:7], 0 +; GFX11-NEXT: s_cselect_b64 s[2:3], s[4:5], 0 ; GFX11-NEXT: v_or_b32_e32 v1, s1, v7 ; GFX11-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX11-NEXT: v_or_b32_e32 v3, s3, v3 @@ -7209,435 +7259,447 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) { define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) { ; GFX6-LABEL: s_fshr_v2i128: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX6-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX6-NEXT: s_lshr_b32 s24, s1, 31 -; GFX6-NEXT: s_mov_b32 s25, 0 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25] -; GFX6-NEXT: s_sub_i32 s19, s16, 64 -; GFX6-NEXT: s_sub_i32 s17, 64, s16 -; GFX6-NEXT: s_cmp_lt_u32 s16, 64 -; GFX6-NEXT: s_cselect_b32 s24, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s16, 0 +; GFX6-NEXT: s_lshr_b32 s22, s1, 31 +; GFX6-NEXT: s_mov_b32 s23, 0 +; GFX6-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23] +; GFX6-NEXT: s_andn2_b32 s2, 0x7f, s16 +; GFX6-NEXT: s_not_b32 s17, s16 +; GFX6-NEXT: s_sub_i32 s21, s2, 64 +; GFX6-NEXT: s_sub_i32 s22, 64, s2 +; GFX6-NEXT: s_cmp_lt_u32 s2, 64 ; GFX6-NEXT: s_cselect_b32 s28, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[2:3], s[22:23], s16 -; GFX6-NEXT: s_lshr_b64 s[26:27], s[22:23], s17 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX6-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX6-NEXT: s_lshl_b64 s[22:23], s[22:23], s19 -; GFX6-NEXT: s_cmp_lg_u32 s24, 0 -; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23] +; GFX6-NEXT: s_cmp_eq_u32 s2, 0 +; GFX6-NEXT: s_cselect_b32 s29, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[24:25], s[18:19], s22 +; GFX6-NEXT: s_lshl_b64 s[26:27], s[0:1], s17 +; GFX6-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 +; GFX6-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX6-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 ; GFX6-NEXT: s_cmp_lg_u32 s28, 0 -; GFX6-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX6-NEXT: s_sub_i32 s24, s18, 64 -; GFX6-NEXT: s_sub_i32 s22, 64, s18 -; GFX6-NEXT: s_cmp_lt_u32 s18, 64 +; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX6-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] +; GFX6-NEXT: s_cmp_lg_u32 s29, 0 +; GFX6-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19] +; GFX6-NEXT: s_and_b32 s0, s16, 0x7f +; GFX6-NEXT: s_sub_i32 s21, s0, 64 +; GFX6-NEXT: s_sub_i32 s22, 64, s0 +; GFX6-NEXT: s_cmp_lt_u32 s0, 64 ; GFX6-NEXT: s_cselect_b32 s26, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s18, 0 +; GFX6-NEXT: s_cmp_eq_u32 s0, 0 ; GFX6-NEXT: s_cselect_b32 s27, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s18 -; GFX6-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 -; GFX6-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 -; GFX6-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s24 +; GFX6-NEXT: s_lshr_b64 s[0:1], s[10:11], s16 +; GFX6-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX6-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 +; GFX6-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX6-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 ; GFX6-NEXT: s_cmp_lg_u32 s26, 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11] +; GFX6-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s27, 0 ; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s26, 0 ; GFX6-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 -; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX6-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX6-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX6-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX6-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX6-NEXT: s_lshr_b32 s24, s5, 31 -; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25] -; GFX6-NEXT: s_sub_i32 s9, s10, 64 -; GFX6-NEXT: s_sub_i32 s11, 64, s10 -; GFX6-NEXT: s_cmp_lt_u32 s10, 64 -; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s10, 0 +; GFX6-NEXT: s_lshr_b32 s22, s5, 31 +; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], 1 +; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23] +; GFX6-NEXT: s_andn2_b32 s6, 0x7f, s20 +; GFX6-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11] +; GFX6-NEXT: s_not_b32 s16, s20 +; GFX6-NEXT: s_sub_i32 s18, s6, 64 +; GFX6-NEXT: s_sub_i32 s10, 64, s6 +; GFX6-NEXT: s_cmp_lt_u32 s6, 64 +; GFX6-NEXT: s_cselect_b32 s19, 1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s6, 0 ; GFX6-NEXT: s_cselect_b32 s21, 1, 0 -; GFX6-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 -; GFX6-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 -; GFX6-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX6-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] -; GFX6-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_lshl_b64 s[6:7], s[8:9], s16 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX6-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX6-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] +; GFX6-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX6-NEXT: s_cmp_lg_u32 s21, 0 -; GFX6-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] -; GFX6-NEXT: s_sub_i32 s18, s8, 64 -; GFX6-NEXT: s_sub_i32 s16, 64, s8 -; GFX6-NEXT: s_cmp_lt_u32 s8, 64 +; GFX6-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9] +; GFX6-NEXT: s_and_b32 s4, s20, 0x7f +; GFX6-NEXT: s_sub_i32 s18, s4, 64 +; GFX6-NEXT: s_sub_i32 s16, 64, s4 +; GFX6-NEXT: s_cmp_lt_u32 s4, 64 ; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: s_cmp_eq_u32 s8, 0 -; GFX6-NEXT: s_cselect_b32 s20, 1, 0 -; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX6-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX6-NEXT: s_cmp_eq_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s21, 1, 0 +; GFX6-NEXT: s_lshr_b64 s[10:11], s[12:13], s20 ; GFX6-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX6-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] +; GFX6-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 +; GFX6-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX6-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] -; GFX6-NEXT: s_cmp_lg_u32 s20, 0 -; GFX6-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX6-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] +; GFX6-NEXT: s_cmp_lg_u32 s21, 0 +; GFX6-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 -; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] -; GFX6-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] +; GFX6-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] +; GFX6-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v2i128: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX8-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX8-NEXT: s_lshr_b32 s24, s1, 31 -; GFX8-NEXT: s_mov_b32 s25, 0 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25] -; GFX8-NEXT: s_sub_i32 s19, s16, 64 -; GFX8-NEXT: s_sub_i32 s17, 64, s16 -; GFX8-NEXT: s_cmp_lt_u32 s16, 64 -; GFX8-NEXT: s_cselect_b32 s24, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s16, 0 +; GFX8-NEXT: s_lshr_b32 s22, s1, 31 +; GFX8-NEXT: s_mov_b32 s23, 0 +; GFX8-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23] +; GFX8-NEXT: s_andn2_b32 s2, 0x7f, s16 +; GFX8-NEXT: s_not_b32 s17, s16 +; GFX8-NEXT: s_sub_i32 s21, s2, 64 +; GFX8-NEXT: s_sub_i32 s22, 64, s2 +; GFX8-NEXT: s_cmp_lt_u32 s2, 64 ; GFX8-NEXT: s_cselect_b32 s28, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[2:3], s[22:23], s16 -; GFX8-NEXT: s_lshr_b64 s[26:27], s[22:23], s17 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX8-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX8-NEXT: s_lshl_b64 s[22:23], s[22:23], s19 -; GFX8-NEXT: s_cmp_lg_u32 s24, 0 -; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23] +; GFX8-NEXT: s_cmp_eq_u32 s2, 0 +; GFX8-NEXT: s_cselect_b32 s29, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[24:25], s[18:19], s22 +; GFX8-NEXT: s_lshl_b64 s[26:27], s[0:1], s17 +; GFX8-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 +; GFX8-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX8-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 ; GFX8-NEXT: s_cmp_lg_u32 s28, 0 -; GFX8-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX8-NEXT: s_sub_i32 s24, s18, 64 -; GFX8-NEXT: s_sub_i32 s22, 64, s18 -; GFX8-NEXT: s_cmp_lt_u32 s18, 64 +; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX8-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] +; GFX8-NEXT: s_cmp_lg_u32 s29, 0 +; GFX8-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19] +; GFX8-NEXT: s_and_b32 s0, s16, 0x7f +; GFX8-NEXT: s_sub_i32 s21, s0, 64 +; GFX8-NEXT: s_sub_i32 s22, 64, s0 +; GFX8-NEXT: s_cmp_lt_u32 s0, 64 ; GFX8-NEXT: s_cselect_b32 s26, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s18, 0 +; GFX8-NEXT: s_cmp_eq_u32 s0, 0 ; GFX8-NEXT: s_cselect_b32 s27, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s18 -; GFX8-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 -; GFX8-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 -; GFX8-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s24 +; GFX8-NEXT: s_lshr_b64 s[0:1], s[10:11], s16 +; GFX8-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX8-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 +; GFX8-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 ; GFX8-NEXT: s_cmp_lg_u32 s26, 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11] +; GFX8-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s27, 0 ; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s26, 0 ; GFX8-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX8-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX8-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX8-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX8-NEXT: s_lshr_b32 s24, s5, 31 -; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25] -; GFX8-NEXT: s_sub_i32 s9, s10, 64 -; GFX8-NEXT: s_sub_i32 s11, 64, s10 -; GFX8-NEXT: s_cmp_lt_u32 s10, 64 -; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s10, 0 +; GFX8-NEXT: s_lshr_b32 s22, s5, 31 +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX8-NEXT: s_lshl_b64 s[8:9], s[4:5], 1 +; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23] +; GFX8-NEXT: s_andn2_b32 s6, 0x7f, s20 +; GFX8-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11] +; GFX8-NEXT: s_not_b32 s16, s20 +; GFX8-NEXT: s_sub_i32 s18, s6, 64 +; GFX8-NEXT: s_sub_i32 s10, 64, s6 +; GFX8-NEXT: s_cmp_lt_u32 s6, 64 +; GFX8-NEXT: s_cselect_b32 s19, 1, 0 +; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s21, 1, 0 -; GFX8-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 -; GFX8-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 -; GFX8-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX8-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] -; GFX8-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 -; GFX8-NEXT: s_cmp_lg_u32 s20, 0 +; GFX8-NEXT: s_lshl_b64 s[6:7], s[8:9], s16 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX8-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX8-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 +; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] +; GFX8-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX8-NEXT: s_cmp_lg_u32 s21, 0 -; GFX8-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] -; GFX8-NEXT: s_sub_i32 s18, s8, 64 -; GFX8-NEXT: s_sub_i32 s16, 64, s8 -; GFX8-NEXT: s_cmp_lt_u32 s8, 64 +; GFX8-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9] +; GFX8-NEXT: s_and_b32 s4, s20, 0x7f +; GFX8-NEXT: s_sub_i32 s18, s4, 64 +; GFX8-NEXT: s_sub_i32 s16, 64, s4 +; GFX8-NEXT: s_cmp_lt_u32 s4, 64 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 -; GFX8-NEXT: s_cmp_eq_u32 s8, 0 -; GFX8-NEXT: s_cselect_b32 s20, 1, 0 -; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX8-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; GFX8-NEXT: s_cselect_b32 s21, 1, 0 +; GFX8-NEXT: s_lshr_b64 s[10:11], s[12:13], s20 ; GFX8-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX8-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] +; GFX8-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 +; GFX8-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] -; GFX8-NEXT: s_cmp_lg_u32 s20, 0 -; GFX8-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX8-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] +; GFX8-NEXT: s_cmp_lg_u32 s21, 0 +; GFX8-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 ; GFX8-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 -; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] -; GFX8-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] +; GFX8-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] +; GFX8-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v2i128: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX9-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX9-NEXT: s_lshr_b32 s24, s1, 31 -; GFX9-NEXT: s_mov_b32 s25, 0 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[0:1], 1 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[24:25] -; GFX9-NEXT: s_sub_i32 s19, s16, 64 -; GFX9-NEXT: s_sub_i32 s17, 64, s16 -; GFX9-NEXT: s_cmp_lt_u32 s16, 64 -; GFX9-NEXT: s_cselect_b32 s24, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s16, 0 +; GFX9-NEXT: s_lshr_b32 s22, s1, 31 +; GFX9-NEXT: s_mov_b32 s23, 0 +; GFX9-NEXT: s_lshl_b64 s[18:19], s[0:1], 1 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[22:23] +; GFX9-NEXT: s_andn2_b32 s2, 0x7f, s16 +; GFX9-NEXT: s_not_b32 s17, s16 +; GFX9-NEXT: s_sub_i32 s21, s2, 64 +; GFX9-NEXT: s_sub_i32 s22, 64, s2 +; GFX9-NEXT: s_cmp_lt_u32 s2, 64 ; GFX9-NEXT: s_cselect_b32 s28, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[2:3], s[22:23], s16 -; GFX9-NEXT: s_lshr_b64 s[26:27], s[22:23], s17 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX9-NEXT: s_or_b64 s[16:17], s[26:27], s[16:17] -; GFX9-NEXT: s_lshl_b64 s[22:23], s[22:23], s19 -; GFX9-NEXT: s_cmp_lg_u32 s24, 0 -; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[16:17], s[22:23] +; GFX9-NEXT: s_cmp_eq_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s29, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[24:25], s[18:19], s22 +; GFX9-NEXT: s_lshl_b64 s[26:27], s[0:1], s17 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[18:19], s17 +; GFX9-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] +; GFX9-NEXT: s_lshl_b64 s[18:19], s[18:19], s21 ; GFX9-NEXT: s_cmp_lg_u32 s28, 0 -; GFX9-NEXT: s_cselect_b64 s[16:17], s[0:1], s[16:17] -; GFX9-NEXT: s_sub_i32 s24, s18, 64 -; GFX9-NEXT: s_sub_i32 s22, 64, s18 -; GFX9-NEXT: s_cmp_lt_u32 s18, 64 +; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_cselect_b64 s[18:19], s[24:25], s[18:19] +; GFX9-NEXT: s_cmp_lg_u32 s29, 0 +; GFX9-NEXT: s_cselect_b64 s[18:19], s[0:1], s[18:19] +; GFX9-NEXT: s_and_b32 s0, s16, 0x7f +; GFX9-NEXT: s_sub_i32 s21, s0, 64 +; GFX9-NEXT: s_sub_i32 s22, 64, s0 +; GFX9-NEXT: s_cmp_lt_u32 s0, 64 ; GFX9-NEXT: s_cselect_b32 s26, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s18, 0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 ; GFX9-NEXT: s_cselect_b32 s27, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s18 -; GFX9-NEXT: s_lshr_b64 s[18:19], s[8:9], s18 -; GFX9-NEXT: s_lshl_b64 s[22:23], s[10:11], s22 -; GFX9-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s24 +; GFX9-NEXT: s_lshr_b64 s[0:1], s[10:11], s16 +; GFX9-NEXT: s_lshr_b64 s[16:17], s[8:9], s16 +; GFX9-NEXT: s_lshl_b64 s[24:25], s[10:11], s22 +; GFX9-NEXT: s_or_b64 s[16:17], s[16:17], s[24:25] +; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s21 ; GFX9-NEXT: s_cmp_lg_u32 s26, 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[18:19], s[10:11] +; GFX9-NEXT: s_cselect_b64 s[10:11], s[16:17], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s27, 0 ; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s26, 0 ; GFX9-NEXT: s_cselect_b64 s[10:11], s[0:1], 0 -; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] -; GFX9-NEXT: s_or_b64 s[2:3], s[16:17], s[10:11] -; GFX9-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX9-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] ; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX9-NEXT: s_lshr_b32 s24, s5, 31 -; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], 1 -; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[24:25] -; GFX9-NEXT: s_sub_i32 s9, s10, 64 -; GFX9-NEXT: s_sub_i32 s11, 64, s10 -; GFX9-NEXT: s_cmp_lt_u32 s10, 64 -; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s10, 0 +; GFX9-NEXT: s_lshr_b32 s22, s5, 31 +; GFX9-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9] +; GFX9-NEXT: s_lshl_b64 s[8:9], s[4:5], 1 +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[22:23] +; GFX9-NEXT: s_andn2_b32 s6, 0x7f, s20 +; GFX9-NEXT: s_or_b64 s[2:3], s[18:19], s[10:11] +; GFX9-NEXT: s_not_b32 s16, s20 +; GFX9-NEXT: s_sub_i32 s18, s6, 64 +; GFX9-NEXT: s_sub_i32 s10, 64, s6 +; GFX9-NEXT: s_cmp_lt_u32 s6, 64 +; GFX9-NEXT: s_cselect_b32 s19, 1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s6, 0 ; GFX9-NEXT: s_cselect_b32 s21, 1, 0 -; GFX9-NEXT: s_lshl_b64 s[6:7], s[16:17], s10 -; GFX9-NEXT: s_lshr_b64 s[18:19], s[16:17], s11 -; GFX9-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX9-NEXT: s_or_b64 s[10:11], s[18:19], s[10:11] -; GFX9-NEXT: s_lshl_b64 s[16:17], s[16:17], s9 -; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_lshl_b64 s[6:7], s[8:9], s16 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[8:9], s10 +; GFX9-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] +; GFX9-NEXT: s_lshl_b64 s[8:9], s[8:9], s18 +; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[16:17] +; GFX9-NEXT: s_cselect_b64 s[8:9], s[10:11], s[8:9] ; GFX9-NEXT: s_cmp_lg_u32 s21, 0 -; GFX9-NEXT: s_cselect_b64 s[10:11], s[4:5], s[10:11] -; GFX9-NEXT: s_sub_i32 s18, s8, 64 -; GFX9-NEXT: s_sub_i32 s16, 64, s8 -; GFX9-NEXT: s_cmp_lt_u32 s8, 64 +; GFX9-NEXT: s_cselect_b64 s[8:9], s[4:5], s[8:9] +; GFX9-NEXT: s_and_b32 s4, s20, 0x7f +; GFX9-NEXT: s_sub_i32 s18, s4, 64 +; GFX9-NEXT: s_sub_i32 s16, 64, s4 +; GFX9-NEXT: s_cmp_lt_u32 s4, 64 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 -; GFX9-NEXT: s_cmp_eq_u32 s8, 0 -; GFX9-NEXT: s_cselect_b32 s20, 1, 0 -; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s8 -; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], s8 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s21, 1, 0 +; GFX9-NEXT: s_lshr_b64 s[10:11], s[12:13], s20 ; GFX9-NEXT: s_lshl_b64 s[16:17], s[14:15], s16 -; GFX9-NEXT: s_or_b64 s[8:9], s[8:9], s[16:17] +; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], s20 +; GFX9-NEXT: s_or_b64 s[10:11], s[10:11], s[16:17] ; GFX9-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[8:9], s[14:15] -; GFX9-NEXT: s_cmp_lg_u32 s20, 0 -; GFX9-NEXT: s_cselect_b64 s[8:9], s[12:13], s[8:9] +; GFX9-NEXT: s_cselect_b64 s[10:11], s[10:11], s[14:15] +; GFX9-NEXT: s_cmp_lg_u32 s21, 0 +; GFX9-NEXT: s_cselect_b64 s[10:11], s[12:13], s[10:11] ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 ; GFX9-NEXT: s_cselect_b64 s[12:13], s[4:5], 0 -; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[8:9] -; GFX9-NEXT: s_or_b64 s[6:7], s[10:11], s[12:13] +; GFX9-NEXT: s_or_b64 s[4:5], s[6:7], s[10:11] +; GFX9-NEXT: s_or_b64 s[6:7], s[8:9], s[12:13] ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i128: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX10-NEXT: s_andn2_b64 s[16:17], 0x7f, s[16:17] ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: s_lshr_b32 s22, s1, 31 -; GFX10-NEXT: s_mov_b32 s23, 0 +; GFX10-NEXT: s_lshr_b32 s18, s1, 31 +; GFX10-NEXT: s_mov_b32 s19, 0 +; GFX10-NEXT: s_andn2_b32 s17, 0x7f, s16 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[22:23] -; GFX10-NEXT: s_sub_i32 s19, s16, 64 -; GFX10-NEXT: s_sub_i32 s17, 64, s16 -; GFX10-NEXT: s_cmp_lt_u32 s16, 64 -; GFX10-NEXT: s_cselect_b32 s22, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s16, 0 +; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19] +; GFX10-NEXT: s_not_b32 s18, s16 +; GFX10-NEXT: s_sub_i32 s21, s17, 64 +; GFX10-NEXT: s_sub_i32 s22, 64, s17 +; GFX10-NEXT: s_cmp_lt_u32 s17, 64 ; GFX10-NEXT: s_cselect_b32 s28, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[24:25], s[0:1], s17 -; GFX10-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX10-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 -; GFX10-NEXT: s_cmp_lg_u32 s22, 0 -; GFX10-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s17, 0 +; GFX10-NEXT: s_cselect_b32 s17, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 +; GFX10-NEXT: s_lshl_b64 s[24:25], s[2:3], s18 +; GFX10-NEXT: s_lshl_b64 s[26:27], s[0:1], s18 +; GFX10-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], s21 ; GFX10-NEXT: s_cmp_lg_u32 s28, 0 +; GFX10-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s17, 0 ; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX10-NEXT: s_sub_i32 s22, s18, 64 -; GFX10-NEXT: s_sub_i32 s19, 64, s18 -; GFX10-NEXT: s_cmp_lt_u32 s18, 64 +; GFX10-NEXT: s_and_b32 s0, s16, 0x7f +; GFX10-NEXT: s_sub_i32 s18, s0, 64 +; GFX10-NEXT: s_sub_i32 s17, 64, s0 +; GFX10-NEXT: s_cmp_lt_u32 s0, 64 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 ; GFX10-NEXT: s_cselect_b32 s26, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s18, 0 -; GFX10-NEXT: s_cselect_b32 s27, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s18 -; GFX10-NEXT: s_lshl_b64 s[24:25], s[10:11], s19 -; GFX10-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 -; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s22 -; GFX10-NEXT: s_cmp_lg_u32 s26, 0 +; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s16 +; GFX10-NEXT: s_lshl_b64 s[22:23], s[10:11], s17 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[10:11], s16 +; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 ; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX10-NEXT: s_cmp_lg_u32 s27, 0 -; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX10-NEXT: s_cmp_lg_u32 s26, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[18:19], 0 -; GFX10-NEXT: s_andn2_b64 s[10:11], 0x7f, s[20:21] +; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX10-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX10-NEXT: s_lshr_b32 s22, s5, 31 ; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX10-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX10-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] +; GFX10-NEXT: s_lshr_b32 s18, s5, 31 +; GFX10-NEXT: s_andn2_b32 s8, 0x7f, s20 +; GFX10-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 -; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[22:23] -; GFX10-NEXT: s_sub_i32 s9, s10, 64 -; GFX10-NEXT: s_sub_i32 s11, 64, s10 -; GFX10-NEXT: s_cmp_lt_u32 s10, 64 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 -; GFX10-NEXT: s_cselect_b32 s21, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[16:17], s[4:5], s11 -; GFX10-NEXT: s_lshl_b64 s[18:19], s[6:7], s10 -; GFX10-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX10-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX10-NEXT: s_cmp_lg_u32 s21, 0 -; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] +; GFX10-NEXT: s_not_b32 s16, s20 ; GFX10-NEXT: s_sub_i32 s18, s8, 64 ; GFX10-NEXT: s_sub_i32 s9, 64, s8 ; GFX10-NEXT: s_cmp_lt_u32 s8, 64 ; GFX10-NEXT: s_cselect_b32 s19, 1, 0 ; GFX10-NEXT: s_cmp_eq_u32 s8, 0 -; GFX10-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], s8 -; GFX10-NEXT: s_lshl_b64 s[16:17], s[14:15], s9 -; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], s8 -; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17] -; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[8:9], s[4:5], s9 +; GFX10-NEXT: s_lshl_b64 s[10:11], s[6:7], s16 +; GFX10-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX10-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX10-NEXT: s_lshl_b64 s[4:5], s[4:5], s18 +; GFX10-NEXT: s_cmp_lg_u32 s19, 0 +; GFX10-NEXT: s_cselect_b64 s[10:11], s[16:17], 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 +; GFX10-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX10-NEXT: s_and_b32 s4, s20, 0x7f +; GFX10-NEXT: s_sub_i32 s18, s4, 64 +; GFX10-NEXT: s_sub_i32 s8, 64, s4 +; GFX10-NEXT: s_cmp_lt_u32 s4, 64 +; GFX10-NEXT: s_cselect_b32 s19, 1, 0 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s21, 1, 0 +; GFX10-NEXT: s_lshr_b64 s[4:5], s[12:13], s20 +; GFX10-NEXT: s_lshl_b64 s[8:9], s[14:15], s8 +; GFX10-NEXT: s_lshr_b64 s[16:17], s[14:15], s20 +; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], s18 ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15] -; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX10-NEXT: s_cmp_lg_u32 s21, 0 ; GFX10-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] ; GFX10-NEXT: s_cmp_lg_u32 s19, 0 -; GFX10-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX10-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX10-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] ; GFX10-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX10-NEXT: ; return to shader part epilog ; ; GFX11-LABEL: s_fshr_v2i128: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_and_b64 s[18:19], s[16:17], 0x7f -; GFX11-NEXT: s_and_not1_b64 s[16:17], 0x7f, s[16:17] ; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX11-NEXT: s_lshr_b32 s22, s1, 31 -; GFX11-NEXT: s_mov_b32 s23, 0 +; GFX11-NEXT: s_lshr_b32 s18, s1, 31 +; GFX11-NEXT: s_mov_b32 s19, 0 +; GFX11-NEXT: s_and_not1_b32 s17, 0x7f, s16 ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 -; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[22:23] -; GFX11-NEXT: s_sub_i32 s19, s16, 64 -; GFX11-NEXT: s_sub_i32 s17, 64, s16 -; GFX11-NEXT: s_cmp_lt_u32 s16, 64 -; GFX11-NEXT: s_cselect_b32 s22, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s16, 0 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[18:19] +; GFX11-NEXT: s_not_b32 s18, s16 +; GFX11-NEXT: s_sub_i32 s21, s17, 64 +; GFX11-NEXT: s_sub_i32 s22, 64, s17 +; GFX11-NEXT: s_cmp_lt_u32 s17, 64 ; GFX11-NEXT: s_cselect_b32 s28, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[24:25], s[0:1], s17 -; GFX11-NEXT: s_lshl_b64 s[26:27], s[2:3], s16 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[0:1], s16 -; GFX11-NEXT: s_or_b64 s[24:25], s[24:25], s[26:27] -; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s19 -; GFX11-NEXT: s_cmp_lg_u32 s22, 0 -; GFX11-NEXT: s_cselect_b64 s[16:17], s[16:17], 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[24:25], s[0:1] +; GFX11-NEXT: s_cmp_eq_u32 s17, 0 +; GFX11-NEXT: s_cselect_b32 s17, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 +; GFX11-NEXT: s_lshl_b64 s[24:25], s[2:3], s18 +; GFX11-NEXT: s_lshl_b64 s[26:27], s[0:1], s18 +; GFX11-NEXT: s_or_b64 s[22:23], s[22:23], s[24:25] +; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], s21 ; GFX11-NEXT: s_cmp_lg_u32 s28, 0 +; GFX11-NEXT: s_cselect_b64 s[24:25], s[26:27], 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], s[22:23], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s17, 0 ; GFX11-NEXT: s_cselect_b64 s[2:3], s[2:3], s[0:1] -; GFX11-NEXT: s_sub_i32 s22, s18, 64 -; GFX11-NEXT: s_sub_i32 s19, 64, s18 -; GFX11-NEXT: s_cmp_lt_u32 s18, 64 +; GFX11-NEXT: s_and_b32 s0, s16, 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s18, s0, 64 +; GFX11-NEXT: s_sub_i32 s17, 64, s0 +; GFX11-NEXT: s_cmp_lt_u32 s0, 64 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s26, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s18, 0 -; GFX11-NEXT: s_cselect_b32 s27, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s18 -; GFX11-NEXT: s_lshl_b64 s[24:25], s[10:11], s19 -; GFX11-NEXT: s_lshr_b64 s[18:19], s[10:11], s18 -; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[24:25] -; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s22 -; GFX11-NEXT: s_cmp_lg_u32 s26, 0 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], s16 +; GFX11-NEXT: s_lshl_b64 s[22:23], s[10:11], s17 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[10:11], s16 +; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[22:23] +; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18 +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 ; GFX11-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11] -; GFX11-NEXT: s_cmp_lg_u32 s27, 0 -; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] ; GFX11-NEXT: s_cmp_lg_u32 s26, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[18:19], 0 -; GFX11-NEXT: s_and_not1_b64 s[10:11], 0x7f, s[20:21] +; GFX11-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1] +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX11-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GFX11-NEXT: s_lshr_b32 s22, s5, 31 ; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] -; GFX11-NEXT: s_and_b64 s[8:9], s[20:21], 0x7f -; GFX11-NEXT: s_or_b64 s[0:1], s[16:17], s[0:1] +; GFX11-NEXT: s_lshr_b32 s18, s5, 31 +; GFX11-NEXT: s_and_not1_b32 s8, 0x7f, s20 +; GFX11-NEXT: s_or_b64 s[0:1], s[24:25], s[0:1] ; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], 1 -; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[22:23] -; GFX11-NEXT: s_sub_i32 s9, s10, 64 -; GFX11-NEXT: s_sub_i32 s11, 64, s10 -; GFX11-NEXT: s_cmp_lt_u32 s10, 64 -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s10, 0 -; GFX11-NEXT: s_cselect_b32 s21, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[16:17], s[4:5], s11 -; GFX11-NEXT: s_lshl_b64 s[18:19], s[6:7], s10 -; GFX11-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GFX11-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9 -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 -; GFX11-NEXT: s_cselect_b64 s[10:11], s[10:11], 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[16:17], s[4:5] -; GFX11-NEXT: s_cmp_lg_u32 s21, 0 -; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[18:19] +; GFX11-NEXT: s_not_b32 s16, s20 ; GFX11-NEXT: s_sub_i32 s18, s8, 64 ; GFX11-NEXT: s_sub_i32 s9, 64, s8 ; GFX11-NEXT: s_cmp_lt_u32 s8, 64 ; GFX11-NEXT: s_cselect_b32 s19, 1, 0 ; GFX11-NEXT: s_cmp_eq_u32 s8, 0 -; GFX11-NEXT: s_cselect_b32 s20, 1, 0 -; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], s8 -; GFX11-NEXT: s_lshl_b64 s[16:17], s[14:15], s9 -; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], s8 -; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[16:17] -; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s18 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[8:9], s[4:5], s9 +; GFX11-NEXT: s_lshl_b64 s[10:11], s[6:7], s16 +; GFX11-NEXT: s_lshl_b64 s[16:17], s[4:5], s16 +; GFX11-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s18 +; GFX11-NEXT: s_cmp_lg_u32 s19, 0 +; GFX11-NEXT: s_cselect_b64 s[10:11], s[16:17], 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[8:9], s[4:5] +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 +; GFX11-NEXT: s_cselect_b64 s[6:7], s[6:7], s[4:5] +; GFX11-NEXT: s_and_b32 s4, s20, 0x7f +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s18, s4, 64 +; GFX11-NEXT: s_sub_i32 s8, 64, s4 +; GFX11-NEXT: s_cmp_lt_u32 s4, 64 +; GFX11-NEXT: s_cselect_b32 s19, 1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s21, 1, 0 +; GFX11-NEXT: s_lshr_b64 s[4:5], s[12:13], s20 +; GFX11-NEXT: s_lshl_b64 s[8:9], s[14:15], s8 +; GFX11-NEXT: s_lshr_b64 s[16:17], s[14:15], s20 +; GFX11-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] +; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], s18 ; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[14:15] -; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: s_cselect_b64 s[4:5], s[4:5], s[8:9] +; GFX11-NEXT: s_cmp_lg_u32 s21, 0 ; GFX11-NEXT: s_cselect_b64 s[4:5], s[12:13], s[4:5] ; GFX11-NEXT: s_cmp_lg_u32 s19, 0 -; GFX11-NEXT: s_cselect_b64 s[8:9], s[8:9], 0 +; GFX11-NEXT: s_cselect_b64 s[8:9], s[16:17], 0 ; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] ; GFX11-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] ; GFX11-NEXT: ; return to shader part epilog @@ -7649,68 +7711,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-LABEL: v_fshr_v2i128: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX6-NEXT: v_not_b32_e32 v16, v16 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GFX6-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[0:1], 1 +; GFX6-NEXT: v_lshl_b64 v[17:18], v[0:1], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v24 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[16:17], v0 -; GFX6-NEXT: v_lshl_b64 v[18:19], v[2:3], v24 -; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v24 -; GFX6-NEXT: v_lshl_b64 v[21:22], v[16:17], v24 -; GFX6-NEXT: v_or_b32_e32 v18, v0, v18 -; GFX6-NEXT: v_or_b32_e32 v19, v1, v19 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[16:17], v25 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX6-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v23 -; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v23 +; GFX6-NEXT: v_not_b32_e32 v0, v16 +; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0 +; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 +; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19 +; GFX6-NEXT: v_lshl_b64 v[23:24], v[17:18], v19 +; GFX6-NEXT: v_or_b32_e32 v21, v0, v21 +; GFX6-NEXT: v_or_b32_e32 v22, v1, v22 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v25 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX6-NEXT: v_and_b32_e32 v22, 0x7f, v16 +; GFX6-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v22 +; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v22 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 -; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v23 +; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v22 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 -; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v23 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX6-NEXT: v_not_b32_e32 v8, v20 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], 1 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX6-NEXT: v_lshr_b64 v[16:17], v[10:11], v22 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v19 +; GFX6-NEXT: v_not_b32_e32 v4, v20 +; GFX6-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v18 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 -; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v19 -; GFX6-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX6-NEXT: v_subrev_i32_e32 v20, vcc, 64, v19 -; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v19 +; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 +; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 +; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v18 ; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v20 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX6-NEXT: v_lshl_b64 v[4:5], v[8:9], v19 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 ; GFX6-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX6-NEXT: v_and_b32_e32 v18, 0x7f, v20 ; GFX6-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v18 @@ -7729,8 +7791,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX6-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX6-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX6-NEXT: v_or_b32_e32 v3, v21, v3 ; GFX6-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX6-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX6-NEXT: v_or_b32_e32 v6, v10, v6 @@ -7740,68 +7802,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-LABEL: v_fshr_v2i128: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX8-NEXT: v_not_b32_e32 v16, v16 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX8-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v24 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[16:17] -; GFX8-NEXT: v_lshlrev_b64 v[18:19], v24, v[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v24 -; GFX8-NEXT: v_lshlrev_b64 v[21:22], v24, v[16:17] -; GFX8-NEXT: v_or_b32_e32 v18, v0, v18 -; GFX8-NEXT: v_or_b32_e32 v19, v1, v19 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[16:17] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v23 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v23, v[8:9] +; GFX8-NEXT: v_not_b32_e32 v0, v16 +; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] +; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19 +; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] +; GFX8-NEXT: v_or_b32_e32 v21, v0, v21 +; GFX8-NEXT: v_or_b32_e32 v22, v1, v22 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX8-NEXT: v_and_b32_e32 v22, 0x7f, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v22 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v23 +; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v22 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX8-NEXT: v_not_b32_e32 v8, v20 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 ; GFX8-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX8-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX8-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11] ; GFX8-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v19 +; GFX8-NEXT: v_not_b32_e32 v4, v20 +; GFX8-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v18 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX8-NEXT: v_lshlrev_b64 v[10:11], v19, v[6:7] -; GFX8-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX8-NEXT: v_subrev_u32_e32 v20, vcc, 64, v19 -; GFX8-NEXT: v_lshlrev_b64 v[16:17], v19, v[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] +; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 +; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] ; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX8-NEXT: v_lshlrev_b64 v[4:5], v20, v[8:9] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX8-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 ; GFX8-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX8-NEXT: v_and_b32_e32 v18, 0x7f, v20 ; GFX8-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v18 @@ -7820,8 +7882,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX8-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX8-NEXT: v_or_b32_e32 v3, v21, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX8-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX8-NEXT: v_or_b32_e32 v6, v10, v6 @@ -7831,68 +7893,68 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-LABEL: v_fshr_v2i128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v16 -; GFX9-NEXT: v_not_b32_e32 v16, v16 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX9-NEXT: v_and_b32_e32 v24, 0x7f, v16 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], 1, v[0:1] +; GFX9-NEXT: v_lshlrev_b64 v[17:18], 1, v[0:1] ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 31, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v24 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[16:17] -; GFX9-NEXT: v_lshlrev_b64 v[18:19], v24, v[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v24 -; GFX9-NEXT: v_lshlrev_b64 v[21:22], v24, v[16:17] -; GFX9-NEXT: v_or_b32_e32 v18, v0, v18 -; GFX9-NEXT: v_or_b32_e32 v19, v1, v19 -; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[16:17] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v18, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v19, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v0, v2, vcc -; GFX9-NEXT: v_sub_u32_e32 v2, 64, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v1, v3, vcc -; GFX9-NEXT: v_lshrrev_b64 v[0:1], v23, v[8:9] +; GFX9-NEXT: v_not_b32_e32 v0, v16 +; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] +; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] +; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19 +; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] +; GFX9-NEXT: v_or_b32_e32 v21, v0, v21 +; GFX9-NEXT: v_or_b32_e32 v22, v1, v22 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v21, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v22, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_and_b32_e32 v22, 0x7f, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v19, v0, v2, vcc +; GFX9-NEXT: v_sub_u32_e32 v2, 64, v22 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v23 +; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v22 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] -; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[10:11] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v23 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v22 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v23 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc -; GFX9-NEXT: v_not_b32_e32 v8, v20 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v22 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX9-NEXT: v_lshlrev_b64 v[8:9], 1, v[4:5] ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, 64, v19 +; GFX9-NEXT: v_not_b32_e32 v4, v20 +; GFX9-NEXT: v_lshrrev_b64 v[16:17], v22, v[10:11] +; GFX9-NEXT: v_or_b32_e32 v0, v18, v0 +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, 64, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] -; GFX9-NEXT: v_lshlrev_b64 v[10:11], v19, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v2, v18, v2 -; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 -; GFX9-NEXT: v_subrev_u32_e32 v20, 64, v19 -; GFX9-NEXT: v_lshlrev_b64 v[16:17], v19, v[8:9] +; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] +; GFX9-NEXT: v_or_b32_e32 v2, v19, v2 +; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc +; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX9-NEXT: v_or_b32_e32 v11, v5, v11 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], v20, v[8:9] -; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 +; GFX9-NEXT: v_lshlrev_b64 v[4:5], v19, v[8:9] +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v18 +; GFX9-NEXT: v_and_b32_e32 v18, 0x7f, v20 ; GFX9-NEXT: v_cndmask_b32_e32 v10, v4, v6, vcc ; GFX9-NEXT: v_sub_u32_e32 v6, 64, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc @@ -7911,8 +7973,8 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc -; GFX9-NEXT: v_or_b32_e32 v0, v21, v0 -; GFX9-NEXT: v_or_b32_e32 v1, v22, v1 +; GFX9-NEXT: v_or_b32_e32 v1, v23, v1 +; GFX9-NEXT: v_or_b32_e32 v3, v21, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v16, v4 ; GFX9-NEXT: v_or_b32_e32 v5, v17, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v10, v6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index 404e726246f4d..81abe91b283f9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -2787,52 +2787,51 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-LABEL: v_sdiv_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5 +; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v5 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v7, 0xffffff, v0 +; CGP-NEXT: v_rcp_f32_e32 v7, v3 ; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 -; CGP-NEXT: v_rcp_f32_e32 v8, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v0 +; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v0 +; CGP-NEXT: v_mov_b32_e32 v0, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v6 +; CGP-NEXT: v_mul_lo_u32 v4, v1, v5 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v7 +; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v1 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v4, v1, v8, vcc ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mov_b32_e32 v5, v1 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v1, v5, v3 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v5 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v7, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3 -; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v5 ; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v6, v0 +; CGP-NEXT: v_add_i32_e64 v0, s[4:5], v7, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v4 ; CGP-NEXT: v_mov_b32_e32 v7, v1 -; CGP-NEXT: v_mul_lo_u32 v8, v7, v4 -; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v5 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_mul_lo_u32 v8, v7, v6 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v7 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 -; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc +; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v3 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 +; CGP-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 3729f1cc2b12d..183f2edbf9035 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -563,18 +563,21 @@ define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_and_b32 s0, s0, 0x3fff ; GFX8-NEXT: s_lshl_b32 s0, s0, 2 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_shl_i32_zext_i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_and_b32 s0, s0, 0x3fff ; GFX9-NEXT: s_lshl_b32 s0, s0, 2 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10PLUS-LABEL: s_shl_i32_zext_i16: ; GFX10PLUS: ; %bb.0: ; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0x3fff ; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 2 +; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX10PLUS-NEXT: ; return to shader part epilog %and = and i16 %x, 16383 %ext = zext i16 %and to i32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 5b94e71ecf52e..cfac0c2fa56aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -3286,45 +3286,45 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-LABEL: v_srem_v2i64_24bit: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v4 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v3 -; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6 -; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v3 +; CGP-NEXT: v_and_b32_e32 v5, 0xffffff, v4 +; CGP-NEXT: v_cvt_f32_u32_e32 v1, v5 +; CGP-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; CGP-NEXT: v_cvt_f32_u32_e32 v3, v6 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v5 ; CGP-NEXT: v_rcp_f32_e32 v1, v1 -; CGP-NEXT: v_and_b32_e32 v7, 0xffffff, v0 +; CGP-NEXT: v_rcp_f32_e32 v7, v3 ; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; CGP-NEXT: v_cvt_u32_f32_e32 v5, v1 -; CGP-NEXT: v_cvt_f32_u32_e32 v1, v4 -; CGP-NEXT: v_mul_lo_u32 v6, v6, v5 -; CGP-NEXT: v_rcp_f32_e32 v8, v1 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0 -; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v8 -; CGP-NEXT: v_cvt_u32_f32_e32 v6, v0 -; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v5, v0 +; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v1 +; CGP-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v1, v4, 0 +; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v0 +; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v7 +; CGP-NEXT: v_cvt_u32_f32_e32 v7, v0 +; CGP-NEXT: v_mov_b32_e32 v0, v4 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v0, 0 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v6 +; CGP-NEXT: v_mul_lo_u32 v0, v0, v7 +; CGP-NEXT: v_mul_lo_u32 v4, v1, v5 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 -; CGP-NEXT: v_mul_lo_u32 v0, v0, v6 -; CGP-NEXT: v_mul_lo_u32 v5, v1, v3 -; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v7, v5 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 ; CGP-NEXT: v_mov_b32_e32 v0, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v6, v0 +; CGP-NEXT: v_add_i32_e32 v0, vcc, v7, v0 ; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0 -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v5, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3 -; CGP-NEXT: v_mul_lo_u32 v6, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc -; CGP-NEXT: v_sub_i32_e32 v5, vcc, v0, v3 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v4, vcc, v3, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5 +; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; CGP-NEXT: v_mul_lo_u32 v4, v1, v6 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 +; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v4 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4 +; CGP-NEXT: v_sub_i32_e32 v3, vcc, v2, v6 +; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6 ; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; CGP-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll index e31d8e95bd608..1ee521b3dedac 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll @@ -2147,26 +2147,26 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 -; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3 -; CGP-NEXT: v_rcp_f32_e32 v4, v2 +; CGP-NEXT: v_rcp_f32_e32 v4, v1 ; CGP-NEXT: v_rcp_f32_e32 v5, v3 ; CGP-NEXT: v_mul_f32_e32 v4, v0, v4 -; CGP-NEXT: v_mul_f32_e32 v5, v1, v5 +; CGP-NEXT: v_mul_f32_e32 v5, v2, v5 ; CGP-NEXT: v_trunc_f32_e32 v4, v4 ; CGP-NEXT: v_trunc_f32_e32 v5, v5 -; CGP-NEXT: v_mad_f32 v0, -v4, v2, v0 +; CGP-NEXT: v_mad_f32 v0, -v4, v1, v0 ; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4 -; CGP-NEXT: v_mad_f32 v1, -v5, v3, v1 +; CGP-NEXT: v_mad_f32 v2, -v5, v3, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5 -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v2 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1 ; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, v3 +; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, v3 ; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index f30b278b3e611..a7e5ce3d21619 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -2561,12 +2561,12 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2 -; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4 +; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6 ; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0 -; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2 -; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v5, v1 +; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2 ; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3 ; CGP-NEXT: v_rcp_f32_e32 v8, v5 ; CGP-NEXT: v_rcp_f32_e32 v9, v7 @@ -2584,10 +2584,10 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) { ; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] ; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5 -; CGP-NEXT: v_mul_lo_u32 v2, v4, v2 +; CGP-NEXT: v_mul_lo_u32 v1, v4, v1 ; CGP-NEXT: v_mul_lo_u32 v3, v5, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v2, v3 ; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v1 ; CGP-NEXT: v_mov_b32_e32 v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll index 9ea9fa91e4f92..1b35a89ad7f93 100644 --- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll +++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll @@ -278,7 +278,6 @@ define amdgpu_ps i64 @s_csh_64_0(i64 inreg %a, i64 inreg %b) { ; ; GISEL-LABEL: s_csh_64_0: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], 63 ; GISEL-NEXT: s_lshl_b64 s[4:5], s[0:1], s2 ; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2 ; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 @@ -310,7 +309,6 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) { ; ; GISEL-LABEL: s_csh_64_1: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_and_b64 s[2:3], s[2:3], 0xff ; GISEL-NEXT: s_lshl_b64 s[4:5], s[0:1], s2 ; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2 ; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index a0b549711f339..93e14a205f05d 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1592,7 +1592,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 +; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 24 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] @@ -1837,7 +1837,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1 +; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 25 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 From b88aced1abd8280e305d176c3cc5d85ae720ae50 Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Wed, 11 Sep 2024 15:30:23 +0200 Subject: [PATCH 101/114] [flang][lowering] handle procedure pointers with generic name (#108043) Handle procedure pointer with the same name as generics in lowering to avoid crashes after #107928. --- flang/lib/Lower/PFTBuilder.cpp | 9 +++- .../HLFIR/procedure-pointer-in-generics.f90 | 46 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90 diff --git a/flang/lib/Lower/PFTBuilder.cpp b/flang/lib/Lower/PFTBuilder.cpp index 5b3d5471925bf..793e291a168ad 100644 --- a/flang/lib/Lower/PFTBuilder.cpp +++ b/flang/lib/Lower/PFTBuilder.cpp @@ -1566,6 +1566,14 @@ struct SymbolDependenceAnalysis { return 0; LLVM_DEBUG(llvm::dbgs() << "analyze symbol " << &sym << " in <" << &sym.owner() << ">: " << sym << '\n'); + const semantics::Symbol &ultimate = sym.GetUltimate(); + if (const auto *details = ultimate.detailsIf()) { + // Procedure pointers may be "hidden" behind to the generic symbol if they + // have the same name. + if (const semantics::Symbol *specific = details->specific()) + analyze(*specific); + return 0; + } const bool isProcedurePointerOrDummy = semantics::IsProcedurePointer(sym) || (semantics::IsProcedure(sym) && IsDummy(sym)); @@ -1582,7 +1590,6 @@ struct SymbolDependenceAnalysis { if (sym.owner().IsDerivedType()) return 0; - semantics::Symbol ultimate = sym.GetUltimate(); if (const auto *details = ultimate.detailsIf()) { // handle namelist group symbols diff --git a/flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90 b/flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90 new file mode 100644 index 0000000000000..ff447d31b1af1 --- /dev/null +++ b/flang/test/Lower/HLFIR/procedure-pointer-in-generics.f90 @@ -0,0 +1,46 @@ +! Test procedure pointers with the same name as generics. +! RUN: bbc -emit-hlfir -o - %s | FileCheck %s + +module m_gen + procedure(func), pointer :: foo + interface foo + procedure :: foo + end interface + interface + real function func(x) + real :: x + end function + end interface +end +!CHECK-LABEL: fir.global @_QMm_genEfoo : !fir.boxproc<(!fir.ref) -> f32> { +!CHECK: %[[VAL_0:.*]] = fir.zero_bits (!fir.ref) -> f32 +!CHECK: %[[VAL_1:.*]] = fir.emboxproc %[[VAL_0]] : ((!fir.ref) -> f32) -> !fir.boxproc<(!fir.ref) -> f32> +!CHECK: fir.has_value %[[VAL_1]] : !fir.boxproc<(!fir.ref) -> f32> + +subroutine test1() + use m_gen + foo => func +end subroutine +!CHECK-LABEL: func.func @_QPtest1() { +!CHECK: %[[VAL_0:.*]] = fir.address_of(@_QMm_genEfoo) : !fir.ref) -> f32>> +!CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}"_QMm_genEfoo"{{.*}} : (!fir.ref) -> f32>>) -> (!fir.ref) -> f32>>, !fir.ref) -> f32>>) +!CHECK: %[[VAL_2:.*]] = fir.address_of(@_QPfunc) : (!fir.ref) -> f32 +!CHECK: %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : ((!fir.ref) -> f32) -> !fir.boxproc<() -> ()> +!CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> !fir.boxproc<(!fir.ref) -> f32> +!CHECK: fir.store %[[VAL_4]] to %[[VAL_1]]#0 : !fir.ref) -> f32>> + +subroutine test_local() + use m_gen, only : func + procedure(func), pointer :: foo + interface foo + procedure :: foo + end interface + foo => func +end subroutine +!CHECK-LABEL: func.func @_QPtest_local() { +!CHECK: %[[VAL_0:.*]] = fir.alloca !fir.boxproc<(!fir.ref) -> f32> {bindc_name = "foo", uniq_name = "_QFtest_localEfoo"} +!CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}"_QFtest_localEfoo"{{.*}} : (!fir.ref) -> f32>>) -> (!fir.ref) -> f32>>, !fir.ref) -> f32>>) +!CHECK: %[[VAL_4:.*]] = fir.address_of(@_QPfunc) : (!fir.ref) -> f32 +!CHECK: %[[VAL_5:.*]] = fir.emboxproc %[[VAL_4]] : ((!fir.ref) -> f32) -> !fir.boxproc<() -> ()> +!CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.boxproc<() -> ()>) -> !fir.boxproc<(!fir.ref) -> f32> +!CHECK: fir.store %[[VAL_6]] to %[[VAL_3]]#0 : !fir.ref) -> f32>> From 7be6ea124430c3461fb85588d6eb1af70930cfe7 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Sep 2024 06:39:30 -0700 Subject: [PATCH 102/114] [Dialect] Avoid repeated hash lookups (NFC) (#108137) --- mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp index d51d63f243ea0..85604eef2f283 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp @@ -465,9 +465,8 @@ bool AnalysisState::isValueRead(Value value) const { while (!workingSet.empty()) { OpOperand *uMaybeReading = workingSet.pop_back_val(); - if (visited.contains(uMaybeReading)) + if (!visited.insert(uMaybeReading).second) continue; - visited.insert(uMaybeReading); // Skip over all ops that neither read nor write (but create an alias). if (bufferizesToAliasOnly(*uMaybeReading)) From 7dfaedf86127620821da7d31bf08fe1403f19ffe Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Sep 2024 06:39:58 -0700 Subject: [PATCH 103/114] [TableGen] Avoid repeated hash lookups (NFC) (#108138) --- llvm/utils/TableGen/DAGISelMatcherEmitter.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp index 1b93e3d5e3b70..a14cc3d6b844c 100644 --- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp @@ -71,9 +71,9 @@ class MatcherTableEmitter { MapVector> VecPatterns; unsigned getPatternIdxFromTable(std::string &&P, std::string &&include_loc) { - const auto It = VecPatterns.find(P); - if (It == VecPatterns.end()) { - VecPatterns.insert(std::pair(std::move(P), VecPatterns.size())); + const auto [It, Inserted] = + VecPatterns.try_emplace(std::move(P), VecPatterns.size()); + if (Inserted) { VecIncludeStrings.push_back(std::move(include_loc)); return VecIncludeStrings.size() - 1; } From 4b1b450ae4b8fb9185f337c15315f4f473c3b429 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Sep 2024 06:40:17 -0700 Subject: [PATCH 104/114] [Transforms] Avoid repeated hash lookups (NFC) (#108139) --- mlir/lib/Transforms/Utils/DialectConversion.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index 450e66f0db4e7..f288c7fc2cb77 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -2486,9 +2486,8 @@ static void eraseDeadUnrealizedCasts( // Do not visit ops multiple times. If we find a circle, no live user was // found on the current path. - if (visited.contains(op)) + if (!visited.insert(op).second) return false; - visited.insert(op); // Visit all users. for (Operation *user : op->getUsers()) { From 6ffa7cd8b04ab4b771925d329d7ee8788a3b00ca Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Sep 2024 06:40:37 -0700 Subject: [PATCH 105/114] [Interfaces] Avoid repeated hash lookups (NFC) (#108140) --- mlir/lib/Interfaces/ValueBoundsOpInterface.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp index 6420c192b257d..505e84e3ca0cf 100644 --- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp +++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp @@ -605,9 +605,8 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound( worklist.push_back(v); while (!worklist.empty()) { Value next = worklist.pop_back_val(); - if (visited.contains(next)) + if (!visited.insert(next).second) continue; - visited.insert(next); if (llvm::is_contained(independencies, next)) return false; // TODO: DominanceInfo could be used to stop the traversal early. From 01967e265889a9e242122087c41a97139e84bdc0 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 11 Sep 2024 14:55:14 +0100 Subject: [PATCH 106/114] [AMDGPU] Shrink a live interval instead of recomputing it. NFCI. (#108171) --- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index f9d7ead4ff3ec..38ebda6cde1e5 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -1527,13 +1527,18 @@ bool SIWholeQuadMode::lowerCopyInstrs() { for (MachineInstr *MI : LowerToCopyInstrs) { LLVM_DEBUG(dbgs() << "simplify: " << *MI); - Register RecomputeReg = 0; if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 || MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) { assert(MI->getNumExplicitOperands() == 3); + + LiveInterval *RecomputeLI = nullptr; if (MI->getOperand(2).isReg()) - RecomputeReg = MI->getOperand(2).getReg(); + RecomputeLI = &LIS->getInterval(MI->getOperand(2).getReg()); + MI->removeOperand(2); + + if (RecomputeLI) + LIS->shrinkToUses(RecomputeLI); } else { assert(MI->getNumExplicitOperands() == 2); } @@ -1550,11 +1555,6 @@ bool SIWholeQuadMode::lowerCopyInstrs() { MI->setDesc(TII->get(CopyOp)); LLVM_DEBUG(dbgs() << " -> " << *MI); - - if (RecomputeReg) { - LIS->removeInterval(RecomputeReg); - LIS->createAndComputeVirtRegInterval(RecomputeReg); - } } return !LowerToCopyInstrs.empty() || !LowerToMovInstrs.empty(); } From 7a30b9c0f0c9523e29b449d80c695e6d8df0f176 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 11 Sep 2024 14:55:53 +0100 Subject: [PATCH 107/114] [AMDGPU] Make more use of getWaveMaskRegClass. NFC. (#108186) --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 4 ++-- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 +++--- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 14 ++++++-------- llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 2 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 6 ++---- 5 files changed, 14 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 44872761760db..434336ef137ff 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -1116,8 +1116,8 @@ void SIFixSGPRCopies::fixSCCCopies(MachineFunction &MF) { Register SrcReg = MI.getOperand(1).getReg(); Register DstReg = MI.getOperand(0).getReg(); if (SrcReg == AMDGPU::SCC) { - Register SCCCopy = MRI->createVirtualRegister( - TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID)); + Register SCCCopy = + MRI->createVirtualRegister(TRI->getWaveMaskRegClass()); I = BuildMI(*MI.getParent(), std::next(MachineBasicBlock::iterator(MI)), MI.getDebugLoc(), TII->get(IsWave32 ? AMDGPU::S_CSELECT_B32 diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 736f714ac1a77..bbb1d0c5eba14 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4562,7 +4562,7 @@ loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const auto *BoolXExecRC = TRI->getWaveMaskRegClass(); Register DstReg = MI.getOperand(0).getReg(); Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); Register TmpExec = MRI.createVirtualRegister(BoolXExecRC); @@ -5064,7 +5064,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return BB; } - const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const auto *CarryRC = TRI->getWaveMaskRegClass(); Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -5296,7 +5296,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const auto *CondRC = TRI->getWaveMaskRegClass(); Register SrcCondCopy = MRI.createVirtualRegister(CondRC); const TargetRegisterClass *Src0RC = Src0.isReg() diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index c6f28af1e5e73..87b213767b4fc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1231,8 +1231,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, Register TrueReg, Register FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - const TargetRegisterClass *BoolXExecRC = - RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const TargetRegisterClass *BoolXExecRC = RI.getWaveMaskRegClass(); assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg"); @@ -6417,7 +6416,7 @@ static void emitLoadScalarOpsFromVGPRLoop( ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term; unsigned AndOpc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; - const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const auto *BoolXExecRC = TRI->getWaveMaskRegClass(); MachineBasicBlock::iterator I = LoopBB.begin(); @@ -6565,7 +6564,7 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; - const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const auto *BoolXExecRC = TRI->getWaveMaskRegClass(); // Save SCC. Waterfall Loop may overwrite SCC. Register SaveSCCReg; @@ -6958,7 +6957,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); - const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const auto *BoolXExecRC = RI.getWaveMaskRegClass(); Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC); Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC); @@ -7336,7 +7335,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; - const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const auto *CarryRC = RI.getWaveMaskRegClass(); Register CarryInReg = Inst.getOperand(4).getReg(); if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { @@ -7711,8 +7710,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, Register NewCondReg = CondReg; if (IsSCC) { - const TargetRegisterClass *TC = - RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const TargetRegisterClass *TC = RI.getWaveMaskRegClass(); NewCondReg = MRI.createVirtualRegister(TC); // Now look for the closest SCC def if it is a copy diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 1b52a48d068eb..23d04fae42015 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2014,7 +2014,7 @@ Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, MachineOperand OffsetHi = createRegOrImm(static_cast(Addr.Offset >> 32), MI); - const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + const auto *CarryRC = TRI->getWaveMaskRegClass(); Register CarryReg = MRI->createVirtualRegister(CarryRC); Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 4c571a36e4896..2d1cd1bda3afe 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3428,8 +3428,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size, std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size)); case AMDGPU::VCCRegBankID: assert(Size == 1); - return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass - : &AMDGPU::SReg_64_XEXECRegClass; + return getWaveMaskRegClass(); case AMDGPU::SGPRRegBankID: return getSGPRClassForBitWidth(std::max(32u, Size)); case AMDGPU::AGPRRegBankID: @@ -3472,8 +3471,7 @@ SIRegisterInfo::getRegClass(unsigned RCID) const { case AMDGPU::SReg_1RegClassID: return getBoolRC(); case AMDGPU::SReg_1_XEXECRegClassID: - return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass - : &AMDGPU::SReg_64_XEXECRegClass; + return getWaveMaskRegClass(); case -1: return nullptr; default: From ccc4fa18423f097a9f04d3198cacf094445ffd71 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 06:57:31 -0700 Subject: [PATCH 108/114] [TableGen] Fix MacOS failure in Option Emitter. (#108225) Handle the case of same pointer used as both inputs to the `CompareOptionRecords`, to avoid emitting errors for equivalent options. Follow-up to #107696. --- llvm/utils/TableGen/Common/OptEmitter.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/TableGen/Common/OptEmitter.cpp b/llvm/utils/TableGen/Common/OptEmitter.cpp index 1c91ec5b3dbc4..75e32c36d4f72 100644 --- a/llvm/utils/TableGen/Common/OptEmitter.cpp +++ b/llvm/utils/TableGen/Common/OptEmitter.cpp @@ -41,6 +41,8 @@ static int StrCmpOptionName(const char *A, const char *B) { // Returns true if A is ordered before B. bool CompareOptionRecords(const Record *A, const Record *B) { + if (A == B) + return false; // Sentinel options precede all others and are only ordered by precedence. bool ASent = A->getValueAsDef("Kind")->getValueAsBit("Sentinel"); bool BSent = B->getValueAsDef("Kind")->getValueAsBit("Sentinel"); From 35f7cfb22420a7c94b48e54fa28195ada9863d1a Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 11 Sep 2024 16:14:41 +0200 Subject: [PATCH 109/114] [clang][bytecode] Check for Pointer dereference in EvaluationResult (#108207) We will deref<>() it later, so this is the right check. --- clang/lib/AST/ByteCode/EvaluationResult.cpp | 4 ++-- clang/test/AST/ByteCode/initializer_list.cpp | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/ByteCode/EvaluationResult.cpp b/clang/lib/AST/ByteCode/EvaluationResult.cpp index bdebd19af9f94..627d4b2f65be9 100644 --- a/clang/lib/AST/ByteCode/EvaluationResult.cpp +++ b/clang/lib/AST/ByteCode/EvaluationResult.cpp @@ -178,8 +178,8 @@ bool EvaluationResult::checkFullyInitialized(InterpState &S, static void collectBlocks(const Pointer &Ptr, llvm::SetVector &Blocks) { auto isUsefulPtr = [](const Pointer &P) -> bool { - return P.isLive() && !P.isZero() && !P.isDummy() && - !P.isUnknownSizeArray() && !P.isOnePastEnd() && P.isBlockPointer(); + return P.isLive() && !P.isZero() && !P.isDummy() && P.isDereferencable() && + !P.isUnknownSizeArray() && !P.isOnePastEnd(); }; if (!isUsefulPtr(Ptr)) diff --git a/clang/test/AST/ByteCode/initializer_list.cpp b/clang/test/AST/ByteCode/initializer_list.cpp index 4e3b8dc912016..f882e4ff1b124 100644 --- a/clang/test/AST/ByteCode/initializer_list.cpp +++ b/clang/test/AST/ByteCode/initializer_list.cpp @@ -1,8 +1,6 @@ // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -fms-extensions -std=c++20 -verify=expected,both %s // RUN: %clang_cc1 -std=c++20 -fms-extensions -verify=ref,both %s -// both-no-diagnostics - namespace std { typedef decltype(sizeof(int)) size_t; template @@ -53,3 +51,21 @@ constexpr int foo() { } static_assert(foo() == 0); + + +namespace rdar13395022 { + struct MoveOnly { // both-note {{candidate}} + MoveOnly(MoveOnly&&); // both-note 2{{copy constructor is implicitly deleted because}} both-note {{candidate}} + }; + + void test(MoveOnly mo) { + auto &&list1 = {mo}; // both-error {{call to implicitly-deleted copy constructor}} both-note {{in initialization of temporary of type 'std::initializer_list}} + MoveOnly (&&list2)[1] = {mo}; // both-error {{call to implicitly-deleted copy constructor}} both-note {{in initialization of temporary of type 'MoveOnly[1]'}} + std::initializer_list &&list3 = {}; + MoveOnly (&&list4)[1] = {}; // both-error {{no matching constructor}} + // both-note@-1 {{in implicit initialization of array element 0 with omitted initializer}} + // both-note@-2 {{in initialization of temporary of type 'MoveOnly[1]' created to list-initialize this reference}} + } +} + + From 43da8a7a10237e8cb89e6d776bec81d97b5326d1 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 11 Sep 2024 15:18:16 +0100 Subject: [PATCH 110/114] [DAG] Add test coverage for ABD "sub of selects" patterns based off #53045 Add tests for "sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abd(a,b)" patterns that still fail to match to abd nodes This will hopefully be helped by #108218 --- llvm/test/CodeGen/AArch64/abds.ll | 84 +++++++ llvm/test/CodeGen/AArch64/abdu.ll | 85 +++++++ llvm/test/CodeGen/RISCV/abds.ll | 404 ++++++++++++++++++++++++++++++ llvm/test/CodeGen/RISCV/abdu.ll | 393 ++++++++++++++++++++++++++++- llvm/test/CodeGen/X86/abds.ll | 205 +++++++++++++++ llvm/test/CodeGen/X86/abdu.ll | 206 +++++++++++++++ 6 files changed, 1376 insertions(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AArch64/abds.ll b/llvm/test/CodeGen/AArch64/abds.ll index 0e35f8240848b..e5cc04f9be1a1 100644 --- a/llvm/test/CodeGen/AArch64/abds.ll +++ b/llvm/test/CodeGen/AArch64/abds.ll @@ -539,6 +539,90 @@ define i64 @vector_legalized(i16 %a, i16 %b) { ret i64 %z } +; +; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b) +; + +define i8 @abd_select_i8(i8 %a, i8 %b) nounwind { +; CHECK-LABEL: abd_select_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: cmp w8, w1, sxtb +; CHECK-NEXT: csel w8, w0, w1, lt +; CHECK-NEXT: csel w9, w1, w0, lt +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %cmp = icmp slt i8 %a, %b + %ab = select i1 %cmp, i8 %a, i8 %b + %ba = select i1 %cmp, i8 %b, i8 %a + %sub = sub i8 %ba, %ab + ret i8 %sub +} + +define i16 @abd_select_i16(i16 %a, i16 %b) nounwind { +; CHECK-LABEL: abd_select_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: cmp w8, w1, sxth +; CHECK-NEXT: csel w8, w0, w1, le +; CHECK-NEXT: csel w9, w1, w0, le +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %cmp = icmp sle i16 %a, %b + %ab = select i1 %cmp, i16 %a, i16 %b + %ba = select i1 %cmp, i16 %b, i16 %a + %sub = sub i16 %ba, %ab + ret i16 %sub +} + +define i32 @abd_select_i32(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: abd_select_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: csel w8, w0, w1, gt +; CHECK-NEXT: csel w9, w1, w0, gt +; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: ret + %cmp = icmp sgt i32 %a, %b + %ab = select i1 %cmp, i32 %a, i32 %b + %ba = select i1 %cmp, i32 %b, i32 %a + %sub = sub i32 %ab, %ba + ret i32 %sub +} + +define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: abd_select_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp x0, x1 +; CHECK-NEXT: csel x8, x0, x1, ge +; CHECK-NEXT: csel x9, x1, x0, ge +; CHECK-NEXT: sub x0, x8, x9 +; CHECK-NEXT: ret + %cmp = icmp sge i64 %a, %b + %ab = select i1 %cmp, i64 %a, i64 %b + %ba = select i1 %cmp, i64 %b, i64 %a + %sub = sub i64 %ab, %ba + ret i64 %sub +} + +define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { +; CHECK-LABEL: abd_select_i128: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: sbcs xzr, x1, x3 +; CHECK-NEXT: csel x8, x0, x2, lt +; CHECK-NEXT: csel x9, x2, x0, lt +; CHECK-NEXT: csel x10, x1, x3, lt +; CHECK-NEXT: csel x11, x3, x1, lt +; CHECK-NEXT: subs x0, x9, x8 +; CHECK-NEXT: sbc x1, x11, x10 +; CHECK-NEXT: ret + %cmp = icmp slt i128 %a, %b + %ab = select i1 %cmp, i128 %a, i128 %b + %ba = select i1 %cmp, i128 %b, i128 %a + %sub = sub i128 %ba, %ab + ret i128 %sub +} declare i8 @llvm.abs.i8(i8, i1) declare i16 @llvm.abs.i16(i16, i1) diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll index eb866e6a78a9b..0a44ae1688458 100644 --- a/llvm/test/CodeGen/AArch64/abdu.ll +++ b/llvm/test/CodeGen/AArch64/abdu.ll @@ -400,6 +400,91 @@ define i64 @vector_legalized(i16 %a, i16 %b) { ret i64 %z } +; +; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abdu(a,b) +; + +define i8 @abd_select_i8(i8 %a, i8 %b) nounwind { +; CHECK-LABEL: abd_select_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: cmp w8, w1, uxtb +; CHECK-NEXT: csel w8, w0, w1, lo +; CHECK-NEXT: csel w9, w1, w0, lo +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %cmp = icmp ult i8 %a, %b + %ab = select i1 %cmp, i8 %a, i8 %b + %ba = select i1 %cmp, i8 %b, i8 %a + %sub = sub i8 %ba, %ab + ret i8 %sub +} + +define i16 @abd_select_i16(i16 %a, i16 %b) nounwind { +; CHECK-LABEL: abd_select_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: cmp w8, w1, uxth +; CHECK-NEXT: csel w8, w0, w1, ls +; CHECK-NEXT: csel w9, w1, w0, ls +; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: ret + %cmp = icmp ule i16 %a, %b + %ab = select i1 %cmp, i16 %a, i16 %b + %ba = select i1 %cmp, i16 %b, i16 %a + %sub = sub i16 %ba, %ab + ret i16 %sub +} + +define i32 @abd_select_i32(i32 %a, i32 %b) nounwind { +; CHECK-LABEL: abd_select_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp w0, w1 +; CHECK-NEXT: csel w8, w0, w1, hi +; CHECK-NEXT: csel w9, w1, w0, hi +; CHECK-NEXT: sub w0, w8, w9 +; CHECK-NEXT: ret + %cmp = icmp ugt i32 %a, %b + %ab = select i1 %cmp, i32 %a, i32 %b + %ba = select i1 %cmp, i32 %b, i32 %a + %sub = sub i32 %ab, %ba + ret i32 %sub +} + +define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { +; CHECK-LABEL: abd_select_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp x0, x1 +; CHECK-NEXT: csel x8, x0, x1, hs +; CHECK-NEXT: csel x9, x1, x0, hs +; CHECK-NEXT: sub x0, x8, x9 +; CHECK-NEXT: ret + %cmp = icmp uge i64 %a, %b + %ab = select i1 %cmp, i64 %a, i64 %b + %ba = select i1 %cmp, i64 %b, i64 %a + %sub = sub i64 %ab, %ba + ret i64 %sub +} + +define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { +; CHECK-LABEL: abd_select_i128: +; CHECK: // %bb.0: +; CHECK-NEXT: cmp x0, x2 +; CHECK-NEXT: sbcs xzr, x1, x3 +; CHECK-NEXT: csel x8, x0, x2, lo +; CHECK-NEXT: csel x9, x2, x0, lo +; CHECK-NEXT: csel x10, x1, x3, lo +; CHECK-NEXT: csel x11, x3, x1, lo +; CHECK-NEXT: subs x0, x9, x8 +; CHECK-NEXT: sbc x1, x11, x10 +; CHECK-NEXT: ret + %cmp = icmp ult i128 %a, %b + %ab = select i1 %cmp, i128 %a, i128 %b + %ba = select i1 %cmp, i128 %b, i128 %a + %sub = sub i128 %ba, %ab + ret i128 %sub +} + declare i8 @llvm.abs.i8(i8, i1) declare i16 @llvm.abs.i16(i16, i1) declare i32 @llvm.abs.i32(i32, i1) diff --git a/llvm/test/CodeGen/RISCV/abds.ll b/llvm/test/CodeGen/RISCV/abds.ll index 86b36d8f69e95..919214b0e9a8d 100644 --- a/llvm/test/CodeGen/RISCV/abds.ll +++ b/llvm/test/CodeGen/RISCV/abds.ll @@ -2341,6 +2341,410 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind { ret i32 %abs } +; +; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b) +; + +define i8 @abd_select_i8(i8 %a, i8 %b) nounwind { +; RV32I-LABEL: abd_select_i8: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a2, a1, 24 +; RV32I-NEXT: srai a2, a2, 24 +; RV32I-NEXT: slli a3, a0, 24 +; RV32I-NEXT: srai a3, a3, 24 +; RV32I-NEXT: blt a3, a2, .LBB34_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB34_2: +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_select_i8: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a2, a1, 56 +; RV64I-NEXT: srai a2, a2, 56 +; RV64I-NEXT: slli a3, a0, 56 +; RV64I-NEXT: srai a3, a3, 56 +; RV64I-NEXT: blt a3, a2, .LBB34_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB34_2: +; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: ret +; +; ZBB-LABEL: abd_select_i8: +; ZBB: # %bb.0: +; ZBB-NEXT: sext.b a1, a1 +; ZBB-NEXT: sext.b a0, a0 +; ZBB-NEXT: min a2, a0, a1 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: sub a0, a0, a2 +; ZBB-NEXT: ret + %cmp = icmp slt i8 %a, %b + %ab = select i1 %cmp, i8 %a, i8 %b + %ba = select i1 %cmp, i8 %b, i8 %a + %sub = sub i8 %ba, %ab + ret i8 %sub +} + +define i16 @abd_select_i16(i16 %a, i16 %b) nounwind { +; RV32I-LABEL: abd_select_i16: +; RV32I: # %bb.0: +; RV32I-NEXT: slli a2, a0, 16 +; RV32I-NEXT: srai a2, a2, 16 +; RV32I-NEXT: slli a3, a1, 16 +; RV32I-NEXT: srai a3, a3, 16 +; RV32I-NEXT: bge a3, a2, .LBB35_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB35_2: +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_select_i16: +; RV64I: # %bb.0: +; RV64I-NEXT: slli a2, a0, 48 +; RV64I-NEXT: srai a2, a2, 48 +; RV64I-NEXT: slli a3, a1, 48 +; RV64I-NEXT: srai a3, a3, 48 +; RV64I-NEXT: bge a3, a2, .LBB35_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB35_2: +; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: ret +; +; ZBB-LABEL: abd_select_i16: +; ZBB: # %bb.0: +; ZBB-NEXT: sext.h a1, a1 +; ZBB-NEXT: sext.h a0, a0 +; ZBB-NEXT: min a2, a0, a1 +; ZBB-NEXT: max a0, a0, a1 +; ZBB-NEXT: sub a0, a0, a2 +; ZBB-NEXT: ret + %cmp = icmp sle i16 %a, %b + %ab = select i1 %cmp, i16 %a, i16 %b + %ba = select i1 %cmp, i16 %b, i16 %a + %sub = sub i16 %ba, %ab + ret i16 %sub +} + +define i32 @abd_select_i32(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: abd_select_i32: +; RV32I: # %bb.0: +; RV32I-NEXT: blt a1, a0, .LBB36_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB36_2: +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_select_i32: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a2, a0 +; RV64I-NEXT: sext.w a3, a1 +; RV64I-NEXT: blt a3, a2, .LBB36_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: subw a0, a1, a0 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB36_2: +; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: abd_select_i32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: min a2, a0, a1 +; RV32ZBB-NEXT: max a0, a0, a1 +; RV32ZBB-NEXT: sub a0, a0, a2 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: abd_select_i32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.w a1, a1 +; RV64ZBB-NEXT: sext.w a0, a0 +; RV64ZBB-NEXT: min a2, a0, a1 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: ret + %cmp = icmp sgt i32 %a, %b + %ab = select i1 %cmp, i32 %a, i32 %b + %ba = select i1 %cmp, i32 %b, i32 %a + %sub = sub i32 %ab, %ba + ret i32 %sub +} + +define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { +; RV32I-LABEL: abd_select_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: beq a1, a3, .LBB37_3 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt a4, a1, a3 +; RV32I-NEXT: bnez a4, .LBB37_4 +; RV32I-NEXT: .LBB37_2: +; RV32I-NEXT: mv a4, a1 +; RV32I-NEXT: mv a5, a0 +; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: j .LBB37_5 +; RV32I-NEXT: .LBB37_3: +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: beqz a4, .LBB37_2 +; RV32I-NEXT: .LBB37_4: +; RV32I-NEXT: mv a4, a3 +; RV32I-NEXT: mv a5, a2 +; RV32I-NEXT: .LBB37_5: +; RV32I-NEXT: sltu a2, a5, a0 +; RV32I-NEXT: sub a1, a4, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a0, a5, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_select_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: bge a0, a1, .LBB37_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB37_2: +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: abd_select_i64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: sltu a4, a2, a0 +; RV32ZBB-NEXT: mv a5, a4 +; RV32ZBB-NEXT: beq a1, a3, .LBB37_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt a5, a3, a1 +; RV32ZBB-NEXT: .LBB37_2: +; RV32ZBB-NEXT: bnez a5, .LBB37_4 +; RV32ZBB-NEXT: # %bb.3: +; RV32ZBB-NEXT: sub a1, a3, a1 +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sub a0, a2, a0 +; RV32ZBB-NEXT: ret +; RV32ZBB-NEXT: .LBB37_4: +; RV32ZBB-NEXT: sltu a4, a0, a2 +; RV32ZBB-NEXT: sub a1, a1, a3 +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sub a0, a0, a2 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: abd_select_i64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: min a2, a0, a1 +; RV64ZBB-NEXT: max a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: ret + %cmp = icmp sge i64 %a, %b + %ab = select i1 %cmp, i64 %a, i64 %b + %ba = select i1 %cmp, i64 %b, i64 %a + %sub = sub i64 %ab, %ba + ret i64 %sub +} + +define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { +; RV32I-LABEL: abd_select_i128: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a7, 4(a2) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw t0, 12(a2) +; RV32I-NEXT: lw a5, 12(a1) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: beq a5, t0, .LBB38_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: slt t1, a5, t0 +; RV32I-NEXT: j .LBB38_3 +; RV32I-NEXT: .LBB38_2: +; RV32I-NEXT: sltu t1, a4, a6 +; RV32I-NEXT: .LBB38_3: +; RV32I-NEXT: lw t3, 0(a2) +; RV32I-NEXT: lw a1, 0(a1) +; RV32I-NEXT: beq a3, a7, .LBB38_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: sltu a2, a3, a7 +; RV32I-NEXT: j .LBB38_6 +; RV32I-NEXT: .LBB38_5: +; RV32I-NEXT: sltu a2, a1, t3 +; RV32I-NEXT: .LBB38_6: +; RV32I-NEXT: xor t2, a5, t0 +; RV32I-NEXT: xor t4, a4, a6 +; RV32I-NEXT: or t2, t4, t2 +; RV32I-NEXT: beqz t2, .LBB38_8 +; RV32I-NEXT: # %bb.7: +; RV32I-NEXT: mv a2, t1 +; RV32I-NEXT: .LBB38_8: +; RV32I-NEXT: bnez a2, .LBB38_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv a2, t3 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: mv t2, a6 +; RV32I-NEXT: j .LBB38_11 +; RV32I-NEXT: .LBB38_10: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv t1, a3 +; RV32I-NEXT: mv t4, a5 +; RV32I-NEXT: mv t2, a4 +; RV32I-NEXT: mv a1, t3 +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: mv a4, a6 +; RV32I-NEXT: .LBB38_11: +; RV32I-NEXT: sltu a6, a4, t2 +; RV32I-NEXT: sub a7, a5, t4 +; RV32I-NEXT: sltu a5, a1, a2 +; RV32I-NEXT: sub a6, a7, a6 +; RV32I-NEXT: mv a7, a5 +; RV32I-NEXT: beq a3, t1, .LBB38_13 +; RV32I-NEXT: # %bb.12: +; RV32I-NEXT: sltu a7, a3, t1 +; RV32I-NEXT: .LBB38_13: +; RV32I-NEXT: sub a4, a4, t2 +; RV32I-NEXT: sltu t0, a4, a7 +; RV32I-NEXT: sub a6, a6, t0 +; RV32I-NEXT: sub a4, a4, a7 +; RV32I-NEXT: sub a3, a3, t1 +; RV32I-NEXT: sub a3, a3, a5 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 4(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a6, 12(a0) +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_select_i128: +; RV64I: # %bb.0: +; RV64I-NEXT: beq a1, a3, .LBB38_3 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: slt a4, a1, a3 +; RV64I-NEXT: beqz a4, .LBB38_4 +; RV64I-NEXT: .LBB38_2: +; RV64I-NEXT: mv a4, a1 +; RV64I-NEXT: mv a5, a0 +; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: j .LBB38_5 +; RV64I-NEXT: .LBB38_3: +; RV64I-NEXT: sltu a4, a0, a2 +; RV64I-NEXT: bnez a4, .LBB38_2 +; RV64I-NEXT: .LBB38_4: +; RV64I-NEXT: mv a4, a3 +; RV64I-NEXT: mv a5, a2 +; RV64I-NEXT: .LBB38_5: +; RV64I-NEXT: sltu a2, a0, a5 +; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a5 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: abd_select_i128: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a4, 4(a1) +; RV32ZBB-NEXT: lw a6, 8(a1) +; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw t0, 12(a1) +; RV32ZBB-NEXT: lw t1, 12(a2) +; RV32ZBB-NEXT: lw a1, 4(a2) +; RV32ZBB-NEXT: sltu a2, a7, a6 +; RV32ZBB-NEXT: mv t4, a2 +; RV32ZBB-NEXT: beq t0, t1, .LBB38_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: slt t4, t1, t0 +; RV32ZBB-NEXT: .LBB38_2: +; RV32ZBB-NEXT: sltu t2, a5, a3 +; RV32ZBB-NEXT: sltu t5, a1, a4 +; RV32ZBB-NEXT: mv t3, t2 +; RV32ZBB-NEXT: beq a4, a1, .LBB38_4 +; RV32ZBB-NEXT: # %bb.3: +; RV32ZBB-NEXT: mv t3, t5 +; RV32ZBB-NEXT: .LBB38_4: +; RV32ZBB-NEXT: addi sp, sp, -16 +; RV32ZBB-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZBB-NEXT: xor t6, t0, t1 +; RV32ZBB-NEXT: xor s0, a6, a7 +; RV32ZBB-NEXT: or t6, s0, t6 +; RV32ZBB-NEXT: beqz t6, .LBB38_6 +; RV32ZBB-NEXT: # %bb.5: +; RV32ZBB-NEXT: mv t3, t4 +; RV32ZBB-NEXT: .LBB38_6: +; RV32ZBB-NEXT: mv t4, t2 +; RV32ZBB-NEXT: beq a1, a4, .LBB38_8 +; RV32ZBB-NEXT: # %bb.7: +; RV32ZBB-NEXT: mv t4, t5 +; RV32ZBB-NEXT: .LBB38_8: +; RV32ZBB-NEXT: sltu t5, a3, a5 +; RV32ZBB-NEXT: mv t6, t5 +; RV32ZBB-NEXT: beq a4, a1, .LBB38_10 +; RV32ZBB-NEXT: # %bb.9: +; RV32ZBB-NEXT: sltu t6, a4, a1 +; RV32ZBB-NEXT: .LBB38_10: +; RV32ZBB-NEXT: bnez t3, .LBB38_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sub t0, t1, t0 +; RV32ZBB-NEXT: sub a6, a7, a6 +; RV32ZBB-NEXT: sub a2, t0, a2 +; RV32ZBB-NEXT: sltu a7, a6, t4 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a3, a5, a3 +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: sub a1, a1, t2 +; RV32ZBB-NEXT: sub a4, a6, t4 +; RV32ZBB-NEXT: j .LBB38_13 +; RV32ZBB-NEXT: .LBB38_12: +; RV32ZBB-NEXT: sltu a2, a6, a7 +; RV32ZBB-NEXT: sub t0, t0, t1 +; RV32ZBB-NEXT: sub a2, t0, a2 +; RV32ZBB-NEXT: sub a6, a6, a7 +; RV32ZBB-NEXT: sltu a7, a6, t6 +; RV32ZBB-NEXT: sub a2, a2, a7 +; RV32ZBB-NEXT: sub a3, a3, a5 +; RV32ZBB-NEXT: sub a4, a4, a1 +; RV32ZBB-NEXT: sub a1, a4, t5 +; RV32ZBB-NEXT: sub a4, a6, t6 +; RV32ZBB-NEXT: .LBB38_13: +; RV32ZBB-NEXT: sw a4, 8(a0) +; RV32ZBB-NEXT: sw a1, 4(a0) +; RV32ZBB-NEXT: sw a3, 0(a0) +; RV32ZBB-NEXT: sw a2, 12(a0) +; RV32ZBB-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZBB-NEXT: addi sp, sp, 16 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: abd_select_i128: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sltu a4, a2, a0 +; RV64ZBB-NEXT: mv a5, a4 +; RV64ZBB-NEXT: beq a1, a3, .LBB38_2 +; RV64ZBB-NEXT: # %bb.1: +; RV64ZBB-NEXT: slt a5, a3, a1 +; RV64ZBB-NEXT: .LBB38_2: +; RV64ZBB-NEXT: bnez a5, .LBB38_4 +; RV64ZBB-NEXT: # %bb.3: +; RV64ZBB-NEXT: sub a1, a3, a1 +; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: sub a0, a2, a0 +; RV64ZBB-NEXT: ret +; RV64ZBB-NEXT: .LBB38_4: +; RV64ZBB-NEXT: sltu a4, a0, a2 +; RV64ZBB-NEXT: sub a1, a1, a3 +; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: ret + %cmp = icmp slt i128 %a, %b + %ab = select i1 %cmp, i128 %a, i128 %b + %ba = select i1 %cmp, i128 %b, i128 %a + %sub = sub i128 %ba, %ab + ret i128 %sub +} declare i8 @llvm.abs.i8(i8, i1) declare i16 @llvm.abs.i16(i16, i1) diff --git a/llvm/test/CodeGen/RISCV/abdu.ll b/llvm/test/CodeGen/RISCV/abdu.ll index 14f45895754df..a9f933243f679 100644 --- a/llvm/test/CodeGen/RISCV/abdu.ll +++ b/llvm/test/CodeGen/RISCV/abdu.ll @@ -1720,6 +1720,398 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ret i128 %sel } +; +; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abdu(a,b) +; + +define i8 @abd_select_i8(i8 %a, i8 %b) nounwind { +; NOZBB-LABEL: abd_select_i8: +; NOZBB: # %bb.0: +; NOZBB-NEXT: andi a2, a1, 255 +; NOZBB-NEXT: andi a3, a0, 255 +; NOZBB-NEXT: bltu a3, a2, .LBB23_2 +; NOZBB-NEXT: # %bb.1: +; NOZBB-NEXT: sub a0, a0, a1 +; NOZBB-NEXT: ret +; NOZBB-NEXT: .LBB23_2: +; NOZBB-NEXT: sub a0, a1, a0 +; NOZBB-NEXT: ret +; +; ZBB-LABEL: abd_select_i8: +; ZBB: # %bb.0: +; ZBB-NEXT: andi a1, a1, 255 +; ZBB-NEXT: andi a0, a0, 255 +; ZBB-NEXT: minu a2, a0, a1 +; ZBB-NEXT: maxu a0, a0, a1 +; ZBB-NEXT: sub a0, a0, a2 +; ZBB-NEXT: ret + %cmp = icmp ult i8 %a, %b + %ab = select i1 %cmp, i8 %a, i8 %b + %ba = select i1 %cmp, i8 %b, i8 %a + %sub = sub i8 %ba, %ab + ret i8 %sub +} + +define i16 @abd_select_i16(i16 %a, i16 %b) nounwind { +; RV32I-LABEL: abd_select_i16: +; RV32I: # %bb.0: +; RV32I-NEXT: lui a2, 16 +; RV32I-NEXT: addi a2, a2, -1 +; RV32I-NEXT: and a3, a0, a2 +; RV32I-NEXT: and a2, a1, a2 +; RV32I-NEXT: bgeu a2, a3, .LBB24_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB24_2: +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_select_i16: +; RV64I: # %bb.0: +; RV64I-NEXT: lui a2, 16 +; RV64I-NEXT: addiw a2, a2, -1 +; RV64I-NEXT: and a3, a0, a2 +; RV64I-NEXT: and a2, a1, a2 +; RV64I-NEXT: bgeu a2, a3, .LBB24_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB24_2: +; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: ret +; +; ZBB-LABEL: abd_select_i16: +; ZBB: # %bb.0: +; ZBB-NEXT: zext.h a1, a1 +; ZBB-NEXT: zext.h a0, a0 +; ZBB-NEXT: minu a2, a0, a1 +; ZBB-NEXT: maxu a0, a0, a1 +; ZBB-NEXT: sub a0, a0, a2 +; ZBB-NEXT: ret + %cmp = icmp ule i16 %a, %b + %ab = select i1 %cmp, i16 %a, i16 %b + %ba = select i1 %cmp, i16 %b, i16 %a + %sub = sub i16 %ba, %ab + ret i16 %sub +} + +define i32 @abd_select_i32(i32 %a, i32 %b) nounwind { +; RV32I-LABEL: abd_select_i32: +; RV32I: # %bb.0: +; RV32I-NEXT: bltu a1, a0, .LBB25_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; RV32I-NEXT: .LBB25_2: +; RV32I-NEXT: sub a0, a0, a1 +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_select_i32: +; RV64I: # %bb.0: +; RV64I-NEXT: sext.w a2, a0 +; RV64I-NEXT: sext.w a3, a1 +; RV64I-NEXT: bltu a3, a2, .LBB25_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: subw a0, a1, a0 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB25_2: +; RV64I-NEXT: subw a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: abd_select_i32: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: minu a2, a0, a1 +; RV32ZBB-NEXT: maxu a0, a0, a1 +; RV32ZBB-NEXT: sub a0, a0, a2 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: abd_select_i32: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: slli a1, a1, 32 +; RV64ZBB-NEXT: srli a1, a1, 32 +; RV64ZBB-NEXT: slli a0, a0, 32 +; RV64ZBB-NEXT: srli a0, a0, 32 +; RV64ZBB-NEXT: minu a2, a0, a1 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: ret + %cmp = icmp ugt i32 %a, %b + %ab = select i1 %cmp, i32 %a, i32 %b + %ba = select i1 %cmp, i32 %b, i32 %a + %sub = sub i32 %ab, %ba + ret i32 %sub +} + +define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { +; RV32I-LABEL: abd_select_i64: +; RV32I: # %bb.0: +; RV32I-NEXT: beq a1, a3, .LBB26_3 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu a4, a1, a3 +; RV32I-NEXT: bnez a4, .LBB26_4 +; RV32I-NEXT: .LBB26_2: +; RV32I-NEXT: mv a4, a1 +; RV32I-NEXT: mv a5, a0 +; RV32I-NEXT: mv a1, a3 +; RV32I-NEXT: mv a0, a2 +; RV32I-NEXT: j .LBB26_5 +; RV32I-NEXT: .LBB26_3: +; RV32I-NEXT: sltu a4, a0, a2 +; RV32I-NEXT: beqz a4, .LBB26_2 +; RV32I-NEXT: .LBB26_4: +; RV32I-NEXT: mv a4, a3 +; RV32I-NEXT: mv a5, a2 +; RV32I-NEXT: .LBB26_5: +; RV32I-NEXT: sltu a2, a5, a0 +; RV32I-NEXT: sub a1, a4, a1 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sub a0, a5, a0 +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_select_i64: +; RV64I: # %bb.0: +; RV64I-NEXT: bgeu a0, a1, .LBB26_2 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sub a0, a1, a0 +; RV64I-NEXT: ret +; RV64I-NEXT: .LBB26_2: +; RV64I-NEXT: sub a0, a0, a1 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: abd_select_i64: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: sltu a4, a0, a2 +; RV32ZBB-NEXT: sub a3, a1, a3 +; RV32ZBB-NEXT: sub a3, a3, a4 +; RV32ZBB-NEXT: sub a2, a0, a2 +; RV32ZBB-NEXT: beq a3, a1, .LBB26_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu a0, a1, a3 +; RV32ZBB-NEXT: j .LBB26_3 +; RV32ZBB-NEXT: .LBB26_2: +; RV32ZBB-NEXT: sltu a0, a0, a2 +; RV32ZBB-NEXT: .LBB26_3: +; RV32ZBB-NEXT: neg a1, a0 +; RV32ZBB-NEXT: xor a2, a2, a1 +; RV32ZBB-NEXT: sltu a4, a2, a1 +; RV32ZBB-NEXT: xor a1, a3, a1 +; RV32ZBB-NEXT: add a1, a1, a0 +; RV32ZBB-NEXT: sub a1, a1, a4 +; RV32ZBB-NEXT: add a0, a2, a0 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: abd_select_i64: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: minu a2, a0, a1 +; RV64ZBB-NEXT: maxu a0, a0, a1 +; RV64ZBB-NEXT: sub a0, a0, a2 +; RV64ZBB-NEXT: ret + %cmp = icmp uge i64 %a, %b + %ab = select i1 %cmp, i64 %a, i64 %b + %ba = select i1 %cmp, i64 %b, i64 %a + %sub = sub i64 %ab, %ba + ret i64 %sub +} + +define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { +; RV32I-LABEL: abd_select_i128: +; RV32I: # %bb.0: +; RV32I-NEXT: lw a7, 4(a2) +; RV32I-NEXT: lw a3, 4(a1) +; RV32I-NEXT: lw a6, 8(a2) +; RV32I-NEXT: lw t0, 12(a2) +; RV32I-NEXT: lw a5, 12(a1) +; RV32I-NEXT: lw a4, 8(a1) +; RV32I-NEXT: beq a5, t0, .LBB27_2 +; RV32I-NEXT: # %bb.1: +; RV32I-NEXT: sltu t1, a5, t0 +; RV32I-NEXT: j .LBB27_3 +; RV32I-NEXT: .LBB27_2: +; RV32I-NEXT: sltu t1, a4, a6 +; RV32I-NEXT: .LBB27_3: +; RV32I-NEXT: lw t3, 0(a2) +; RV32I-NEXT: lw a1, 0(a1) +; RV32I-NEXT: beq a3, a7, .LBB27_5 +; RV32I-NEXT: # %bb.4: +; RV32I-NEXT: sltu a2, a3, a7 +; RV32I-NEXT: j .LBB27_6 +; RV32I-NEXT: .LBB27_5: +; RV32I-NEXT: sltu a2, a1, t3 +; RV32I-NEXT: .LBB27_6: +; RV32I-NEXT: xor t2, a5, t0 +; RV32I-NEXT: xor t4, a4, a6 +; RV32I-NEXT: or t2, t4, t2 +; RV32I-NEXT: beqz t2, .LBB27_8 +; RV32I-NEXT: # %bb.7: +; RV32I-NEXT: mv a2, t1 +; RV32I-NEXT: .LBB27_8: +; RV32I-NEXT: bnez a2, .LBB27_10 +; RV32I-NEXT: # %bb.9: +; RV32I-NEXT: mv a2, t3 +; RV32I-NEXT: mv t1, a7 +; RV32I-NEXT: mv t4, t0 +; RV32I-NEXT: mv t2, a6 +; RV32I-NEXT: j .LBB27_11 +; RV32I-NEXT: .LBB27_10: +; RV32I-NEXT: mv a2, a1 +; RV32I-NEXT: mv t1, a3 +; RV32I-NEXT: mv t4, a5 +; RV32I-NEXT: mv t2, a4 +; RV32I-NEXT: mv a1, t3 +; RV32I-NEXT: mv a3, a7 +; RV32I-NEXT: mv a5, t0 +; RV32I-NEXT: mv a4, a6 +; RV32I-NEXT: .LBB27_11: +; RV32I-NEXT: sltu a6, a4, t2 +; RV32I-NEXT: sub a7, a5, t4 +; RV32I-NEXT: sltu a5, a1, a2 +; RV32I-NEXT: sub a6, a7, a6 +; RV32I-NEXT: mv a7, a5 +; RV32I-NEXT: beq a3, t1, .LBB27_13 +; RV32I-NEXT: # %bb.12: +; RV32I-NEXT: sltu a7, a3, t1 +; RV32I-NEXT: .LBB27_13: +; RV32I-NEXT: sub a4, a4, t2 +; RV32I-NEXT: sltu t0, a4, a7 +; RV32I-NEXT: sub a6, a6, t0 +; RV32I-NEXT: sub a4, a4, a7 +; RV32I-NEXT: sub a3, a3, t1 +; RV32I-NEXT: sub a3, a3, a5 +; RV32I-NEXT: sub a1, a1, a2 +; RV32I-NEXT: sw a1, 0(a0) +; RV32I-NEXT: sw a3, 4(a0) +; RV32I-NEXT: sw a4, 8(a0) +; RV32I-NEXT: sw a6, 12(a0) +; RV32I-NEXT: ret +; +; RV64I-LABEL: abd_select_i128: +; RV64I: # %bb.0: +; RV64I-NEXT: beq a1, a3, .LBB27_3 +; RV64I-NEXT: # %bb.1: +; RV64I-NEXT: sltu a4, a1, a3 +; RV64I-NEXT: beqz a4, .LBB27_4 +; RV64I-NEXT: .LBB27_2: +; RV64I-NEXT: mv a4, a1 +; RV64I-NEXT: mv a5, a0 +; RV64I-NEXT: mv a1, a3 +; RV64I-NEXT: mv a0, a2 +; RV64I-NEXT: j .LBB27_5 +; RV64I-NEXT: .LBB27_3: +; RV64I-NEXT: sltu a4, a0, a2 +; RV64I-NEXT: bnez a4, .LBB27_2 +; RV64I-NEXT: .LBB27_4: +; RV64I-NEXT: mv a4, a3 +; RV64I-NEXT: mv a5, a2 +; RV64I-NEXT: .LBB27_5: +; RV64I-NEXT: sltu a2, a0, a5 +; RV64I-NEXT: sub a1, a1, a4 +; RV64I-NEXT: sub a1, a1, a2 +; RV64I-NEXT: sub a0, a0, a5 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: abd_select_i128: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: lw a5, 0(a2) +; RV32ZBB-NEXT: lw a3, 0(a1) +; RV32ZBB-NEXT: lw t1, 12(a2) +; RV32ZBB-NEXT: lw a7, 8(a2) +; RV32ZBB-NEXT: lw a4, 8(a1) +; RV32ZBB-NEXT: lw a6, 12(a1) +; RV32ZBB-NEXT: lw t0, 4(a2) +; RV32ZBB-NEXT: lw a1, 4(a1) +; RV32ZBB-NEXT: sltu a2, a4, a7 +; RV32ZBB-NEXT: sub t1, a6, t1 +; RV32ZBB-NEXT: sltu t2, a3, a5 +; RV32ZBB-NEXT: sub a2, t1, a2 +; RV32ZBB-NEXT: mv t1, t2 +; RV32ZBB-NEXT: beq a1, t0, .LBB27_2 +; RV32ZBB-NEXT: # %bb.1: +; RV32ZBB-NEXT: sltu t1, a1, t0 +; RV32ZBB-NEXT: .LBB27_2: +; RV32ZBB-NEXT: sub a7, a4, a7 +; RV32ZBB-NEXT: sltu t3, a7, t1 +; RV32ZBB-NEXT: sub a2, a2, t3 +; RV32ZBB-NEXT: sub a7, a7, t1 +; RV32ZBB-NEXT: beq a2, a6, .LBB27_4 +; RV32ZBB-NEXT: # %bb.3: +; RV32ZBB-NEXT: sltu t1, a6, a2 +; RV32ZBB-NEXT: j .LBB27_5 +; RV32ZBB-NEXT: .LBB27_4: +; RV32ZBB-NEXT: sltu t1, a4, a7 +; RV32ZBB-NEXT: .LBB27_5: +; RV32ZBB-NEXT: sub t0, a1, t0 +; RV32ZBB-NEXT: sub t0, t0, t2 +; RV32ZBB-NEXT: sub a5, a3, a5 +; RV32ZBB-NEXT: beq t0, a1, .LBB27_7 +; RV32ZBB-NEXT: # %bb.6: +; RV32ZBB-NEXT: sltu a1, a1, t0 +; RV32ZBB-NEXT: j .LBB27_8 +; RV32ZBB-NEXT: .LBB27_7: +; RV32ZBB-NEXT: sltu a1, a3, a5 +; RV32ZBB-NEXT: .LBB27_8: +; RV32ZBB-NEXT: xor a3, a2, a6 +; RV32ZBB-NEXT: xor a4, a7, a4 +; RV32ZBB-NEXT: or a3, a4, a3 +; RV32ZBB-NEXT: beqz a3, .LBB27_10 +; RV32ZBB-NEXT: # %bb.9: +; RV32ZBB-NEXT: mv a1, t1 +; RV32ZBB-NEXT: .LBB27_10: +; RV32ZBB-NEXT: neg a6, a1 +; RV32ZBB-NEXT: xor a3, a7, a6 +; RV32ZBB-NEXT: sltu a4, a3, a6 +; RV32ZBB-NEXT: xor a2, a2, a6 +; RV32ZBB-NEXT: add a2, a2, a1 +; RV32ZBB-NEXT: sub a4, a2, a4 +; RV32ZBB-NEXT: xor a2, a5, a6 +; RV32ZBB-NEXT: sltu a5, a2, a6 +; RV32ZBB-NEXT: xor a7, t0, a6 +; RV32ZBB-NEXT: mv t1, a5 +; RV32ZBB-NEXT: beqz t0, .LBB27_12 +; RV32ZBB-NEXT: # %bb.11: +; RV32ZBB-NEXT: sltu t1, a7, a6 +; RV32ZBB-NEXT: .LBB27_12: +; RV32ZBB-NEXT: add a3, a3, a1 +; RV32ZBB-NEXT: sltu a6, a3, t1 +; RV32ZBB-NEXT: sub a4, a4, a6 +; RV32ZBB-NEXT: sub a3, a3, t1 +; RV32ZBB-NEXT: add a7, a7, a1 +; RV32ZBB-NEXT: sub a5, a7, a5 +; RV32ZBB-NEXT: add a1, a2, a1 +; RV32ZBB-NEXT: sw a1, 0(a0) +; RV32ZBB-NEXT: sw a5, 4(a0) +; RV32ZBB-NEXT: sw a3, 8(a0) +; RV32ZBB-NEXT: sw a4, 12(a0) +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: abd_select_i128: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sltu a4, a0, a2 +; RV64ZBB-NEXT: sub a3, a1, a3 +; RV64ZBB-NEXT: sub a3, a3, a4 +; RV64ZBB-NEXT: sub a2, a0, a2 +; RV64ZBB-NEXT: beq a3, a1, .LBB27_2 +; RV64ZBB-NEXT: # %bb.1: +; RV64ZBB-NEXT: sltu a0, a1, a3 +; RV64ZBB-NEXT: j .LBB27_3 +; RV64ZBB-NEXT: .LBB27_2: +; RV64ZBB-NEXT: sltu a0, a0, a2 +; RV64ZBB-NEXT: .LBB27_3: +; RV64ZBB-NEXT: neg a1, a0 +; RV64ZBB-NEXT: xor a2, a2, a1 +; RV64ZBB-NEXT: sltu a4, a2, a1 +; RV64ZBB-NEXT: xor a1, a3, a1 +; RV64ZBB-NEXT: add a1, a1, a0 +; RV64ZBB-NEXT: sub a1, a1, a4 +; RV64ZBB-NEXT: add a0, a2, a0 +; RV64ZBB-NEXT: ret + %cmp = icmp ult i128 %a, %b + %ab = select i1 %cmp, i128 %a, i128 %b + %ba = select i1 %cmp, i128 %b, i128 %a + %sub = sub i128 %ba, %ab + ret i128 %sub +} + declare i8 @llvm.abs.i8(i8, i1) declare i16 @llvm.abs.i16(i16, i1) declare i32 @llvm.abs.i32(i32, i1) @@ -1737,4 +2129,3 @@ declare i32 @llvm.umin.i32(i32, i32) declare i64 @llvm.umin.i64(i64, i64) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} -; NOZBB: {{.*}} diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll index 9c4c059a3b9bf..4c524c28b160a 100644 --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -1154,6 +1154,211 @@ define i32 @abd_sub_i32(i32 %a, i32 %b) nounwind { ret i32 %abs } +; +; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abds(a,b) +; + +define i8 @abd_select_i8(i8 %a, i8 %b) nounwind { +; X86-LABEL: abd_select_i8: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: subb %dl, %al +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: abd_select_i8: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmpb %sil, %al +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmovll %edi, %ecx +; X64-NEXT: cmovll %esi, %eax +; X64-NEXT: subb %cl, %al +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %cmp = icmp slt i8 %a, %b + %ab = select i1 %cmp, i8 %a, i8 %b + %ba = select i1 %cmp, i8 %b, i8 %a + %sub = sub i8 %ba, %ab + ret i8 %sub +} + +define i16 @abd_select_i16(i16 %a, i16 %b) nounwind { +; X86-LABEL: abd_select_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: cmovlel %eax, %edx +; X86-NEXT: cmovlel %ecx, %eax +; X86-NEXT: subl %edx, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: abd_select_i16: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmpw %si, %ax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmovlel %edi, %ecx +; X64-NEXT: cmovlel %esi, %eax +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %cmp = icmp sle i16 %a, %b + %ab = select i1 %cmp, i16 %a, i16 %b + %ba = select i1 %cmp, i16 %b, i16 %a + %sub = sub i16 %ba, %ab + ret i16 %sub +} + +define i32 @abd_select_i32(i32 %a, i32 %b) nounwind { +; X86-LABEL: abd_select_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %ecx, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmovgl %edx, %eax +; X86-NEXT: cmovgl %ecx, %edx +; X86-NEXT: subl %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: abd_select_i32: +; X64: # %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmovgl %edi, %eax +; X64-NEXT: cmovgl %esi, %edi +; X64-NEXT: subl %edi, %eax +; X64-NEXT: retq + %cmp = icmp sgt i32 %a, %b + %ab = select i1 %cmp, i32 %a, i32 %b + %ba = select i1 %cmp, i32 %b, i32 %a + %sub = sub i32 %ab, %ba + ret i32 %sub +} + +define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { +; X86-LABEL: abd_select_i64: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %esi, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: cmovgel %ecx, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: cmovgel %ebx, %eax +; X86-NEXT: cmovgel %edi, %ecx +; X86-NEXT: cmovgel %esi, %ebx +; X86-NEXT: subl %ebx, %eax +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl +; +; X64-LABEL: abd_select_i64: +; X64: # %bb.0: +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: cmovgeq %rdi, %rax +; X64-NEXT: cmovgeq %rsi, %rdi +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: retq + %cmp = icmp sge i64 %a, %b + %ab = select i1 %cmp, i64 %a, i64 %b + %ba = select i1 %cmp, i64 %b, i64 %a + %sub = sub i64 %ab, %ba + ret i64 %sub +} + +define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { +; X86-LABEL: abd_select_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sbbl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: cmovll %edi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: cmovll %ebx, %edi +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: cmovll %ecx, %ebx +; X86-NEXT: cmovll %ebp, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: cmovll %esi, %ebp +; X86-NEXT: cmovll %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmovll %edx, %eax +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: sbbl %ebp, %esi +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: addl $4, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; X64-LABEL: abd_select_i128: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: cmpq %rdx, %rdi +; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: sbbq %rcx, %rdi +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: cmovlq %rsi, %rdi +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: cmovlq %rax, %r8 +; X64-NEXT: cmovlq %rcx, %rsi +; X64-NEXT: cmovlq %rdx, %rax +; X64-NEXT: subq %r8, %rax +; X64-NEXT: sbbq %rdi, %rsi +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: retq + %cmp = icmp slt i128 %a, %b + %ab = select i1 %cmp, i128 %a, i128 %b + %ba = select i1 %cmp, i128 %b, i128 %a + %sub = sub i128 %ba, %ab + ret i128 %sub +} declare i8 @llvm.abs.i8(i8, i1) declare i16 @llvm.abs.i16(i16, i1) diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll index 335fa8c156f8e..fe9006a8aec23 100644 --- a/llvm/test/CodeGen/X86/abdu.ll +++ b/llvm/test/CodeGen/X86/abdu.ll @@ -768,6 +768,212 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ret i128 %sel } +; +; sub(select(icmp(a,b),a,b),select(icmp(a,b),b,a)) -> abdu(a,b) +; + +define i8 @abd_select_i8(i8 %a, i8 %b) nounwind { +; X86-LABEL: abd_select_i8: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: subb %dl, %al +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: abd_select_i8: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmpb %sil, %al +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmovbl %edi, %ecx +; X64-NEXT: cmovbl %esi, %eax +; X64-NEXT: subb %cl, %al +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %cmp = icmp ult i8 %a, %b + %ab = select i1 %cmp, i8 %a, i8 %b + %ba = select i1 %cmp, i8 %b, i8 %a + %sub = sub i8 %ba, %ab + ret i8 %sub +} + +define i16 @abd_select_i16(i16 %a, i16 %b) nounwind { +; X86-LABEL: abd_select_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: cmovbel %eax, %edx +; X86-NEXT: cmovbel %ecx, %eax +; X86-NEXT: subl %edx, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: abd_select_i16: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: cmpw %si, %ax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: cmovbel %edi, %ecx +; X64-NEXT: cmovbel %esi, %eax +; X64-NEXT: subl %ecx, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %cmp = icmp ule i16 %a, %b + %ab = select i1 %cmp, i16 %a, i16 %b + %ba = select i1 %cmp, i16 %b, i16 %a + %sub = sub i16 %ba, %ab + ret i16 %sub +} + +define i32 @abd_select_i32(i32 %a, i32 %b) nounwind { +; X86-LABEL: abd_select_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %ecx, %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: cmoval %edx, %eax +; X86-NEXT: cmoval %ecx, %edx +; X86-NEXT: subl %edx, %eax +; X86-NEXT: retl +; +; X64-LABEL: abd_select_i32: +; X64: # %bb.0: +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmoval %edi, %eax +; X64-NEXT: cmoval %esi, %edi +; X64-NEXT: subl %edi, %eax +; X64-NEXT: retq + %cmp = icmp ugt i32 %a, %b + %ab = select i1 %cmp, i32 %a, i32 %b + %ba = select i1 %cmp, i32 %b, i32 %a + %sub = sub i32 %ab, %ba + ret i32 %sub +} + +define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { +; X86-LABEL: abd_select_i64: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %esi, %ebx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl %edi, %edx +; X86-NEXT: cmovael %ecx, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: cmovael %ebx, %eax +; X86-NEXT: cmovael %edi, %ecx +; X86-NEXT: cmovael %esi, %ebx +; X86-NEXT: subl %ebx, %eax +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl +; +; X64-LABEL: abd_select_i64: +; X64: # %bb.0: +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: cmovaeq %rdi, %rax +; X64-NEXT: cmovaeq %rsi, %rdi +; X64-NEXT: subq %rdi, %rax +; X64-NEXT: retq + %cmp = icmp uge i64 %a, %b + %ab = select i1 %cmp, i64 %a, i64 %b + %ba = select i1 %cmp, i64 %b, i64 %a + %sub = sub i64 %ab, %ba + ret i64 %sub +} + +define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { +; X86-LABEL: abd_select_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sbbl %ebp, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: cmovbl %ebx, %edi +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: cmovbl %ecx, %ebx +; X86-NEXT: cmovbl %ebp, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: cmovbl %esi, %ebp +; X86-NEXT: cmovbl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmovbl %edx, %eax +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subl %eax, %edx +; X86-NEXT: sbbl %ebp, %esi +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: sbbl (%esp), %edi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: addl $4, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; X64-LABEL: abd_select_i128: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: cmpq %rdx, %rdi +; X64-NEXT: movq %rsi, %rdi +; X64-NEXT: sbbq %rcx, %rdi +; X64-NEXT: movq %rcx, %rdi +; X64-NEXT: cmovbq %rsi, %rdi +; X64-NEXT: movq %rdx, %r8 +; X64-NEXT: cmovbq %rax, %r8 +; X64-NEXT: cmovbq %rcx, %rsi +; X64-NEXT: cmovbq %rdx, %rax +; X64-NEXT: subq %r8, %rax +; X64-NEXT: sbbq %rdi, %rsi +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: retq + %cmp = icmp ult i128 %a, %b + %ab = select i1 %cmp, i128 %a, i128 %b + %ba = select i1 %cmp, i128 %b, i128 %a + %sub = sub i128 %ba, %ab + ret i128 %sub +} + declare i8 @llvm.abs.i8(i8, i1) declare i16 @llvm.abs.i16(i16, i1) declare i32 @llvm.abs.i32(i32, i1) From ee61a4db3c6d21fcbdccd74606e3c050052b8e7c Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 11 Sep 2024 17:19:11 +0400 Subject: [PATCH 111/114] AMDGPU: Add tests for minimumnum/maximumnum intrinsics Vector cases are broken, so leave those for later. --- llvm/test/CodeGen/AMDGPU/maximumnum.ll | 1736 ++++++++++++++++++++++++ llvm/test/CodeGen/AMDGPU/minimumnum.ll | 1690 +++++++++++++++++++++++ 2 files changed, 3426 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/maximumnum.ll create mode 100644 llvm/test/CodeGen/AMDGPU/minimumnum.ll diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll new file mode 100644 index 0000000000000..506f40516c9e6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -0,0 +1,1736 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +define half @v_maximumnum_f16(half %x, half %y) { +; GFX8-LABEL: v_maximumnum_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.maximumnum.f16(half %x, half %y) + ret half %result +} + +define half @v_maximumnum_f16_nnan(half %x, half %y) { +; GFX8-LABEL: v_maximumnum_f16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan half @llvm.maximumnum.f16(half %x, half %y) + ret half %result +} + +define half @v_maximumnum_f16_1.0(half %x) { +; GFX8-LABEL: v_maximumnum_f16_1.0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, 1.0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16_1.0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e32 v0, 1.0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16_1.0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, 1.0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16_1.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16_1.0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, 1.0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.maximumnum.f16(half %x, half 1.0) + ret half %result +} + +define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { +; GFX8-LABEL: v_maximumnum_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y) + ret bfloat %result +} + +define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { +; GFX8-LABEL: v_maximumnum_bf16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_bf16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v3, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_bf16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_bf16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_bf16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan bfloat @llvm.maximumnum.bf16(bfloat %x, bfloat %y) + ret bfloat %result +} + +define float @v_maximumnum_f32(float %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.maximumnum.f32(float %x, float %y) + ret float %result +} + +define float @v_maximumnum_f32_nnan(float %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan float @llvm.maximumnum.f32(float %x, float %y) + ret float %result +} + +define double @v_maximumnum_f64(double %x, double %y) { +; GFX8-LABEL: v_maximumnum_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.maximumnum.f64(double %x, double %y) + ret double %result +} + +define double @v_maximumnum_f64_nnan(double %x, double %y) { +; GFX8-LABEL: v_maximumnum_f64_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f64_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f64_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f64_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f64_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan double @llvm.maximumnum.f64(double %x, double %y) + ret double %result +} + +define float @v_maximumnum_f32_1.0(float %x) { +; GFX8-LABEL: v_maximumnum_f32_1.0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_1.0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_1.0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_1.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_1.0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, 1.0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.maximumnum.f32(float %x, float 1.0) + ret float %result +} + +define float @v_maximumnum_f32_rhs_not_snan(float %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32_rhs_not_snan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_rhs_not_snan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_rhs_not_snan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_rhs_not_snan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_rhs_not_snan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %canon.y = call float @llvm.canonicalize.f32(float %y) + %result = call float @llvm.maximumnum.f32(float %x, float %canon.y) + ret float %result +} + +define float @v_maximumnum_f32_lhs_not_snan(float %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32_lhs_not_snan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_lhs_not_snan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_lhs_not_snan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_lhs_not_snan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_lhs_not_snan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %canon.x = call float @llvm.canonicalize.f32(float %x) + %result = call float @llvm.maximumnum.f32(float %canon.x, float %y) + ret float %result +} + +define float @v_maximumnum_f32_both_operands_not_snan(float %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32_both_operands_not_snan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_both_operands_not_snan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_both_operands_not_snan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_both_operands_not_snan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_both_operands_not_snan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %canon.x = call float @llvm.canonicalize.f32(float %x) + %canon.y = call float @llvm.canonicalize.f32(float %y) + %result = call float @llvm.maximumnum.f32(float %canon.x, float %canon.y) + ret float %result +} + +define double @v_maximumnum_f64_1.0(double %x) { +; GFX8-LABEL: v_maximumnum_f64_1.0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], 1.0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f64_1.0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f64_1.0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f64_1.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f64_1.0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], 1.0, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.maximumnum.f64(double %x, double 1.0) + ret double %result +} + +define half @v_maximumnum_f16_s_v(half inreg %x, half %y) { +; GFX8-LABEL: v_maximumnum_f16_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: v_max_f16_e64 v1, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16_s_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: v_max_num_f16_e64 v1, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v1, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.maximumnum.f16(half %x, half %y) + ret half %result +} + +define half @v_maximumnum_f16_v_s(half %x, half inreg %y) { +; GFX8-LABEL: v_maximumnum_f16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, s0, s0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16_v_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, s0, s0 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.maximumnum.f16(half %x, half %y) + ret half %result +} + +define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) { +; GFX8-LABEL: v_maximumnum_f16_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v0, s7, s7 +; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v0, s7, s7 +; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v0, s7, s7 +; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v0, s1, s1 +; GFX11-NEXT: v_max_f16_e64 v1, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16_s_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v0, s1, s1 +; GFX12-NEXT: v_max_num_f16_e64 v1, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v1, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.maximumnum.f16(half %x, half %y) + ret half %result +} + +define float @v_maximumnum_f32_s_v(float inreg %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_max_f32_e64 v1, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_s_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: v_max_num_f32_e64 v1, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.maximumnum.f32(float %x, float %y) + ret float %result +} + +define float @v_maximumnum_f32_v_s(float %x, float inreg %y) { +; GFX8-LABEL: v_maximumnum_f32_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, s0, s0 +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_v_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, s0, s0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.maximumnum.f32(float %x, float %y) + ret float %result +} + +define float @v_maximumnum_f32_s_s(float inreg %x, float inreg %y) { +; GFX8-LABEL: v_maximumnum_f32_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, s7 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v0, s7, s7 +; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v0, s7, s7 +; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v0, s1, s1 +; GFX11-NEXT: v_max_f32_e64 v1, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_s_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v0, s1, s1 +; GFX12-NEXT: v_max_num_f32_e64 v1, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.maximumnum.f32(float %x, float %y) + ret float %result +} + +define double @v_maximumnum_f64_s_v(double inreg %x, double %y) { +; GFX8-LABEL: v_maximumnum_f64_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f64_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f64_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f64_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], s[0:1], s[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f64_s_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e64 v[2:3], s[0:1], s[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.maximumnum.f64(double %x, double %y) + ret double %result +} + +define double @v_maximumnum_f64_v_s(double %x, double inreg %y) { +; GFX8-LABEL: v_maximumnum_f64_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f64_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f64_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f64_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], s[0:1], s[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f64_v_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e64 v[2:3], s[0:1], s[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.maximumnum.f64(double %x, double %y) + ret double %result +} + +define double @v_maximumnum_f64_s_s(double inreg %x, double inreg %y) { +; GFX8-LABEL: v_maximumnum_f64_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f64_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f64_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] +; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f64_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], s[0:1], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f64_s_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] +; GFX12-NEXT: v_max_num_f64_e64 v[2:3], s[0:1], s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.maximumnum.f64(double %x, double %y) + ret double %result +} + +define float @v_maximumnum_f32_fabs_rhs(float %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, |v1| +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_fabs_rhs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, |v1|, |v1| +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call float @llvm.fabs.f32(float %y) + %result = call float @llvm.maximumnum.f32(float %x, float %fabs.y) + ret float %result +} + +define float @v_maximumnum_f32_fneg_fabs_rhs(float %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32_fneg_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v1, -1.0, |v1| +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_fneg_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_fneg_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_fneg_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_fneg_fabs_rhs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call float @llvm.fabs.f32(float %y) + %fneg.fabs.y = fneg float %fabs.y + %result = call float @llvm.maximumnum.f32(float %x, float %fneg.fabs.y) + ret float %result +} + +define float @v_maximumnum_f32_fabs(float %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32_fabs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, |v1| +; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_fabs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX9-NEXT: v_max_f32_e64 v0, |v0|, |v0| +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_fabs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX10-NEXT: v_max_f32_e64 v0, |v0|, |v0| +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_fabs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX11-NEXT: v_max_f32_e64 v0, |v0|, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_fabs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, |v1|, |v1| +; GFX12-NEXT: v_max_num_f32_e64 v0, |v0|, |v0| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call float @llvm.fabs.f32(float %x) + %fabs.y = call float @llvm.fabs.f32(float %y) + %result = call float @llvm.maximumnum.f32(float %fabs.x, float %fabs.y) + ret float %result +} + +define float @v_maximumnum_f32_fneg(float %x, float %y) { +; GFX8-LABEL: v_maximumnum_f32_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v1, -1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v0, -1.0, v0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f32_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-NEXT: v_max_f32_e64 v0, -v0, -v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f32_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX10-NEXT: v_max_f32_e64 v0, -v0, -v0 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f32_fneg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-NEXT: v_max_f32_e64 v0, -v0, -v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f32_fneg: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 +; GFX12-NEXT: v_max_num_f32_e64 v0, -v0, -v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fneg.x = fneg float %x + %fneg.y = fneg float %y + %result = call float @llvm.maximumnum.f32(float %fneg.x, float %fneg.y) + ret float %result +} + +define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) { +; GFX8-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16_fabs_rhs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, |v1|, |v1| +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call half @llvm.fabs.f16(half %y) + %result = call half @llvm.maximumnum.f16(half %x, half %fabs.y) + ret half %result +} + +define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) { +; GFX8-LABEL: v_maximumnum_f16_fneg_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16_fneg_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16_fneg_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16_fneg_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16_fneg_fabs_rhs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, -|v1|, -|v1| +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call half @llvm.fabs.f16(half %y) + %fneg.fabs.y = fneg half %fabs.y + %result = call half @llvm.maximumnum.f16(half %x, half %fneg.fabs.y) + ret half %result +} + +define half @v_maximumnum_f16_fabs(half %x, half %y) { +; GFX8-LABEL: v_maximumnum_f16_fabs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX8-NEXT: v_max_f16_e64 v0, |v0|, |v0| +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16_fabs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX9-NEXT: v_max_f16_e64 v0, |v0|, |v0| +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16_fabs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX10-NEXT: v_max_f16_e64 v0, |v0|, |v0| +; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16_fabs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX11-NEXT: v_max_f16_e64 v0, |v0|, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16_fabs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, |v1|, |v1| +; GFX12-NEXT: v_max_num_f16_e64 v0, |v0|, |v0| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call half @llvm.fabs.f16(half %x) + %fabs.y = call half @llvm.fabs.f16(half %y) + %result = call half @llvm.maximumnum.f16(half %fabs.x, half %fabs.y) + ret half %result +} + +define half @v_maximumnum_f16_fneg(half %x, half %y) { +; GFX8-LABEL: v_maximumnum_f16_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX8-NEXT: v_max_f16_e64 v0, -v0, -v0 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f16_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX9-NEXT: v_max_f16_e64 v0, -v0, -v0 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f16_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX10-NEXT: v_max_f16_e64 v0, -v0, -v0 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f16_fneg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f16_fneg: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, -v1, -v1 +; GFX12-NEXT: v_max_num_f16_e64 v0, -v0, -v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fneg.x = fneg half %x + %fneg.y = fneg half %y + %result = call half @llvm.maximumnum.f16(half %fneg.x, half %fneg.y) + ret half %result +} + +define double @v_maximumnum_f64_fneg(double %x, double %y) { +; GFX8-LABEL: v_maximumnum_f64_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_maximumnum_f64_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_maximumnum_f64_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] +; GFX10-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_maximumnum_f64_fneg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_maximumnum_f64_fneg: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e64 v[2:3], -v[2:3], -v[2:3] +; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fneg.x = fneg double %x + %fneg.y = fneg double %y + %result = call double @llvm.maximumnum.f64(double %fneg.x, double %fneg.y) + ret double %result +} diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll new file mode 100644 index 0000000000000..a2ba770067d16 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -0,0 +1,1690 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX10 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +define half @v_minimumnum_f16(half %x, half %y) { +; GFX8-LABEL: v_minimumnum_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e32 v1, v1, v1 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.minimumnum.f16(half %x, half %y) + ret half %result +} + +define half @v_minimumnum_f16_nnan(half %x, half %y) { +; GFX8-LABEL: v_minimumnum_f16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan half @llvm.minimumnum.f16(half %x, half %y) + ret half %result +} + +define half @v_minimumnum_f16_1.0(half %x) { +; GFX8-LABEL: v_minimumnum_f16_1.0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_e32 v0, 1.0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f16_1.0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f16_e32 v0, 1.0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f16_1.0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f16_e32 v0, 1.0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f16_1.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f16_e32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f16_1.0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, 1.0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.minimumnum.f16(half %x, half 1.0) + ret half %result +} + +define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { +; GFX8-LABEL: v_minimumnum_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX8-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_bf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v3, v3, v2, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX10-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_bf16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_bf16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GFX12-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX12-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y) + ret bfloat %result +} + +define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { +; GFX8-LABEL: v_minimumnum_bf16_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2 +; GFX8-NEXT: s_movk_i32 s4, 0x8000 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_bf16_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v3, v2 +; GFX9-NEXT: s_movk_i32 s4, 0x8000 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_bf16_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_bf16_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_bf16_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 +; GFX12-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc_lo +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX12-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_lshlrev_b32 v3, 16, v2 +; GFX12-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan bfloat @llvm.minimumnum.bf16(bfloat %x, bfloat %y) + ret bfloat %result +} + +define float @v_minimumnum_f32(float %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.minimumnum.f32(float %x, float %y) + ret float %result +} + +define float @v_minimumnum_f32_nnan(float %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan float @llvm.minimumnum.f32(float %x, float %y) + ret float %result +} + +define double @v_minimumnum_f64(double %x, double %y) { +; GFX8-LABEL: v_minimumnum_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.minimumnum.f64(double %x, double %y) + ret double %result +} + +define double @v_minimumnum_f64_nnan(double %x, double %y) { +; GFX8-LABEL: v_minimumnum_f64_nnan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f64_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f64_nnan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f64_nnan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f64_nnan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call nnan double @llvm.minimumnum.f64(double %x, double %y) + ret double %result +} + +define float @v_minimumnum_f32_1.0(float %x) { +; GFX8-LABEL: v_minimumnum_f32_1.0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_1.0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_1.0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_1.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, 1.0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_1.0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, 1.0, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.minimumnum.f32(float %x, float 1.0) + ret float %result +} + +define float @v_minimumnum_f32_rhs_not_snan(float %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32_rhs_not_snan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_rhs_not_snan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_rhs_not_snan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_rhs_not_snan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_rhs_not_snan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %canon.y = call float @llvm.canonicalize.f32(float %y) + %result = call float @llvm.minimumnum.f32(float %x, float %canon.y) + ret float %result +} + +define float @v_minimumnum_f32_lhs_not_snan(float %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32_lhs_not_snan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_lhs_not_snan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_lhs_not_snan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_lhs_not_snan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_lhs_not_snan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %canon.x = call float @llvm.canonicalize.f32(float %x) + %result = call float @llvm.minimumnum.f32(float %canon.x, float %y) + ret float %result +} + +define float @v_minimumnum_f32_both_operands_not_snan(float %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32_both_operands_not_snan: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_both_operands_not_snan: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_both_operands_not_snan: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_both_operands_not_snan: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_both_operands_not_snan: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %canon.x = call float @llvm.canonicalize.f32(float %x) + %canon.y = call float @llvm.canonicalize.f32(float %y) + %result = call float @llvm.minimumnum.f32(float %canon.x, float %canon.y) + ret float %result +} + +define double @v_minimumnum_f64_1.0(double %x) { +; GFX8-LABEL: v_minimumnum_f64_1.0: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], 1.0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f64_1.0: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f64_1.0: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], 1.0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f64_1.0: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f64_1.0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], 1.0, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.minimumnum.f64(double %x, double 1.0) + ret double %result +} + +define half @v_minimumnum_f16_v_s(half %x, half inreg %y) { +; GFX8-LABEL: v_minimumnum_f16_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f16_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f16_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f16_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, s0, s0 +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f16_v_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, s0, s0 +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.minimumnum.f16(half %x, half %y) + ret half %result +} + +define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) { +; GFX8-LABEL: v_minimumnum_f16_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v0, s7, s7 +; GFX8-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX8-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f16_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v0, s7, s7 +; GFX9-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX9-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f16_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v0, s7, s7 +; GFX10-NEXT: v_max_f16_e64 v1, s6, s6 +; GFX10-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f16_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v0, s1, s1 +; GFX11-NEXT: v_max_f16_e64 v1, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f16_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f16_s_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v0, s1, s1 +; GFX12-NEXT: v_max_num_f16_e64 v1, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v1, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.minimumnum.f16(half %x, half %y) + ret half %result +} + +define float @v_minimumnum_f32_s_v(float inreg %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: v_max_f32_e64 v1, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_s_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: v_max_num_f32_e64 v1, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v1, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.minimumnum.f32(float %x, float %y) + ret float %result +} + +define float @v_minimumnum_f32_v_s(float %x, float inreg %y) { +; GFX8-LABEL: v_minimumnum_f32_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, s0, s0 +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_v_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, s0, s0 +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.minimumnum.f32(float %x, float %y) + ret float %result +} + +define float @v_minimumnum_f32_s_s(float inreg %x, float inreg %y) { +; GFX8-LABEL: v_minimumnum_f32_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, s7 +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, s6 +; GFX8-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v0, s7, s7 +; GFX9-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX9-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v0, s7, s7 +; GFX10-NEXT: v_max_f32_e64 v1, s6, s6 +; GFX10-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v0, s1, s1 +; GFX11-NEXT: v_max_f32_e64 v1, s0, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v1, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_s_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v0, s1, s1 +; GFX12-NEXT: v_max_num_f32_e64 v1, s0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v1, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call float @llvm.minimumnum.f32(float %x, float %y) + ret float %result +} + +define double @v_minimumnum_f64_s_v(double inreg %x, double %y) { +; GFX8-LABEL: v_minimumnum_f64_s_v: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f64_s_v: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f64_s_v: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f64_s_v: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], s[0:1], s[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f64_s_v: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e64 v[2:3], s[0:1], s[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.minimumnum.f64(double %x, double %y) + ret double %result +} + +define double @v_minimumnum_f64_v_s(double %x, double inreg %y) { +; GFX8-LABEL: v_minimumnum_f64_v_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f64_v_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f64_v_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f64_v_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], s[0:1], s[0:1] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f64_v_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e64 v[2:3], s[0:1], s[0:1] +; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.minimumnum.f64(double %x, double %y) + ret double %result +} + +define double @v_minimumnum_f64_s_s(double inreg %x, double inreg %y) { +; GFX8-LABEL: v_minimumnum_f64_s_s: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX8-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f64_s_s: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f64_s_s: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[0:1], s[16:17], s[16:17] +; GFX10-NEXT: v_max_f64 v[2:3], s[6:7], s[6:7] +; GFX10-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f64_s_s: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3] +; GFX11-NEXT: v_max_f64 v[2:3], s[0:1], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f64_s_s: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e64 v[0:1], s[2:3], s[2:3] +; GFX12-NEXT: v_max_num_f64_e64 v[2:3], s[0:1], s[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %result = call double @llvm.minimumnum.f64(double %x, double %y) + ret double %result +} + +define float @v_minimumnum_f32_fabs_rhs(float %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, |v1| +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_fabs_rhs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, |v1|, |v1| +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call float @llvm.fabs.f32(float %y) + %result = call float @llvm.minimumnum.f32(float %x, float %fabs.y) + ret float %result +} + +define float @v_minimumnum_f32_fneg_fabs_rhs(float %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32_fneg_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v1, -1.0, |v1| +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_fneg_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_fneg_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX10-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_fneg_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| +; GFX11-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_fneg_fabs_rhs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| +; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call float @llvm.fabs.f32(float %y) + %fneg.fabs.y = fneg float %fabs.y + %result = call float @llvm.minimumnum.f32(float %x, float %fneg.fabs.y) + ret float %result +} + +define float @v_minimumnum_f32_fabs(float %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32_fabs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e64 v1, 1.0, |v1| +; GFX8-NEXT: v_mul_f32_e64 v0, 1.0, |v0| +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_fabs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX9-NEXT: v_max_f32_e64 v0, |v0|, |v0| +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_fabs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX10-NEXT: v_max_f32_e64 v0, |v0|, |v0| +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_fabs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX11-NEXT: v_max_f32_e64 v0, |v0|, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_fabs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, |v1|, |v1| +; GFX12-NEXT: v_max_num_f32_e64 v0, |v0|, |v0| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call float @llvm.fabs.f32(float %x) + %fabs.y = call float @llvm.fabs.f32(float %y) + %result = call float @llvm.minimumnum.f32(float %fabs.x, float %fabs.y) + ret float %result +} + +define float @v_minimumnum_f32_fneg(float %x, float %y) { +; GFX8-LABEL: v_minimumnum_f32_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mul_f32_e32 v1, -1.0, v1 +; GFX8-NEXT: v_mul_f32_e32 v0, -1.0, v0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f32_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-NEXT: v_max_f32_e64 v0, -v0, -v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f32_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX10-NEXT: v_max_f32_e64 v0, -v0, -v0 +; GFX10-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f32_fneg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX11-NEXT: v_max_f32_e64 v0, -v0, -v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f32_fneg: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 +; GFX12-NEXT: v_max_num_f32_e64 v0, -v0, -v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fneg.x = fneg float %x + %fneg.y = fneg float %y + %result = call float @llvm.minimumnum.f32(float %fneg.x, float %fneg.y) + ret float %result +} + +define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) { +; GFX8-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f16_fabs_rhs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, |v1|, |v1| +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call half @llvm.fabs.f16(half %y) + %result = call half @llvm.minimumnum.f16(half %x, half %fabs.y) + ret half %result +} + +define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) { +; GFX8-LABEL: v_minimumnum_f16_fneg_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f16_fneg_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX9-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f16_fneg_fabs_rhs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX10-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f16_fneg_fabs_rhs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| +; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f16_fneg_fabs_rhs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, -|v1|, -|v1| +; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call half @llvm.fabs.f16(half %y) + %fneg.fabs.y = fneg half %fabs.y + %result = call half @llvm.minimumnum.f16(half %x, half %fneg.fabs.y) + ret half %result +} + +define half @v_minimumnum_f16_fabs(half %x, half %y) { +; GFX8-LABEL: v_minimumnum_f16_fabs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX8-NEXT: v_max_f16_e64 v0, |v0|, |v0| +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f16_fabs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX9-NEXT: v_max_f16_e64 v0, |v0|, |v0| +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f16_fabs: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX10-NEXT: v_max_f16_e64 v0, |v0|, |v0| +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f16_fabs: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1| +; GFX11-NEXT: v_max_f16_e64 v0, |v0|, |v0| +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f16_fabs: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, |v1|, |v1| +; GFX12-NEXT: v_max_num_f16_e64 v0, |v0|, |v0| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call half @llvm.fabs.f16(half %x) + %fabs.y = call half @llvm.fabs.f16(half %y) + %result = call half @llvm.minimumnum.f16(half %fabs.x, half %fabs.y) + ret half %result +} + +define half @v_minimumnum_f16_fneg(half %x, half %y) { +; GFX8-LABEL: v_minimumnum_f16_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX8-NEXT: v_max_f16_e64 v0, -v0, -v0 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f16_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX9-NEXT: v_max_f16_e64 v0, -v0, -v0 +; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f16_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX10-NEXT: v_max_f16_e64 v0, -v0, -v0 +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f16_fneg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 +; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f16_fneg: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f16_e64 v1, -v1, -v1 +; GFX12-NEXT: v_max_num_f16_e64 v0, -v0, -v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f16_e32 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fneg.x = fneg half %x + %fneg.y = fneg half %y + %result = call half @llvm.minimumnum.f16(half %fneg.x, half %fneg.y) + ret half %result +} + +define double @v_minimumnum_f64_fneg(double %x, double %y) { +; GFX8-LABEL: v_minimumnum_f64_fneg: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] +; GFX8-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_minimumnum_f64_fneg: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] +; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_minimumnum_f64_fneg: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] +; GFX10-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_minimumnum_f64_fneg: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_max_f64 v[2:3], -v[2:3], -v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: v_minimumnum_f64_fneg: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_max_num_f64_e64 v[2:3], -v[2:3], -v[2:3] +; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_min_num_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fneg.x = fneg double %x + %fneg.y = fneg double %y + %result = call double @llvm.minimumnum.f64(double %fneg.x, double %fneg.y) + ret double %result +} From 1741b9c3d75ee34550210fadb9c6156419c3c892 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 11 Sep 2024 15:21:32 +0100 Subject: [PATCH 112/114] [LV] Generalize check lines for interleave group costs. Check cost of all instructions in an interleave group, to prepare for follow-up changes. --- .../X86/interleaved-load-f32-stride-5.ll | 107 +++++++- .../X86/interleaved-load-f32-stride-7.ll | 150 +++++++++- .../X86/interleaved-load-f32-stride-8.ll | 175 +++++++++++- .../X86/interleaved-load-f64-stride-2.ll | 34 ++- .../X86/interleaved-load-f64-stride-3.ll | 56 +++- .../X86/interleaved-load-f64-stride-4.ll | 78 +++++- .../X86/interleaved-load-f64-stride-5.ll | 90 +++++- .../X86/interleaved-load-f64-stride-6.ll | 111 +++++++- .../X86/interleaved-load-f64-stride-7.ll | 132 ++++++++- .../X86/interleaved-load-f64-stride-8.ll | 146 +++++++++- .../X86/interleaved-load-i16-stride-5.ll | 151 +++++++++- .../X86/interleaved-load-i16-stride-7.ll | 223 ++++++++++++++- .../X86/interleaved-load-i16-stride-8.ll | 259 +++++++++++++++++- ...erleaved-load-i32-stride-4-indices-01uu.ll | 34 ++- .../X86/interleaved-load-i32-stride-5.ll | 107 +++++++- .../X86/interleaved-load-i32-stride-7.ll | 150 +++++++++- .../X86/interleaved-load-i32-stride-8.ll | 175 +++++++++++- .../X86/interleaved-load-i64-stride-2.ll | 34 ++- .../X86/interleaved-load-i64-stride-3.ll | 56 +++- .../X86/interleaved-load-i64-stride-4.ll | 78 +++++- .../X86/interleaved-load-i64-stride-5.ll | 90 +++++- .../X86/interleaved-load-i64-stride-6.ll | 111 +++++++- .../X86/interleaved-load-i64-stride-7.ll | 132 ++++++++- .../X86/interleaved-load-i64-stride-8.ll | 146 +++++++++- .../X86/interleaved-load-i8-stride-5.ll | 151 +++++++++- .../X86/interleaved-load-i8-stride-6.ll | 187 ++++++++++++- .../X86/interleaved-load-i8-stride-7.ll | 223 ++++++++++++++- .../X86/interleaved-load-i8-stride-8.ll | 259 +++++++++++++++++- .../X86/interleaved-store-f32-stride-8.ll | 4 + .../X86/interleaved-store-f64-stride-8.ll | 147 +++++++++- .../X86/interleaved-store-i64-stride-8.ll | 147 +++++++++- .../X86/masked-interleaved-store-i16.ll | 10 + 32 files changed, 3923 insertions(+), 30 deletions(-) diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll index de178cdf19308..29dce5f21173a 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,31 +14,136 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 ; SSE2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 ; SSE2: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 ; SSE2: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX1: LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX1: LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 35 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 145 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll index 1f54b7485aa8f..0e7b1c58e587c 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,31 +14,179 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 ; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 ; SSE2: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 ; SSE2: LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 23 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll index d53dca05155b7..8830aff579c32 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load float, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load float, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,30 +14,203 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 ; SSE2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4 ; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4 ; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX1: LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX1: LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX512: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX512: LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX512: LV: Found an estimated cost of 92 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX512: LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll index 1575f92465d52..cfd3d7841caa2 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,35 +14,67 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 24 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 48 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 112 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 48 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll index 89175a65990f6..5ec5b51731385 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,32 +14,86 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 ; SSE2: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 ; SSE2: LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 ; SSE2: LV: Found an estimated cost of 36 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll index 8db9fd364133e..450743df72325 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,31 +14,107 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 ; SSE2: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 ; SSE2: LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 ; SSE2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX1: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX1: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX1: LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX1: LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll index 25c49e3b8a811..5e5c718dba97d 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,28 +14,116 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 ; SSE2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 ; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX1: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX1: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll index 42c980b6d3985..62541fa2368c6 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,28 +14,137 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 ; SSE2: LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 ; SSE2: LV: Found an estimated cost of 36 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX1: LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX1: LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX1: LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll index 68afa6d17f02f..cfed8554b978b 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,28 +14,158 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 ; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 ; SSE2: LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX1: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX1: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX1: LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll index 7894912c88fab..07939b914d022 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load double, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load double, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,27 +14,171 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 ; SSE2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8 ; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX512: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 ; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll index d8eaa0aad61d5..964a9b660942e 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -15,44 +15,193 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 ; SSE2: LV: Found an estimated cost of 22 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 ; SSE2: LV: Found an estimated cost of 43 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 ; SSE2: LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 ; SSE2: LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX1: LV: Found an estimated cost of 25 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX1: LV: Found an estimated cost of 43 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX1: LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX1: LV: Found an estimated cost of 175 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX1: LV: Found an estimated cost of 350 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX2: LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX2: LV: Found an estimated cost of 330 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512DQ: LV: Found an estimated cost of 25 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512DQ: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512DQ: LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512DQ: LV: Found an estimated cost of 175 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512DQ: LV: Found an estimated cost of 355 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512DQ: LV: Found an estimated cost of 710 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512BW: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512BW: LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512BW: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512BW: LV: Found an estimated cost of 28 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512BW: LV: Found an estimated cost of 55 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX512BW: LV: Found an estimated cost of 235 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll index 9c0d102a70d1e..6653198397dd2 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -15,44 +15,265 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 ; SSE2: LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 ; SSE2: LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 ; SSE2: LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 ; SSE2: LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX1: LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX1: LV: Found an estimated cost of 62 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX1: LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX1: LV: Found an estimated cost of 245 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX1: LV: Found an estimated cost of 490 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX2: LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX2: LV: Found an estimated cost of 462 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512DQ: LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512DQ: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512DQ: LV: Found an estimated cost of 121 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512DQ: LV: Found an estimated cost of 245 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512DQ: LV: Found an estimated cost of 497 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512DQ: LV: Found an estimated cost of 994 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512BW: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512BW: LV: Found an estimated cost of 15 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512BW: LV: Found an estimated cost of 19 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512BW: LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512BW: LV: Found an estimated cost of 112 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX512BW: LV: Found an estimated cost of 469 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll index 7654185635d3e..b3a5cbeccc09c 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i16, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i16, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -15,44 +15,301 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 ; SSE2: LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2 ; SSE2: LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2 ; SSE2: LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 ; SSE2: LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX1: LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX1: LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX1: LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX1: LV: Found an estimated cost of 280 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX1: LV: Found an estimated cost of 560 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX2: LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX2: LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX2: LV: Found an estimated cost of 528 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512DQ: LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512DQ: LV: Found an estimated cost of 68 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512DQ: LV: Found an estimated cost of 136 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512DQ: LV: Found an estimated cost of 280 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512DQ: LV: Found an estimated cost of 568 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512DQ: LV: Found an estimated cost of 1136 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i16, ptr %in7, align 2 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512BW: LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512BW: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512BW: LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512BW: LV: Found an estimated cost of 64 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512BW: LV: Found an estimated cost of 148 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 ; AVX512BW: LV: Found an estimated cost of 616 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i16, ptr %in7, align 2 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll index 86590c0fa6a9c..c0ea210385dfd 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,35 +14,67 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 ; SSE2: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 ; SSE2: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 ; SSE2: LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 ; SSE2: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX1: LV: Found an estimated cost of 10 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX1: LV: Found an estimated cost of 20 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX1: LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX1: LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX1: LV: Found an estimated cost of 168 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX2: LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX512: LV: Found an estimated cost of 13 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX512: LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll index 63901617bb9dd..2a261ca4de4fa 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,31 +14,136 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 ; SSE2: LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 ; SSE2: LV: Found an estimated cost of 75 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 ; SSE2: LV: Found an estimated cost of 150 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX1: LV: Found an estimated cost of 23 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX1: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX1: LV: Found an estimated cost of 95 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX1: LV: Found an estimated cost of 190 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 35 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 145 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll index 1eabac4e0b9c3..8bf3071d29fbe 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,31 +14,179 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 ; SSE2: LV: Found an estimated cost of 49 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 ; SSE2: LV: Found an estimated cost of 105 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 ; SSE2: LV: Found an estimated cost of 210 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 133 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX1: LV: Found an estimated cost of 266 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll index a1bb2efd73963..3182de2df058a 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i32, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i32, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,30 +14,203 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 ; SSE2: LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4 ; SSE2: LV: Found an estimated cost of 120 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4 ; SSE2: LV: Found an estimated cost of 240 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX1: LV: Found an estimated cost of 36 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX1: LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX1: LV: Found an estimated cost of 152 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX1: LV: Found an estimated cost of 304 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX512: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX512: LV: Found an estimated cost of 14 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX512: LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX512: LV: Found an estimated cost of 92 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX512: LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll index bd230166ebe78..27e2ee0392615 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,35 +14,67 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; SSE2: LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 22 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 44 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 88 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX1: LV: Found an estimated cost of 176 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 24 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX2: LV: Found an estimated cost of 48 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll index e03d3c2f8b3a4..c37723257c1f7 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,32 +14,86 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 ; SSE2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 ; SSE2: LV: Found an estimated cost of 42 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 ; SSE2: LV: Found an estimated cost of 84 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 66 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX1: LV: Found an estimated cost of 132 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll index f7249666918dd..2eb7c5e93078f 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,31 +14,107 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 ; SSE2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 ; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 ; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX1: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX1: LV: Found an estimated cost of 44 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX1: LV: Found an estimated cost of 88 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX1: LV: Found an estimated cost of 176 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 22 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll index 96946bd58dea1..c11da4309737d 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,28 +14,116 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 ; SSE2: LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 ; SSE2: LV: Found an estimated cost of 70 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX1: LV: Found an estimated cost of 25 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX1: LV: Found an estimated cost of 55 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX1: LV: Found an estimated cost of 110 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll index 2355c6e8b57a1..de57af6ebe398 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,28 +14,137 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 ; SSE2: LV: Found an estimated cost of 42 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 ; SSE2: LV: Found an estimated cost of 84 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX1: LV: Found an estimated cost of 30 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX1: LV: Found an estimated cost of 66 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX1: LV: Found an estimated cost of 132 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 24 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll index 646003a41dcf5..949c1af1fdad3 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,28 +14,158 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 ; SSE2: LV: Found an estimated cost of 49 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 ; SSE2: LV: Found an estimated cost of 98 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX1: LV: Found an estimated cost of 35 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX1: LV: Found an estimated cost of 77 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX1: LV: Found an estimated cost of 154 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 126 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll index 568ab74068f94..4388ccfbdcfc4 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i64, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i64, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -14,27 +14,171 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 ; SSE2: LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8 ; SSE2: LV: Found an estimated cost of 112 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX1: LV: Found an estimated cost of 40 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX1: LV: Found an estimated cost of 88 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX1: LV: Found an estimated cost of 176 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX2: LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX2: LV: Found an estimated cost of 144 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX512: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX512: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 ; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll index 6c1dd916311ab..6078fb440f9d1 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -15,44 +15,193 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 ; SSE2: LV: Found an estimated cost of 38 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 ; SSE2: LV: Found an estimated cost of 75 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 ; SSE2: LV: Found an estimated cost of 155 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 ; SSE2: LV: Found an estimated cost of 315 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX1: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX1: LV: Found an estimated cost of 83 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX1: LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX1: LV: Found an estimated cost of 335 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX2: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX2: LV: Found an estimated cost of 325 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512DQ: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512DQ: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512DQ: LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512DQ: LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512DQ: LV: Found an estimated cost of 335 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512DQ: LV: Found an estimated cost of 675 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512BW: LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512BW: LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512BW: LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512BW: LV: Found an estimated cost of 99 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512BW: LV: Found an estimated cost of 198 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX512BW: LV: Found an estimated cost of 395 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll index 1ff3bc57a50d9..ed8bc84e771f8 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-6.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -15,44 +15,229 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 ; SSE2: LV: Found an estimated cost of 47 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 ; SSE2: LV: Found an estimated cost of 90 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 ; SSE2: LV: Found an estimated cost of 186 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 ; SSE2: LV: Found an estimated cost of 378 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX1: LV: Found an estimated cost of 27 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX1: LV: Found an estimated cost of 52 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX1: LV: Found an estimated cost of 99 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX1: LV: Found an estimated cost of 198 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX1: LV: Found an estimated cost of 402 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX2: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX2: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX2: LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX2: LV: Found an estimated cost of 46 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX2: LV: Found an estimated cost of 88 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512DQ: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512DQ: LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512DQ: LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512DQ: LV: Found an estimated cost of 45 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512DQ: LV: Found an estimated cost of 85 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512DQ: LV: Found an estimated cost of 810 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512BW: LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512BW: LV: Found an estimated cost of 25 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512BW: LV: Found an estimated cost of 49 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512BW: LV: Found an estimated cost of 119 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512BW: LV: Found an estimated cost of 237 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX512BW: LV: Found an estimated cost of 591 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll index d77bca6b7aa5a..778a4e7dfd7d9 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -15,44 +15,265 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 ; SSE2: LV: Found an estimated cost of 57 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 ; SSE2: LV: Found an estimated cost of 110 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 ; SSE2: LV: Found an estimated cost of 217 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 ; SSE2: LV: Found an estimated cost of 441 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX1: LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX1: LV: Found an estimated cost of 62 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX1: LV: Found an estimated cost of 118 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX1: LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX1: LV: Found an estimated cost of 469 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX2: LV: Found an estimated cost of 224 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX2: LV: Found an estimated cost of 455 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512DQ: LV: Found an estimated cost of 34 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512DQ: LV: Found an estimated cost of 62 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512DQ: LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512DQ: LV: Found an estimated cost of 233 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512DQ: LV: Found an estimated cost of 469 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512DQ: LV: Found an estimated cost of 945 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512BW: LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512BW: LV: Found an estimated cost of 29 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512BW: LV: Found an estimated cost of 57 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512BW: LV: Found an estimated cost of 138 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512BW: LV: Found an estimated cost of 413 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX512BW: LV: Found an estimated cost of 826 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll index 00ad2f68814b8..a230b5a0b1f2b 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v0 = load i8, ptr %in0" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*%v. = load i8, ptr %in." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -15,44 +15,301 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 ; SSE2: LV: Found an estimated cost of 56 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1 ; SSE2: LV: Found an estimated cost of 120 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1 ; SSE2: LV: Found an estimated cost of 248 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1 ; SSE2: LV: Found an estimated cost of 504 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 +; SSE2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX1: LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX1: LV: Found an estimated cost of 66 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX1: LV: Found an estimated cost of 132 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX1: LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX1: LV: Found an estimated cost of 536 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX1: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1 ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX2: LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX2: LV: Found an estimated cost of 256 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX2: LV: Found an estimated cost of 520 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512DQ: LV: Found an estimated cost of 33 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512DQ: LV: Found an estimated cost of 66 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512DQ: LV: Found an estimated cost of 132 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512DQ: LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512DQ: LV: Found an estimated cost of 536 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512DQ: LV: Found an estimated cost of 1080 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512DQ: LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i8, ptr %in7, align 1 ; ; AVX512BW-LABEL: 'test' ; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512BW: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512BW: LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512BW: LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512BW: LV: Found an estimated cost of 65 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512BW: LV: Found an estimated cost of 158 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512BW: LV: Found an estimated cost of 472 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1 ; AVX512BW: LV: Found an estimated cost of 1100 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX512BW: LV: Found an estimated cost of 0 for VF 64 For instruction: %v7 = load i8, ptr %in7, align 1 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll index 678bb8917bd0d..2ad37bee35bed 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f32-stride-8.ll @@ -14,12 +14,14 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4 ; SSE2: LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4 ; SSE2: LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4 ; SSE2: LV: Found an estimated cost of 112 for VF 8 For instruction: store float %v7, ptr %out7, align 4 ; ; AVX1-LABEL: 'test' ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4 ; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4 ; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4 ; AVX1: LV: Found an estimated cost of 120 for VF 8 For instruction: store float %v7, ptr %out7, align 4 @@ -27,6 +29,7 @@ define void @test() { ; ; AVX2-LABEL: 'test' ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4 ; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v7, ptr %out7, align 4 ; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: store float %v7, ptr %out7, align 4 ; AVX2: LV: Found an estimated cost of 120 for VF 8 For instruction: store float %v7, ptr %out7, align 4 @@ -34,6 +37,7 @@ define void @test() { ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v7, ptr %out7, align 4 ; AVX512: LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v7, ptr %out7, align 4 ; AVX512: LV: Found an estimated cost of 23 for VF 4 For instruction: store float %v7, ptr %out7, align 4 ; AVX512: LV: Found an estimated cost of 46 for VF 8 For instruction: store float %v7, ptr %out7, align 4 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll index 394d1d4de00f5..c1a66c1a41d74 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v7, ptr %out7" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store double %v., ptr %out." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -13,27 +13,172 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8 ; SSE2: LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8 ; SSE2: LV: Found an estimated cost of 48 for VF 4 For instruction: store double %v7, ptr %out7, align 8 ; ; AVX1-LABEL: 'test' +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8 ; AVX1: LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8 ; AVX1: LV: Found an estimated cost of 56 for VF 4 For instruction: store double %v7, ptr %out7, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8 ; AVX1: LV: Found an estimated cost of 112 for VF 8 For instruction: store double %v7, ptr %out7, align 8 ; ; AVX2-LABEL: 'test' +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8 ; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: store double %v7, ptr %out7, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8 ; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: store double %v7, ptr %out7, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8 ; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: store double %v7, ptr %out7, align 8 ; ; AVX512-LABEL: 'test' +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store double %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 46 for VF 4 For instruction: store double %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: store double %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: store double %v7, ptr %out7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll index 4d9aad54b0c8f..7be9577960efe 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v7, ptr %out7" +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "LV: Found an estimated cost of [0-9]+ for VF [0-9]+ For instruction:\s*store i64 %v., ptr %out." ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+sse2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=SSE2 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX1 ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -mattr=+avx2 --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=AVX2 @@ -13,27 +13,172 @@ target triple = "x86_64-unknown-linux-gnu" define void @test() { ; SSE2-LABEL: 'test' +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 ; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 +; SSE2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8 ; SSE2: LV: Found an estimated cost of 56 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 +; SSE2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 ; SSE2: LV: Found an estimated cost of 112 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8 ; ; AVX1-LABEL: 'test' +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX1: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX1: LV: Found an estimated cost of 40 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX1: LV: Found an estimated cost of 88 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX1: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX1: LV: Found an estimated cost of 176 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8 ; ; AVX2-LABEL: 'test' +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX2: LV: Found an estimated cost of 40 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX2: LV: Found an estimated cost of 88 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX2: LV: Found an estimated cost of 176 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8 ; ; AVX512-LABEL: 'test' +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 23 for VF 2 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 46 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll index 741dd0746b744..13a844230f89d 100644 --- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll @@ -22,6 +22,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly ; DISABLED_MASKED_STRIDED-LABEL: 'test1' ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 13 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 @@ -34,6 +36,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly ; ENABLED_MASKED_STRIDED-LABEL: 'test1' ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 12 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 @@ -79,6 +83,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; DISABLED_MASKED_STRIDED-LABEL: 'test2' ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 @@ -91,6 +97,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; ENABLED_MASKED_STRIDED-LABEL: 'test2' ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 @@ -145,6 +153,7 @@ for.end: define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly %x, ptr noalias nocapture readnone %y) { ; DISABLED_MASKED_STRIDED-LABEL: 'test' ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2 @@ -152,6 +161,7 @@ define void @test(ptr noalias nocapture %points, ptr noalias nocapture readonly ; ; ENABLED_MASKED_STRIDED-LABEL: 'test' ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 4 for VF 4 For instruction: store i16 %0, ptr %arrayidx6, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %0, ptr %arrayidx6, align 2 From 9a9f155df1ed13fdc690d713242b13508f6d725e Mon Sep 17 00:00:00 2001 From: Tyler Nowicki Date: Wed, 11 Sep 2024 10:29:06 -0400 Subject: [PATCH 113/114] [Coroutines] Split buildCoroutineFrame into normalization and frame building (#108076) * Split buildCoroutineFrame into code related to normalization and code related to actually building the coroutine frame. * This will enable future specialization of buildCoroutineFrame for different ABIs while the normalization can be done by splitCoroutine prior to calling buildCoroutineFrame. See RFC for more info: https://discourse.llvm.org/t/rfc-abi-objects-for-coroutines/81057 --- llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 14 ++++++++------ llvm/lib/Transforms/Coroutines/CoroInternal.h | 4 +++- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 3 ++- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 4b76fc7936100..8ee4bfa3b888d 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1754,7 +1754,8 @@ static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB, if (depth == 0) return false; // If this is a suspend block, we're about to exit the resumption function. - if (isSuspendBlock(BB)) return true; + if (isSuspendBlock(BB)) + return true; // Recurse into the successors. for (auto *Succ : successors(BB)) { @@ -2288,9 +2289,8 @@ static void doRematerializations( rewriteMaterializableInstructions(AllRemats); } -void coro::buildCoroutineFrame( - Function &F, Shape &Shape, TargetTransformInfo &TTI, - const std::function &MaterializableCallback) { +void coro::normalizeCoroutine(Function &F, coro::Shape &Shape, + TargetTransformInfo &TTI) { // Don't eliminate swifterror in async functions that won't be split. if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty()) eliminateSwiftError(F, Shape); @@ -2337,10 +2337,12 @@ void coro::buildCoroutineFrame( // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will // never have its definition separated from the PHI by the suspend point. rewritePHIs(F); +} - // Build suspend crossing info. +void coro::buildCoroutineFrame( + Function &F, Shape &Shape, + const std::function &MaterializableCallback) { SuspendCrossingInfo Checker(F, Shape.CoroSuspends, Shape.CoroEnds); - doRematerializations(F, Checker, MaterializableCallback); const DominatorTree DT(F); diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h index be86f96525b67..698c21a797420 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -281,8 +281,10 @@ struct LLVM_LIBRARY_VISIBILITY Shape { }; bool defaultMaterializable(Instruction &V); +void normalizeCoroutine(Function &F, coro::Shape &Shape, + TargetTransformInfo &TTI); void buildCoroutineFrame( - Function &F, Shape &Shape, TargetTransformInfo &TTI, + Function &F, Shape &Shape, const std::function &MaterializableCallback); CallInst *createMustTailCall(DebugLoc Loc, Function *MustTailCallFn, TargetTransformInfo &TTI, diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 494c4d632de95..dc3829d7f28eb 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -2030,7 +2030,8 @@ splitCoroutine(Function &F, SmallVectorImpl &Clones, lowerAwaitSuspends(F, Shape); simplifySuspendPoints(Shape); - buildCoroutineFrame(F, Shape, TTI, MaterializableCallback); + normalizeCoroutine(F, Shape, TTI); + buildCoroutineFrame(F, Shape, MaterializableCallback); replaceFrameSizeAndAlignment(Shape); bool isNoSuspendCoroutine = Shape.CoroSuspends.empty(); From 193e81e9612746fd58c43e53438257ecdc2cb415 Mon Sep 17 00:00:00 2001 From: tnowicki Date: Fri, 23 Aug 2024 18:24:54 -0400 Subject: [PATCH 114/114] [Coroutines] Move materialization code into its own utils * Move materialization out of CoroFrame to MaterializationUtils.h * Move spill related utilities that were used by materialization to SpillUtils * Move isSuspendBlock (needed by materialization) to CoroInternal --- llvm/lib/Transforms/Coroutines/CMakeLists.txt | 1 + llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 296 +---------------- llvm/lib/Transforms/Coroutines/CoroInternal.h | 1 + llvm/lib/Transforms/Coroutines/Coroutines.cpp | 4 + .../Coroutines/MaterializationUtils.cpp | 308 ++++++++++++++++++ .../Coroutines/MaterializationUtils.h | 30 ++ llvm/lib/Transforms/Coroutines/SpillUtils.cpp | 24 +- llvm/lib/Transforms/Coroutines/SpillUtils.h | 2 - 8 files changed, 360 insertions(+), 306 deletions(-) create mode 100644 llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp create mode 100644 llvm/lib/Transforms/Coroutines/MaterializationUtils.h diff --git a/llvm/lib/Transforms/Coroutines/CMakeLists.txt b/llvm/lib/Transforms/Coroutines/CMakeLists.txt index c6508174a7f10..46ef5cd4e8cfe 100644 --- a/llvm/lib/Transforms/Coroutines/CMakeLists.txt +++ b/llvm/lib/Transforms/Coroutines/CMakeLists.txt @@ -9,6 +9,7 @@ add_llvm_component_library(LLVMCoroutines CoroSplit.cpp SuspendCrossingInfo.cpp SpillUtils.cpp + MaterializationUtils.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Coroutines diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 8ee4bfa3b888d..b74c9f01cd239 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -16,10 +16,10 @@ //===----------------------------------------------------------------------===// #include "CoroInternal.h" +#include "MaterializationUtils.h" #include "SpillUtils.h" #include "SuspendCrossingInfo.h" #include "llvm/ADT/BitVector.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallString.h" #include "llvm/Analysis/StackLifetime.h" @@ -36,135 +36,12 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include -#include #include using namespace llvm; extern cl::opt UseNewDbgInfoFormat; -// The "coro-suspend-crossing" flag is very noisy. There is another debug type, -// "coro-frame", which results in leaner debug spew. -#define DEBUG_TYPE "coro-suspend-crossing" - -namespace { - -// RematGraph is used to construct a DAG for rematerializable instructions -// When the constructor is invoked with a candidate instruction (which is -// materializable) it builds a DAG of materializable instructions from that -// point. -// Typically, for each instruction identified as re-materializable across a -// suspend point, a RematGraph will be created. -struct RematGraph { - // Each RematNode in the graph contains the edges to instructions providing - // operands in the current node. - struct RematNode { - Instruction *Node; - SmallVector Operands; - RematNode() = default; - RematNode(Instruction *V) : Node(V) {} - }; - - RematNode *EntryNode; - using RematNodeMap = - SmallMapVector, 8>; - RematNodeMap Remats; - const std::function &MaterializableCallback; - SuspendCrossingInfo &Checker; - - RematGraph(const std::function &MaterializableCallback, - Instruction *I, SuspendCrossingInfo &Checker) - : MaterializableCallback(MaterializableCallback), Checker(Checker) { - std::unique_ptr FirstNode = std::make_unique(I); - EntryNode = FirstNode.get(); - std::deque> WorkList; - addNode(std::move(FirstNode), WorkList, cast(I)); - while (WorkList.size()) { - std::unique_ptr N = std::move(WorkList.front()); - WorkList.pop_front(); - addNode(std::move(N), WorkList, cast(I)); - } - } - - void addNode(std::unique_ptr NUPtr, - std::deque> &WorkList, - User *FirstUse) { - RematNode *N = NUPtr.get(); - if (Remats.count(N->Node)) - return; - - // We haven't see this node yet - add to the list - Remats[N->Node] = std::move(NUPtr); - for (auto &Def : N->Node->operands()) { - Instruction *D = dyn_cast(Def.get()); - if (!D || !MaterializableCallback(*D) || - !Checker.isDefinitionAcrossSuspend(*D, FirstUse)) - continue; - - if (Remats.count(D)) { - // Already have this in the graph - N->Operands.push_back(Remats[D].get()); - continue; - } - - bool NoMatch = true; - for (auto &I : WorkList) { - if (I->Node == D) { - NoMatch = false; - N->Operands.push_back(I.get()); - break; - } - } - if (NoMatch) { - // Create a new node - std::unique_ptr ChildNode = std::make_unique(D); - N->Operands.push_back(ChildNode.get()); - WorkList.push_back(std::move(ChildNode)); - } - } - } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - static std::string getBasicBlockLabel(const BasicBlock *BB) { - if (BB->hasName()) - return BB->getName().str(); - - std::string S; - raw_string_ostream OS(S); - BB->printAsOperand(OS, false); - return OS.str().substr(1); - } - - void dump() const { - dbgs() << "Entry ("; - dbgs() << getBasicBlockLabel(EntryNode->Node->getParent()); - dbgs() << ") : " << *EntryNode->Node << "\n"; - for (auto &E : Remats) { - dbgs() << *(E.first) << "\n"; - for (RematNode *U : E.second->Operands) - dbgs() << " " << *U->Node << "\n"; - } - } -#endif -}; -} // end anonymous namespace - -namespace llvm { - -template <> struct GraphTraits { - using NodeRef = RematGraph::RematNode *; - using ChildIteratorType = RematGraph::RematNode **; - - static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; } - static ChildIteratorType child_begin(NodeRef N) { - return N->Operands.begin(); - } - static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); } -}; - -} // end namespace llvm - -#undef DEBUG_TYPE // "coro-suspend-crossing" #define DEBUG_TYPE "coro-frame" namespace { @@ -268,15 +145,6 @@ static void dumpSpills(StringRef Title, const coro::SpillInfo &Spills) { I->dump(); } } -static void dumpRemats( - StringRef Title, - const SmallMapVector, 8> &RM) { - dbgs() << "------------- " << Title << "--------------\n"; - for (const auto &E : RM) { - E.second->dump(); - dbgs() << "--\n"; - } -} static void dumpAllocas(const SmallVectorImpl &Allocas) { dbgs() << "------------- Allocas --------------\n"; @@ -1634,93 +1502,6 @@ static void rewritePHIs(Function &F) { rewritePHIs(*BB); } -/// Default materializable callback -// Check for instructions that we can recreate on resume as opposed to spill -// the result into a coroutine frame. -bool coro::defaultMaterializable(Instruction &V) { - return (isa(&V) || isa(&V) || - isa(&V) || isa(&V) || isa(&V)); -} - -// For each instruction identified as materializable across the suspend point, -// and its associated DAG of other rematerializable instructions, -// recreate the DAG of instructions after the suspend point. -static void rewriteMaterializableInstructions( - const SmallMapVector, 8> - &AllRemats) { - // This has to be done in 2 phases - // Do the remats and record the required defs to be replaced in the - // original use instructions - // Once all the remats are complete, replace the uses in the final - // instructions with the new defs - typedef struct { - Instruction *Use; - Instruction *Def; - Instruction *Remat; - } ProcessNode; - - SmallVector FinalInstructionsToProcess; - - for (const auto &E : AllRemats) { - Instruction *Use = E.first; - Instruction *CurrentMaterialization = nullptr; - RematGraph *RG = E.second.get(); - ReversePostOrderTraversal RPOT(RG); - SmallVector InstructionsToProcess; - - // If the target use is actually a suspend instruction then we have to - // insert the remats into the end of the predecessor (there should only be - // one). This is so that suspend blocks always have the suspend instruction - // as the first instruction. - auto InsertPoint = &*Use->getParent()->getFirstInsertionPt(); - if (isa(Use)) { - BasicBlock *SuspendPredecessorBlock = - Use->getParent()->getSinglePredecessor(); - assert(SuspendPredecessorBlock && "malformed coro suspend instruction"); - InsertPoint = SuspendPredecessorBlock->getTerminator(); - } - - // Note: skip the first instruction as this is the actual use that we're - // rematerializing everything for. - auto I = RPOT.begin(); - ++I; - for (; I != RPOT.end(); ++I) { - Instruction *D = (*I)->Node; - CurrentMaterialization = D->clone(); - CurrentMaterialization->setName(D->getName()); - CurrentMaterialization->insertBefore(InsertPoint); - InsertPoint = CurrentMaterialization; - - // Replace all uses of Def in the instructions being added as part of this - // rematerialization group - for (auto &I : InstructionsToProcess) - I->replaceUsesOfWith(D, CurrentMaterialization); - - // Don't replace the final use at this point as this can cause problems - // for other materializations. Instead, for any final use that uses a - // define that's being rematerialized, record the replace values - for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i) - if (Use->getOperand(i) == D) // Is this operand pointing to oldval? - FinalInstructionsToProcess.push_back( - {Use, D, CurrentMaterialization}); - - InstructionsToProcess.push_back(CurrentMaterialization); - } - } - - // Finally, replace the uses with the defines that we've just rematerialized - for (auto &R : FinalInstructionsToProcess) { - if (auto *PN = dyn_cast(R.Use)) { - assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming " - "values in the PHINode"); - PN->replaceAllUsesWith(R.Remat); - PN->eraseFromParent(); - continue; - } - R.Use->replaceUsesOfWith(R.Def, R.Remat); - } -} - // Splits the block at a particular instruction unless it is the first // instruction in the block with a single predecessor. static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) { @@ -1741,10 +1522,6 @@ static void splitAround(Instruction *I, const Twine &Name) { splitBlockIfNotFirst(I->getNextNode(), "After" + Name); } -static bool isSuspendBlock(BasicBlock *BB) { - return isa(BB->front()); -} - /// After we split the coroutine, will the given basic block be along /// an obvious exit path for the resumption function? static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB, @@ -1754,7 +1531,7 @@ static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB, if (depth == 0) return false; // If this is a suspend block, we're about to exit the resumption function. - if (isSuspendBlock(BB)) + if (coro::isSuspendBlock(BB)) return true; // Recurse into the successors. @@ -1995,7 +1772,8 @@ static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape, DomSet.insert(&F.getEntryBlock()); for (auto *CSI : Shape.CoroSuspends) { BasicBlock *SuspendBlock = CSI->getParent(); - assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() && + assert(coro::isSuspendBlock(SuspendBlock) && + SuspendBlock->getSingleSuccessor() && "should have split coro.suspend into its own block"); DomSet.insert(SuspendBlock->getSingleSuccessor()); } @@ -2227,68 +2005,6 @@ void coro::salvageDebugInfo( } } -static void doRematerializations( - Function &F, SuspendCrossingInfo &Checker, - const std::function &MaterializableCallback) { - if (F.hasOptNone()) - return; - - coro::SpillInfo Spills; - - // See if there are materializable instructions across suspend points - // We record these as the starting point to also identify materializable - // defs of uses in these operations - for (Instruction &I : instructions(F)) { - if (!MaterializableCallback(I)) - continue; - for (User *U : I.users()) - if (Checker.isDefinitionAcrossSuspend(I, U)) - Spills[&I].push_back(cast(U)); - } - - // Process each of the identified rematerializable instructions - // and add predecessor instructions that can also be rematerialized. - // This is actually a graph of instructions since we could potentially - // have multiple uses of a def in the set of predecessor instructions. - // The approach here is to maintain a graph of instructions for each bottom - // level instruction - where we have a unique set of instructions (nodes) - // and edges between them. We then walk the graph in reverse post-dominator - // order to insert them past the suspend point, but ensure that ordering is - // correct. We also rely on CSE removing duplicate defs for remats of - // different instructions with a def in common (rather than maintaining more - // complex graphs for each suspend point) - - // We can do this by adding new nodes to the list for each suspend - // point. Then using standard GraphTraits to give a reverse post-order - // traversal when we insert the nodes after the suspend - SmallMapVector, 8> AllRemats; - for (auto &E : Spills) { - for (Instruction *U : E.second) { - // Don't process a user twice (this can happen if the instruction uses - // more than one rematerializable def) - if (AllRemats.count(U)) - continue; - - // Constructor creates the whole RematGraph for the given Use - auto RematUPtr = - std::make_unique(MaterializableCallback, U, Checker); - - LLVM_DEBUG(dbgs() << "***** Next remat group *****\n"; - ReversePostOrderTraversal RPOT(RematUPtr.get()); - for (auto I = RPOT.begin(); I != RPOT.end(); - ++I) { (*I)->Node->dump(); } dbgs() - << "\n";); - - AllRemats[U] = std::move(RematUPtr); - } - } - - // Rewrite materializable instructions to be materialized at the use - // point. - LLVM_DEBUG(dumpRemats("Materializations", AllRemats)); - rewriteMaterializableInstructions(AllRemats); -} - void coro::normalizeCoroutine(Function &F, coro::Shape &Shape, TargetTransformInfo &TTI) { // Don't eliminate swifterror in async functions that won't be split. @@ -2324,8 +2040,8 @@ void coro::normalizeCoroutine(Function &F, coro::Shape &Shape, IRBuilder<> Builder(AsyncEnd); SmallVector Args(AsyncEnd->args()); auto Arguments = ArrayRef(Args).drop_front(3); - auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn, - TTI, Arguments, Builder); + auto *Call = coro::createMustTailCall( + AsyncEnd->getDebugLoc(), MustTailCallFn, TTI, Arguments, Builder); splitAround(Call, "MustTailCall.Before.CoroEnd"); } } diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h index 698c21a797420..891798f53b2d0 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -21,6 +21,7 @@ class CallGraph; namespace coro { +bool isSuspendBlock(BasicBlock *BB); bool declaresAnyIntrinsic(const Module &M); bool declaresIntrinsics(const Module &M, const std::initializer_list); diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index be257339e0ac4..cdc442bc819c3 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -100,6 +100,10 @@ static bool isCoroutineIntrinsicName(StringRef Name) { } #endif +bool coro::isSuspendBlock(BasicBlock *BB) { + return isa(BB->front()); +} + bool coro::declaresAnyIntrinsic(const Module &M) { for (StringRef Name : CoroIntrinsics) { assert(isCoroutineIntrinsicName(Name) && "not a coroutine intrinsic"); diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp new file mode 100644 index 0000000000000..708e8734175f9 --- /dev/null +++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp @@ -0,0 +1,308 @@ +//===- MaterializationUtils.cpp - Builds and manipulates coroutine frame +//-------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file contains classes used to materialize insts after suspends points. +//===----------------------------------------------------------------------===// + +#include "MaterializationUtils.h" +#include "SpillUtils.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include + +using namespace llvm; + +using namespace coro; + +// The "coro-suspend-crossing" flag is very noisy. There is another debug type, +// "coro-frame", which results in leaner debug spew. +#define DEBUG_TYPE "coro-suspend-crossing" + +namespace { + +// RematGraph is used to construct a DAG for rematerializable instructions +// When the constructor is invoked with a candidate instruction (which is +// materializable) it builds a DAG of materializable instructions from that +// point. +// Typically, for each instruction identified as re-materializable across a +// suspend point, a RematGraph will be created. +struct RematGraph { + // Each RematNode in the graph contains the edges to instructions providing + // operands in the current node. + struct RematNode { + Instruction *Node; + SmallVector Operands; + RematNode() = default; + RematNode(Instruction *V) : Node(V) {} + }; + + RematNode *EntryNode; + using RematNodeMap = + SmallMapVector, 8>; + RematNodeMap Remats; + const std::function &MaterializableCallback; + SuspendCrossingInfo &Checker; + + RematGraph(const std::function &MaterializableCallback, + Instruction *I, SuspendCrossingInfo &Checker) + : MaterializableCallback(MaterializableCallback), Checker(Checker) { + std::unique_ptr FirstNode = std::make_unique(I); + EntryNode = FirstNode.get(); + std::deque> WorkList; + addNode(std::move(FirstNode), WorkList, cast(I)); + while (WorkList.size()) { + std::unique_ptr N = std::move(WorkList.front()); + WorkList.pop_front(); + addNode(std::move(N), WorkList, cast(I)); + } + } + + void addNode(std::unique_ptr NUPtr, + std::deque> &WorkList, + User *FirstUse) { + RematNode *N = NUPtr.get(); + if (Remats.count(N->Node)) + return; + + // We haven't see this node yet - add to the list + Remats[N->Node] = std::move(NUPtr); + for (auto &Def : N->Node->operands()) { + Instruction *D = dyn_cast(Def.get()); + if (!D || !MaterializableCallback(*D) || + !Checker.isDefinitionAcrossSuspend(*D, FirstUse)) + continue; + + if (Remats.count(D)) { + // Already have this in the graph + N->Operands.push_back(Remats[D].get()); + continue; + } + + bool NoMatch = true; + for (auto &I : WorkList) { + if (I->Node == D) { + NoMatch = false; + N->Operands.push_back(I.get()); + break; + } + } + if (NoMatch) { + // Create a new node + std::unique_ptr ChildNode = std::make_unique(D); + N->Operands.push_back(ChildNode.get()); + WorkList.push_back(std::move(ChildNode)); + } + } + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + static std::string getBasicBlockLabel(const BasicBlock *BB) { + if (BB->hasName()) + return BB->getName().str(); + + std::string S; + raw_string_ostream OS(S); + BB->printAsOperand(OS, false); + return OS.str().substr(1); + } + + void dump() const { + dbgs() << "Entry ("; + dbgs() << getBasicBlockLabel(EntryNode->Node->getParent()); + dbgs() << ") : " << *EntryNode->Node << "\n"; + for (auto &E : Remats) { + dbgs() << *(E.first) << "\n"; + for (RematNode *U : E.second->Operands) + dbgs() << " " << *U->Node << "\n"; + } + } +#endif +}; + +} // namespace + +namespace llvm { +template <> struct GraphTraits { + using NodeRef = RematGraph::RematNode *; + using ChildIteratorType = RematGraph::RematNode **; + + static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; } + static ChildIteratorType child_begin(NodeRef N) { + return N->Operands.begin(); + } + static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); } +}; + +} // end namespace llvm + +// For each instruction identified as materializable across the suspend point, +// and its associated DAG of other rematerializable instructions, +// recreate the DAG of instructions after the suspend point. +static void rewriteMaterializableInstructions( + const SmallMapVector, 8> + &AllRemats) { + // This has to be done in 2 phases + // Do the remats and record the required defs to be replaced in the + // original use instructions + // Once all the remats are complete, replace the uses in the final + // instructions with the new defs + typedef struct { + Instruction *Use; + Instruction *Def; + Instruction *Remat; + } ProcessNode; + + SmallVector FinalInstructionsToProcess; + + for (const auto &E : AllRemats) { + Instruction *Use = E.first; + Instruction *CurrentMaterialization = nullptr; + RematGraph *RG = E.second.get(); + ReversePostOrderTraversal RPOT(RG); + SmallVector InstructionsToProcess; + + // If the target use is actually a suspend instruction then we have to + // insert the remats into the end of the predecessor (there should only be + // one). This is so that suspend blocks always have the suspend instruction + // as the first instruction. + auto InsertPoint = &*Use->getParent()->getFirstInsertionPt(); + if (isa(Use)) { + BasicBlock *SuspendPredecessorBlock = + Use->getParent()->getSinglePredecessor(); + assert(SuspendPredecessorBlock && "malformed coro suspend instruction"); + InsertPoint = SuspendPredecessorBlock->getTerminator(); + } + + // Note: skip the first instruction as this is the actual use that we're + // rematerializing everything for. + auto I = RPOT.begin(); + ++I; + for (; I != RPOT.end(); ++I) { + Instruction *D = (*I)->Node; + CurrentMaterialization = D->clone(); + CurrentMaterialization->setName(D->getName()); + CurrentMaterialization->insertBefore(InsertPoint); + InsertPoint = CurrentMaterialization; + + // Replace all uses of Def in the instructions being added as part of this + // rematerialization group + for (auto &I : InstructionsToProcess) + I->replaceUsesOfWith(D, CurrentMaterialization); + + // Don't replace the final use at this point as this can cause problems + // for other materializations. Instead, for any final use that uses a + // define that's being rematerialized, record the replace values + for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i) + if (Use->getOperand(i) == D) // Is this operand pointing to oldval? + FinalInstructionsToProcess.push_back( + {Use, D, CurrentMaterialization}); + + InstructionsToProcess.push_back(CurrentMaterialization); + } + } + + // Finally, replace the uses with the defines that we've just rematerialized + for (auto &R : FinalInstructionsToProcess) { + if (auto *PN = dyn_cast(R.Use)) { + assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming " + "values in the PHINode"); + PN->replaceAllUsesWith(R.Remat); + PN->eraseFromParent(); + continue; + } + R.Use->replaceUsesOfWith(R.Def, R.Remat); + } +} + +/// Default materializable callback +// Check for instructions that we can recreate on resume as opposed to spill +// the result into a coroutine frame. +bool llvm::coro::defaultMaterializable(Instruction &V) { + return (isa(&V) || isa(&V) || + isa(&V) || isa(&V) || isa(&V)); +} + +bool llvm::coro::isTriviallyMaterializable(Instruction &V) { + return defaultMaterializable(V); +} + +#ifndef NDEBUG +static void dumpRemats( + StringRef Title, + const SmallMapVector, 8> &RM) { + dbgs() << "------------- " << Title << "--------------\n"; + for (const auto &E : RM) { + E.second->dump(); + dbgs() << "--\n"; + } +} +#endif + +void coro::doRematerializations( + Function &F, SuspendCrossingInfo &Checker, + std::function IsMaterializable) { + if (F.hasOptNone()) + return; + + coro::SpillInfo Spills; + + // See if there are materializable instructions across suspend points + // We record these as the starting point to also identify materializable + // defs of uses in these operations + for (Instruction &I : instructions(F)) { + if (!IsMaterializable(I)) + continue; + for (User *U : I.users()) + if (Checker.isDefinitionAcrossSuspend(I, U)) + Spills[&I].push_back(cast(U)); + } + + // Process each of the identified rematerializable instructions + // and add predecessor instructions that can also be rematerialized. + // This is actually a graph of instructions since we could potentially + // have multiple uses of a def in the set of predecessor instructions. + // The approach here is to maintain a graph of instructions for each bottom + // level instruction - where we have a unique set of instructions (nodes) + // and edges between them. We then walk the graph in reverse post-dominator + // order to insert them past the suspend point, but ensure that ordering is + // correct. We also rely on CSE removing duplicate defs for remats of + // different instructions with a def in common (rather than maintaining more + // complex graphs for each suspend point) + + // We can do this by adding new nodes to the list for each suspend + // point. Then using standard GraphTraits to give a reverse post-order + // traversal when we insert the nodes after the suspend + SmallMapVector, 8> AllRemats; + for (auto &E : Spills) { + for (Instruction *U : E.second) { + // Don't process a user twice (this can happen if the instruction uses + // more than one rematerializable def) + if (AllRemats.count(U)) + continue; + + // Constructor creates the whole RematGraph for the given Use + auto RematUPtr = + std::make_unique(IsMaterializable, U, Checker); + + LLVM_DEBUG(dbgs() << "***** Next remat group *****\n"; + ReversePostOrderTraversal RPOT(RematUPtr.get()); + for (auto I = RPOT.begin(); I != RPOT.end(); + ++I) { (*I)->Node->dump(); } dbgs() + << "\n";); + + AllRemats[U] = std::move(RematUPtr); + } + } + + // Rewrite materializable instructions to be materialized at the use + // point. + LLVM_DEBUG(dumpRemats("Materializations", AllRemats)); + rewriteMaterializableInstructions(AllRemats); +} diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.h b/llvm/lib/Transforms/Coroutines/MaterializationUtils.h new file mode 100644 index 0000000000000..f391851c97b3b --- /dev/null +++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.h @@ -0,0 +1,30 @@ +//===- MaterializationUtils.h - Utilities for doing materialization -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SuspendCrossingInfo.h" +#include "llvm/IR/Instruction.h" + +#ifndef LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H +#define LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H + +namespace llvm { + +namespace coro { + +// True if I is trivially rematerialzable, e.g. InsertElementInst +bool isTriviallyMaterializable(Instruction &I); + +// Performs rematerialization, invoked from buildCoroutineFrame. +void doRematerializations(Function &F, SuspendCrossingInfo &Checker, + std::function IsMaterializable); + +} // namespace coro + +} // namespace llvm + +#endif // LIB_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp index d71b0a336d471..4c12e66f288db 100644 --- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp @@ -23,17 +23,6 @@ namespace { typedef SmallPtrSet VisitedBlocksSet; -static bool isSuspendBlock(BasicBlock *BB) { - return isa(BB->front()); -} - -// Check for structural coroutine intrinsics that should not be spilled into -// the coroutine frame. -static bool isCoroutineStructureIntrinsic(Instruction &I) { - return isa(&I) || isa(&I) || - isa(&I); -} - /// Does control flow starting at the given block ever reach a suspend /// instruction before reaching a block in VisitedOrFreeBBs? static bool isSuspendReachableFrom(BasicBlock *From, @@ -45,7 +34,7 @@ static bool isSuspendReachableFrom(BasicBlock *From, return false; // We assume that we'll already have split suspends into their own blocks. - if (isSuspendBlock(From)) + if (coro::isSuspendBlock(From)) return true; // Recurse on the successors. @@ -448,6 +437,13 @@ static void collectFrameAlloca(AllocaInst *AI, const coro::Shape &Shape, } // namespace +// Check for structural coroutine intrinsics that should not be spilled into +// the coroutine frame. +bool isCoroutineStructureIntrinsic(Instruction &I) { + return isa(&I) || isa(&I) || + isa(&I); +} + void collectSpillsFromArgs(SpillInfo &Spills, Function &F, const SuspendCrossingInfo &Checker) { // Collect the spills for arguments and other not-materializable values. @@ -626,6 +622,6 @@ BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def, return InsertPt; } -} // End namespace coro. +} // namespace coro -} // End namespace llvm. +} // namespace llvm diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.h b/llvm/lib/Transforms/Coroutines/SpillUtils.h index de0ff0bcd3a4f..8843b611e0842 100644 --- a/llvm/lib/Transforms/Coroutines/SpillUtils.h +++ b/llvm/lib/Transforms/Coroutines/SpillUtils.h @@ -29,8 +29,6 @@ struct AllocaInfo { MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {} }; -bool isSuspendBlock(BasicBlock *BB); - void collectSpillsFromArgs(SpillInfo &Spills, Function &F, const SuspendCrossingInfo &Checker); void collectSpillsAndAllocasFromInsts(